pax_global_header00006660000000000000000000000064146525271340014523gustar00rootroot0000000000000052 comment=04623bf01ba876f3686ba085ed5bd8db7f28ca0d dqlite-1.16.7/000077500000000000000000000000001465252713400131015ustar00rootroot00000000000000dqlite-1.16.7/.clang-format000066400000000000000000000004031465252713400154510ustar00rootroot00000000000000BasedOnStyle: Chromium BreakBeforeBraces: Custom BraceWrapping: AfterFunction: true AfterStruct: false Cpp11BracedListStyle: false IndentWidth: 8 UseTab: ForContinuationAndIndentation PointerAlignment: Right AllowAllParametersOfDeclarationOnNextLine: false dqlite-1.16.7/.clang-tidy000066400000000000000000000006111465252713400151330ustar00rootroot00000000000000Checks: '-*,readability-identifier-naming' HeaderFilterRegex: '.*' WarningsAsErrors: '*' CheckOptions: - key: readability-identifier-naming.StructCase value: lower_case - key: readability-identifier-naming.UnionCase value: lower_case - key: readability-identifier-naming.FunctionCase value: lower_case - key: readability-identifier-naming.TypedefCase value: lower_case dqlite-1.16.7/.dir-locals.el000066400000000000000000000002661465252713400155360ustar00rootroot00000000000000((nil . ((fill-column . 80))) (c-mode . ((c-file-style . "linux-tabs-only") (flycheck-gcc-definitions . ("_GNU_SOURCE")) (flycheck-clang-definitions . ("_GNU_SOURCE"))))) dqlite-1.16.7/.github/000077500000000000000000000000001465252713400144415ustar00rootroot00000000000000dqlite-1.16.7/.github/dependabot.yml000066400000000000000000000005421465252713400172720ustar00rootroot00000000000000# Set update schedule for GitHub Actions # for more info see: https://docs.github.com/en/code-security/dependabot/working-with-dependabot/keeping-your-actions-up-to-date-with-dependabot version: 2 updates: - package-ecosystem: "github-actions" directory: "/" # checks for workflow files in .github/workflows schedule: interval: "weekly" dqlite-1.16.7/.github/workflows/000077500000000000000000000000001465252713400164765ustar00rootroot00000000000000dqlite-1.16.7/.github/workflows/build-and-test.yml000066400000000000000000000027751465252713400220500ustar00rootroot00000000000000name: CI Tests on: - push - pull_request jobs: build-and-test: strategy: fail-fast: false matrix: os: - ubuntu-20.04 - ubuntu-22.04 - ubuntu-24.04 compiler: - gcc - clang dqlite-next: - yes - no runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 - name: Set up dependencies run: | sudo apt update sudo apt install -y lcov libsqlite3-dev liblz4-dev libuv1-dev - name: Build dqlite env: CC: ${{ matrix.compiler }} run: | autoreconf -i ./configure --enable-debug --enable-code-coverage --enable-sanitize \ --enable-build-raft --enable-dqlite-next=${{ matrix.dqlite-next }} make -j4 unit-test integration-test \ raft-core-fuzzy-test \ raft-core-integration-test \ raft-core-unit-test \ raft-uv-integration-test \ raft-uv-unit-test - name: Test env: CC: ${{ matrix.compiler }} LIBDQLITE_TRACE: 1 run: | make check || (cat ./test-suite.log && false) - name: Coverage env: CC: ${{ matrix.compiler }} if: ${{ matrix.os == 'ubuntu-22.04' && matrix.compiler == 'gcc' }} run: | make code-coverage-capture - name: Upload coverage to Codecov uses: codecov/codecov-action@v4 with: verbose: true dqlite-1.16.7/.github/workflows/cla-check.yml000066400000000000000000000002711465252713400210330ustar00rootroot00000000000000name: Canonical CLA on: - pull_request jobs: cla-check: runs-on: ubuntu-20.04 steps: - name: Check if CLA signed uses: canonical/has-signed-canonical-cla@v1 dqlite-1.16.7/.github/workflows/coverity.yml000066400000000000000000000027421465252713400210720ustar00rootroot00000000000000name: Coverity on: push: branches: - master jobs: test: runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v4 - name: Download Coverity Build Tool run: | wget -q https://scan.coverity.com/download/cxx/linux64 --post-data "token=$TOKEN&project=canonical/dqlite" -O cov-analysis-linux64.tar.gz mkdir cov-analysis-linux64 tar xzf cov-analysis-linux64.tar.gz --strip 1 -C cov-analysis-linux64 env: TOKEN: ${{ secrets.COVERITY_SCAN_TOKEN }} - name: Install dependencies run: | sudo apt-get update -qq sudo apt-get install -qq gcc libsqlite3-dev liblz4-dev libuv1-dev - name: Run coverity run: | export PATH="$(pwd)/cov-analysis-linux64/bin:${PATH}" # Configure autoreconf -i mkdir build cd build ../configure --enable-build-raft # Build cov-build --dir cov-int make -j4 tar czvf dqlite.tgz cov-int # Submit the results curl \ --form project=canonical/dqlite \ --form token=${TOKEN} \ --form email=mathieu.bordere@canonical.com \ --form file=@dqlite.tgz \ --form version=master \ --form description="${GITHUB_SHA}" \ https://scan.coverity.com/builds?project=canonical/dqlite env: TOKEN: ${{ secrets.COVERITY_SCAN_TOKEN }} dqlite-1.16.7/.github/workflows/downstream.yml000066400000000000000000000046401465252713400214100ustar00rootroot00000000000000name: Downstream checks on: issue_comment: types: [created, edited] jobs: dqlite: if: ${{ github.event.issue.pull_request && contains(github.event.comment.body, 'please test downstream') }} runs-on: ubuntu-22.04 steps: - name: Install apt deps run: | sudo apt-get update -qq sudo apt-get install -qq automake libtool gcc make liblz4-dev libuv1-dev libsqlite3-dev - name: Check out libbacktrace uses: actions/checkout@v4 with: repository: ianlancetaylor/libbacktrace path: libbacktrace - name: Install libbacktrace run: | cd libbacktrace autoreconf -i ./configure sudo make -j$(nproc) install sudo ldconfig - name: Check out dqlite uses: actions/checkout@v4 with: ref: refs/pull/${{ github.event.issue.number }}/merge path: dqlite - name: Install dqlite run: | cd dqlite autoreconf -i ./configure --enable-debug --enable-sanitize --enable-backtrace --enable-build-raft sudo make -j$(nproc) sudo make install sudo ldconfig - name: Install Go uses: actions/setup-go@v5 - name: Check out go-dqlite uses: actions/checkout@v4 with: repository: canonical/go-dqlite path: go-dqlite - name: Test go-dqlite env: GO_DQLITE_MULTITHREAD: '1' run: | cd go-dqlite go get -tags libsqlite3 -t ./... go test -asan -v ./... VERBOSE=1 ASAN=-asan ./test/dqlite-demo.sh VERBOSE=1 ASAN=-asan DISK=1 ./test/dqlite-demo.sh VERBOSE=1 ASAN=-asan ./test/roles.sh VERBOSE=1 ASAN=-asan DISK=1 ./test/roles.sh VERBOSE=1 ASAN=-asan ./test/recover.sh VERBOSE=1 ASAN=-asan DISK=1 ./test/recover.sh jepsen: if: ${{ github.event.issue.pull_request && contains(github.event.comment.body, 'please test downstream') }} uses: canonical/jepsen.dqlite/.github/workflows/test-build-run.yml@master with: dqlite-ref: refs/pull/${{ github.event.issue.number }}/head workloads: > ['append', 'bank', 'set'] nemeses: > ['none', 'partition', 'kill', 'stop', 'disk', 'member', 'partition,stop', 'partition,kill', 'partition,member', 'packet,stop', 'pause'] disk: > ['0'] dqlite-1.16.7/.github/workflows/external-raft.yml000066400000000000000000000015211465252713400217740ustar00rootroot00000000000000name: CI Tests (external libraft) on: - push - pull_request jobs: build-and-test: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Setup dependencies run: | sudo apt update sudo apt install -y libsqlite3-dev liblz4-dev libuv1-dev - name: Build raft run: | git clone https://github.com/canonical/raft --depth 1 cd raft autoreconf -i ./configure --enable-debug --enable-sanitize make -j4 sudo make install sudo ldconfig - name: Build dqlite run: | autoreconf -i ./configure --enable-debug --enable-sanitize make -j4 - name: Test run: | export LIBRAFT_TRACE=1 LIBDQLITE_TRACE=1 make -j4 check || (cat ./test-suite.log && false) dqlite-1.16.7/.github/workflows/latest-deps.yml000066400000000000000000000041641465252713400214530ustar00rootroot00000000000000name: CI Tests (latest deps) on: - push - pull_request jobs: build-and-test: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Raise aio-max-nr run: | sysctl fs.aio-max-nr sudo sysctl -w fs.aio-max-nr=1000000 - name: Install latest libuv run: | version="$(curl -L 'https://dist.libuv.org/dist' | grep -o 'v[0-9]\.[0-9]\{1,2\}\.[0-9]\{1,2\}' | sort -V -r | head -n1)" echo "Selected libuv $version" curl -LO "https://dist.libuv.org/dist/$version/libuv-$version.tar.gz" tar xzf "libuv-$version.tar.gz" cd "libuv-$version" sh autogen.sh ./configure make -j4 sudo make install - name: Install latest liblz4 run: | mkdir lz4 cd lz4 git init git remote add github 'https://github.com/lz4/lz4' git fetch github 'refs/tags/*:refs/tags/*' version="$(git tag | sort -V -r | head -n1)" echo "Selected lz4 $version" git checkout "$version" make -j4 sudo make install - name: ldconfig run: | sudo ldconfig - name: Get latest SQLite run: | relative="$(curl -L 'https://sqlite.org/download.html' | grep '^PRODUCT' | grep 'amalgamation' | cut -d',' -f3)" curl -LO "https://sqlite.org/$relative" name="$(basename "$relative" .zip)" echo "Selected $name" unzip "$name.zip" cd "$name" cp sqlite3.{c,h} "$GITHUB_WORKSPACE" - name: Build dqlite run: | autoreconf -i ./configure --enable-debug --enable-sanitize --enable-build-raft --enable-build-sqlite make -j4 unit-test integration-test \ raft-core-fuzzy-test \ raft-core-integration-test \ raft-core-unit-test \ raft-uv-integration-test \ raft-uv-unit-test ldd .libs/libdqlite.so - name: Test run: | export LIBDQLITE_TRACE=1 make check || (cat ./test-suite.log && false) dqlite-1.16.7/.github/workflows/linting.yml000066400000000000000000000015621465252713400206710ustar00rootroot00000000000000name: Linting on: - push - pull_request jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 with: fetch-depth: 2 - name: Install apt dependencies run: | sudo apt update sudo apt install -y libsqlite3-dev liblz4-dev libuv1-dev bear - uses: KyleMayes/install-llvm-action@master with: version: 17 - name: Run clang-format run: | find . \( -name '*.c' -or -name '*.h' \) -not -name 'munit.*' -path ./llvm -prune | xargs ./llvm/bin/clang-format --style=file --dry-run -Werror - name: Run clang-tidy run: | shopt -s globstar bear -- cc -D_GNU_SOURCE -DHAVE_LINUX_AIO_ABI_H -c {src,test}/**/*.c git show -U0 --first-parent | ./clang-tidy-diff.py -p1 -config-file=.clang-tidy -clang-tidy-binary=./llvm/bin/clang-tidy -use-color dqlite-1.16.7/.github/workflows/packages.yml000066400000000000000000000031561465252713400210040ustar00rootroot00000000000000name: Build PPA source packages on: push: branches: - master jobs: build: if: github.repository == 'canonical/dqlite' strategy: fail-fast: false matrix: target: - focal - jammy - mantic - noble runs-on: ubuntu-20.04 environment: name: ppa steps: - name: Clone the repositories run: | git clone https://github.com/canonical/dqlite git clone https://github.com/canonical/dqlite-ppa -b dqlite --depth 1 - name: Setup dependencies run: | sudo apt-get update -qq sudo apt-get install -qq debhelper devscripts gnupg - name: Setup GPG signing key env: PPA_SECRET_KEY: ${{ secrets.PPA_SECRET_KEY }} run: | echo "$PPA_SECRET_KEY" > private-key.asc gpg --import --batch private-key.asc - name: Delete GPG signing key file if: always() run: | rm -f private-key.asc - name: Build source package env: DEBFULLNAME: "Github Actions" DEBEMAIL: "dqlitebot@lists.canonical.com" TARGET: ${{ matrix.target }} run: | cp -R dqlite-ppa/debian dqlite/ cd dqlite/ VERSION="$(git describe --tags | sed -e "s/^v//" -e "s/-/+git/")" dch --create \ --distribution ${TARGET} \ --package dqlite \ --newversion ${VERSION}~${TARGET}1 \ "Automatic build from Github" debuild -S -sa -d -k${{ vars.PPA_PUBLIC_KEY }} - name: Upload to Launchpad run: | dput -U -u ppa:dqlite/dev *.changes dqlite-1.16.7/.gitignore000066400000000000000000000005651465252713400150770ustar00rootroot00000000000000*.a *.gcda *.gcno *.la *.lo *.log *.o *.so *.trs .deps .dirstamp .libs Makefile Makefile.in aclocal.m4 aminclude_static.am autom4te*.cache confdefs.h config.status configure coverage/ coverage.info unit-test integration-test dqlite.pc libtool stamp-h* sqlite3.c raft-core-fuzzy-test raft-core-integration-test raft-core-unit-test raft-uv-integration-test raft-uv-unit-test dqlite-1.16.7/AUTHORS000066400000000000000000000003601465252713400141500ustar00rootroot00000000000000Unless mentioned otherwise in a specific file's header, all code in this project is released under the LGPL v3 license. The list of authors and contributors can be retrieved from the git commit history and in some cases, the file headers. dqlite-1.16.7/CODE_OF_CONDUCT.md000066400000000000000000000001611465252713400156760ustar00rootroot00000000000000dqlite has adopted the [Ubuntu Code of Conduct](coc). [coc]: https://ubuntu.com/community/ethos/code-of-conduct dqlite-1.16.7/CONTRIBUTING.md000066400000000000000000000006611465252713400153350ustar00rootroot00000000000000# Contributing to dqlite The dqlite team welcomes external contributions via GitHub pull requests. To get your PR merged, you need to sign [Canonical's contributor license agreement (CLA)][cla]. This is straightforward to do once you have an account on [Launchpad][lp]; if you don't, you can create one [here][signup]. [cla]: https://ubuntu.com/legal/contributors [lp]: https://launchpad.net [signup]: https://launchpad.net/+login dqlite-1.16.7/Dockerfile000066400000000000000000000027701465252713400151010ustar00rootroot00000000000000# FROM debian:buster-slim as dqlite-lib-builder FROM ubuntu as dqlite-lib-builder ARG DEBIAN_FRONTEND="noninteractive" ENV TZ=Europe/London ENV LD_LIBRARY_PATH=/usr/local/lib ENV GOROOT=/usr/local/go ENV GOPATH=/go ENV PATH=$GOPATH/bin:$GOROOT/bin:$PATH RUN apt-get update && apt-get install -y git build-essential dh-autoreconf pkg-config libuv1-dev libsqlite3-dev liblz4-dev tcl8.6 wget WORKDIR /opt RUN git clone https://github.com/canonical/raft.git && \ git clone https://github.com/canonical/go-dqlite.git && \ wget -c https://golang.org/dl/go1.15.2.linux-amd64.tar.gz -O - | tar -xzf - -C /usr/local WORKDIR /opt/raft RUN autoreconf -i && ./configure && make && make install WORKDIR /opt/dqlite COPY . . RUN autoreconf -i && ./configure && make && make install WORKDIR /opt/go-dqlite RUN go get -d -v ./... && \ go install -tags libsqlite3 ./cmd/dqlite-demo && \ go install -tags libsqlite3 ./cmd/dqlite # FROM debian:buster-slim FROM ubuntu ARG DEBIAN_FRONTEND="noninteractive" ENV TZ=Europe/London ENV LD_LIBRARY_PATH=/usr/local/lib ENV PATH=/opt:$PATH COPY --from=dqlite-lib-builder /go/bin /opt/ COPY --from=dqlite-lib-builder /usr/local/lib /usr/local/lib COPY --from=dqlite-lib-builder \ /usr/lib/x86_64-linux-gnu/libuv.so \ /usr/lib/x86_64-linux-gnu/libuv.so.1\ /usr/lib/x86_64-linux-gnu/libuv.so.1.0.0\ /usr/lib/ COPY --from=dqlite-lib-builder \ /lib/x86_64-linux-gnu/libsqlite3.so \ /lib/x86_64-linux-gnu/libsqlite3.so.0 \ /usr/lib/x86_64-linux-gnu/ dqlite-1.16.7/LICENSE000066400000000000000000000215061465252713400141120ustar00rootroot00000000000000All files in this repository are licensed as follows. If you contribute to this repository, it is assumed that you license your contribution under the same license unless you state otherwise. All files Copyright (C) 2017-2019 Canonical Ltd. unless otherwise specified in the file. This software is licensed under the LGPLv3, included below. As a special exception to the GNU Lesser General Public License version 3 ("LGPL3"), the copyright holders of this Library give you permission to convey to a third party a Combined Work that links statically or dynamically to this Library without providing any Minimal Corresponding Source or Minimal Application Code as set out in 4d or providing the installation information set out in section 4e, provided that you comply with the other provisions of LGPL3 and provided that you meet, for the Application the terms and conditions of the license(s) which apply to the Application. Except as stated in this special exception, the provisions of LGPL3 will continue to comply in full to this Library. If you modify this Library, you may apply this exception to your version of this Library, but you are not obliged to do so. If you do not wish to do so, delete this exception statement from your version. This exception does not (and cannot) modify any license terms which apply to the Application, with which you must still comply. GNU LESSER GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. This version of the GNU Lesser General Public License incorporates the terms and conditions of version 3 of the GNU General Public License, supplemented by the additional permissions listed below. 0. Additional Definitions. As used herein, "this License" refers to version 3 of the GNU Lesser General Public License, and the "GNU GPL" refers to version 3 of the GNU General Public License. "The Library" refers to a covered work governed by this License, other than an Application or a Combined Work as defined below. An "Application" is any work that makes use of an interface provided by the Library, but which is not otherwise based on the Library. Defining a subclass of a class defined by the Library is deemed a mode of using an interface provided by the Library. A "Combined Work" is a work produced by combining or linking an Application with the Library. The particular version of the Library with which the Combined Work was made is also called the "Linked Version". The "Minimal Corresponding Source" for a Combined Work means the Corresponding Source for the Combined Work, excluding any source code for portions of the Combined Work that, considered in isolation, are based on the Application, and not on the Linked Version. The "Corresponding Application Code" for a Combined Work means the object code and/or source code for the Application, including any data and utility programs needed for reproducing the Combined Work from the Application, but excluding the System Libraries of the Combined Work. 1. Exception to Section 3 of the GNU GPL. You may convey a covered work under sections 3 and 4 of this License without being bound by section 3 of the GNU GPL. 2. Conveying Modified Versions. If you modify a copy of the Library, and, in your modifications, a facility refers to a function or data to be supplied by an Application that uses the facility (other than as an argument passed when the facility is invoked), then you may convey a copy of the modified version: a) under this License, provided that you make a good faith effort to ensure that, in the event an Application does not supply the function or data, the facility still operates, and performs whatever part of its purpose remains meaningful, or b) under the GNU GPL, with none of the additional permissions of this License applicable to that copy. 3. Object Code Incorporating Material from Library Header Files. The object code form of an Application may incorporate material from a header file that is part of the Library. You may convey such object code under terms of your choice, provided that, if the incorporated material is not limited to numerical parameters, data structure layouts and accessors, or small macros, inline functions and templates (ten or fewer lines in length), you do both of the following: a) Give prominent notice with each copy of the object code that the Library is used in it and that the Library and its use are covered by this License. b) Accompany the object code with a copy of the GNU GPL and this license document. 4. Combined Works. You may convey a Combined Work under terms of your choice that, taken together, effectively do not restrict modification of the portions of the Library contained in the Combined Work and reverse engineering for debugging such modifications, if you also do each of the following: a) Give prominent notice with each copy of the Combined Work that the Library is used in it and that the Library and its use are covered by this License. b) Accompany the Combined Work with a copy of the GNU GPL and this license document. c) For a Combined Work that displays copyright notices during execution, include the copyright notice for the Library among these notices, as well as a reference directing the user to the copies of the GNU GPL and this license document. d) Do one of the following: 0) Convey the Minimal Corresponding Source under the terms of this License, and the Corresponding Application Code in a form suitable for, and under terms that permit, the user to recombine or relink the Application with a modified version of the Linked Version to produce a modified Combined Work, in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source. 1) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (a) uses at run time a copy of the Library already present on the user's computer system, and (b) will operate properly with a modified version of the Library that is interface-compatible with the Linked Version. e) Provide Installation Information, but only if you would otherwise be required to provide such information under section 6 of the GNU GPL, and only to the extent that such information is necessary to install and execute a modified version of the Combined Work produced by recombining or relinking the Application with a modified version of the Linked Version. (If you use option 4d0, the Installation Information must accompany the Minimal Corresponding Source and Corresponding Application Code. If you use option 4d1, you must provide the Installation Information in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source.) 5. Combined Libraries. You may place library facilities that are a work based on the Library side by side in a single library together with other library facilities that are not Applications and are not covered by this License, and convey such a combined library under terms of your choice, if you do both of the following: a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities, conveyed under the terms of this License. b) Give prominent notice with the combined library that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. 6. Revised Versions of the GNU Lesser General Public License. The Free Software Foundation may publish revised and/or new versions of the GNU Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library as you received it specifies that a certain numbered version of the GNU Lesser General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that published version or of any later version published by the Free Software Foundation. If the Library as you received it does not specify a version number of the GNU Lesser General Public License, you may choose any version of the GNU Lesser General Public License ever published by the Free Software Foundation. If the Library as you received it specifies that a proxy can decide whether future versions of the GNU Lesser General Public License shall apply, that proxy's public statement of acceptance of any version is permanent authorization for you to choose that version for the Library. dqlite-1.16.7/Makefile.am000066400000000000000000000237761465252713400151540ustar00rootroot00000000000000ACLOCAL_AMFLAGS = -I m4 AM_CFLAGS += $(CODE_COVERAGE_CFLAGS) AM_CFLAGS += $(SQLITE_CFLAGS) $(UV_CFLAGS) $(PTHREAD_CFLAGS) AM_LDFLAGS = $(UV_LIBS) $(PTHREAD_LIBS) if DQLITE_NEXT_ENABLED AM_CFLAGS += -DDQLITE_NEXT endif if !BUILD_RAFT_ENABLED AM_CFLAGS += $(RAFT_CFLAGS) -DUSE_SYSTEM_RAFT AM_LDFLAGS += $(RAFT_LIBS) endif if DEBUG_ENABLED AM_CFLAGS += -O0 else AM_CFLAGS += -O2 endif if SANITIZE_ENABLED AM_CFLAGS += -fsanitize=address endif if BACKTRACE_ENABLED AM_CFLAGS += -DDQLITE_ASSERT_WITH_BACKTRACE -DRAFT_ASSERT_WITH_BACKTRACE AM_LDFLAGS += -lbacktrace endif include_HEADERS = include/dqlite.h basic_dqlite_sources = \ src/bind.c \ src/client/protocol.c \ src/command.c \ src/conn.c \ src/db.c \ src/dqlite.c \ src/error.c \ src/format.c \ src/fsm.c \ src/gateway.c \ src/id.c \ src/leader.c \ src/lib/addr.c \ src/lib/buffer.c \ src/lib/fs.c \ src/lib/sm.c \ src/lib/threadpool.c \ src/lib/transport.c \ src/logger.c \ src/message.c \ src/metrics.c \ src/config.c \ src/query.c \ src/registry.c \ src/request.c \ src/response.c \ src/roles.c \ src/server.c \ src/stmt.c \ src/tracing.c \ src/transport.c \ src/translate.c \ src/tuple.c \ src/vfs.c \ src/vfs2.c lib_LTLIBRARIES = libdqlite.la libdqlite_la_CFLAGS = $(AM_CFLAGS) -fvisibility=hidden -DRAFT_API='' libdqlite_la_LDFLAGS = $(AM_LDFLAGS) -version-info 0:1:0 libdqlite_la_SOURCES = $(basic_dqlite_sources) if BUILD_RAFT_ENABLED libraft_la_SOURCES = \ src/raft/byte.c \ src/raft/callbacks.c \ src/raft/client.c \ src/raft/compress.c \ src/raft/configuration.c \ src/raft/convert.c \ src/raft/election.c \ src/raft/entry.c \ src/raft/err.c \ src/raft/fixture.c \ src/raft/flags.c \ src/raft/heap.c \ src/raft/lifecycle.c \ src/raft/log.c \ src/raft/membership.c \ src/raft/progress.c \ src/raft/raft.c \ src/raft/recv.c \ src/raft/recv_append_entries.c \ src/raft/recv_append_entries_result.c \ src/raft/recv_request_vote.c \ src/raft/recv_request_vote_result.c \ src/raft/recv_install_snapshot.c \ src/raft/recv_timeout_now.c \ src/raft/replication.c \ src/raft/snapshot.c \ src/raft/start.c \ src/raft/state.c \ src/raft/syscall.c \ src/raft/tick.c \ src/raft/uv.c \ src/raft/uv_append.c \ src/raft/uv_encoding.c \ src/raft/uv_finalize.c \ src/raft/uv_fs.c \ src/raft/uv_ip.c \ src/raft/uv_list.c \ src/raft/uv_metadata.c \ src/raft/uv_os.c \ src/raft/uv_prepare.c \ src/raft/uv_recv.c \ src/raft/uv_segment.c \ src/raft/uv_send.c \ src/raft/uv_snapshot.c \ src/raft/uv_tcp.c \ src/raft/uv_tcp_listen.c \ src/raft/uv_tcp_connect.c \ src/raft/uv_truncate.c \ src/raft/uv_work.c \ src/raft/uv_writer.c libdqlite_la_SOURCES += $(libraft_la_SOURCES) endif # BUILD_RAFT_ENABLED check_PROGRAMS = unit-test integration-test check_LTLIBRARIES = libtest.la libtest_la_CFLAGS = $(AM_CFLAGS) -DMUNIT_TEST_NAME_LEN=60 -Wno-unknown-warning-option -Wno-unused-result -Wno-conversion -Wno-uninitialized -Wno-maybe-uninitialized -Wno-strict-prototypes -Wno-old-style-definition libtest_la_SOURCES = \ test/lib/endpoint.c \ test/lib/fault.c \ test/lib/fs.c \ test/lib/heap.c \ test/lib/logger.c \ test/lib/munit.c \ test/lib/raft_heap.c \ test/lib/server.c \ test/lib/sqlite.c \ test/lib/uv.c unit_test_SOURCES = $(basic_dqlite_sources) unit_test_SOURCES += \ test/test_error.c \ test/test_integration.c \ test/unit/ext/test_uv.c \ test/unit/ext/test_uv_pool.c \ test/unit/lib/test_addr.c \ test/unit/lib/test_buffer.c \ test/unit/lib/test_byte.c \ test/unit/lib/test_registry.c \ test/unit/lib/test_serialize.c \ test/unit/lib/test_transport.c \ test/unit/test_command.c \ test/unit/test_conn.c \ test/unit/test_gateway.c \ test/unit/test_concurrency.c \ test/unit/test_registry.c \ test/unit/test_replication.c \ test/unit/test_request.c \ test/unit/test_role_management.c \ test/unit/test_sm.c \ test/unit/test_tuple.c \ test/unit/test_vfs.c \ test/unit/test_vfs2.c \ test/unit/main.c unit_test_CFLAGS = $(AM_CFLAGS) -Wno-unknown-warning-option -Wno-uninitialized -Wno-maybe-uninitialized -Wno-float-equal -Wno-conversion unit_test_LDFLAGS = $(AM_LDFLAGS) unit_test_LDADD = libtest.la if BUILD_RAFT_ENABLED unit_test_LDADD += libraft.la endif integration_test_SOURCES = \ test/integration/test_client.c \ test/integration/test_cluster.c \ test/integration/test_fsm.c \ test/integration/test_membership.c \ test/integration/test_node.c \ test/integration/test_role_management.c \ test/integration/test_server.c \ test/integration/test_vfs.c \ test/integration/main.c integration_test_CFLAGS = $(AM_CFLAGS) -Wno-conversion integration_test_LDFLAGS = $(AM_LDFLAGS) -no-install integration_test_LDADD = libtest.la libdqlite.la if BUILD_RAFT_ENABLED check_LTLIBRARIES += libraft.la check_PROGRAMS += \ raft-core-unit-test \ raft-core-integration-test \ raft-uv-unit-test \ raft-uv-integration-test \ raft-core-fuzzy-test libtest_la_SOURCES += \ test/raft/lib/addrinfo.c \ test/raft/lib/fault.c \ test/raft/lib/fsm.c \ test/raft/lib/heap.c \ test/raft/lib/munit.c \ test/raft/lib/tcp.c \ test/raft/lib/cluster.c \ test/raft/lib/aio.c \ test/raft/lib/dir.c \ test/raft/lib/tcp.c \ test/raft/lib/loop.c libraft_la_CFLAGS = $(AM_CFLAGS) libraft_la_LDFLAGS = $(UV_LIBS) raft_core_unit_test_SOURCES = \ $(libraft_la_SOURCES) \ src/lib/sm.c \ src/tracing.c \ test/raft/unit/main_core.c \ test/raft/unit/test_byte.c \ test/raft/unit/test_compress.c \ test/raft/unit/test_configuration.c \ test/raft/unit/test_err.c \ test/raft/unit/test_flags.c \ test/raft/unit/test_log.c \ test/raft/unit/test_queue.c \ test/raft/unit/test_snapshot.c raft_core_unit_test_CFLAGS = $(AM_CFLAGS) -Wno-conversion raft_core_unit_test_LDADD = libtest.la raft_core_integration_test_SOURCES = \ src/tracing.c \ src/lib/sm.c \ test/raft/integration/main_core.c \ test/raft/integration/test_apply.c \ test/raft/integration/test_assign.c \ test/raft/integration/test_barrier.c \ test/raft/integration/test_bootstrap.c \ test/raft/integration/test_digest.c \ test/raft/integration/test_election.c \ test/raft/integration/test_fixture.c \ test/raft/integration/test_heap.c \ test/raft/integration/test_init.c \ test/raft/integration/test_membership.c \ test/raft/integration/test_recover.c \ test/raft/integration/test_replication.c \ test/raft/integration/test_snapshot.c \ test/raft/integration/test_start.c \ test/raft/integration/test_strerror.c \ test/raft/integration/test_tick.c \ test/raft/integration/test_transfer.c \ test/raft/integration/test_voter_contacts.c raft_core_integration_test_CFLAGS = $(AM_CFLAGS) -Wno-conversion raft_core_integration_test_LDFLAGS = -no-install raft_core_integration_test_LDADD = libtest.la libraft.la raft_core_fuzzy_test_SOURCES = \ src/lib/sm.c \ src/tracing.c \ test/raft/fuzzy/main_core.c \ test/raft/fuzzy/test_election.c \ test/raft/fuzzy/test_liveness.c \ test/raft/fuzzy/test_membership.c \ test/raft/fuzzy/test_replication.c raft_core_fuzzy_test_CFLAGS = $(AM_CFLAGS) -Wno-conversion raft_core_fuzzy_test_LDFLAGS = -no-install raft_core_fuzzy_test_LDADD = libtest.la libraft.la raft_uv_unit_test_SOURCES = \ src/tracing.c \ src/raft/err.c \ src/raft/heap.c \ src/raft/syscall.c \ src/raft/uv_fs.c \ src/raft/uv_os.c \ src/raft/uv_writer.c \ test/raft/unit/main_uv.c \ test/raft/unit/test_uv_fs.c \ test/raft/unit/test_uv_os.c \ test/raft/unit/test_uv_writer.c raft_uv_unit_test_CFLAGS = $(AM_CFLAGS) -Wno-conversion raft_uv_unit_test_LDADD = libtest.la $(UV_LIBS) # The integration/uv test is not linked to libraft, but built # directly against the libraft sources in order to test some # non-visible, non-API functions. raft_uv_integration_test_SOURCES = \ $(libraft_la_SOURCES) \ src/tracing.c \ src/lib/sm.c \ test/raft/integration/main_uv.c \ test/raft/integration/test_uv_init.c \ test/raft/integration/test_uv_append.c \ test/raft/integration/test_uv_bootstrap.c \ test/raft/integration/test_uv_load.c \ test/raft/integration/test_uv_recover.c \ test/raft/integration/test_uv_recv.c \ test/raft/integration/test_uv_send.c \ test/raft/integration/test_uv_set_term.c \ test/raft/integration/test_uv_tcp_connect.c \ test/raft/integration/test_uv_tcp_listen.c \ test/raft/integration/test_uv_snapshot_put.c \ test/raft/integration/test_uv_truncate.c \ test/raft/integration/test_uv_truncate_snapshot.c \ test/raft/integration/test_uv_work.c raft_uv_integration_test_CFLAGS = $(AM_CFLAGS) -Wno-type-limits -Wno-conversion raft_uv_integration_test_LDFLAGS = -no-install raft_uv_integration_test_LDADD = libtest.la $(UV_LIBS) if LZ4_AVAILABLE libdqlite_la_CFLAGS += -DLZ4_AVAILABLE $(LZ4_CFLAGS) libdqlite_la_LDFLAGS += $(LZ4_LIBS) raft_core_unit_test_CFLAGS += -DLZ4_AVAILABLE $(LZ4_CFLAGS) raft_core_unit_test_LDFLAGS = $(LZ4_LIBS) libraft_la_CFLAGS += -DLZ4_AVAILABLE $(LZ4_CFLAGS) libraft_la_LDFLAGS += $(LZ4_LIBS) raft_uv_integration_test_CFLAGS += -DLZ4_AVAILABLE raft_uv_integration_test_LDFLAGS += $(LZ4_LIBS) endif if LZ4_ENABLED libdqlite_la_CFLAGS += -DLZ4_ENABLED raft_uv_integration_test_CFLAGS += -DLZ4_ENABLED raft_core_unit_test_CFLAGS += -DLZ4_ENABLED libraft_la_CFLAGS += -DLZ4_ENABLED endif endif # BUILD_RAFT_ENABLED if BUILD_SQLITE_ENABLED noinst_LTLIBRARIES = libsqlite3.la libsqlite3_la_SOURCES = sqlite3.c libsqlite3_la_CFLAGS = -g3 unit_test_LDADD += libsqlite3.la libdqlite_la_LIBADD = libsqlite3.la else AM_LDFLAGS += $(SQLITE_LIBS) endif TESTS = $(check_PROGRAMS) if CODE_COVERAGE_ENABLED include $(top_srcdir)/aminclude_static.am CODE_COVERAGE_DIRECTORY=./src CODE_COVERAGE_OUTPUT_DIRECTORY=coverage CODE_COVERAGE_OUTPUT_FILE=coverage.info CODE_COVERAGE_IGNORE_PATTERN="/usr/include/*" CODE_COVERAGE_BRANCH_COVERAGE=1 CODE_COVERAGE_LCOV_OPTIONS=$(CODE_COVERAGE_LCOV_OPTIONS_DEFAULT) --rc lcov_excl_br_line="assert\(" clean-local: code-coverage-clean distclean-local: code-coverage-dist-clean endif # CODE_COVERAGE_ENABLED pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = dqlite.pc dqlite-1.16.7/README.md000066400000000000000000000104601465252713400143610ustar00rootroot00000000000000dqlite [![CI Tests](https://github.com/canonical/dqlite/actions/workflows/build-and-test.yml/badge.svg)](https://github.com/canonical/dqlite/actions/workflows/build-and-test.yml) [![codecov](https://codecov.io/gh/canonical/dqlite/branch/master/graph/badge.svg)](https://codecov.io/gh/canonical/dqlite) ====== [English](./README.md)|[简体中文](./README_CH.md) [dqlite](https://dqlite.io) is a C library that implements an embeddable and replicated SQL database engine with high availability and automatic failover. The acronym "dqlite" stands for "distributed SQLite", meaning that dqlite extends [SQLite](https://sqlite.org/) with a network protocol that can connect together various instances of your application and have them act as a highly-available cluster, with no dependency on external databases. Design highlights ---------------- * Asynchronous single-threaded implementation using [libuv](https://libuv.org/) as event loop. * Custom wire protocol optimized for SQLite primitives and data types. * Data replication based on the [Raft](https://raft.github.io/) algorithm. License ------- The dqlite library is released under a slightly modified version of LGPLv3, that includes a copyright exception allowing users to statically link the library code in their project and release the final work under their own terms. See the full [license](https://github.com/canonical/dqlite/blob/master/LICENSE) text. Compatibility ------------- dqlite runs on Linux and requires a kernel with support for [native async I/O](https://man7.org/linux/man-pages/man2/io_setup.2.html) (not to be confused with [POSIX AIO](https://man7.org/linux/man-pages/man7/aio.7.html)). Try it ------- The simplest way to see dqlite in action is to use the demo program that comes with the Go dqlite bindings. Please see the [relevant documentation](https://github.com/canonical/go-dqlite#demo) in that project. Media ----- A talk about dqlite was given at FOSDEM 2020, you can watch it [here](https://fosdem.org/2020/schedule/event/dqlite/). [Here](https://gcore.com/blog/comparing-litestream-rqlite-dqlite/) is a blog post from 2022 comparing dqlite with rqlite and Litestream, other replication software for SQLite. Wire protocol ------------- If you wish to write a client, please refer to the [wire protocol](https://dqlite.io/docs/protocol) documentation. Install ------- If you are on a Debian-based system, you can get the latest development release from dqlite's [dev PPA](https://launchpad.net/~dqlite/+archive/ubuntu/dev): ``` sudo add-apt-repository ppa:dqlite/dev sudo apt update sudo apt install libdqlite-dev ``` Contributing ------------ See [CONTRIBUTING.md](./CONTRIBUTING.md). Build ----- To build libdqlite from source you'll need: * Build dependencies: pkg-config and GNU Autoconf, Automake, libtool, and make * A reasonably recent version of [libuv](https://libuv.org/) (v1.8.0 or later), with headers. * A reasonably recent version of [SQLite](https://sqlite.org/) (v3.22.0 or later), with headers. * Optionally, a reasonably recent version of [LZ4](https://lz4.org/) (v1.7.1 or later), with headers. Your distribution should already provide you with these dependencies. For example, on Debian-based distros: ``` sudo apt install pkg-config autoconf automake libtool make libuv1-dev libsqlite3-dev liblz4-dev ``` With these dependencies installed, you can build and install the dqlite shared library and headers as follows: ``` $ autoreconf -i $ ./configure --enable-build-raft $ make $ sudo make install ``` The default installation prefix is `/usr/local`; you may need to run ``` $ sudo ldconfig ``` to enable the linker to find `libdqlite.so`. To install to a different prefix, replace the configure step with something like ``` $ ./configure --enable-build-raft --prefix=/usr ``` The `--enable-build-raft` option causes dqlite to use its bundled Raft implementation instead of linking to an external libraft; the latter is a legacy configuration that should not be used for new development. Usage Notes ----------- Detailed tracing will be enabled when the environment variable `LIBDQLITE_TRACE` is set before startup. The value of it can be in `[0..5]` range and reperesents a tracing level, where `0` means "no traces" emitted, `5` enables minimum (FATAL records only), and `1` enables maximum verbosity (all: DEBUG, INFO, WARN, ERROR, FATAL records). dqlite-1.16.7/README_CH.md000066400000000000000000000065511465252713400147410ustar00rootroot00000000000000 # dqlite [![CI Tests](https://github.com/canonical/dqlite/actions/workflows/build-and-test.yml/badge.svg)](https://github.com/canonical/dqlite/actions/workflows/build-and-test.yml) [![codecov](https://codecov.io/gh/canonical/dqlite/branch/master/graph/badge.svg)](https://codecov.io/gh/canonical/dqlite) **注意**:中文文档有可能未及时更新,请以最新的英文[readme](./README.md)为准。 [dqlite](https://dqlite.io)是一个用C语言开发的可嵌入的,支持流复制的数据库引擎,具备高可用性和自动故障转移功能。 “dqlite”是“distributed SQLite”的简写,即分布式SQLite。意味着dqlite通过网络协议扩展SQLite,将应用程序的各个实例连接在一起,让它们作为一个高可用的集群,而不依赖外部数据库。 ## 设计亮点 - 使用[libuv](https://libuv.org/)实现异步单线程的事件循环机制 - 针对SQLite 原始数据类型优化的自定义网络协议 - 基于[Raft](https://raft.github.io/)算法的数据复制及其高效[C-raft](https://github.com/canonical/raft)实现 ## license dqlite库是在略微修改的 LGPLv3 版本下发布的,其中包括一个版权例外,允许用户在他们的项目中静态链接这个库的代码并按照自己的条款发布最终作品。如有需要,请查看完整[license](https://github.com/canonical/dqlite/blob/master/LICENSE)文件。 ## 兼容性 dqlite 在 Linux 上运行,由于C-raft 的 libuv 后端的实现,需要一个支持 [native async I/O](https://man7.org/linux/man-pages/man2/io_setup.2.html) 的内核(注意不要和[POSIX AIO](https://man7.org/linux/man-pages/man7/aio.7.html)混淆)。 ## 尝试使用 查看和了解dqlite的最简单方式是使用绑定了Go dqlite的demo样例程序,Go dqlite的使用可以参考它的项目文档[relevant documentation](https://github.com/canonical/go-dqlite#demo)。 ## 视频 在 FOSDEM 2020 上有一个关于dqlite的演讲视频,您可以在[此处](https://fosdem.org/2020/schedule/event/dqlite/)观看。 ## 网络协议 如果您想编写客户端,请参阅[网络协议](https://dqlite.io/docs/protocol)文档。 ## 下载 如果您使用的是基于 Debian 的系统,您可以从 dqlite 的[dev PPA](https://launchpad.net/~dqlite/+archive/ubuntu/dev) 获得最新的开发版本: ```bash sudo add-apt-repository ppa:dqlite/dev sudo apt-get update sudo apt-get install libdqlite-dev ``` ## 源码构建 为了编译构建libdqlite,您需要准备: - 较新版本的libuv(v1.18.0或之后的版本) - 较新版本的sqlite3-dev - 构建好的[C-raft](https://github.com/canonical/raft)库 您的linux发行版应该已经为您提供了预构建的 libuv 共享库和 libsqlite3-dev,就不需要在下载了,否则还需要下载这两个依赖。 对于基于 Debian 的 Linux 发行版,您可以使用以下命令安装构建依赖项: ``` sudo apt install autoconf libuv1-dev liblz4-dev libtool pkg-config build-essential libsqlite3-dev ``` 编译raft库运行如下命令: ```bash git clone https://github.com/canonical/raft.git cd raft autoreconf -i ./configure make sudo make install cd .. ``` 所有依赖的库都下载好后,运行如下命令手动编译dqlite库: ```bash autoreconf -i ./configure make sudo make install ``` ## 注意事项 当环境变量LIBRAFT_TRACE在启动时被设置,将启用详细跟踪。dqlite-1.16.7/VERSION000066400000000000000000000000051465252713400141440ustar00rootroot000000000000000.1.0dqlite-1.16.7/ac/000077500000000000000000000000001465252713400134645ustar00rootroot00000000000000dqlite-1.16.7/ac/.gitignore000066400000000000000000000000161465252713400154510ustar00rootroot00000000000000* !.gitignore dqlite-1.16.7/bt/000077500000000000000000000000001465252713400135065ustar00rootroot00000000000000dqlite-1.16.7/bt/request000077500000000000000000000014201465252713400151210ustar00rootroot00000000000000#!/bin/sh set -o errexit libraft_path="${LIBRAFT_SO_PATH:-/usr/local/lib/libraft.so.2}" exec bpftrace -I resources -I include $@ - < struct request { void *data; int type; raft_index index; queue queue; }; uprobe:$libraft_path:lifecycleRequestStart { \$req = (struct request *)arg1; @start_request[\$req->data, \$req->type, \$req->index] = nsecs; } uprobe:$libraft_path:lifecycleRequestEnd { \$req = (struct request *)arg1; \$start = @start_request[\$req->data, \$req->type, \$req->index]; \$end = nsecs; @full[\$req->data, \$req->type, \$req->index] = (\$start, \$end); \$elapsed_msecs = (\$end - \$start) / 1000; @hist = lhist(\$elapsed_msecs, 100, 1000, 10); delete(@start_request[\$req->data, \$req->type, \$req->index]); } EOF dqlite-1.16.7/clang-tidy-diff.py000077500000000000000000000277471465252713400164400ustar00rootroot00000000000000#!/usr/bin/env python3 # # ===- clang-tidy-diff.py - ClangTidy Diff Checker -----------*- python -*--===# # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # # ===-----------------------------------------------------------------------===# r""" ClangTidy Diff Checker ====================== This script reads input from a unified diff, runs clang-tidy on all changed files and outputs clang-tidy warnings in changed lines only. This is useful to detect clang-tidy regressions in the lines touched by a specific patch. Example usage for git/svn users: git diff -U0 HEAD^ | clang-tidy-diff.py -p1 svn diff --diff-cmd=diff -x-U0 | \ clang-tidy-diff.py -fix -checks=-*,modernize-use-override """ import argparse import glob import json import multiprocessing import os import re import shutil import subprocess import sys import tempfile import threading import traceback try: import yaml except ImportError: yaml = None is_py2 = sys.version[0] == "2" if is_py2: import Queue as queue else: import queue as queue def run_tidy(task_queue, lock, timeout, failed_files): watchdog = None while True: command = task_queue.get() try: proc = subprocess.Popen( command, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) if timeout is not None: watchdog = threading.Timer(timeout, proc.kill) watchdog.start() stdout, stderr = proc.communicate() if proc.returncode != 0: if proc.returncode < 0: msg = "Terminated by signal %d : %s\n" % ( -proc.returncode, " ".join(command), ) stderr += msg.encode("utf-8") failed_files.append(command) with lock: sys.stdout.write(stdout.decode("utf-8") + "\n") sys.stdout.flush() if stderr: sys.stderr.write(stderr.decode("utf-8") + "\n") sys.stderr.flush() except Exception as e: with lock: sys.stderr.write("Failed: " + str(e) + ": ".join(command) + "\n") finally: with lock: if not (timeout is None or watchdog is None): if not watchdog.is_alive(): sys.stderr.write( "Terminated by timeout: " + " ".join(command) + "\n" ) watchdog.cancel() task_queue.task_done() def start_workers(max_tasks, tidy_caller, arguments): for _ in range(max_tasks): t = threading.Thread(target=tidy_caller, args=arguments) t.daemon = True t.start() def merge_replacement_files(tmpdir, mergefile): """Merge all replacement files in a directory into a single file""" # The fixes suggested by clang-tidy >= 4.0.0 are given under # the top level key 'Diagnostics' in the output yaml files mergekey = "Diagnostics" merged = [] for replacefile in glob.iglob(os.path.join(tmpdir, "*.yaml")): content = yaml.safe_load(open(replacefile, "r")) if not content: continue # Skip empty files. merged.extend(content.get(mergekey, [])) if merged: # MainSourceFile: The key is required by the definition inside # include/clang/Tooling/ReplacementsYaml.h, but the value # is actually never used inside clang-apply-replacements, # so we set it to '' here. output = {"MainSourceFile": "", mergekey: merged} with open(mergefile, "w") as out: yaml.safe_dump(output, out) else: # Empty the file: open(mergefile, "w").close() def main(): parser = argparse.ArgumentParser( description="Run clang-tidy against changed files, and " "output diagnostics only for modified " "lines." ) parser.add_argument( "-clang-tidy-binary", metavar="PATH", default="clang-tidy", help="path to clang-tidy binary", ) parser.add_argument( "-p", metavar="NUM", default=0, help="strip the smallest prefix containing P slashes", ) parser.add_argument( "-regex", metavar="PATTERN", default=None, help="custom pattern selecting file paths to check " "(case sensitive, overrides -iregex)", ) parser.add_argument( "-iregex", metavar="PATTERN", default=r".*\.(cpp|cc|c\+\+|cxx|c|cl|h|hpp|m|mm|inc)", help="custom pattern selecting file paths to check " "(case insensitive, overridden by -regex)", ) parser.add_argument( "-j", type=int, default=1, help="number of tidy instances to be run in parallel.", ) parser.add_argument( "-timeout", type=int, default=None, help="timeout per each file in seconds." ) parser.add_argument( "-fix", action="store_true", default=False, help="apply suggested fixes" ) parser.add_argument( "-checks", help="checks filter, when not specified, use clang-tidy " "default", default="", ) parser.add_argument( "-config-file", dest="config_file", help="Specify the path of .clang-tidy or custom config file", default="", ) parser.add_argument("-use-color", action="store_true", help="Use colors in output") parser.add_argument( "-path", dest="build_path", help="Path used to read a compile command database." ) if yaml: parser.add_argument( "-export-fixes", metavar="FILE_OR_DIRECTORY", dest="export_fixes", help="A directory or a yaml file to store suggested fixes in, " "which can be applied with clang-apply-replacements. If the " "parameter is a directory, the fixes of each compilation unit are " "stored in individual yaml files in the directory.", ) else: parser.add_argument( "-export-fixes", metavar="DIRECTORY", dest="export_fixes", help="A directory to store suggested fixes in, which can be applied " "with clang-apply-replacements. The fixes of each compilation unit are " "stored in individual yaml files in the directory.", ) parser.add_argument( "-extra-arg", dest="extra_arg", action="append", default=[], help="Additional argument to append to the compiler " "command line.", ) parser.add_argument( "-extra-arg-before", dest="extra_arg_before", action="append", default=[], help="Additional argument to prepend to the compiler " "command line.", ) parser.add_argument( "-quiet", action="store_true", default=False, help="Run clang-tidy in quiet mode", ) parser.add_argument( "-load", dest="plugins", action="append", default=[], help="Load the specified plugin in clang-tidy.", ) clang_tidy_args = [] argv = sys.argv[1:] if "--" in argv: clang_tidy_args.extend(argv[argv.index("--") :]) argv = argv[: argv.index("--")] args = parser.parse_args(argv) # Extract changed lines for each file. filename = None lines_by_file = {} for line in sys.stdin: match = re.search('^\+\+\+\ "?(.*?/){%s}([^ \t\n"]*)' % args.p, line) if match: filename = match.group(2) if filename is None: continue if args.regex is not None: if not re.match("^%s$" % args.regex, filename): continue else: if not re.match("^%s$" % args.iregex, filename, re.IGNORECASE): continue match = re.search("^@@.*\+(\d+)(,(\d+))?", line) if match: start_line = int(match.group(1)) line_count = 1 if match.group(3): line_count = int(match.group(3)) if line_count == 0: continue end_line = start_line + line_count - 1 lines_by_file.setdefault(filename, []).append([start_line, end_line]) if not any(lines_by_file): print("No relevant changes found.") sys.exit(0) max_task_count = args.j if max_task_count == 0: max_task_count = multiprocessing.cpu_count() max_task_count = min(len(lines_by_file), max_task_count) combine_fixes = False export_fixes_dir = None delete_fixes_dir = False if args.export_fixes is not None: # if a directory is given, create it if it does not exist if args.export_fixes.endswith(os.path.sep) and not os.path.isdir( args.export_fixes ): os.makedirs(args.export_fixes) if not os.path.isdir(args.export_fixes): if not yaml: raise RuntimeError( "Cannot combine fixes in one yaml file. Either install PyYAML or specify an output directory." ) combine_fixes = True if os.path.isdir(args.export_fixes): export_fixes_dir = args.export_fixes if combine_fixes: export_fixes_dir = tempfile.mkdtemp() delete_fixes_dir = True # Tasks for clang-tidy. task_queue = queue.Queue(max_task_count) # A lock for console output. lock = threading.Lock() # List of files with a non-zero return code. failed_files = [] # Run a pool of clang-tidy workers. start_workers( max_task_count, run_tidy, (task_queue, lock, args.timeout, failed_files) ) # Form the common args list. common_clang_tidy_args = [] if args.fix: common_clang_tidy_args.append("-fix") if args.checks != "": common_clang_tidy_args.append("-checks=" + args.checks) if args.config_file != "": common_clang_tidy_args.append("-config-file=" + args.config_file) if args.quiet: common_clang_tidy_args.append("-quiet") if args.build_path is not None: common_clang_tidy_args.append("-p=%s" % args.build_path) if args.use_color: common_clang_tidy_args.append("--use-color") for arg in args.extra_arg: common_clang_tidy_args.append("-extra-arg=%s" % arg) for arg in args.extra_arg_before: common_clang_tidy_args.append("-extra-arg-before=%s" % arg) for plugin in args.plugins: common_clang_tidy_args.append("-load=%s" % plugin) for name in lines_by_file: line_filter_json = json.dumps( [{"name": name, "lines": lines_by_file[name]}], separators=(",", ":") ) # Run clang-tidy on files containing changes. command = [args.clang_tidy_binary] command.append("-line-filter=" + line_filter_json) if args.export_fixes is not None: # Get a temporary file. We immediately close the handle so clang-tidy can # overwrite it. (handle, tmp_name) = tempfile.mkstemp(suffix=".yaml", dir=export_fixes_dir) os.close(handle) command.append("-export-fixes=" + tmp_name) command.extend(common_clang_tidy_args) command.append(name) command.extend(clang_tidy_args) task_queue.put(command) # Application return code return_code = 0 # Wait for all threads to be done. task_queue.join() # Application return code return_code = 0 if failed_files: return_code = 1 if combine_fixes: print("Writing fixes to " + args.export_fixes + " ...") try: merge_replacement_files(export_fixes_dir, args.export_fixes) except: sys.stderr.write("Error exporting fixes.\n") traceback.print_exc() return_code = 1 if delete_fixes_dir: shutil.rmtree(export_fixes_dir) sys.exit(return_code) if __name__ == "__main__": main() dqlite-1.16.7/configure.ac000066400000000000000000000110051465252713400153640ustar00rootroot00000000000000AC_PREREQ(2.60) AC_INIT([libdqlite], [1.16.7], [https://github.com/canonical/dqlite]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_AUX_DIR([ac]) AM_INIT_AUTOMAKE([subdir-objects -Wall -Werror -Wno-portability foreign]) AM_SILENT_RULES([yes]) # Without this line, AC_PROG_CC boneheadedly adds `-g -O2` to our CFLAGS. AC_SUBST(CFLAGS, "") AC_PROG_CC AC_USE_SYSTEM_EXTENSIONS AX_PTHREAD LT_INIT # TODO: eventually enable this # AX_CHECK_COMPILE_FLAG([-Weverything], AM_CFLAGS+=" -Weverything") # Whether to enable debugging code. AC_ARG_ENABLE(debug, AS_HELP_STRING([--enable-debug[=ARG]], [enable debugging [default=no]])) AM_CONDITIONAL(DEBUG_ENABLED, test "x$enable_debug" = "xyes") # Whether to enable memory sanitizer. AC_ARG_ENABLE(sanitize, AS_HELP_STRING([--enable-sanitize[=ARG]], [enable code sanitizers [default=no]])) AM_CONDITIONAL(SANITIZE_ENABLED, test x"$enable_sanitize" = x"yes") AM_COND_IF(SANITIZE_ENABLED, AX_CHECK_COMPILE_FLAG([-fsanitize=address], [true], [AC_MSG_ERROR([address sanitizer not supported])])) AC_ARG_ENABLE(backtrace, AS_HELP_STRING([--enable-backtrace[=ARG]], [print backtrace on assertion failure [default=no]])) AM_CONDITIONAL(BACKTRACE_ENABLED, test "x$enable_backtrace" = "xyes") AC_ARG_ENABLE(build-sqlite, AS_HELP_STRING([--enable-build-sqlite[=ARG]], [build libsqlite3 from sqlite3.c in the build root [default=no]])) AM_CONDITIONAL(BUILD_SQLITE_ENABLED, test "x$enable_build_sqlite" = "xyes") AC_ARG_ENABLE(build-raft, AS_HELP_STRING([--enable-build-raft[=ARG]], [use the bundled raft sources instead of linking to libraft [default=no]])) AM_CONDITIONAL(BUILD_RAFT_ENABLED, test "x$enable_build_raft" = "xyes") AC_ARG_ENABLE(dqlite-next, AS_HELP_STRING([--enable-dqlite-next[=ARG]], [build with the experimental dqlite backend [default=no]])) AM_CONDITIONAL(DQLITE_NEXT_ENABLED, test "x$enable_dqlite_next" = "xyes") AS_IF([test "x$enable_build_raft" != "xyes" -a "x$enable_dqlite_next" = "xyes"], [AC_MSG_ERROR([dqlite-next requires bundled raft])], []) # Whether to enable code coverage. AX_CODE_COVERAGE # Checks for header files. AC_CHECK_HEADERS([linux/io_uring.h linux/aio_abi.h]) # Checks for library functions and definitions. AC_CHECK_DECLS(RWF_NOWAIT, [], [AC_MSG_ERROR(Linux kernel >= 4.14 required.)], [#include ]) # Enable large file support. This is mandatory in order to interoperate with # libuv, which enables large file support by default, making the size of 'off_t' # on 32-bit architecture be 8 bytes instead of the normal 4. AC_SYS_LARGEFILE # Checks for libraries PKG_CHECK_MODULES(SQLITE, [sqlite3 >= 3.22.0], [], []) PKG_CHECK_MODULES(UV, [libuv >= 1.34.0], [], []) AS_IF([test "x$enable_build_raft" != "xyes"], [PKG_CHECK_MODULES(RAFT, [raft >= 0.18.1], [], [])], []) # Allow not linking to liblz4 even if it's present. AC_ARG_WITH([lz4], AS_HELP_STRING([--without-lz4], [never link to liblz4])) AS_IF([test "x$enable_build_raft" = "xyes"], # Building raft [AS_IF([test "x$with_lz4" != "xno"], [PKG_CHECK_MODULES(LZ4, [liblz4 >= 1.7.1], [have_lz4=yes], [have_lz4=no])], [have_lz4=no]) AS_IF([test "x$with_lz4" != "xno" -a "x$have_lz4" = "xno"], [AC_MSG_ERROR([liblz4 required but not found])], [])], # Not building raft [AS_IF([test "x$with_lz4" = "xyes"], [AC_MSG_ERROR([linking lz4 doesn't make sense unless building raft])], []) have_lz4=no]) AM_CONDITIONAL(LZ4_AVAILABLE, test "x$have_lz4" = "xyes") AC_ARG_ENABLE(lz4, AS_HELP_STRING([--disable-lz4], [when building with lz4, do not compress snapshots by default])) AS_IF([test "x$enable_lz4" != "x" -a "x$have_lz4" = "xno"], [AC_MSG_ERROR([snapshot compression (either by default or not) requires liblz4])], []) AM_CONDITIONAL(LZ4_ENABLED, test "x$enable_lz4" != "xno" -a "x$have_lz4" = "xyes") CC_CHECK_FLAGS_APPEND([AM_CFLAGS],[CFLAGS],[ \ -std=c11 \ -g3 \ --mcet \ -fcf-protection \ --param=ssp-buffer-size=4 \ -pipe \ -fno-strict-aliasing \ -fdiagnostics-color \ -fexceptions \ -fstack-clash-protection \ -fstack-protector-strong \ -fasynchronous-unwind-tables \ -fdiagnostics-show-option \ -Wall \ -Wextra \ -Wimplicit-fallthrough=5 \ -Wcast-align \ -Wstrict-prototypes \ -Wlogical-op \ -Wmissing-include-dirs \ -Wold-style-definition \ -Winit-self \ -Wfloat-equal \ -Wsuggest-attribute=noreturn \ -Wformat=2 \ -Wshadow \ -Wendif-labels \ -Wdate-time \ -Wnested-externs \ -Wconversion \ -Werror \ ]) # To enable: # # -Wpedantic \ AC_SUBST(AM_CFLAGS) AC_CONFIG_FILES([dqlite.pc Makefile]) AC_OUTPUT dqlite-1.16.7/doc/000077500000000000000000000000001465252713400136465ustar00rootroot00000000000000dqlite-1.16.7/doc/faq.md000066400000000000000000000000741465252713400147400ustar00rootroot00000000000000Moved to the [website project](https://dqlite.io/docs/faq). dqlite-1.16.7/doc/index.md000066400000000000000000000000701465252713400152740ustar00rootroot00000000000000Moved to the [website project](https://dqlite.io/docs). dqlite-1.16.7/doc/protocol.md000066400000000000000000000001011465252713400160210ustar00rootroot00000000000000Moved to the [website project](https://dqlite.io/docs/protocol). dqlite-1.16.7/dqlite.pc.in000066400000000000000000000004101465252713400153070ustar00rootroot00000000000000prefix=@prefix@ exec_prefix=@exec_prefix@ libdir=@libdir@ includedir=@includedir@ Name: dqlite Description: Distributed SQLite engine Version: @PACKAGE_VERSION@ Libs: -L${libdir} -ldqlite Libs.private: @SQLITE_LIBS@ @UV_LIBS@ @RAFT_LIBS@ Cflags: -I${includedir} dqlite-1.16.7/include/000077500000000000000000000000001465252713400145245ustar00rootroot00000000000000dqlite-1.16.7/include/dqlite.h000066400000000000000000000643371465252713400161740ustar00rootroot00000000000000#ifndef DQLITE_H #define DQLITE_H #include #include #include #include #ifndef DQLITE_API #define DQLITE_API __attribute__((visibility("default"))) #endif /** * This "pseudo-attribute" marks declarations that are only a provisional part * of the dqlite public API. These declarations may change or be removed * entirely in minor or point releases of dqlite, without bumping the soversion * of libdqlite.so. Consumers of dqlite who use these declarations are * responsible for updating their code in response to such breaking changes. */ #define DQLITE_EXPERIMENTAL #ifndef DQLITE_VISIBLE_TO_TESTS #define DQLITE_VISIBLE_TO_TESTS DQLITE_API #endif /** * Version. */ #define DQLITE_VERSION_MAJOR 1 #define DQLITE_VERSION_MINOR 16 #define DQLITE_VERSION_RELEASE 7 #define DQLITE_VERSION_NUMBER \ (DQLITE_VERSION_MAJOR * 100 * 100 + DQLITE_VERSION_MINOR * 100 + \ DQLITE_VERSION_RELEASE) DQLITE_API int dqlite_version_number(void); /** * Hold the value of a dqlite node ID. Guaranteed to be at least 64-bit long. */ typedef unsigned long long dqlite_node_id; DQLITE_EXPERIMENTAL typedef struct dqlite_server dqlite_server; /** * Signature of a custom callback used to establish network connections * to dqlite servers. * * @arg is a user data parameter, copied from the third argument of * dqlite_server_set_connect_func. @addr is a (borrowed) abstract address * string, as passed to dqlite_server_create or dqlite_server_set_auto_join. @fd * is an address where a socket representing the connection should be stored. * The callback should return zero if a connection was established successfully * or nonzero if the attempt failed. */ DQLITE_EXPERIMENTAL typedef int (*dqlite_connect_func)(void *arg, const char *addr, int *fd); /* The following dqlite_server functions return zero on success or nonzero on * error. More specific error codes may be specified in the future. */ /** * Start configuring a dqlite server. * * The server will not start running until dqlite_server_start is called. @path * is the path to a directory where the server (and attached client) will store * its persistent state; the directory must exist. A pointer to the new server * object is stored in @server on success. * * Whether or not this function succeeds, you should call dqlite_server_destroy * to release resources owned by the server object. * * No reference to @path is kept after this function returns. */ DQLITE_API DQLITE_EXPERIMENTAL int dqlite_server_create(const char *path, dqlite_server **server); /** * Set the abstract address of this server. * * This function must be called when the server starts for the first time, and * is a no-op when the server is restarting. The abstract address is recorded in * the Raft log and passed to the connect function on each server (see * dqlite_server_set_connect_func). The server will also bind to this address to * listen for incoming connections from clients and other servers, unless * dqlite_server_set_bind_address is used. For the address syntax accepted by * the default connect function (and for binding/listening), see * dqlite_server_set_bind_address. */ DQLITE_API DQLITE_EXPERIMENTAL int dqlite_server_set_address( dqlite_server *server, const char *address); /** * Turn on or off automatic bootstrap for this server. * * The bootstrap server should be the first to start up. It automatically * becomes the leader in the first term, and is responsible for adding all other * servers to the cluster configuration. There must be exactly one bootstrap * server in each cluster. After the first startup, the bootstrap server is no * longer special and this function is a no-op. */ DQLITE_API DQLITE_EXPERIMENTAL int dqlite_server_set_auto_bootstrap( dqlite_server *server, bool on); /** * Declare the addresses of existing servers in the cluster, which should * already be running. * * The server addresses declared with this function will not be used unless * @server is starting up for the first time; after the first startup, the list * of servers stored on disk will be used instead. (It is harmless to call this * function unconditionally.) */ DQLITE_API DQLITE_EXPERIMENTAL int dqlite_server_set_auto_join( dqlite_server *server, const char *const *addrs, unsigned n); /** * Configure @server to listen on the address @addr for incoming connections * (from clients and other servers). * * If no bind address is configured with this function, the abstract address * passed to dqlite_server_create will be used. The point of this function is to * support decoupling the abstract address from the networking implementation * (for example, if a proxy is going to be used). * * @addr must use one of the following formats: * * 1. "" * 2. ":" * 3. "@" * * Where is a numeric IPv4/IPv6 address, is a port number, and * is an abstract Unix socket path. The port number defaults to 8080 if * not specified. In the second form, if is an IPv6 address, it must be * enclosed in square brackets "[]". In the third form, if is empty, the * implementation will automatically select an available abstract Unix socket * path. * * If an abstract Unix socket is used, the server will accept only * connections originating from the same process. */ DQLITE_API DQLITE_EXPERIMENTAL int dqlite_server_set_bind_address( dqlite_server *server, const char *addr); /** * Configure the function that this server will use to connect to other servers. * * The same function will be used by the server's attached client to establish * connections to all servers in the cluster. @arg is a user data parameter that * will be passed to all invocations of the connect function. */ DQLITE_API DQLITE_EXPERIMENTAL int dqlite_server_set_connect_func( dqlite_server *server, dqlite_connect_func f, void *arg); /** * Start running the server. * * Once this function returns successfully, the server will be ready to accept * client requests using the functions below. */ DQLITE_API DQLITE_EXPERIMENTAL int dqlite_server_start(dqlite_server *server); /** * Get the ID of the server. * * This will return 0 (an invalid ID) if the server has not been started. */ DQLITE_API DQLITE_EXPERIMENTAL dqlite_node_id dqlite_server_get_id(dqlite_server *server); /** * Hand over the server's privileges to other servers. * * This is intended to be called before dqlite_server_stop. The server will try * to surrender leadership and voting rights to other nodes in the cluster, if * applicable. This avoids some disruptions that can result when a privileged * server stops suddenly. */ DQLITE_API DQLITE_EXPERIMENTAL int dqlite_server_handover( dqlite_server *server); /** * Stop the server. * * The server will stop processing requests from client or other servers. To * smooth over some possible disruptions to the cluster, call * dqlite_server_handover before this function. After this function returns * (successfully or not), you should call dqlite_server_destroy to free * resources owned by the server. */ DQLITE_API DQLITE_EXPERIMENTAL int dqlite_server_stop(dqlite_server *server); /** * Free resources owned by the server. * * You should always call this function to finalize a server created with * dqlite_server_create, whether or not that function returned successfully. * If the server has been successfully started with dqlite_server_start, * then you must stop it with dqlite_server_stop before calling this function. */ DQLITE_API DQLITE_EXPERIMENTAL void dqlite_server_destroy( dqlite_server *server); /** * Error codes. * * These are used only with the dqlite_node family of functions. */ enum { DQLITE_ERROR = 1, /* Generic error */ DQLITE_MISUSE, /* Library used incorrectly */ DQLITE_NOMEM /* A malloc() failed */ }; /** * Dqlite node handle. * * Opaque handle to a single dqlite node that can serve database requests from * connected clients and exchanges data replication messages with other dqlite * nodes. */ typedef struct dqlite_node dqlite_node; /** * Create a new dqlite node object. * * The @id argument a is positive number that identifies this particular dqlite * node in the cluster. Each dqlite node part of the same cluster must be * created with a different ID. The very first node, used to bootstrap a new * cluster, must have ID #1. Every time a node is started again, it must be * passed the same ID. * The @address argument is the network address that clients or other nodes in * the cluster must use to connect to this dqlite node. If no custom connect * function is going to be set using dqlite_node_set_connect_func(), then the * format of the string must be "" or ":, where is a * numeric IPv4/IPv6 address and is a port number. The port number * defaults to 8080 if not specified. If a port number is specified with an * IPv6 address, the address must be enclosed in square brackets "[]". * * If a custom connect function is used, then the format of the string must by * whatever the custom connect function accepts. * * The @data_dir argument the file system path where the node should store its * durable data, such as Raft log entries containing WAL frames of the SQLite * databases being replicated. * * No reference to the memory pointed to by @address and @data_dir is kept by * the dqlite library, so any memory associated with them can be released after * the function returns. * * Even if an error is returned, the caller should call dqlite_node_destroy() * on the dqlite_node* value pointed to by @n, and calling dqlite_node_errmsg() * with that value will return a valid error string. (In some cases *n will be * set to NULL, but dqlite_node_destroy() and dqlite_node_errmsg() will handle * this gracefully.) */ DQLITE_API int dqlite_node_create(dqlite_node_id id, const char *address, const char *data_dir, dqlite_node **n); /** * Destroy a dqlite node object. * * This will release all memory that was allocated by the node. If * dqlite_node_start() was successfully invoked, then dqlite_node_stop() must be * invoked before destroying the node. */ DQLITE_API void dqlite_node_destroy(dqlite_node *n); /** * Instruct the dqlite node to bind a network address when starting, and * listening for incoming client connections. * * The given address might match the one passed to @dqlite_node_create or be a * different one (for example if the application wants to proxy it). * * The format of the @address argument must be one of * * 1. "" * 2. ":" * 3. "@" * * Where is a numeric IPv4/IPv6 address, is a port number, and * is an abstract Unix socket path. The port number defaults to 8080 if * not specified. In the second form, if is an IPv6 address, it must be * enclosed in square brackets "[]". In the third form, if is empty, the * implementation will automatically select an available abstract Unix socket * path, which can then be retrieved with dqlite_node_get_bind_address(). * * If an abstract Unix socket is used the dqlite node will accept only * connections originating from the same process. * * No reference to the memory pointed to by @address is kept, so any memory * associated with them can be released after the function returns. * * This function must be called before calling dqlite_node_start(). */ DQLITE_API int dqlite_node_set_bind_address(dqlite_node *n, const char *address); /** * Get the network address that the dqlite node is using to accept incoming * connections. */ DQLITE_API const char *dqlite_node_get_bind_address(dqlite_node *n); /** * Set a custom connect function. * * The function should block until a network connection with the dqlite node at * the given @address is established, or an error occurs. * * In case of success, the file descriptor of the connected socket must be saved * into the location pointed by the @fd argument. The socket must be either a * TCP or a Unix socket. * * This function must be called before calling dqlite_node_start(). */ DQLITE_API int dqlite_node_set_connect_func(dqlite_node *n, int (*f)(void *arg, const char *address, int *fd), void *arg); /** * DEPRECATED - USE `dqlite_node_set_network_latency_ms` * Set the average one-way network latency, expressed in nanoseconds. * * This value is used internally by dqlite to decide how frequently the leader * node should send heartbeats to other nodes in order to maintain its * leadership, and how long other nodes should wait before deciding that the * leader has died and initiate a failover. * * This function must be called before calling dqlite_node_start(). */ DQLITE_API int dqlite_node_set_network_latency(dqlite_node *n, unsigned long long nanoseconds); /** * Set the average one-way network latency, expressed in milliseconds. * * This value is used internally by dqlite to decide how frequently the leader * node should send heartbeats to other nodes in order to maintain its * leadership, and how long other nodes should wait before deciding that the * leader has died and initiate a failover. * * This function must be called before calling dqlite_node_start(). * * Latency should not be 0 or larger than 3600000 milliseconds. */ DQLITE_API int dqlite_node_set_network_latency_ms(dqlite_node *t, unsigned milliseconds); /** * Set the failure domain associated with this node. * * This is effectively a tag applied to the node and that can be inspected later * with the "Describe node" client request. */ DQLITE_API int dqlite_node_set_failure_domain(dqlite_node *n, unsigned long long code); /** * Set the snapshot parameters for this node. * * This function determines how frequently a node will snapshot the state * of the database and how many raft log entries will be kept around after * a snapshot has been taken. * * `snapshot_threshold` : Determines the frequency of taking a snapshot, the * lower the number, the higher the frequency. * * `snapshot_trailing` : Determines the amount of log entries kept around after * taking a snapshot. Lowering this number decreases disk and memory footprint * but increases the chance of having to send a full snapshot (instead of a * number of log entries to a node that has fallen behind. * * This function must be called before calling dqlite_node_start(). */ DQLITE_API int dqlite_node_set_snapshot_params(dqlite_node *n, unsigned snapshot_threshold, unsigned snapshot_trailing); /** * Set the block size used for performing disk IO when writing raft log segments * to disk. @size is limited to a list of preset values. * * This function must be called before calling dqlite_node_start(). */ DQLITE_API int dqlite_node_set_block_size(dqlite_node *n, size_t size); /** * WARNING: This is an experimental API. * * By default dqlite holds the SQLite database file and WAL in memory. By * enabling disk-mode, dqlite will hold the SQLite database file on-disk while * keeping the WAL in memory. Has to be called after `dqlite_node_create` and * before `dqlite_node_start`. */ DQLITE_API int dqlite_node_enable_disk_mode(dqlite_node *n); /** * Set the target number of voting nodes for the cluster. * * If automatic role management is enabled, the cluster leader will attempt to * promote nodes to reach the target. If automatic role management is disabled, * this has no effect. * * The default target is 3 voters. */ DQLITE_API int dqlite_node_set_target_voters(dqlite_node *n, int voters); /** * Set the target number of standby nodes for the cluster. * * If automatic role management is enabled, the cluster leader will attempt to * promote nodes to reach the target. If automatic role management is disabled, * this has no effect. * * The default target is 0 standbys. */ DQLITE_API int dqlite_node_set_target_standbys(dqlite_node *n, int standbys); /** * Set the target number of threads in the thread pool processing sqlite3 disk * operations. * * The default pool thread count is 4. */ DQLITE_API int dqlite_node_set_pool_thread_count(dqlite_node *n, unsigned thread_count); /** * Enable or disable auto-recovery for corrupted disk files. * * When auto-recovery is enabled, files in the data directory that are * determined to be corrupt may be removed by dqlite at startup. This allows * the node to start up successfully in more situations, but comes at the cost * of possible data loss, and may mask bugs. * * This must be called before dqlite_node_start. * * Auto-recovery is enabled by default. */ DQLITE_API int dqlite_node_set_auto_recovery(dqlite_node *n, bool enabled); /** * Enable or disable raft snapshot compression. */ DQLITE_API int dqlite_node_set_snapshot_compression(dqlite_node *n, bool enabled); /** * Enable automatic role management on the server side for this node. * * When automatic role management is enabled, servers in a dqlite cluster will * autonomously (without client intervention) promote and demote each other * to maintain a specified number of voters and standbys, taking into account * the health, failure domain, and weight of each server. * * By default, no automatic role management is performed. */ DQLITE_API int dqlite_node_enable_role_management(dqlite_node *n); /** * Start a dqlite node. * * A background thread will be spawned which will run the node's main loop. If * this function returns successfully, the dqlite node is ready to accept new * connections. */ DQLITE_API int dqlite_node_start(dqlite_node *n); /** * Attempt to hand over this node's privileges to other nodes in preparation * for a graceful shutdown. * * Specifically, if this node is the cluster leader, this will cause another * voting node (if one exists) to be elected leader; then, if this node is a * voter, another non-voting node (if one exists) will be promoted to voter, and * then this node will be demoted to spare. * * This function returns 0 if all privileges were handed over successfully, * and nonzero otherwise. Callers can continue to dqlite_node_stop immediately * after this function returns (whether or not it succeeded), or include their * own graceful shutdown logic before dqlite_node_stop. */ DQLITE_API int dqlite_node_handover(dqlite_node *n); /** * Stop a dqlite node. * * The background thread running the main loop will be notified and the node * will not accept any new client connections. Once inflight requests are * completed, open client connections get closed and then the thread exits. */ DQLITE_API int dqlite_node_stop(dqlite_node *n); struct dqlite_node_info { dqlite_node_id id; const char *address; }; typedef struct dqlite_node_info dqlite_node_info; /* Defined to be an extensible struct, future additions to this struct should be * 64-bits wide and 0 should not be used as a valid value. */ struct dqlite_node_info_ext { uint64_t size; /* The size of this struct */ uint64_t id; /* dqlite_node_id */ uint64_t address; uint64_t dqlite_role; }; typedef struct dqlite_node_info_ext dqlite_node_info_ext; #define DQLITE_NODE_INFO_EXT_SZ_ORIG 32U /* (4 * 64) / 8 */ /** * !!! Deprecated, use `dqlite_node_recover_ext` instead which also includes * dqlite roles. !!! * * Force recovering a dqlite node which is part of a cluster whose majority of * nodes have died, and therefore has become unavailable. * * In order for this operation to be safe you must follow these steps: * * 1. Make sure no dqlite node in the cluster is running. * * 2. Identify all dqlite nodes that have survived and that you want to be part * of the recovered cluster. * * 3. Among the survived dqlite nodes, find the one with the most up-to-date * raft term and log. * * 4. Invoke @dqlite_node_recover exactly one time, on the node you found in * step 3, and pass it an array of #dqlite_node_info filled with the IDs and * addresses of the survived nodes, including the one being recovered. * * 5. Copy the data directory of the node you ran @dqlite_node_recover on to all * other non-dead nodes in the cluster, replacing their current data * directory. * * 6. Restart all nodes. */ DQLITE_API int dqlite_node_recover(dqlite_node *n, dqlite_node_info infos[], int n_info); /** * Force recovering a dqlite node which is part of a cluster whose majority of * nodes have died, and therefore has become unavailable. * * In order for this operation to be safe you must follow these steps: * * 1. Make sure no dqlite node in the cluster is running. * * 2. Identify all dqlite nodes that have survived and that you want to be part * of the recovered cluster. * * 3. Among the survived dqlite nodes, find the one with the most up-to-date * raft term and log. * * 4. Invoke @dqlite_node_recover_ext exactly one time, on the node you found in * step 3, and pass it an array of #dqlite_node_info filled with the IDs, * addresses and roles of the survived nodes, including the one being * recovered. * * 5. Copy the data directory of the node you ran @dqlite_node_recover_ext on to * all other non-dead nodes in the cluster, replacing their current data * directory. * * 6. Restart all nodes. */ DQLITE_API int dqlite_node_recover_ext(dqlite_node *n, dqlite_node_info_ext infos[], int n_info); /** * Return a human-readable description of the last error occurred. */ DQLITE_API const char *dqlite_node_errmsg(dqlite_node *n); /** * Generate a unique ID for the given address. */ DQLITE_API dqlite_node_id dqlite_generate_node_id(const char *address); /** * This function is DEPRECATED and will be removed in a future major release. * * Initialize the given SQLite VFS interface object with dqlite's custom * implementation, which can be used for replication. */ DQLITE_API int dqlite_vfs_init(sqlite3_vfs *vfs, const char *name); DQLITE_API int dqlite_vfs_enable_disk(sqlite3_vfs *vfs); /** * This function is DEPRECATED and will be removed in a future major release. * * Release all memory used internally by a SQLite VFS object that was * initialized using @qlite_vfs_init. */ DQLITE_API void dqlite_vfs_close(sqlite3_vfs *vfs); /** * This type is DEPRECATED and will be removed in a future major release. * * A single WAL frame to be replicated. */ struct dqlite_vfs_frame { unsigned long page_number; /* Database page number. */ void *data; /* Content of the database page. */ }; typedef struct dqlite_vfs_frame dqlite_vfs_frame; /** * This function is DEPRECATED and will be removed in a future major release. * * Check if the last call to sqlite3_step() has triggered a write transaction on * the database with the given filename. In that case acquire a WAL write lock * to prevent further write transactions, and return all new WAL frames * generated by the transaction. These frames are meant to be replicated across * nodes and then actually added to the WAL with dqlite_vfs_apply() once a * quorum is reached. If a quorum is not reached within a given time, then * dqlite_vfs_abort() can be used to abort and release the WAL write lock. */ DQLITE_API int dqlite_vfs_poll(sqlite3_vfs *vfs, const char *filename, dqlite_vfs_frame **frames, unsigned *n); /** * This function is DEPRECATED and will be removed in a future major release. * * Add to the WAL all frames that were generated by a write transaction * triggered by sqlite3_step() and that were obtained via dqlite_vfs_poll(). * * This interface is designed to match the typical use case of a node receiving * the frames by sequentially reading a byte stream from a network socket and * passing the data to this routine directly without any copy or futher * allocation, possibly except for integer encoding/decoding. */ DQLITE_API int dqlite_vfs_apply(sqlite3_vfs *vfs, const char *filename, unsigned n, unsigned long *page_numbers, void *frames); /** * This function is DEPRECATED and will be removed in a future major release. * * Abort a pending write transaction that was triggered by sqlite3_step() and * whose frames were obtained via dqlite_vfs_poll(). * * This should be called if the transaction could not be safely replicated. In * particular it will release the write lock acquired by dqlite_vfs_poll(). */ DQLITE_API int dqlite_vfs_abort(sqlite3_vfs *vfs, const char *filename); /** * This function is DEPRECATED and will be removed in a future major release. * * Return a snapshot of the main database file and of the WAL file. */ DQLITE_API int dqlite_vfs_snapshot(sqlite3_vfs *vfs, const char *filename, void **data, size_t *n); /** * This type is DEPRECATED and will be removed in a future major release. * * A data buffer. */ struct dqlite_buffer { void *base; /* Pointer to the buffer data. */ size_t len; /* Length of the buffer. */ }; /** * This function is DEPRECATED and will be removed in a future major release. * * Return a shallow snapshot of the main database file and of the WAL file. * Expects a bufs array of size x + 1, where x is obtained from * `dqlite_vfs_num_pages`. */ DQLITE_API int dqlite_vfs_shallow_snapshot(sqlite3_vfs *vfs, const char *filename, struct dqlite_buffer bufs[], unsigned n); /** * This function is DEPRECATED and will be removed in a future major release. */ DQLITE_API int dqlite_vfs_snapshot_disk(sqlite3_vfs *vfs, const char *filename, struct dqlite_buffer bufs[], unsigned n); /** * This function is DEPRECATED and will be removed in a future major release. * * Return the number of database pages (excluding WAL). */ DQLITE_API int dqlite_vfs_num_pages(sqlite3_vfs *vfs, const char *filename, unsigned *n); /** * This function is DEPRECATED and will be removed in a future major release. * * Restore a snapshot of the main database file and of the WAL file. */ DQLITE_API int dqlite_vfs_restore(sqlite3_vfs *vfs, const char *filename, const void *data, size_t n); /** * This function is DEPRECATED and will be removed in a future major release. * * Restore a snapshot of the main database file and of the WAL file. */ DQLITE_API int dqlite_vfs_restore_disk(sqlite3_vfs *vfs, const char *filename, const void *data, size_t main_size, size_t wal_size); #endif /* DQLITE_H */ dqlite-1.16.7/m4/000077500000000000000000000000001465252713400134215ustar00rootroot00000000000000dqlite-1.16.7/m4/.gitignore000066400000000000000000000003771465252713400154200ustar00rootroot00000000000000*.m4 !attributes.m4 !ax_ac_append_to_file.m4 !ax_ac_print_to_file.m4 !ax_add_am_macro_static.m4 !ax_am_macros_static.m4 !ax_check_compile_flag.m4 !ax_check_gnu_make.m4 !ax_code_coverage.m4 !ax_compare_version.m4 !ax_file_escapes.m4 !ax_pthread.m4 !pkg.m4 dqlite-1.16.7/m4/attributes.m4000066400000000000000000000240211465252713400160500ustar00rootroot00000000000000dnl Macros to check the presence of generic (non-typed) symbols. dnl Copyright (c) 2006-2008 Diego Pettenò dnl Copyright (c) 2006-2008 xine project dnl Copyright (c) 2012 Lucas De Marchi dnl dnl This program is free software; you can redistribute it and/or modify dnl it under the terms of the GNU General Public License as published by dnl the Free Software Foundation; either version 2, or (at your option) dnl any later version. dnl dnl This program is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the dnl GNU General Public License for more details. dnl dnl You should have received a copy of the GNU General Public License dnl along with this program; if not, write to the Free Software dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA dnl 02110-1301, USA. dnl dnl As a special exception, the copyright owners of the dnl macro gives unlimited permission to copy, distribute and modify the dnl configure scripts that are the output of Autoconf when processing the dnl Macro. You need not follow the terms of the GNU General Public dnl License when using or distributing such scripts, even though portions dnl of the text of the Macro appear in them. The GNU General Public dnl License (GPL) does govern all other use of the material that dnl constitutes the Autoconf Macro. dnl dnl This special exception to the GPL applies to versions of the dnl Autoconf Macro released by this project. When you make and dnl distribute a modified version of the Autoconf Macro, you may extend dnl this special exception to the GPL to apply to your modified version as dnl well. dnl Check if FLAG in ENV-VAR is supported by compiler and append it dnl to WHERE-TO-APPEND variable. Note that we invert -Wno-* checks to dnl -W* as gcc cannot test for negated warnings. If a C snippet is passed, dnl use it, otherwise use a simple main() definition that just returns 0. dnl CC_CHECK_FLAG_APPEND([WHERE-TO-APPEND], [ENV-VAR], [FLAG], [C-SNIPPET]) AC_DEFUN([CC_CHECK_FLAG_APPEND], [ AC_CACHE_CHECK([if $CC supports flag $3 in envvar $2], AS_TR_SH([cc_cv_$2_$3]), [eval "AS_TR_SH([cc_save_$2])='${$2}'" eval "AS_TR_SH([$2])='${cc_save_$2} -Werror `echo "$3" | sed 's/^-Wno-/-W/'`'" AC_LINK_IFELSE([AC_LANG_SOURCE(ifelse([$4], [], [int main(void) { return 0; } ], [$4]))], [eval "AS_TR_SH([cc_cv_$2_$3])='yes'"], [eval "AS_TR_SH([cc_cv_$2_$3])='no'"]) eval "AS_TR_SH([$2])='$cc_save_$2'"]) AS_IF([eval test x$]AS_TR_SH([cc_cv_$2_$3])[ = xyes], [eval "$1='${$1} $3'"]) ]) dnl CC_CHECK_FLAGS_APPEND([WHERE-TO-APPEND], [ENV-VAR], [FLAG1 FLAG2], [C-SNIPPET]) AC_DEFUN([CC_CHECK_FLAGS_APPEND], [ for flag in [$3]; do CC_CHECK_FLAG_APPEND([$1], [$2], $flag, [$4]) done ]) dnl Check if the flag is supported by linker (cacheable) dnl CC_CHECK_LDFLAGS([FLAG], [ACTION-IF-FOUND],[ACTION-IF-NOT-FOUND]) AC_DEFUN([CC_CHECK_LDFLAGS], [ AC_CACHE_CHECK([if $CC supports $1 flag], AS_TR_SH([cc_cv_ldflags_$1]), [ac_save_LDFLAGS="$LDFLAGS" LDFLAGS="$LDFLAGS $1" AC_LINK_IFELSE([int main() { return 1; }], [eval "AS_TR_SH([cc_cv_ldflags_$1])='yes'"], [eval "AS_TR_SH([cc_cv_ldflags_$1])="]) LDFLAGS="$ac_save_LDFLAGS" ]) AS_IF([eval test x$]AS_TR_SH([cc_cv_ldflags_$1])[ = xyes], [$2], [$3]) ]) dnl define the LDFLAGS_NOUNDEFINED variable with the correct value for dnl the current linker to avoid undefined references in a shared object. AC_DEFUN([CC_NOUNDEFINED], [ dnl We check $host for which systems to enable this for. AC_REQUIRE([AC_CANONICAL_HOST]) case $host in dnl FreeBSD (et al.) does not complete linking for shared objects when pthreads dnl are requested, as different implementations are present; to avoid problems dnl use -Wl,-z,defs only for those platform not behaving this way. *-freebsd* | *-openbsd*) ;; *) dnl First of all check for the --no-undefined variant of GNU ld. This allows dnl for a much more readable command line, so that people can understand what dnl it does without going to look for what the heck -z defs does. for possible_flags in "-Wl,--no-undefined" "-Wl,-z,defs"; do CC_CHECK_LDFLAGS([$possible_flags], [LDFLAGS_NOUNDEFINED="$possible_flags"]) break done ;; esac AC_SUBST([LDFLAGS_NOUNDEFINED]) ]) dnl Check for a -Werror flag or equivalent. -Werror is the GCC dnl and ICC flag that tells the compiler to treat all the warnings dnl as fatal. We usually need this option to make sure that some dnl constructs (like attributes) are not simply ignored. dnl dnl Other compilers don't support -Werror per se, but they support dnl an equivalent flag: dnl - Sun Studio compiler supports -errwarn=%all AC_DEFUN([CC_CHECK_WERROR], [ AC_CACHE_CHECK( [for $CC way to treat warnings as errors], [cc_cv_werror], [CC_CHECK_CFLAGS_SILENT([-Werror], [cc_cv_werror=-Werror], [CC_CHECK_CFLAGS_SILENT([-errwarn=%all], [cc_cv_werror=-errwarn=%all])]) ]) ]) AC_DEFUN([CC_CHECK_ATTRIBUTE], [ AC_REQUIRE([CC_CHECK_WERROR]) AC_CACHE_CHECK([if $CC supports __attribute__(( ifelse([$2], , [$1], [$2]) ))], AS_TR_SH([cc_cv_attribute_$1]), [ac_save_CFLAGS="$CFLAGS" CFLAGS="$CFLAGS $cc_cv_werror" AC_COMPILE_IFELSE([AC_LANG_SOURCE([$3])], [eval "AS_TR_SH([cc_cv_attribute_$1])='yes'"], [eval "AS_TR_SH([cc_cv_attribute_$1])='no'"]) CFLAGS="$ac_save_CFLAGS" ]) AS_IF([eval test x$]AS_TR_SH([cc_cv_attribute_$1])[ = xyes], [AC_DEFINE( AS_TR_CPP([SUPPORT_ATTRIBUTE_$1]), 1, [Define this if the compiler supports __attribute__(( ifelse([$2], , [$1], [$2]) ))] ) $4], [$5]) ]) AC_DEFUN([CC_ATTRIBUTE_CONSTRUCTOR], [ CC_CHECK_ATTRIBUTE( [constructor],, [void __attribute__((constructor)) ctor() { int a; }], [$1], [$2]) ]) AC_DEFUN([CC_ATTRIBUTE_FORMAT], [ CC_CHECK_ATTRIBUTE( [format], [format(printf, n, n)], [void __attribute__((format(printf, 1, 2))) printflike(const char *fmt, ...) { fmt = (void *)0; }], [$1], [$2]) ]) AC_DEFUN([CC_ATTRIBUTE_FORMAT_ARG], [ CC_CHECK_ATTRIBUTE( [format_arg], [format_arg(printf)], [char *__attribute__((format_arg(1))) gettextlike(const char *fmt) { fmt = (void *)0; }], [$1], [$2]) ]) AC_DEFUN([CC_ATTRIBUTE_VISIBILITY], [ CC_CHECK_ATTRIBUTE( [visibility_$1], [visibility("$1")], [void __attribute__((visibility("$1"))) $1_function() { }], [$2], [$3]) ]) AC_DEFUN([CC_ATTRIBUTE_NONNULL], [ CC_CHECK_ATTRIBUTE( [nonnull], [nonnull()], [void __attribute__((nonnull())) some_function(void *foo, void *bar) { foo = (void*)0; bar = (void*)0; }], [$1], [$2]) ]) AC_DEFUN([CC_ATTRIBUTE_UNUSED], [ CC_CHECK_ATTRIBUTE( [unused], , [void some_function(void *foo, __attribute__((unused)) void *bar);], [$1], [$2]) ]) AC_DEFUN([CC_ATTRIBUTE_SENTINEL], [ CC_CHECK_ATTRIBUTE( [sentinel], , [void some_function(void *foo, ...) __attribute__((sentinel));], [$1], [$2]) ]) AC_DEFUN([CC_ATTRIBUTE_DEPRECATED], [ CC_CHECK_ATTRIBUTE( [deprecated], , [void some_function(void *foo, ...) __attribute__((deprecated));], [$1], [$2]) ]) AC_DEFUN([CC_ATTRIBUTE_ALIAS], [ CC_CHECK_ATTRIBUTE( [alias], [weak, alias], [void other_function(void *foo) { } void some_function(void *foo) __attribute__((weak, alias("other_function")));], [$1], [$2]) ]) AC_DEFUN([CC_ATTRIBUTE_MALLOC], [ CC_CHECK_ATTRIBUTE( [malloc], , [void * __attribute__((malloc)) my_alloc(int n);], [$1], [$2]) ]) AC_DEFUN([CC_ATTRIBUTE_PACKED], [ CC_CHECK_ATTRIBUTE( [packed], , [struct astructure { char a; int b; long c; void *d; } __attribute__((packed));], [$1], [$2]) ]) AC_DEFUN([CC_ATTRIBUTE_CONST], [ CC_CHECK_ATTRIBUTE( [const], , [int __attribute__((const)) twopow(int n) { return 1 << n; } ], [$1], [$2]) ]) AC_DEFUN([CC_FLAG_VISIBILITY], [ AC_REQUIRE([CC_CHECK_WERROR]) AC_CACHE_CHECK([if $CC supports -fvisibility=hidden], [cc_cv_flag_visibility], [cc_flag_visibility_save_CFLAGS="$CFLAGS" CFLAGS="$CFLAGS $cc_cv_werror" CC_CHECK_CFLAGS_SILENT([-fvisibility=hidden], cc_cv_flag_visibility='yes', cc_cv_flag_visibility='no') CFLAGS="$cc_flag_visibility_save_CFLAGS"]) AS_IF([test "x$cc_cv_flag_visibility" = "xyes"], [AC_DEFINE([SUPPORT_FLAG_VISIBILITY], 1, [Define this if the compiler supports the -fvisibility flag]) $1], [$2]) ]) AC_DEFUN([CC_FUNC_EXPECT], [ AC_REQUIRE([CC_CHECK_WERROR]) AC_CACHE_CHECK([if compiler has __builtin_expect function], [cc_cv_func_expect], [ac_save_CFLAGS="$CFLAGS" CFLAGS="$CFLAGS $cc_cv_werror" AC_COMPILE_IFELSE([AC_LANG_SOURCE( [int some_function() { int a = 3; return (int)__builtin_expect(a, 3); }])], [cc_cv_func_expect=yes], [cc_cv_func_expect=no]) CFLAGS="$ac_save_CFLAGS" ]) AS_IF([test "x$cc_cv_func_expect" = "xyes"], [AC_DEFINE([SUPPORT__BUILTIN_EXPECT], 1, [Define this if the compiler supports __builtin_expect() function]) $1], [$2]) ]) AC_DEFUN([CC_ATTRIBUTE_ALIGNED], [ AC_REQUIRE([CC_CHECK_WERROR]) AC_CACHE_CHECK([highest __attribute__ ((aligned ())) supported], [cc_cv_attribute_aligned], [ac_save_CFLAGS="$CFLAGS" CFLAGS="$CFLAGS $cc_cv_werror" for cc_attribute_align_try in 64 32 16 8 4 2; do AC_COMPILE_IFELSE([AC_LANG_SOURCE([ int main() { static char c __attribute__ ((aligned($cc_attribute_align_try))) = 0; return c; }])], [cc_cv_attribute_aligned=$cc_attribute_align_try; break]) done CFLAGS="$ac_save_CFLAGS" ]) if test "x$cc_cv_attribute_aligned" != "x"; then AC_DEFINE_UNQUOTED([ATTRIBUTE_ALIGNED_MAX], [$cc_cv_attribute_aligned], [Define the highest alignment supported]) fi ]) dqlite-1.16.7/m4/ax_ac_append_to_file.m4000066400000000000000000000016221465252713400177670ustar00rootroot00000000000000# =========================================================================== # https://www.gnu.org/software/autoconf-archive/ax_ac_append_to_file.html # =========================================================================== # # SYNOPSIS # # AX_AC_APPEND_TO_FILE([FILE],[DATA]) # # DESCRIPTION # # Appends the specified data to the specified Autoconf is run. If you want # to append to a file when configure is run use AX_APPEND_TO_FILE instead. # # LICENSE # # Copyright (c) 2009 Allan Caffee # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. This file is offered as-is, without any # warranty. #serial 10 AC_DEFUN([AX_AC_APPEND_TO_FILE],[ AC_REQUIRE([AX_FILE_ESCAPES]) m4_esyscmd( AX_FILE_ESCAPES [ printf "%s" "$2" >> "$1" ]) ]) dqlite-1.16.7/m4/ax_ac_print_to_file.m4000066400000000000000000000016111465252713400176520ustar00rootroot00000000000000# =========================================================================== # https://www.gnu.org/software/autoconf-archive/ax_ac_print_to_file.html # =========================================================================== # # SYNOPSIS # # AX_AC_PRINT_TO_FILE([FILE],[DATA]) # # DESCRIPTION # # Writes the specified data to the specified file when Autoconf is run. If # you want to print to a file when configure is run use AX_PRINT_TO_FILE # instead. # # LICENSE # # Copyright (c) 2009 Allan Caffee # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. This file is offered as-is, without any # warranty. #serial 10 AC_DEFUN([AX_AC_PRINT_TO_FILE],[ m4_esyscmd( AC_REQUIRE([AX_FILE_ESCAPES]) [ printf "%s" "$2" > "$1" ]) ]) dqlite-1.16.7/m4/ax_add_am_macro_static.m4000066400000000000000000000015251465252713400203130ustar00rootroot00000000000000# =========================================================================== # https://www.gnu.org/software/autoconf-archive/ax_add_am_macro_static.html # =========================================================================== # # SYNOPSIS # # AX_ADD_AM_MACRO_STATIC([RULE]) # # DESCRIPTION # # Adds the specified rule to $AMINCLUDE. # # LICENSE # # Copyright (c) 2009 Tom Howard # Copyright (c) 2009 Allan Caffee # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. This file is offered as-is, without any # warranty. #serial 8 AC_DEFUN([AX_ADD_AM_MACRO_STATIC],[ AC_REQUIRE([AX_AM_MACROS_STATIC]) AX_AC_APPEND_TO_FILE(AMINCLUDE_STATIC,[$1]) ]) dqlite-1.16.7/m4/ax_am_macros_static.m4000066400000000000000000000021251465252713400176630ustar00rootroot00000000000000# =========================================================================== # https://www.gnu.org/software/autoconf-archive/ax_am_macros_static.html # =========================================================================== # # SYNOPSIS # # AX_AM_MACROS_STATIC # # DESCRIPTION # # Adds support for macros that create Automake rules. You must manually # add the following line # # include $(top_srcdir)/aminclude_static.am # # to your Makefile.am files. # # LICENSE # # Copyright (c) 2009 Tom Howard # Copyright (c) 2009 Allan Caffee # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. This file is offered as-is, without any # warranty. #serial 11 AC_DEFUN([AMINCLUDE_STATIC],[aminclude_static.am]) AC_DEFUN([AX_AM_MACROS_STATIC], [ AX_AC_PRINT_TO_FILE(AMINCLUDE_STATIC,[ # ]AMINCLUDE_STATIC[ generated automatically by Autoconf # from AX_AM_MACROS_STATIC on ]m4_esyscmd([LC_ALL=C date])[ ]) ]) dqlite-1.16.7/m4/ax_check_compile_flag.m4000066400000000000000000000040701465252713400201320ustar00rootroot00000000000000# =========================================================================== # https://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html # =========================================================================== # # SYNOPSIS # # AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS], [INPUT]) # # DESCRIPTION # # Check whether the given FLAG works with the current language's compiler # or gives an error. (Warnings, however, are ignored) # # ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on # success/failure. # # If EXTRA-FLAGS is defined, it is added to the current language's default # flags (e.g. CFLAGS) when the check is done. The check is thus made with # the flags: "CFLAGS EXTRA-FLAGS FLAG". This can for example be used to # force the compiler to issue an error when a bad flag is given. # # INPUT gives an alternative input source to AC_COMPILE_IFELSE. # # NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this # macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG. # # LICENSE # # Copyright (c) 2008 Guido U. Draheim # Copyright (c) 2011 Maarten Bosmans # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. This file is offered as-is, without any # warranty. #serial 6 AC_DEFUN([AX_CHECK_COMPILE_FLAG], [AC_PREREQ(2.64)dnl for _AC_LANG_PREFIX and AS_VAR_IF AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [ ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1" AC_COMPILE_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])], [AS_VAR_SET(CACHEVAR,[yes])], [AS_VAR_SET(CACHEVAR,[no])]) _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags]) AS_VAR_IF(CACHEVAR,yes, [m4_default([$2], :)], [m4_default([$3], :)]) AS_VAR_POPDEF([CACHEVAR])dnl ])dnl AX_CHECK_COMPILE_FLAGS dqlite-1.16.7/m4/ax_check_gnu_make.m4000066400000000000000000000077271465252713400173130ustar00rootroot00000000000000# =========================================================================== # https://www.gnu.org/software/autoconf-archive/ax_check_gnu_make.html # =========================================================================== # # SYNOPSIS # # AX_CHECK_GNU_MAKE([run-if-true],[run-if-false]) # # DESCRIPTION # # This macro searches for a GNU version of make. If a match is found: # # * The makefile variable `ifGNUmake' is set to the empty string, otherwise # it is set to "#". This is useful for including a special features in a # Makefile, which cannot be handled by other versions of make. # * The makefile variable `ifnGNUmake' is set to #, otherwise # it is set to the empty string. This is useful for including a special # features in a Makefile, which can be handled # by other versions of make or to specify else like clause. # * The variable `_cv_gnu_make_command` is set to the command to invoke # GNU make if it exists, the empty string otherwise. # * The variable `ax_cv_gnu_make_command` is set to the command to invoke # GNU make by copying `_cv_gnu_make_command`, otherwise it is unset. # * If GNU Make is found, its version is extracted from the output of # `make --version` as the last field of a record of space-separated # columns and saved into the variable `ax_check_gnu_make_version`. # * Additionally if GNU Make is found, run shell code run-if-true # else run shell code run-if-false. # # Here is an example of its use: # # Makefile.in might contain: # # # A failsafe way of putting a dependency rule into a makefile # $(DEPEND): # $(CC) -MM $(srcdir)/*.c > $(DEPEND) # # @ifGNUmake@ ifeq ($(DEPEND),$(wildcard $(DEPEND))) # @ifGNUmake@ include $(DEPEND) # @ifGNUmake@ else # fallback code # @ifGNUmake@ endif # # Then configure.in would normally contain: # # AX_CHECK_GNU_MAKE() # AC_OUTPUT(Makefile) # # Then perhaps to cause gnu make to override any other make, we could do # something like this (note that GNU make always looks for GNUmakefile # first): # # if ! test x$_cv_gnu_make_command = x ; then # mv Makefile GNUmakefile # echo .DEFAULT: > Makefile ; # echo \ $_cv_gnu_make_command \$@ >> Makefile; # fi # # Then, if any (well almost any) other make is called, and GNU make also # exists, then the other make wraps the GNU make. # # LICENSE # # Copyright (c) 2008 John Darrington # Copyright (c) 2015 Enrico M. Crisostomo # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. This file is offered as-is, without any # warranty. #serial 12 AC_DEFUN([AX_CHECK_GNU_MAKE],dnl [AC_PROG_AWK AC_CACHE_CHECK([for GNU make],[_cv_gnu_make_command],[dnl _cv_gnu_make_command="" ; dnl Search all the common names for GNU make for a in "$MAKE" make gmake gnumake ; do if test -z "$a" ; then continue ; fi ; if "$a" --version 2> /dev/null | grep GNU 2>&1 > /dev/null ; then _cv_gnu_make_command=$a ; AX_CHECK_GNU_MAKE_HEADLINE=$("$a" --version 2> /dev/null | grep "GNU Make") ax_check_gnu_make_version=$(echo ${AX_CHECK_GNU_MAKE_HEADLINE} | ${AWK} -F " " '{ print $(NF); }') break ; fi done ;]) dnl If there was a GNU version, then set @ifGNUmake@ to the empty string, '#' otherwise AS_VAR_IF([_cv_gnu_make_command], [""], [AS_VAR_SET([ifGNUmake], ["#"])], [AS_VAR_SET([ifGNUmake], [""])]) AS_VAR_IF([_cv_gnu_make_command], [""], [AS_VAR_SET([ifnGNUmake], [""])], [AS_VAR_SET([ifnGNUmake], ["#"])]) AS_VAR_IF([_cv_gnu_make_command], [""], [AS_UNSET(ax_cv_gnu_make_command)], [AS_VAR_SET([ax_cv_gnu_make_command], [${_cv_gnu_make_command}])]) AS_VAR_IF([_cv_gnu_make_command], [""],[$2],[$1]) AC_SUBST([ifGNUmake]) AC_SUBST([ifnGNUmake]) ]) dqlite-1.16.7/m4/ax_code_coverage.m4000066400000000000000000000276161465252713400171540ustar00rootroot00000000000000# =========================================================================== # https://www.gnu.org/software/autoconf-archive/ax_code_coverage.html # =========================================================================== # # SYNOPSIS # # AX_CODE_COVERAGE() # # DESCRIPTION # # Defines CODE_COVERAGE_CPPFLAGS, CODE_COVERAGE_CFLAGS, # CODE_COVERAGE_CXXFLAGS and CODE_COVERAGE_LIBS which should be included # in the CPPFLAGS, CFLAGS CXXFLAGS and LIBS/LIBADD variables of every # build target (program or library) which should be built with code # coverage support. Also add rules using AX_ADD_AM_MACRO_STATIC; and # $enable_code_coverage which can be used in subsequent configure output. # CODE_COVERAGE_ENABLED is defined and substituted, and corresponds to the # value of the --enable-code-coverage option, which defaults to being # disabled. # # Test also for gcov program and create GCOV variable that could be # substituted. # # Note that all optimization flags in CFLAGS must be disabled when code # coverage is enabled. # # Usage example: # # configure.ac: # # AX_CODE_COVERAGE # # Makefile.am: # # include $(top_srcdir)/aminclude_static.am # # my_program_LIBS = ... $(CODE_COVERAGE_LIBS) ... # my_program_CPPFLAGS = ... $(CODE_COVERAGE_CPPFLAGS) ... # my_program_CFLAGS = ... $(CODE_COVERAGE_CFLAGS) ... # my_program_CXXFLAGS = ... $(CODE_COVERAGE_CXXFLAGS) ... # # clean-local: code-coverage-clean # distclean-local: code-coverage-dist-clean # # This results in a "check-code-coverage" rule being added to any # Makefile.am which do "include $(top_srcdir)/aminclude_static.am" # (assuming the module has been configured with --enable-code-coverage). # Running `make check-code-coverage` in that directory will run the # module's test suite (`make check`) and build a code coverage report # detailing the code which was touched, then print the URI for the report. # # This code was derived from Makefile.decl in GLib, originally licensed # under LGPLv2.1+. # # LICENSE # # Copyright (c) 2012, 2016 Philip Withnall # Copyright (c) 2012 Xan Lopez # Copyright (c) 2012 Christian Persch # Copyright (c) 2012 Paolo Borelli # Copyright (c) 2012 Dan Winship # Copyright (c) 2015,2018 Bastien ROUCARIES # # This library is free software; you can redistribute it and/or modify it # under the terms of the GNU Lesser General Public License as published by # the Free Software Foundation; either version 2.1 of the License, or (at # your option) any later version. # # This library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser # General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . #serial 34 m4_define(_AX_CODE_COVERAGE_RULES,[ AX_ADD_AM_MACRO_STATIC([ # Code coverage # # Optional: # - CODE_COVERAGE_DIRECTORY: Top-level directory for code coverage reporting. # Multiple directories may be specified, separated by whitespace. # (Default: \$(top_builddir)) # - CODE_COVERAGE_OUTPUT_FILE: Filename and path for the .info file generated # by lcov for code coverage. (Default: # \$(PACKAGE_NAME)-\$(PACKAGE_VERSION)-coverage.info) # - CODE_COVERAGE_OUTPUT_DIRECTORY: Directory for generated code coverage # reports to be created. (Default: # \$(PACKAGE_NAME)-\$(PACKAGE_VERSION)-coverage) # - CODE_COVERAGE_BRANCH_COVERAGE: Set to 1 to enforce branch coverage, # set to 0 to disable it and leave empty to stay with the default. # (Default: empty) # - CODE_COVERAGE_LCOV_SHOPTS_DEFAULT: Extra options shared between both lcov # instances. (Default: based on $CODE_COVERAGE_BRANCH_COVERAGE) # - CODE_COVERAGE_LCOV_SHOPTS: Extra options to shared between both lcov # instances. (Default: $CODE_COVERAGE_LCOV_SHOPTS_DEFAULT) # - CODE_COVERAGE_LCOV_OPTIONS_GCOVPATH: --gcov-tool pathtogcov # - CODE_COVERAGE_LCOV_OPTIONS_DEFAULT: Extra options to pass to the # collecting lcov instance. (Default: $CODE_COVERAGE_LCOV_OPTIONS_GCOVPATH) # - CODE_COVERAGE_LCOV_OPTIONS: Extra options to pass to the collecting lcov # instance. (Default: $CODE_COVERAGE_LCOV_OPTIONS_DEFAULT) # - CODE_COVERAGE_LCOV_RMOPTS_DEFAULT: Extra options to pass to the filtering # lcov instance. (Default: empty) # - CODE_COVERAGE_LCOV_RMOPTS: Extra options to pass to the filtering lcov # instance. (Default: $CODE_COVERAGE_LCOV_RMOPTS_DEFAULT) # - CODE_COVERAGE_GENHTML_OPTIONS_DEFAULT: Extra options to pass to the # genhtml instance. (Default: based on $CODE_COVERAGE_BRANCH_COVERAGE) # - CODE_COVERAGE_GENHTML_OPTIONS: Extra options to pass to the genhtml # instance. (Default: $CODE_COVERAGE_GENHTML_OPTIONS_DEFAULT) # - CODE_COVERAGE_IGNORE_PATTERN: Extra glob pattern of files to ignore # # The generated report will be titled using the \$(PACKAGE_NAME) and # \$(PACKAGE_VERSION). In order to add the current git hash to the title, # use the git-version-gen script, available online. # Optional variables # run only on top dir if CODE_COVERAGE_ENABLED ifeq (\$(abs_builddir), \$(abs_top_builddir)) CODE_COVERAGE_DIRECTORY ?= \$(top_builddir) CODE_COVERAGE_OUTPUT_FILE ?= \$(PACKAGE_NAME)-\$(PACKAGE_VERSION)-coverage.info CODE_COVERAGE_OUTPUT_DIRECTORY ?= \$(PACKAGE_NAME)-\$(PACKAGE_VERSION)-coverage CODE_COVERAGE_BRANCH_COVERAGE ?= CODE_COVERAGE_LCOV_SHOPTS_DEFAULT ?= \$(if \$(CODE_COVERAGE_BRANCH_COVERAGE),\ --rc lcov_branch_coverage=\$(CODE_COVERAGE_BRANCH_COVERAGE)) CODE_COVERAGE_LCOV_SHOPTS ?= \$(CODE_COVERAGE_LCOV_SHOPTS_DEFAULT) CODE_COVERAGE_LCOV_OPTIONS_GCOVPATH ?= --gcov-tool \"\$(GCOV)\" CODE_COVERAGE_LCOV_OPTIONS_DEFAULT ?= \$(CODE_COVERAGE_LCOV_OPTIONS_GCOVPATH) CODE_COVERAGE_LCOV_OPTIONS ?= \$(CODE_COVERAGE_LCOV_OPTIONS_DEFAULT) CODE_COVERAGE_LCOV_RMOPTS_DEFAULT ?= CODE_COVERAGE_LCOV_RMOPTS ?= \$(CODE_COVERAGE_LCOV_RMOPTS_DEFAULT) CODE_COVERAGE_GENHTML_OPTIONS_DEFAULT ?=\ \$(if \$(CODE_COVERAGE_BRANCH_COVERAGE),\ --rc genhtml_branch_coverage=\$(CODE_COVERAGE_BRANCH_COVERAGE)) CODE_COVERAGE_GENHTML_OPTIONS ?= \$(CODE_COVERAGE_GENHTML_OPTIONS_DEFAULT) CODE_COVERAGE_IGNORE_PATTERN ?= GITIGNOREFILES := \$(GITIGNOREFILES) \$(CODE_COVERAGE_OUTPUT_FILE) \$(CODE_COVERAGE_OUTPUT_DIRECTORY) code_coverage_v_lcov_cap = \$(code_coverage_v_lcov_cap_\$(V)) code_coverage_v_lcov_cap_ = \$(code_coverage_v_lcov_cap_\$(AM_DEFAULT_VERBOSITY)) code_coverage_v_lcov_cap_0 = @echo \" LCOV --capture\" \$(CODE_COVERAGE_OUTPUT_FILE); code_coverage_v_lcov_ign = \$(code_coverage_v_lcov_ign_\$(V)) code_coverage_v_lcov_ign_ = \$(code_coverage_v_lcov_ign_\$(AM_DEFAULT_VERBOSITY)) code_coverage_v_lcov_ign_0 = @echo \" LCOV --remove /tmp/*\" \$(CODE_COVERAGE_IGNORE_PATTERN); code_coverage_v_genhtml = \$(code_coverage_v_genhtml_\$(V)) code_coverage_v_genhtml_ = \$(code_coverage_v_genhtml_\$(AM_DEFAULT_VERBOSITY)) code_coverage_v_genhtml_0 = @echo \" GEN \" \"\$(CODE_COVERAGE_OUTPUT_DIRECTORY)\"; code_coverage_quiet = \$(code_coverage_quiet_\$(V)) code_coverage_quiet_ = \$(code_coverage_quiet_\$(AM_DEFAULT_VERBOSITY)) code_coverage_quiet_0 = --quiet # sanitizes the test-name: replaces with underscores: dashes and dots code_coverage_sanitize = \$(subst -,_,\$(subst .,_,\$(1))) # Use recursive makes in order to ignore errors during check check-code-coverage: -\$(AM_V_at)\$(MAKE) \$(AM_MAKEFLAGS) -k check \$(AM_V_at)\$(MAKE) \$(AM_MAKEFLAGS) code-coverage-capture # Capture code coverage data code-coverage-capture: code-coverage-capture-hook \$(code_coverage_v_lcov_cap)\$(LCOV) \$(code_coverage_quiet) \$(addprefix --directory ,\$(CODE_COVERAGE_DIRECTORY)) --capture --output-file \"\$(CODE_COVERAGE_OUTPUT_FILE).tmp\" --test-name \"\$(call code_coverage_sanitize,\$(PACKAGE_NAME)-\$(PACKAGE_VERSION))\" --no-checksum --compat-libtool \$(CODE_COVERAGE_LCOV_SHOPTS) \$(CODE_COVERAGE_LCOV_OPTIONS) \$(code_coverage_v_lcov_ign)\$(LCOV) \$(code_coverage_quiet) \$(addprefix --directory ,\$(CODE_COVERAGE_DIRECTORY)) --remove \"\$(CODE_COVERAGE_OUTPUT_FILE).tmp\" \"/tmp/*\" \$(CODE_COVERAGE_IGNORE_PATTERN) --output-file \"\$(CODE_COVERAGE_OUTPUT_FILE)\" \$(CODE_COVERAGE_LCOV_SHOPTS) \$(CODE_COVERAGE_LCOV_RMOPTS) -@rm -f \"\$(CODE_COVERAGE_OUTPUT_FILE).tmp\" \$(code_coverage_v_genhtml)LANG=C \$(GENHTML) \$(code_coverage_quiet) \$(addprefix --prefix ,\$(CODE_COVERAGE_DIRECTORY)) --output-directory \"\$(CODE_COVERAGE_OUTPUT_DIRECTORY)\" --title \"\$(PACKAGE_NAME)-\$(PACKAGE_VERSION) Code Coverage\" --legend --show-details \"\$(CODE_COVERAGE_OUTPUT_FILE)\" \$(CODE_COVERAGE_GENHTML_OPTIONS) @echo \"file://\$(abs_builddir)/\$(CODE_COVERAGE_OUTPUT_DIRECTORY)/index.html\" code-coverage-clean: -\$(LCOV) --directory \$(top_builddir) -z -rm -rf \"\$(CODE_COVERAGE_OUTPUT_FILE)\" \"\$(CODE_COVERAGE_OUTPUT_FILE).tmp\" \"\$(CODE_COVERAGE_OUTPUT_DIRECTORY)\" -find . \\( -name \"*.gcda\" -o -name \"*.gcno\" -o -name \"*.gcov\" \\) -delete code-coverage-dist-clean: A][M_DISTCHECK_CONFIGURE_FLAGS := \$(A][M_DISTCHECK_CONFIGURE_FLAGS) --disable-code-coverage else # ifneq (\$(abs_builddir), \$(abs_top_builddir)) check-code-coverage: code-coverage-capture: code-coverage-capture-hook code-coverage-clean: code-coverage-dist-clean: endif # ifeq (\$(abs_builddir), \$(abs_top_builddir)) else #! CODE_COVERAGE_ENABLED # Use recursive makes in order to ignore errors during check check-code-coverage: @echo \"Need to reconfigure with --enable-code-coverage\" # Capture code coverage data code-coverage-capture: code-coverage-capture-hook @echo \"Need to reconfigure with --enable-code-coverage\" code-coverage-clean: code-coverage-dist-clean: endif #CODE_COVERAGE_ENABLED # Hook rule executed before code-coverage-capture, overridable by the user code-coverage-capture-hook: .PHONY: check-code-coverage code-coverage-capture code-coverage-dist-clean code-coverage-clean code-coverage-capture-hook ]) ]) AC_DEFUN([_AX_CODE_COVERAGE_ENABLED],[ AX_CHECK_GNU_MAKE([],[AC_MSG_ERROR([not using GNU make that is needed for coverage])]) AC_REQUIRE([AX_ADD_AM_MACRO_STATIC]) # check for gcov AC_CHECK_TOOL([GCOV], [$_AX_CODE_COVERAGE_GCOV_PROG_WITH], [:]) AS_IF([test "X$GCOV" = "X:"], [AC_MSG_ERROR([gcov is needed to do coverage])]) AC_SUBST([GCOV]) dnl Check if gcc is being used AS_IF([ test "$GCC" = "no" ], [ AC_MSG_ERROR([not compiling with gcc, which is required for gcov code coverage]) ]) AC_CHECK_PROG([LCOV], [lcov], [lcov]) AC_CHECK_PROG([GENHTML], [genhtml], [genhtml]) AS_IF([ test x"$LCOV" = x ], [ AC_MSG_ERROR([To enable code coverage reporting you must have lcov installed]) ]) AS_IF([ test x"$GENHTML" = x ], [ AC_MSG_ERROR([Could not find genhtml from the lcov package]) ]) dnl Build the code coverage flags dnl Define CODE_COVERAGE_LDFLAGS for backwards compatibility CODE_COVERAGE_CPPFLAGS="-DNDEBUG" CODE_COVERAGE_CFLAGS="-O0 -g -fprofile-arcs -ftest-coverage" CODE_COVERAGE_CXXFLAGS="-O0 -g -fprofile-arcs -ftest-coverage" CODE_COVERAGE_LIBS="-lgcov" AC_SUBST([CODE_COVERAGE_CPPFLAGS]) AC_SUBST([CODE_COVERAGE_CFLAGS]) AC_SUBST([CODE_COVERAGE_CXXFLAGS]) AC_SUBST([CODE_COVERAGE_LIBS]) ]) AC_DEFUN([AX_CODE_COVERAGE],[ dnl Check for --enable-code-coverage # allow to override gcov location AC_ARG_WITH([gcov], [AS_HELP_STRING([--with-gcov[=GCOV]], [use given GCOV for coverage (GCOV=gcov).])], [_AX_CODE_COVERAGE_GCOV_PROG_WITH=$with_gcov], [_AX_CODE_COVERAGE_GCOV_PROG_WITH=gcov]) AC_MSG_CHECKING([whether to build with code coverage support]) AC_ARG_ENABLE([code-coverage], AS_HELP_STRING([--enable-code-coverage], [Whether to enable code coverage support]),, enable_code_coverage=no) AM_CONDITIONAL([CODE_COVERAGE_ENABLED], [test "x$enable_code_coverage" = xyes]) AC_SUBST([CODE_COVERAGE_ENABLED], [$enable_code_coverage]) AC_MSG_RESULT($enable_code_coverage) AS_IF([ test "x$enable_code_coverage" = xyes ], [ _AX_CODE_COVERAGE_ENABLED ]) _AX_CODE_COVERAGE_RULES ]) dqlite-1.16.7/m4/ax_compare_version.m4000066400000000000000000000146531465252713400175570ustar00rootroot00000000000000# =========================================================================== # https://www.gnu.org/software/autoconf-archive/ax_compare_version.html # =========================================================================== # # SYNOPSIS # # AX_COMPARE_VERSION(VERSION_A, OP, VERSION_B, [ACTION-IF-TRUE], [ACTION-IF-FALSE]) # # DESCRIPTION # # This macro compares two version strings. Due to the various number of # minor-version numbers that can exist, and the fact that string # comparisons are not compatible with numeric comparisons, this is not # necessarily trivial to do in a autoconf script. This macro makes doing # these comparisons easy. # # The six basic comparisons are available, as well as checking equality # limited to a certain number of minor-version levels. # # The operator OP determines what type of comparison to do, and can be one # of: # # eq - equal (test A == B) # ne - not equal (test A != B) # le - less than or equal (test A <= B) # ge - greater than or equal (test A >= B) # lt - less than (test A < B) # gt - greater than (test A > B) # # Additionally, the eq and ne operator can have a number after it to limit # the test to that number of minor versions. # # eq0 - equal up to the length of the shorter version # ne0 - not equal up to the length of the shorter version # eqN - equal up to N sub-version levels # neN - not equal up to N sub-version levels # # When the condition is true, shell commands ACTION-IF-TRUE are run, # otherwise shell commands ACTION-IF-FALSE are run. The environment # variable 'ax_compare_version' is always set to either 'true' or 'false' # as well. # # Examples: # # AX_COMPARE_VERSION([3.15.7],[lt],[3.15.8]) # AX_COMPARE_VERSION([3.15],[lt],[3.15.8]) # # would both be true. # # AX_COMPARE_VERSION([3.15.7],[eq],[3.15.8]) # AX_COMPARE_VERSION([3.15],[gt],[3.15.8]) # # would both be false. # # AX_COMPARE_VERSION([3.15.7],[eq2],[3.15.8]) # # would be true because it is only comparing two minor versions. # # AX_COMPARE_VERSION([3.15.7],[eq0],[3.15]) # # would be true because it is only comparing the lesser number of minor # versions of the two values. # # Note: The characters that separate the version numbers do not matter. An # empty string is the same as version 0. OP is evaluated by autoconf, not # configure, so must be a string, not a variable. # # The author would like to acknowledge Guido Draheim whose advice about # the m4_case and m4_ifvaln functions make this macro only include the # portions necessary to perform the specific comparison specified by the # OP argument in the final configure script. # # LICENSE # # Copyright (c) 2008 Tim Toolan # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. This file is offered as-is, without any # warranty. #serial 13 dnl ######################################################################### AC_DEFUN([AX_COMPARE_VERSION], [ AC_REQUIRE([AC_PROG_AWK]) # Used to indicate true or false condition ax_compare_version=false # Convert the two version strings to be compared into a format that # allows a simple string comparison. The end result is that a version # string of the form 1.12.5-r617 will be converted to the form # 0001001200050617. In other words, each number is zero padded to four # digits, and non digits are removed. AS_VAR_PUSHDEF([A],[ax_compare_version_A]) A=`echo "$1" | sed -e 's/\([[0-9]]*\)/Z\1Z/g' \ -e 's/Z\([[0-9]]\)Z/Z0\1Z/g' \ -e 's/Z\([[0-9]][[0-9]]\)Z/Z0\1Z/g' \ -e 's/Z\([[0-9]][[0-9]][[0-9]]\)Z/Z0\1Z/g' \ -e 's/[[^0-9]]//g'` AS_VAR_PUSHDEF([B],[ax_compare_version_B]) B=`echo "$3" | sed -e 's/\([[0-9]]*\)/Z\1Z/g' \ -e 's/Z\([[0-9]]\)Z/Z0\1Z/g' \ -e 's/Z\([[0-9]][[0-9]]\)Z/Z0\1Z/g' \ -e 's/Z\([[0-9]][[0-9]][[0-9]]\)Z/Z0\1Z/g' \ -e 's/[[^0-9]]//g'` dnl # In the case of le, ge, lt, and gt, the strings are sorted as necessary dnl # then the first line is used to determine if the condition is true. dnl # The sed right after the echo is to remove any indented white space. m4_case(m4_tolower($2), [lt],[ ax_compare_version=`echo "x$A x$B" | sed 's/^ *//' | sort -r | sed "s/x${A}/false/;s/x${B}/true/;1q"` ], [gt],[ ax_compare_version=`echo "x$A x$B" | sed 's/^ *//' | sort | sed "s/x${A}/false/;s/x${B}/true/;1q"` ], [le],[ ax_compare_version=`echo "x$A x$B" | sed 's/^ *//' | sort | sed "s/x${A}/true/;s/x${B}/false/;1q"` ], [ge],[ ax_compare_version=`echo "x$A x$B" | sed 's/^ *//' | sort -r | sed "s/x${A}/true/;s/x${B}/false/;1q"` ],[ dnl Split the operator from the subversion count if present. m4_bmatch(m4_substr($2,2), [0],[ # A count of zero means use the length of the shorter version. # Determine the number of characters in A and B. ax_compare_version_len_A=`echo "$A" | $AWK '{print(length)}'` ax_compare_version_len_B=`echo "$B" | $AWK '{print(length)}'` # Set A to no more than B's length and B to no more than A's length. A=`echo "$A" | sed "s/\(.\{$ax_compare_version_len_B\}\).*/\1/"` B=`echo "$B" | sed "s/\(.\{$ax_compare_version_len_A\}\).*/\1/"` ], [[0-9]+],[ # A count greater than zero means use only that many subversions A=`echo "$A" | sed "s/\(\([[0-9]]\{4\}\)\{m4_substr($2,2)\}\).*/\1/"` B=`echo "$B" | sed "s/\(\([[0-9]]\{4\}\)\{m4_substr($2,2)\}\).*/\1/"` ], [.+],[ AC_WARNING( [invalid OP numeric parameter: $2]) ],[]) # Pad zeros at end of numbers to make same length. ax_compare_version_tmp_A="$A`echo $B | sed 's/./0/g'`" B="$B`echo $A | sed 's/./0/g'`" A="$ax_compare_version_tmp_A" # Check for equality or inequality as necessary. m4_case(m4_tolower(m4_substr($2,0,2)), [eq],[ test "x$A" = "x$B" && ax_compare_version=true ], [ne],[ test "x$A" != "x$B" && ax_compare_version=true ],[ AC_WARNING([invalid OP parameter: $2]) ]) ]) AS_VAR_POPDEF([A])dnl AS_VAR_POPDEF([B])dnl dnl # Execute ACTION-IF-TRUE / ACTION-IF-FALSE. if test "$ax_compare_version" = "true" ; then m4_ifvaln([$4],[$4],[:])dnl m4_ifvaln([$5],[else $5])dnl fi ]) dnl AX_COMPARE_VERSION dqlite-1.16.7/m4/ax_file_escapes.m4000066400000000000000000000013731465252713400170010ustar00rootroot00000000000000# =========================================================================== # https://www.gnu.org/software/autoconf-archive/ax_file_escapes.html # =========================================================================== # # SYNOPSIS # # AX_FILE_ESCAPES # # DESCRIPTION # # Writes the specified data to the specified file. # # LICENSE # # Copyright (c) 2008 Tom Howard # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. This file is offered as-is, without any # warranty. #serial 8 AC_DEFUN([AX_FILE_ESCAPES],[ AX_DOLLAR="\$" AX_SRB="\\135" AX_SLB="\\133" AX_BS="\\\\" AX_DQ="\"" ]) dqlite-1.16.7/m4/ax_pthread.m4000066400000000000000000000540341465252713400160100ustar00rootroot00000000000000# =========================================================================== # https://www.gnu.org/software/autoconf-archive/ax_pthread.html # =========================================================================== # # SYNOPSIS # # AX_PTHREAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]]) # # DESCRIPTION # # This macro figures out how to build C programs using POSIX threads. It # sets the PTHREAD_LIBS output variable to the threads library and linker # flags, and the PTHREAD_CFLAGS output variable to any special C compiler # flags that are needed. (The user can also force certain compiler # flags/libs to be tested by setting these environment variables.) # # Also sets PTHREAD_CC and PTHREAD_CXX to any special C compiler that is # needed for multi-threaded programs (defaults to the value of CC # respectively CXX otherwise). (This is necessary on e.g. AIX to use the # special cc_r/CC_r compiler alias.) # # NOTE: You are assumed to not only compile your program with these flags, # but also to link with them as well. For example, you might link with # $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS # $PTHREAD_CXX $CXXFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS # # If you are only building threaded programs, you may wish to use these # variables in your default LIBS, CFLAGS, and CC: # # LIBS="$PTHREAD_LIBS $LIBS" # CFLAGS="$CFLAGS $PTHREAD_CFLAGS" # CXXFLAGS="$CXXFLAGS $PTHREAD_CFLAGS" # CC="$PTHREAD_CC" # CXX="$PTHREAD_CXX" # # In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute constant # has a nonstandard name, this macro defines PTHREAD_CREATE_JOINABLE to # that name (e.g. PTHREAD_CREATE_UNDETACHED on AIX). # # Also HAVE_PTHREAD_PRIO_INHERIT is defined if pthread is found and the # PTHREAD_PRIO_INHERIT symbol is defined when compiling with # PTHREAD_CFLAGS. # # ACTION-IF-FOUND is a list of shell commands to run if a threads library # is found, and ACTION-IF-NOT-FOUND is a list of commands to run it if it # is not found. If ACTION-IF-FOUND is not specified, the default action # will define HAVE_PTHREAD. # # Please let the authors know if this macro fails on any platform, or if # you have any other suggestions or comments. This macro was based on work # by SGJ on autoconf scripts for FFTW (http://www.fftw.org/) (with help # from M. Frigo), as well as ac_pthread and hb_pthread macros posted by # Alejandro Forero Cuervo to the autoconf macro repository. We are also # grateful for the helpful feedback of numerous users. # # Updated for Autoconf 2.68 by Daniel Richard G. # # LICENSE # # Copyright (c) 2008 Steven G. Johnson # Copyright (c) 2011 Daniel Richard G. # Copyright (c) 2019 Marc Stevens # # This program is free software: you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. If not, see . # # As a special exception, the respective Autoconf Macro's copyright owner # gives unlimited permission to copy, distribute and modify the configure # scripts that are the output of Autoconf when processing the Macro. You # need not follow the terms of the GNU General Public License when using # or distributing such scripts, even though portions of the text of the # Macro appear in them. The GNU General Public License (GPL) does govern # all other use of the material that constitutes the Autoconf Macro. # # This special exception to the GPL applies to versions of the Autoconf # Macro released by the Autoconf Archive. When you make and distribute a # modified version of the Autoconf Macro, you may extend this special # exception to the GPL to apply to your modified version as well. #serial 31 AU_ALIAS([ACX_PTHREAD], [AX_PTHREAD]) AC_DEFUN([AX_PTHREAD], [ AC_REQUIRE([AC_CANONICAL_HOST]) AC_REQUIRE([AC_PROG_CC]) AC_REQUIRE([AC_PROG_SED]) AC_LANG_PUSH([C]) ax_pthread_ok=no # We used to check for pthread.h first, but this fails if pthread.h # requires special compiler flags (e.g. on Tru64 or Sequent). # It gets checked for in the link test anyway. # First of all, check if the user has set any of the PTHREAD_LIBS, # etcetera environment variables, and if threads linking works using # them: if test "x$PTHREAD_CFLAGS$PTHREAD_LIBS" != "x"; then ax_pthread_save_CC="$CC" ax_pthread_save_CFLAGS="$CFLAGS" ax_pthread_save_LIBS="$LIBS" AS_IF([test "x$PTHREAD_CC" != "x"], [CC="$PTHREAD_CC"]) AS_IF([test "x$PTHREAD_CXX" != "x"], [CXX="$PTHREAD_CXX"]) CFLAGS="$CFLAGS $PTHREAD_CFLAGS" LIBS="$PTHREAD_LIBS $LIBS" AC_MSG_CHECKING([for pthread_join using $CC $PTHREAD_CFLAGS $PTHREAD_LIBS]) AC_LINK_IFELSE([AC_LANG_CALL([], [pthread_join])], [ax_pthread_ok=yes]) AC_MSG_RESULT([$ax_pthread_ok]) if test "x$ax_pthread_ok" = "xno"; then PTHREAD_LIBS="" PTHREAD_CFLAGS="" fi CC="$ax_pthread_save_CC" CFLAGS="$ax_pthread_save_CFLAGS" LIBS="$ax_pthread_save_LIBS" fi # We must check for the threads library under a number of different # names; the ordering is very important because some systems # (e.g. DEC) have both -lpthread and -lpthreads, where one of the # libraries is broken (non-POSIX). # Create a list of thread flags to try. Items with a "," contain both # C compiler flags (before ",") and linker flags (after ","). Other items # starting with a "-" are C compiler flags, and remaining items are # library names, except for "none" which indicates that we try without # any flags at all, and "pthread-config" which is a program returning # the flags for the Pth emulation library. ax_pthread_flags="pthreads none -Kthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config" # The ordering *is* (sometimes) important. Some notes on the # individual items follow: # pthreads: AIX (must check this before -lpthread) # none: in case threads are in libc; should be tried before -Kthread and # other compiler flags to prevent continual compiler warnings # -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h) # -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads), Tru64 # (Note: HP C rejects this with "bad form for `-t' option") # -pthreads: Solaris/gcc (Note: HP C also rejects) # -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it # doesn't hurt to check since this sometimes defines pthreads and # -D_REENTRANT too), HP C (must be checked before -lpthread, which # is present but should not be used directly; and before -mthreads, # because the compiler interprets this as "-mt" + "-hreads") # -mthreads: Mingw32/gcc, Lynx/gcc # pthread: Linux, etcetera # --thread-safe: KAI C++ # pthread-config: use pthread-config program (for GNU Pth library) case $host_os in freebsd*) # -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able) # lthread: LinuxThreads port on FreeBSD (also preferred to -pthread) ax_pthread_flags="-kthread lthread $ax_pthread_flags" ;; hpux*) # From the cc(1) man page: "[-mt] Sets various -D flags to enable # multi-threading and also sets -lpthread." ax_pthread_flags="-mt -pthread pthread $ax_pthread_flags" ;; openedition*) # IBM z/OS requires a feature-test macro to be defined in order to # enable POSIX threads at all, so give the user a hint if this is # not set. (We don't define these ourselves, as they can affect # other portions of the system API in unpredictable ways.) AC_EGREP_CPP([AX_PTHREAD_ZOS_MISSING], [ # if !defined(_OPEN_THREADS) && !defined(_UNIX03_THREADS) AX_PTHREAD_ZOS_MISSING # endif ], [AC_MSG_WARN([IBM z/OS requires -D_OPEN_THREADS or -D_UNIX03_THREADS to enable pthreads support.])]) ;; solaris*) # On Solaris (at least, for some versions), libc contains stubbed # (non-functional) versions of the pthreads routines, so link-based # tests will erroneously succeed. (N.B.: The stubs are missing # pthread_cleanup_push, or rather a function called by this macro, # so we could check for that, but who knows whether they'll stub # that too in a future libc.) So we'll check first for the # standard Solaris way of linking pthreads (-mt -lpthread). ax_pthread_flags="-mt,-lpthread pthread $ax_pthread_flags" ;; esac # Are we compiling with Clang? AC_CACHE_CHECK([whether $CC is Clang], [ax_cv_PTHREAD_CLANG], [ax_cv_PTHREAD_CLANG=no # Note that Autoconf sets GCC=yes for Clang as well as GCC if test "x$GCC" = "xyes"; then AC_EGREP_CPP([AX_PTHREAD_CC_IS_CLANG], [/* Note: Clang 2.7 lacks __clang_[a-z]+__ */ # if defined(__clang__) && defined(__llvm__) AX_PTHREAD_CC_IS_CLANG # endif ], [ax_cv_PTHREAD_CLANG=yes]) fi ]) ax_pthread_clang="$ax_cv_PTHREAD_CLANG" # GCC generally uses -pthread, or -pthreads on some platforms (e.g. SPARC) # Note that for GCC and Clang -pthread generally implies -lpthread, # except when -nostdlib is passed. # This is problematic using libtool to build C++ shared libraries with pthread: # [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=25460 # [2] https://bugzilla.redhat.com/show_bug.cgi?id=661333 # [3] https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=468555 # To solve this, first try -pthread together with -lpthread for GCC AS_IF([test "x$GCC" = "xyes"], [ax_pthread_flags="-pthread,-lpthread -pthread -pthreads $ax_pthread_flags"]) # Clang takes -pthread (never supported any other flag), but we'll try with -lpthread first AS_IF([test "x$ax_pthread_clang" = "xyes"], [ax_pthread_flags="-pthread,-lpthread -pthread"]) # The presence of a feature test macro requesting re-entrant function # definitions is, on some systems, a strong hint that pthreads support is # correctly enabled case $host_os in darwin* | hpux* | linux* | osf* | solaris*) ax_pthread_check_macro="_REENTRANT" ;; aix*) ax_pthread_check_macro="_THREAD_SAFE" ;; *) ax_pthread_check_macro="--" ;; esac AS_IF([test "x$ax_pthread_check_macro" = "x--"], [ax_pthread_check_cond=0], [ax_pthread_check_cond="!defined($ax_pthread_check_macro)"]) if test "x$ax_pthread_ok" = "xno"; then for ax_pthread_try_flag in $ax_pthread_flags; do case $ax_pthread_try_flag in none) AC_MSG_CHECKING([whether pthreads work without any flags]) ;; *,*) PTHREAD_CFLAGS=`echo $ax_pthread_try_flag | sed "s/^\(.*\),\(.*\)$/\1/"` PTHREAD_LIBS=`echo $ax_pthread_try_flag | sed "s/^\(.*\),\(.*\)$/\2/"` AC_MSG_CHECKING([whether pthreads work with "$PTHREAD_CFLAGS" and "$PTHREAD_LIBS"]) ;; -*) AC_MSG_CHECKING([whether pthreads work with $ax_pthread_try_flag]) PTHREAD_CFLAGS="$ax_pthread_try_flag" ;; pthread-config) AC_CHECK_PROG([ax_pthread_config], [pthread-config], [yes], [no]) AS_IF([test "x$ax_pthread_config" = "xno"], [continue]) PTHREAD_CFLAGS="`pthread-config --cflags`" PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`" ;; *) AC_MSG_CHECKING([for the pthreads library -l$ax_pthread_try_flag]) PTHREAD_LIBS="-l$ax_pthread_try_flag" ;; esac ax_pthread_save_CFLAGS="$CFLAGS" ax_pthread_save_LIBS="$LIBS" CFLAGS="$CFLAGS $PTHREAD_CFLAGS" LIBS="$PTHREAD_LIBS $LIBS" # Check for various functions. We must include pthread.h, # since some functions may be macros. (On the Sequent, we # need a special flag -Kthread to make this header compile.) # We check for pthread_join because it is in -lpthread on IRIX # while pthread_create is in libc. We check for pthread_attr_init # due to DEC craziness with -lpthreads. We check for # pthread_cleanup_push because it is one of the few pthread # functions on Solaris that doesn't have a non-functional libc stub. # We try pthread_create on general principles. AC_LINK_IFELSE([AC_LANG_PROGRAM([#include # if $ax_pthread_check_cond # error "$ax_pthread_check_macro must be defined" # endif static void *some_global = NULL; static void routine(void *a) { /* To avoid any unused-parameter or unused-but-set-parameter warning. */ some_global = a; } static void *start_routine(void *a) { return a; }], [pthread_t th; pthread_attr_t attr; pthread_create(&th, 0, start_routine, 0); pthread_join(th, 0); pthread_attr_init(&attr); pthread_cleanup_push(routine, 0); pthread_cleanup_pop(0) /* ; */])], [ax_pthread_ok=yes], []) CFLAGS="$ax_pthread_save_CFLAGS" LIBS="$ax_pthread_save_LIBS" AC_MSG_RESULT([$ax_pthread_ok]) AS_IF([test "x$ax_pthread_ok" = "xyes"], [break]) PTHREAD_LIBS="" PTHREAD_CFLAGS="" done fi # Clang needs special handling, because older versions handle the -pthread # option in a rather... idiosyncratic way if test "x$ax_pthread_clang" = "xyes"; then # Clang takes -pthread; it has never supported any other flag # (Note 1: This will need to be revisited if a system that Clang # supports has POSIX threads in a separate library. This tends not # to be the way of modern systems, but it's conceivable.) # (Note 2: On some systems, notably Darwin, -pthread is not needed # to get POSIX threads support; the API is always present and # active. We could reasonably leave PTHREAD_CFLAGS empty. But # -pthread does define _REENTRANT, and while the Darwin headers # ignore this macro, third-party headers might not.) # However, older versions of Clang make a point of warning the user # that, in an invocation where only linking and no compilation is # taking place, the -pthread option has no effect ("argument unused # during compilation"). They expect -pthread to be passed in only # when source code is being compiled. # # Problem is, this is at odds with the way Automake and most other # C build frameworks function, which is that the same flags used in # compilation (CFLAGS) are also used in linking. Many systems # supported by AX_PTHREAD require exactly this for POSIX threads # support, and in fact it is often not straightforward to specify a # flag that is used only in the compilation phase and not in # linking. Such a scenario is extremely rare in practice. # # Even though use of the -pthread flag in linking would only print # a warning, this can be a nuisance for well-run software projects # that build with -Werror. So if the active version of Clang has # this misfeature, we search for an option to squash it. AC_CACHE_CHECK([whether Clang needs flag to prevent "argument unused" warning when linking with -pthread], [ax_cv_PTHREAD_CLANG_NO_WARN_FLAG], [ax_cv_PTHREAD_CLANG_NO_WARN_FLAG=unknown # Create an alternate version of $ac_link that compiles and # links in two steps (.c -> .o, .o -> exe) instead of one # (.c -> exe), because the warning occurs only in the second # step ax_pthread_save_ac_link="$ac_link" ax_pthread_sed='s/conftest\.\$ac_ext/conftest.$ac_objext/g' ax_pthread_link_step=`AS_ECHO(["$ac_link"]) | sed "$ax_pthread_sed"` ax_pthread_2step_ac_link="($ac_compile) && (echo ==== >&5) && ($ax_pthread_link_step)" ax_pthread_save_CFLAGS="$CFLAGS" for ax_pthread_try in '' -Qunused-arguments -Wno-unused-command-line-argument unknown; do AS_IF([test "x$ax_pthread_try" = "xunknown"], [break]) CFLAGS="-Werror -Wunknown-warning-option $ax_pthread_try -pthread $ax_pthread_save_CFLAGS" ac_link="$ax_pthread_save_ac_link" AC_LINK_IFELSE([AC_LANG_SOURCE([[int main(void){return 0;}]])], [ac_link="$ax_pthread_2step_ac_link" AC_LINK_IFELSE([AC_LANG_SOURCE([[int main(void){return 0;}]])], [break]) ]) done ac_link="$ax_pthread_save_ac_link" CFLAGS="$ax_pthread_save_CFLAGS" AS_IF([test "x$ax_pthread_try" = "x"], [ax_pthread_try=no]) ax_cv_PTHREAD_CLANG_NO_WARN_FLAG="$ax_pthread_try" ]) case "$ax_cv_PTHREAD_CLANG_NO_WARN_FLAG" in no | unknown) ;; *) PTHREAD_CFLAGS="$ax_cv_PTHREAD_CLANG_NO_WARN_FLAG $PTHREAD_CFLAGS" ;; esac fi # $ax_pthread_clang = yes # Various other checks: if test "x$ax_pthread_ok" = "xyes"; then ax_pthread_save_CFLAGS="$CFLAGS" ax_pthread_save_LIBS="$LIBS" CFLAGS="$CFLAGS $PTHREAD_CFLAGS" LIBS="$PTHREAD_LIBS $LIBS" # Detect AIX lossage: JOINABLE attribute is called UNDETACHED. AC_CACHE_CHECK([for joinable pthread attribute], [ax_cv_PTHREAD_JOINABLE_ATTR], [ax_cv_PTHREAD_JOINABLE_ATTR=unknown for ax_pthread_attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], [int attr = $ax_pthread_attr; return attr /* ; */])], [ax_cv_PTHREAD_JOINABLE_ATTR=$ax_pthread_attr; break], []) done ]) AS_IF([test "x$ax_cv_PTHREAD_JOINABLE_ATTR" != "xunknown" && \ test "x$ax_cv_PTHREAD_JOINABLE_ATTR" != "xPTHREAD_CREATE_JOINABLE" && \ test "x$ax_pthread_joinable_attr_defined" != "xyes"], [AC_DEFINE_UNQUOTED([PTHREAD_CREATE_JOINABLE], [$ax_cv_PTHREAD_JOINABLE_ATTR], [Define to necessary symbol if this constant uses a non-standard name on your system.]) ax_pthread_joinable_attr_defined=yes ]) AC_CACHE_CHECK([whether more special flags are required for pthreads], [ax_cv_PTHREAD_SPECIAL_FLAGS], [ax_cv_PTHREAD_SPECIAL_FLAGS=no case $host_os in solaris*) ax_cv_PTHREAD_SPECIAL_FLAGS="-D_POSIX_PTHREAD_SEMANTICS" ;; esac ]) AS_IF([test "x$ax_cv_PTHREAD_SPECIAL_FLAGS" != "xno" && \ test "x$ax_pthread_special_flags_added" != "xyes"], [PTHREAD_CFLAGS="$ax_cv_PTHREAD_SPECIAL_FLAGS $PTHREAD_CFLAGS" ax_pthread_special_flags_added=yes]) AC_CACHE_CHECK([for PTHREAD_PRIO_INHERIT], [ax_cv_PTHREAD_PRIO_INHERIT], [AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include ]], [[int i = PTHREAD_PRIO_INHERIT; return i;]])], [ax_cv_PTHREAD_PRIO_INHERIT=yes], [ax_cv_PTHREAD_PRIO_INHERIT=no]) ]) AS_IF([test "x$ax_cv_PTHREAD_PRIO_INHERIT" = "xyes" && \ test "x$ax_pthread_prio_inherit_defined" != "xyes"], [AC_DEFINE([HAVE_PTHREAD_PRIO_INHERIT], [1], [Have PTHREAD_PRIO_INHERIT.]) ax_pthread_prio_inherit_defined=yes ]) CFLAGS="$ax_pthread_save_CFLAGS" LIBS="$ax_pthread_save_LIBS" # More AIX lossage: compile with *_r variant if test "x$GCC" != "xyes"; then case $host_os in aix*) AS_CASE(["x/$CC"], [x*/c89|x*/c89_128|x*/c99|x*/c99_128|x*/cc|x*/cc128|x*/xlc|x*/xlc_v6|x*/xlc128|x*/xlc128_v6], [#handle absolute path differently from PATH based program lookup AS_CASE(["x$CC"], [x/*], [ AS_IF([AS_EXECUTABLE_P([${CC}_r])],[PTHREAD_CC="${CC}_r"]) AS_IF([test "x${CXX}" != "x"], [AS_IF([AS_EXECUTABLE_P([${CXX}_r])],[PTHREAD_CXX="${CXX}_r"])]) ], [ AC_CHECK_PROGS([PTHREAD_CC],[${CC}_r],[$CC]) AS_IF([test "x${CXX}" != "x"], [AC_CHECK_PROGS([PTHREAD_CXX],[${CXX}_r],[$CXX])]) ] ) ]) ;; esac fi fi test -n "$PTHREAD_CC" || PTHREAD_CC="$CC" test -n "$PTHREAD_CXX" || PTHREAD_CXX="$CXX" AC_SUBST([PTHREAD_LIBS]) AC_SUBST([PTHREAD_CFLAGS]) AC_SUBST([PTHREAD_CC]) AC_SUBST([PTHREAD_CXX]) # Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND: if test "x$ax_pthread_ok" = "xyes"; then ifelse([$1],,[AC_DEFINE([HAVE_PTHREAD],[1],[Define if you have POSIX threads libraries and header files.])],[$1]) : else ax_pthread_ok=no $2 fi AC_LANG_POP ])dnl AX_PTHREAD dqlite-1.16.7/m4/pkg.m4000066400000000000000000000240111465252713400144420ustar00rootroot00000000000000dnl pkg.m4 - Macros to locate and utilise pkg-config. -*- Autoconf -*- dnl serial 11 (pkg-config-0.29.1) dnl dnl Copyright © 2004 Scott James Remnant . dnl Copyright © 2012-2015 Dan Nicholson dnl dnl This program is free software; you can redistribute it and/or modify dnl it under the terms of the GNU General Public License as published by dnl the Free Software Foundation; either version 2 of the License, or dnl (at your option) any later version. dnl dnl This program is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl General Public License for more details. dnl dnl You should have received a copy of the GNU General Public License dnl along with this program; if not, write to the Free Software dnl Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA dnl 02111-1307, USA. dnl dnl As a special exception to the GNU General Public License, if you dnl distribute this file as part of a program that contains a dnl configuration script generated by Autoconf, you may include it under dnl the same distribution terms that you use for the rest of that dnl program. dnl PKG_PREREQ(MIN-VERSION) dnl ----------------------- dnl Since: 0.29 dnl dnl Verify that the version of the pkg-config macros are at least dnl MIN-VERSION. Unlike PKG_PROG_PKG_CONFIG, which checks the user's dnl installed version of pkg-config, this checks the developer's version dnl of pkg.m4 when generating configure. dnl dnl To ensure that this macro is defined, also add: dnl m4_ifndef([PKG_PREREQ], dnl [m4_fatal([must install pkg-config 0.29 or later before running autoconf/autogen])]) dnl dnl See the "Since" comment for each macro you use to see what version dnl of the macros you require. m4_defun([PKG_PREREQ], [m4_define([PKG_MACROS_VERSION], [0.29.1]) m4_if(m4_version_compare(PKG_MACROS_VERSION, [$1]), -1, [m4_fatal([pkg.m4 version $1 or higher is required but ]PKG_MACROS_VERSION[ found])]) ])dnl PKG_PREREQ dnl PKG_PROG_PKG_CONFIG([MIN-VERSION]) dnl ---------------------------------- dnl Since: 0.16 dnl dnl Search for the pkg-config tool and set the PKG_CONFIG variable to dnl first found in the path. Checks that the version of pkg-config found dnl is at least MIN-VERSION. If MIN-VERSION is not specified, 0.9.0 is dnl used since that's the first version where most current features of dnl pkg-config existed. AC_DEFUN([PKG_PROG_PKG_CONFIG], [m4_pattern_forbid([^_?PKG_[A-Z_]+$]) m4_pattern_allow([^PKG_CONFIG(_(PATH|LIBDIR|SYSROOT_DIR|ALLOW_SYSTEM_(CFLAGS|LIBS)))?$]) m4_pattern_allow([^PKG_CONFIG_(DISABLE_UNINSTALLED|TOP_BUILD_DIR|DEBUG_SPEW)$]) AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility]) AC_ARG_VAR([PKG_CONFIG_PATH], [directories to add to pkg-config's search path]) AC_ARG_VAR([PKG_CONFIG_LIBDIR], [path overriding pkg-config's built-in search path]) if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then AC_PATH_TOOL([PKG_CONFIG], [pkg-config]) fi if test -n "$PKG_CONFIG"; then _pkg_min_version=m4_default([$1], [0.9.0]) AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version]) if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then AC_MSG_RESULT([yes]) else AC_MSG_RESULT([no]) PKG_CONFIG="" fi fi[]dnl ])dnl PKG_PROG_PKG_CONFIG dnl PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) dnl ------------------------------------------------------------------- dnl Since: 0.18 dnl dnl Check to see whether a particular set of modules exists. Similar to dnl PKG_CHECK_MODULES(), but does not set variables or print errors. dnl dnl Please remember that m4 expands AC_REQUIRE([PKG_PROG_PKG_CONFIG]) dnl only at the first occurence in configure.ac, so if the first place dnl it's called might be skipped (such as if it is within an "if", you dnl have to call PKG_CHECK_EXISTS manually AC_DEFUN([PKG_CHECK_EXISTS], [AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl if test -n "$PKG_CONFIG" && \ AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then m4_default([$2], [:]) m4_ifvaln([$3], [else $3])dnl fi]) dnl _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES]) dnl --------------------------------------------- dnl Internal wrapper calling pkg-config via PKG_CONFIG and setting dnl pkg_failed based on the result. m4_define([_PKG_CONFIG], [if test -n "$$1"; then pkg_cv_[]$1="$$1" elif test -n "$PKG_CONFIG"; then PKG_CHECK_EXISTS([$3], [pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null` test "x$?" != "x0" && pkg_failed=yes ], [pkg_failed=yes]) else pkg_failed=untried fi[]dnl ])dnl _PKG_CONFIG dnl _PKG_SHORT_ERRORS_SUPPORTED dnl --------------------------- dnl Internal check to see if pkg-config supports short errors. AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED], [AC_REQUIRE([PKG_PROG_PKG_CONFIG]) if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then _pkg_short_errors_supported=yes else _pkg_short_errors_supported=no fi[]dnl ])dnl _PKG_SHORT_ERRORS_SUPPORTED dnl PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND], dnl [ACTION-IF-NOT-FOUND]) dnl -------------------------------------------------------------- dnl Since: 0.4.0 dnl dnl Note that if there is a possibility the first call to dnl PKG_CHECK_MODULES might not happen, you should be sure to include an dnl explicit call to PKG_PROG_PKG_CONFIG in your configure.ac AC_DEFUN([PKG_CHECK_MODULES], [AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl pkg_failed=no AC_MSG_CHECKING([for $1]) _PKG_CONFIG([$1][_CFLAGS], [cflags], [$2]) _PKG_CONFIG([$1][_LIBS], [libs], [$2]) m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS and $1[]_LIBS to avoid the need to call pkg-config. See the pkg-config man page for more details.]) if test $pkg_failed = yes; then AC_MSG_RESULT([no]) _PKG_SHORT_ERRORS_SUPPORTED if test $_pkg_short_errors_supported = yes; then $1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "$2" 2>&1` else $1[]_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "$2" 2>&1` fi # Put the nasty error message in config.log where it belongs echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD m4_default([$4], [AC_MSG_ERROR( [Package requirements ($2) were not met: $$1_PKG_ERRORS Consider adjusting the PKG_CONFIG_PATH environment variable if you installed software in a non-standard prefix. _PKG_TEXT])[]dnl ]) elif test $pkg_failed = untried; then AC_MSG_RESULT([no]) m4_default([$4], [AC_MSG_FAILURE( [The pkg-config script could not be found or is too old. Make sure it is in your PATH or set the PKG_CONFIG environment variable to the full path to pkg-config. _PKG_TEXT To get pkg-config, see .])[]dnl ]) else $1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS $1[]_LIBS=$pkg_cv_[]$1[]_LIBS AC_MSG_RESULT([yes]) $3 fi[]dnl ])dnl PKG_CHECK_MODULES dnl PKG_CHECK_MODULES_STATIC(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND], dnl [ACTION-IF-NOT-FOUND]) dnl --------------------------------------------------------------------- dnl Since: 0.29 dnl dnl Checks for existence of MODULES and gathers its build flags with dnl static libraries enabled. Sets VARIABLE-PREFIX_CFLAGS from --cflags dnl and VARIABLE-PREFIX_LIBS from --libs. dnl dnl Note that if there is a possibility the first call to dnl PKG_CHECK_MODULES_STATIC might not happen, you should be sure to dnl include an explicit call to PKG_PROG_PKG_CONFIG in your dnl configure.ac. AC_DEFUN([PKG_CHECK_MODULES_STATIC], [AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl _save_PKG_CONFIG=$PKG_CONFIG PKG_CONFIG="$PKG_CONFIG --static" PKG_CHECK_MODULES($@) PKG_CONFIG=$_save_PKG_CONFIG[]dnl ])dnl PKG_CHECK_MODULES_STATIC dnl PKG_INSTALLDIR([DIRECTORY]) dnl ------------------------- dnl Since: 0.27 dnl dnl Substitutes the variable pkgconfigdir as the location where a module dnl should install pkg-config .pc files. By default the directory is dnl $libdir/pkgconfig, but the default can be changed by passing dnl DIRECTORY. The user can override through the --with-pkgconfigdir dnl parameter. AC_DEFUN([PKG_INSTALLDIR], [m4_pushdef([pkg_default], [m4_default([$1], ['${libdir}/pkgconfig'])]) m4_pushdef([pkg_description], [pkg-config installation directory @<:@]pkg_default[@:>@]) AC_ARG_WITH([pkgconfigdir], [AS_HELP_STRING([--with-pkgconfigdir], pkg_description)],, [with_pkgconfigdir=]pkg_default) AC_SUBST([pkgconfigdir], [$with_pkgconfigdir]) m4_popdef([pkg_default]) m4_popdef([pkg_description]) ])dnl PKG_INSTALLDIR dnl PKG_NOARCH_INSTALLDIR([DIRECTORY]) dnl -------------------------------- dnl Since: 0.27 dnl dnl Substitutes the variable noarch_pkgconfigdir as the location where a dnl module should install arch-independent pkg-config .pc files. By dnl default the directory is $datadir/pkgconfig, but the default can be dnl changed by passing DIRECTORY. The user can override through the dnl --with-noarch-pkgconfigdir parameter. AC_DEFUN([PKG_NOARCH_INSTALLDIR], [m4_pushdef([pkg_default], [m4_default([$1], ['${datadir}/pkgconfig'])]) m4_pushdef([pkg_description], [pkg-config arch-independent installation directory @<:@]pkg_default[@:>@]) AC_ARG_WITH([noarch-pkgconfigdir], [AS_HELP_STRING([--with-noarch-pkgconfigdir], pkg_description)],, [with_noarch_pkgconfigdir=]pkg_default) AC_SUBST([noarch_pkgconfigdir], [$with_noarch_pkgconfigdir]) m4_popdef([pkg_default]) m4_popdef([pkg_description]) ])dnl PKG_NOARCH_INSTALLDIR dnl PKG_CHECK_VAR(VARIABLE, MODULE, CONFIG-VARIABLE, dnl [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) dnl ------------------------------------------- dnl Since: 0.28 dnl dnl Retrieves the value of the pkg-config variable for the given module. AC_DEFUN([PKG_CHECK_VAR], [AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl AC_ARG_VAR([$1], [value of $3 for $2, overriding pkg-config])dnl _PKG_CONFIG([$1], [variable="][$3]["], [$2]) AS_VAR_COPY([$1], [pkg_cv_][$1]) AS_VAR_IF([$1], [""], [$5], [$4])dnl ])dnl PKG_CHECK_VAR dqlite-1.16.7/resources/000077500000000000000000000000001465252713400151135ustar00rootroot00000000000000dqlite-1.16.7/resources/stdbool.h000066400000000000000000000020461465252713400167340ustar00rootroot00000000000000/*===---- stdbool.h - Standard header for booleans -------------------------=== * * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ #ifndef __STDBOOL_H #define __STDBOOL_H #define __bool_true_false_are_defined 1 #if defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L /* FIXME: We should be issuing a deprecation warning here, but cannot yet due * to system headers which include this header file unconditionally. */ #elif !defined(__cplusplus) #define bool _Bool #define true 1 #define false 0 #elif defined(__GNUC__) && !defined(__STRICT_ANSI__) /* Define _Bool as a GNU extension. */ #define _Bool bool #if defined(__cplusplus) && __cplusplus < 201103L /* For C++98, define bool, false, true as a GNU extension. */ #define bool bool #define false false #define true true #endif #endif #endif /* __STDBOOL_H */ dqlite-1.16.7/src/000077500000000000000000000000001465252713400136705ustar00rootroot00000000000000dqlite-1.16.7/src/bind.c000066400000000000000000000040521465252713400147510ustar00rootroot00000000000000#include "bind.h" #include "tuple.h" /* Bind a single parameter. */ static int bind_one(sqlite3_stmt *stmt, int n, struct value *value) { int rc; /* TODO: the binding calls below currently use SQLITE_TRANSIENT when * passing pointers to data (for TEXT or BLOB datatypes). This way * SQLite makes its private copy of the data before the bind call * returns, and we can reuse the message body buffer. The overhead of * the copy is typically low, but if it becomes a concern, this could be * optimized to make no copy and instead prevent the message body from * being reused. */ switch (value->type) { case SQLITE_INTEGER: rc = sqlite3_bind_int64(stmt, n, value->integer); break; case SQLITE_FLOAT: rc = sqlite3_bind_double(stmt, n, value->float_); break; case SQLITE_BLOB: rc = sqlite3_bind_blob(stmt, n, value->blob.base, (int)value->blob.len, SQLITE_TRANSIENT); break; case SQLITE_NULL: rc = sqlite3_bind_null(stmt, n); break; case SQLITE_TEXT: rc = sqlite3_bind_text(stmt, n, value->text, -1, SQLITE_TRANSIENT); break; case DQLITE_ISO8601: rc = sqlite3_bind_text(stmt, n, value->text, -1, SQLITE_TRANSIENT); break; case DQLITE_BOOLEAN: rc = sqlite3_bind_int64(stmt, n, value->boolean == 0 ? 0 : 1); break; default: rc = DQLITE_PROTO; break; } return rc; } int bind__params(sqlite3_stmt *stmt, struct cursor *cursor, int format) { struct tuple_decoder decoder; unsigned long i; int rc; assert(format == TUPLE__PARAMS || format == TUPLE__PARAMS32); sqlite3_reset(stmt); /* If the payload has been fully consumed, it means there are no * parameters to bind. */ if (cursor->cap == 0) { return 0; } rc = tuple_decoder__init(&decoder, 0, format, cursor); if (rc != 0) { return rc; } for (i = 0; i < tuple_decoder__n(&decoder); i++) { struct value value; rc = tuple_decoder__next(&decoder, &value); if (rc != 0) { return rc; } rc = bind_one(stmt, (int)(i + 1), &value); if (rc != 0) { return rc; } } return 0; } dqlite-1.16.7/src/bind.h000066400000000000000000000005311465252713400147540ustar00rootroot00000000000000/** * Bind statement parameters decoding them from a client request payload. */ #ifndef BIND_H_ #define BIND_H_ #include #include "lib/serialize.h" /** * Bind the parameters of the given statement by decoding the given payload. */ int bind__params(sqlite3_stmt *stmt, struct cursor *cursor, int format); #endif /* BIND_H_*/ dqlite-1.16.7/src/client/000077500000000000000000000000001465252713400151465ustar00rootroot00000000000000dqlite-1.16.7/src/client/protocol.c000066400000000000000000000633051465252713400171620ustar00rootroot00000000000000#include #include #include #include #include #include #include "../lib/assert.h" #include "../message.h" #include "../protocol.h" #include "../request.h" #include "../response.h" #include "../tracing.h" #include "../tuple.h" #include "protocol.h" static void oom(void) { abort(); } void *mallocChecked(size_t n) { void *p = malloc(n); if (p == NULL) { oom(); } return p; } void *callocChecked(size_t count, size_t n) { void *p = calloc(count, n); if (p == NULL) { oom(); } return p; } char *strdupChecked(const char *s) { char *p = strdup(s); if (p == NULL) { oom(); } return p; } char *strndupChecked(const char *s, size_t n) { char *p = strndup(s, n); if (p == NULL) { oom(); } return p; } /* Convert a value that potentially borrows data from the client_proto read * buffer into one that owns its data. The owned data must be free with * freeOwnedValue. */ static void makeValueOwned(struct value *val) { char *p; switch (val->type) { case SQLITE_TEXT: val->text = strdupChecked(val->text); break; case DQLITE_ISO8601: val->iso8601 = strdupChecked(val->iso8601); break; case SQLITE_BLOB: p = mallocChecked(val->blob.len); memcpy(p, val->blob.base, val->blob.len); val->blob.base = p; break; default:; } } /* Free the owned data of a value, which must have had makeValueOwned called * on it previously. This takes its argument by value because it does *not* * free the memory that stores the `struct value` itself, only the pointers * held by `struct value`. */ static void freeOwnedValue(struct value val) { switch (val.type) { case SQLITE_TEXT: free((char *)val.text); break; case DQLITE_ISO8601: free((char *)val.iso8601); break; case SQLITE_BLOB: free(val.blob.base); break; default:; } } static int peekUint64(struct cursor cursor, uint64_t *val) { if (cursor.cap < 8) { return DQLITE_CLIENT_PROTO_ERROR; } memcpy(val, cursor.p, sizeof(*val)); *val = ByteFlipLe64(*val); return 0; } /* Read data from fd into buf until one of the following occurs: * * - The full count n of bytes is read. * - A read returns 0 (EOF). * - The context's deadline is reached. * - An error occurs. * * On error, -1 is returned. Otherwise the return value is the count * of bytes read. This may be less than n if either EOF happened or * the deadline kicked in. */ static ssize_t doRead(int fd, void *buf, size_t buf_len, struct client_context *context) { ssize_t total; struct pollfd pfd; struct timespec now; long long millis; ssize_t n; int rv; pfd.fd = fd; pfd.events = POLLIN; pfd.revents = 0; total = 0; while ((size_t)total < buf_len) { rv = clock_gettime(CLOCK_REALTIME, &now); assert(rv == 0); if (context != NULL) { millis = (context->deadline.tv_sec - now.tv_sec) * 1000 + (context->deadline.tv_nsec - now.tv_nsec) / 1000000; if (millis < 0) { /* poll(2) will block indefinitely if the * timeout argument is negative, and we don't * want that here. Signal a timeout. */ break; } } else { /* The caller has explicitly asked us to block * indefinitely. */ millis = -1; } rv = poll(&pfd, 1, (millis > INT_MAX) ? INT_MAX : (int)millis); if (rv < 0) { if (errno == EINTR) { continue; } else { return -1; } } else if (rv == 0) { /* Timeout */ break; } assert(rv == 1); if (pfd.revents != POLLIN) { /* If some other bits are set in the out parameter, an * error occurred. */ return -1; } n = read(fd, (char *)buf + (size_t)total, buf_len - (size_t)total); if (n < 0) { if (errno == EINTR) { continue; } else { return -1; } } else if (n == 0) { /* EOF */ break; } total += n; } return total; } /* Write data into fd from buf until one of the following occurs: * * - The full count n of bytes is written. * - A write returns 0 (EOF). * - The context's deadline is reached. * - An error occurs. * * On error, -1 is returned. Otherwise the return value is the count * of bytes written. This may be less than n if either EOF happened or * the deadline kicked in. */ static ssize_t doWrite(int fd, void *buf, size_t buf_len, struct client_context *context) { ssize_t total; struct pollfd pfd; struct timespec now; long long millis; ssize_t n; int rv; pfd.fd = fd; pfd.events = POLLOUT; pfd.revents = 0; total = 0; while ((size_t)total < buf_len) { rv = clock_gettime(CLOCK_REALTIME, &now); assert(rv == 0); if (context != NULL) { millis = (context->deadline.tv_sec - now.tv_sec) * 1000 + (context->deadline.tv_nsec - now.tv_nsec) / 1000000; if (millis < 0) { /* poll(2) will block indefinitely if the * timeout argument is negative, and we don't * want that here. Signal a timeout. */ break; } } else { /* The caller has explicitly asked us to block * indefinitely. */ millis = -1; } rv = poll(&pfd, 1, (millis > INT_MAX) ? INT_MAX : (int)millis); if (rv < 0) { if (errno == EINTR) { continue; } else { return -1; } } else if (rv == 0) { /* Timeout */ break; } assert(rv == 1); if (pfd.revents != POLLOUT) { /* If some other bits are set in the out parameter, an * error occurred. */ return -1; } n = write(fd, (char *)buf + (size_t)total, buf_len - (size_t)total); if (n < 0) { if (errno == EINTR) { continue; } else { return -1; } } else if (n == 0) { /* EOF */ break; } total += n; } return total; } static int handleFailure(struct client_proto *c) { struct response_failure failure; struct cursor cursor; int rv; cursor.p = buffer__cursor(&c->read, 0); cursor.cap = buffer__offset(&c->read); rv = response_failure__decode(&cursor, &failure); if (rv != 0) { tracef("decode as failure failed rv:%d", rv); return DQLITE_CLIENT_PROTO_ERROR; } c->errcode = failure.code; if (c->errmsg != NULL) { free(c->errmsg); } c->errmsg = strdupChecked(failure.message); return DQLITE_CLIENT_PROTO_RECEIVED_FAILURE; } void clientContextMillis(struct client_context *context, long millis) { int rv; rv = clock_gettime(CLOCK_REALTIME, &context->deadline); assert(rv == 0); context->deadline.tv_nsec += millis * 1000000; while (context->deadline.tv_nsec >= 1000000000) { context->deadline.tv_nsec -= 1000000000; context->deadline.tv_sec += 1; } } /* TODO accept a context here? */ int clientOpen(struct client_proto *c, const char *addr, uint64_t server_id) { int rv; rv = c->connect(c->connect_arg, addr, &c->fd); if (rv != 0) { c->fd = -1; return DQLITE_CLIENT_PROTO_ERROR; } c->server_id = server_id; rv = buffer__init(&c->read); if (rv != 0) { oom(); } rv = buffer__init(&c->write); if (rv != 0) { oom(); } c->errcode = 0; c->errmsg = NULL; return 0; } void clientClose(struct client_proto *c) { tracef("client close"); if (c->fd == -1) { return; } close(c->fd); c->fd = -1; buffer__close(&c->write); buffer__close(&c->read); free(c->db_name); c->db_name = NULL; free(c->errmsg); c->errmsg = NULL; c->server_id = 0; } int clientSendHandshake(struct client_proto *c, struct client_context *context) { uint64_t protocol; ssize_t rv; tracef("client send handshake"); protocol = ByteFlipLe64(DQLITE_PROTOCOL_VERSION); rv = doWrite(c->fd, &protocol, sizeof protocol, context); if (rv < 0) { tracef("client send handshake failed %zd", rv); return DQLITE_CLIENT_PROTO_ERROR; } else if ((size_t)rv < sizeof protocol) { return DQLITE_CLIENT_PROTO_SHORT; } return 0; } static int writeMessage(struct client_proto *c, uint8_t type, uint8_t schema, struct client_context *context) { struct message message = {0}; size_t n; size_t words; char *cursor; ssize_t rv; n = buffer__offset(&c->write); words = (n - message__sizeof(&message)) / 8; message.words = (uint32_t)words; message.type = type; message.schema = schema; cursor = buffer__cursor(&c->write, 0); message__encode(&message, &cursor); rv = doWrite(c->fd, buffer__cursor(&c->write, 0), n, context); if (rv < 0) { tracef("request write failed rv:%zd", rv); return DQLITE_CLIENT_PROTO_ERROR; } else if ((size_t)rv < n) { return DQLITE_CLIENT_PROTO_SHORT; } return 0; } #define BUFFER_REQUEST(LOWER, UPPER) \ { \ struct message _message = {0}; \ size_t _n1; \ size_t _n2; \ char *_cursor; \ _n1 = message__sizeof(&_message); \ _n2 = request_##LOWER##__sizeof(&request); \ buffer__reset(&c->write); \ _cursor = buffer__advance(&c->write, _n1 + _n2); \ if (_cursor == NULL) { \ oom(); \ } \ assert(_n2 % 8 == 0); \ message__encode(&_message, &_cursor); \ request_##LOWER##__encode(&request, &_cursor); \ } /* Write out a request. */ #define REQUEST(LOWER, UPPER, SCHEMA) \ { \ int _rv; \ BUFFER_REQUEST(LOWER, UPPER); \ _rv = \ writeMessage(c, DQLITE_REQUEST_##UPPER, SCHEMA, context); \ if (_rv != 0) { \ return _rv; \ } \ } static int readMessage(struct client_proto *c, uint8_t *type, struct client_context *context) { struct message message = {0}; struct cursor cursor; void *p; size_t n; ssize_t rv; buffer__reset(&c->read); n = message__sizeof(&message); p = buffer__advance(&c->read, n); if (p == NULL) { oom(); } rv = doRead(c->fd, p, n, context); if (rv < 0) { return DQLITE_CLIENT_PROTO_ERROR; } else if (rv < (ssize_t)n) { return DQLITE_CLIENT_PROTO_SHORT; } cursor.p = p; cursor.cap = n; rv = message__decode(&cursor, &message); if (rv != 0) { tracef("message decode failed rv:%zd", rv); return DQLITE_CLIENT_PROTO_ERROR; } buffer__reset(&c->read); n = message.words * 8; p = buffer__advance(&c->read, n); if (p == NULL) { oom(); } rv = doRead(c->fd, p, n, context); if (rv < 0) { return DQLITE_ERROR; } else if (rv < (ssize_t)n) { return DQLITE_CLIENT_PROTO_SHORT; } *type = message.type; return 0; } /* Read and decode a response. */ #define RESPONSE(LOWER, UPPER) \ { \ uint8_t _type; \ int _rv; \ _rv = readMessage(c, &_type, context); \ if (_rv != 0) { \ return _rv; \ } \ if (_type == DQLITE_RESPONSE_FAILURE && \ _type != DQLITE_RESPONSE_##UPPER) { \ _rv = handleFailure(c); \ return _rv; \ } else if (_type != DQLITE_RESPONSE_##UPPER) { \ return DQLITE_CLIENT_PROTO_ERROR; \ } \ cursor.p = buffer__cursor(&c->read, 0); \ cursor.cap = buffer__offset(&c->read); \ _rv = response_##LOWER##__decode(&cursor, &response); \ if (_rv != 0) { \ return DQLITE_CLIENT_PROTO_ERROR; \ } \ } int clientSendLeader(struct client_proto *c, struct client_context *context) { tracef("client send leader"); struct request_leader request = {0}; REQUEST(leader, LEADER, 0); return 0; } int clientSendClient(struct client_proto *c, uint64_t id, struct client_context *context) { tracef("client send client"); struct request_client request; request.id = id; REQUEST(client, CLIENT, 0); return 0; } int clientSendOpen(struct client_proto *c, const char *name, struct client_context *context) { tracef("client send open name %s", name); struct request_open request; c->db_name = strdupChecked(name); request.filename = name; request.flags = 0; /* unused */ request.vfs = "test"; /* unused */ REQUEST(open, OPEN, 0); return 0; } int clientRecvDb(struct client_proto *c, struct client_context *context) { tracef("client recvdb"); struct cursor cursor; struct response_db response; RESPONSE(db, DB); c->db_id = response.id; c->db_is_init = true; return 0; } int clientSendPrepare(struct client_proto *c, const char *sql, struct client_context *context) { tracef("client send prepare"); struct request_prepare request; request.db_id = c->db_id; request.sql = sql; REQUEST(prepare, PREPARE, DQLITE_PREPARE_STMT_SCHEMA_V1); return 0; } int clientRecvStmt(struct client_proto *c, uint32_t *stmt_id, uint64_t *n_params, uint64_t *offset, struct client_context *context) { struct cursor cursor; struct response_stmt_with_offset response; RESPONSE(stmt_with_offset, STMT_WITH_OFFSET); if (stmt_id != NULL) { *stmt_id = response.id; } if (n_params != NULL) { *n_params = response.params; } if (offset != NULL) { *offset = response.offset; } return 0; } static int bufferParams(struct client_proto *c, struct value *params, unsigned n_params) { struct tuple_encoder tup; size_t i; int rv; if (n_params == 0) { return 0; } rv = tuple_encoder__init(&tup, n_params, TUPLE__PARAMS32, &c->write); if (rv != 0) { return DQLITE_CLIENT_PROTO_ERROR; } for (i = 0; i < n_params; ++i) { rv = tuple_encoder__next(&tup, ¶ms[i]); if (rv != 0) { return DQLITE_CLIENT_PROTO_ERROR; } } return 0; } int clientSendExec(struct client_proto *c, uint32_t stmt_id, struct value *params, unsigned n_params, struct client_context *context) { tracef("client send exec id %" PRIu32, stmt_id); struct request_exec request; int rv; request.db_id = c->db_id; request.stmt_id = stmt_id; BUFFER_REQUEST(exec, EXEC); rv = bufferParams(c, params, n_params); if (rv != 0) { return rv; } rv = writeMessage(c, DQLITE_REQUEST_EXEC, 1, context); return rv; } int clientSendExecSQL(struct client_proto *c, const char *sql, struct value *params, unsigned n_params, struct client_context *context) { tracef("client send exec sql"); struct request_exec_sql request; int rv; request.db_id = c->db_id; request.sql = sql; BUFFER_REQUEST(exec_sql, EXEC_SQL); rv = bufferParams(c, params, n_params); if (rv != 0) { return rv; } rv = writeMessage(c, DQLITE_REQUEST_EXEC_SQL, 1, context); return rv; } int clientRecvResult(struct client_proto *c, uint64_t *last_insert_id, uint64_t *rows_affected, struct client_context *context) { struct cursor cursor; struct response_result response; RESPONSE(result, RESULT); if (last_insert_id != NULL) { *last_insert_id = response.last_insert_id; } if (rows_affected != NULL) { *rows_affected = response.rows_affected; } return 0; } int clientSendQuery(struct client_proto *c, uint32_t stmt_id, struct value *params, unsigned n_params, struct client_context *context) { tracef("client send query stmt_id %" PRIu32, stmt_id); struct request_query request; int rv; request.db_id = c->db_id; request.stmt_id = stmt_id; BUFFER_REQUEST(query, QUERY); rv = bufferParams(c, params, n_params); if (rv != 0) { return rv; } rv = writeMessage(c, DQLITE_REQUEST_QUERY, 1, context); return rv; } int clientSendQuerySQL(struct client_proto *c, const char *sql, struct value *params, unsigned n_params, struct client_context *context) { tracef("client send query sql sql %s", sql); struct request_query_sql request; int rv; request.db_id = c->db_id; request.sql = sql; BUFFER_REQUEST(query_sql, QUERY_SQL); rv = bufferParams(c, params, n_params); if (rv != 0) { return rv; } rv = writeMessage(c, DQLITE_REQUEST_QUERY_SQL, 1, context); return rv; } int clientRecvRows(struct client_proto *c, struct rows *rows, bool *done, struct client_context *context) { tracef("client recv rows"); struct cursor cursor; uint8_t type; uint64_t column_count; unsigned i; unsigned j; const char *raw; struct row *row; struct row *last; uint64_t eof; struct tuple_decoder tup; int rv; rv = readMessage(c, &type, context); if (rv != 0) { return rv; } if (type == DQLITE_RESPONSE_FAILURE) { rv = handleFailure(c); return rv; } else if (type != DQLITE_RESPONSE_ROWS) { return DQLITE_CLIENT_PROTO_ERROR; } cursor.p = buffer__cursor(&c->read, 0); cursor.cap = buffer__offset(&c->read); rv = uint64__decode(&cursor, &column_count); if (rv != 0) { return DQLITE_CLIENT_PROTO_ERROR; } rows->column_count = (unsigned)column_count; assert((uint64_t)rows->column_count == column_count); rows->column_names = callocChecked(rows->column_count, sizeof *rows->column_names); for (i = 0; i < rows->column_count; ++i) { rv = text__decode(&cursor, &raw); if (rv != 0) { rv = DQLITE_CLIENT_PROTO_ERROR; goto err_after_alloc_column_names; } rows->column_names[i] = strdupChecked(raw); } rows->next = NULL; last = NULL; while (1) { rv = peekUint64(cursor, &eof); if (rv != 0) { goto err_after_alloc_column_names; } if (eof == DQLITE_RESPONSE_ROWS_DONE || eof == DQLITE_RESPONSE_ROWS_PART) { break; } row = mallocChecked(sizeof *row); row->values = callocChecked(rows->column_count, sizeof *row->values); row->next = NULL; /* Make sure that `goto err_after_alloc_row_values` will do the * right thing even before we enter the for loop. */ i = 0; rv = tuple_decoder__init(&tup, rows->column_count, TUPLE__ROW, &cursor); if (rv != 0) { rv = DQLITE_CLIENT_PROTO_ERROR; goto err_after_alloc_row_values; } for (; i < rows->column_count; ++i) { rv = tuple_decoder__next(&tup, &row->values[i]); if (rv != 0) { rv = DQLITE_CLIENT_PROTO_ERROR; goto err_after_alloc_row_values; } makeValueOwned(&row->values[i]); } if (last == NULL) { rows->next = row; } else { last->next = row; } last = row; } assert(eof == DQLITE_RESPONSE_ROWS_DONE || eof == DQLITE_RESPONSE_ROWS_PART); if (done != NULL) { *done = eof == DQLITE_RESPONSE_ROWS_DONE; } return 0; err_after_alloc_row_values: for (j = 0; j < i; ++j) { freeOwnedValue(row->values[j]); } free(row->values); free(row); err_after_alloc_column_names: clientCloseRows(rows); return rv; } void clientCloseRows(struct rows *rows) { uint64_t i; struct row *row = rows->next; struct row *next; /* Note that we take care to still do the right thing if this was * called before clientRecvRows completed. */ for (row = rows->next; row != NULL; row = next) { next = row->next; row->next = NULL; for (i = 0; i < rows->column_count; ++i) { freeOwnedValue(row->values[i]); } free(row->values); row->values = NULL; free(row); } rows->next = NULL; if (rows->column_names != NULL) { for (i = 0; i < rows->column_count; ++i) { free(rows->column_names[i]); rows->column_names[i] = NULL; } } free(rows->column_names); } int clientSendInterrupt(struct client_proto *c, struct client_context *context) { tracef("client send interrupt"); struct request_interrupt request; request.db_id = c->db_id; REQUEST(interrupt, INTERRUPT, 0); return 0; } int clientSendFinalize(struct client_proto *c, uint32_t stmt_id, struct client_context *context) { tracef("client send finalize %u", stmt_id); struct request_finalize request; request.db_id = c->db_id; request.stmt_id = stmt_id; REQUEST(finalize, FINALIZE, 0); return 0; } int clientSendAdd(struct client_proto *c, uint64_t id, const char *address, struct client_context *context) { tracef("client send add id %" PRIu64 " address %s", id, address); struct request_add request; request.id = id; request.address = address; REQUEST(add, ADD, 0); return 0; } int clientSendAssign(struct client_proto *c, uint64_t id, int role, struct client_context *context) { tracef("client send assign id %" PRIu64 " role %d", id, role); assert(role == DQLITE_VOTER || role == DQLITE_STANDBY || role == DQLITE_SPARE); struct request_assign request; request.id = id; request.role = (uint64_t)role; REQUEST(assign, ASSIGN, 0); return 0; } int clientSendRemove(struct client_proto *c, uint64_t id, struct client_context *context) { tracef("client send remove id %" PRIu64, id); struct request_remove request; request.id = id; REQUEST(remove, REMOVE, 0); return 0; } int clientSendDump(struct client_proto *c, struct client_context *context) { tracef("client send dump"); struct request_dump request; assert(c->db_is_init); assert(c->db_name != NULL); request.filename = c->db_name; REQUEST(dump, DUMP, 0); return 0; } int clientSendCluster(struct client_proto *c, struct client_context *context) { tracef("client send cluster"); struct request_cluster request; request.format = DQLITE_REQUEST_CLUSTER_FORMAT_V1; REQUEST(cluster, CLUSTER, 0); return 0; } int clientSendTransfer(struct client_proto *c, uint64_t id, struct client_context *context) { tracef("client send transfer id %" PRIu64, id); struct request_transfer request; request.id = id; REQUEST(transfer, TRANSFER, 0); return 0; } int clientSendDescribe(struct client_proto *c, struct client_context *context) { tracef("client send describe"); struct request_describe request; request.format = DQLITE_REQUEST_DESCRIBE_FORMAT_V0; REQUEST(describe, DESCRIBE, 0); return 0; } int clientSendWeight(struct client_proto *c, uint64_t weight, struct client_context *context) { tracef("client send weight %" PRIu64, weight); struct request_weight request; request.weight = weight; REQUEST(weight, WEIGHT, 0); return 0; } int clientRecvServer(struct client_proto *c, uint64_t *id, char **address, struct client_context *context) { tracef("client recv server"); struct cursor cursor; struct response_server response; *id = 0; *address = NULL; RESPONSE(server, SERVER); *address = strdupChecked(response.address); *id = response.id; return 0; } int clientRecvWelcome(struct client_proto *c, struct client_context *context) { tracef("client recv welcome"); struct cursor cursor; struct response_welcome response; RESPONSE(welcome, WELCOME); return 0; } int clientRecvEmpty(struct client_proto *c, struct client_context *context) { tracef("client recv empty"); struct cursor cursor; struct response_empty response; RESPONSE(empty, EMPTY); return 0; } int clientRecvFailure(struct client_proto *c, uint64_t *code, char **msg, struct client_context *context) { tracef("client recv failure"); struct cursor cursor; struct response_failure response; RESPONSE(failure, FAILURE); *code = response.code; *msg = strdupChecked(response.message); return 0; } int clientRecvServers(struct client_proto *c, struct client_node_info **servers, uint64_t *n_servers, struct client_context *context) { tracef("client recv servers"); struct cursor cursor; size_t n; uint64_t i = 0; uint64_t j; uint64_t raw_role; const char *raw_addr; struct response_servers response; int rv; *servers = NULL; *n_servers = 0; RESPONSE(servers, SERVERS); n = (size_t)response.n; assert((uint64_t)n == response.n); struct client_node_info *srvs = callocChecked(n, sizeof *srvs); for (; i < response.n; ++i) { rv = uint64__decode(&cursor, &srvs[i].id); if (rv != 0) { goto err_after_alloc_srvs; } rv = text__decode(&cursor, &raw_addr); if (rv != 0) { goto err_after_alloc_srvs; } srvs[i].addr = strdupChecked(raw_addr); rv = uint64__decode(&cursor, &raw_role); if (rv != 0) { free(srvs[i].addr); goto err_after_alloc_srvs; } srvs[i].role = (int)raw_role; } *n_servers = n; *servers = srvs; return 0; err_after_alloc_srvs: for (j = 0; j < i; ++j) { free(srvs[i].addr); } free(srvs); return rv; } int clientRecvFiles(struct client_proto *c, struct client_file **files, size_t *n_files, struct client_context *context) { tracef("client recv files"); struct cursor cursor; struct response_files response; struct client_file *fs; size_t n; size_t z; size_t i = 0; size_t j; const char *raw_name; int rv; *files = NULL; *n_files = 0; RESPONSE(files, FILES); n = (size_t)response.n; assert((uint64_t)n == response.n); fs = callocChecked(n, sizeof *fs); for (; i < response.n; ++i) { rv = text__decode(&cursor, &raw_name); if (rv != 0) { goto err_after_alloc_fs; } fs[i].name = strdupChecked(raw_name); rv = uint64__decode(&cursor, &fs[i].size); if (rv != 0) { free(fs[i].name); goto err_after_alloc_fs; } if (cursor.cap != fs[i].size) { free(fs[i].name); rv = DQLITE_PARSE; goto err_after_alloc_fs; } z = (size_t)fs[i].size; assert((uint64_t)z == fs[i].size); fs[i].blob = mallocChecked(z); memcpy(fs[i].blob, cursor.p, z); } *files = fs; *n_files = n; return 0; err_after_alloc_fs: for (j = 0; j < i; ++j) { free(fs[i].name); free(fs[i].blob); } free(fs); return rv; } int clientRecvMetadata(struct client_proto *c, uint64_t *failure_domain, uint64_t *weight, struct client_context *context) { tracef("client recv metadata"); struct cursor cursor; struct response_metadata response; RESPONSE(metadata, METADATA); *failure_domain = response.failure_domain; *weight = response.weight; return 0; } dqlite-1.16.7/src/client/protocol.h000066400000000000000000000233631465252713400171670ustar00rootroot00000000000000/* Core dqlite client logic for encoding requests and decoding responses. */ #ifndef DQLITE_CLIENT_PROTOCOL_H_ #define DQLITE_CLIENT_PROTOCOL_H_ #include "../../include/dqlite.h" #include "../lib/buffer.h" #include "../tuple.h" /* All functions declared in this header file return 0 for success or one * of the follow error codes on failure. */ enum { /* We received a FAILURE response when we expected another response. * * The data carried by the FAILURE response can be retrieved from the * errcode and errmsg fields of struct client_proto. * * It's safe to continue using the client_proto object after receiving * this error code. */ DQLITE_CLIENT_PROTO_RECEIVED_FAILURE = 1, /* We timed out while reading from or writing to our fd, or a read/write * returned EOF before the expected number of bytes were read/written. * * It is not generally safe to continue using the client_proto object * after receiving this error code. */ DQLITE_CLIENT_PROTO_SHORT, /* Another kind of error occurred, like a syscall failure. * * It is not generally safe to continue using the client_proto object * after receiving this error code. */ DQLITE_CLIENT_PROTO_ERROR }; struct client_proto { /* TODO find a better approach to initializing these fields? */ int (*connect)(void *, const char *, int *); void *connect_arg; int fd; /* Connected socket */ uint32_t db_id; /* Database ID provided by the server */ char *db_name; /* Database filename (owned) */ bool db_is_init; /* Whether the database ID has been initialized */ uint64_t server_id; struct buffer read; /* Read buffer */ struct buffer write; /* Write buffer */ uint64_t errcode; /* Last error code returned by the server (owned) */ char *errmsg; /* Last error string returned by the server */ }; /* All of the Send and Recv functions take an `struct client_context *context` * argument, which controls timeouts for read and write operations (and possibly * other knobs in the future). * * Passing NULL for the context argument is permitted and disables all timeouts. */ struct client_context { /* An absolute CLOCK_REALTIME timestamp that limits how long will be * spent trying to complete the requested send or receive operation. * Whenever we are about to make a blocking syscall (read or write), we * first poll(2) using a timeout computed based on how much time remains * before the deadline. If the poll times out, we return early instead * of completing the operation. */ struct timespec deadline; }; /* TODO Consider using a dynamic array instead of a linked list here? */ struct row { struct value *values; struct row *next; }; struct rows { unsigned column_count; char **column_names; struct row *next; }; struct client_node_info { uint64_t id; char *addr; int role; }; struct client_file { char *name; uint64_t size; void *blob; }; /* Checked allocation functions that abort the process on allocation failure. */ void *mallocChecked(size_t n); void *callocChecked(size_t nmemb, size_t size); char *strdupChecked(const char *s); char *strndupCheck(const char *s, size_t n); /* Initialize a context whose deadline will fall after the given duration * in milliseconds. */ DQLITE_VISIBLE_TO_TESTS void clientContextMillis(struct client_context *context, long millis); /* Initialize a new client. */ DQLITE_VISIBLE_TO_TESTS int clientOpen(struct client_proto *c, const char *addr, uint64_t server_id); /* Release all memory used by the client, and close the client socket. */ DQLITE_VISIBLE_TO_TESTS void clientClose(struct client_proto *c); /* Initialize the connection by writing the protocol version. This must be * called before using any other API. */ DQLITE_VISIBLE_TO_TESTS int clientSendHandshake(struct client_proto *c, struct client_context *context); /* Send a request to get the current leader. */ DQLITE_VISIBLE_TO_TESTS int clientSendLeader(struct client_proto *c, struct client_context *context); /* Send a request identifying this client to the attached server. */ DQLITE_VISIBLE_TO_TESTS int clientSendClient(struct client_proto *c, uint64_t id, struct client_context *context); /* Send a request to open a database */ DQLITE_VISIBLE_TO_TESTS int clientSendOpen(struct client_proto *c, const char *name, struct client_context *context); /* Receive the response to an open request. */ DQLITE_VISIBLE_TO_TESTS int clientRecvDb(struct client_proto *c, struct client_context *context); /* Send a request to prepare a statement. */ DQLITE_VISIBLE_TO_TESTS int clientSendPrepare(struct client_proto *c, const char *sql, struct client_context *context); /* Receive the response to a prepare request. */ DQLITE_VISIBLE_TO_TESTS int clientRecvStmt(struct client_proto *c, uint32_t *stmt_id, uint64_t *n_params, uint64_t *offset, struct client_context *context); /* Send a request to execute a statement. */ DQLITE_VISIBLE_TO_TESTS int clientSendExec(struct client_proto *c, uint32_t stmt_id, struct value *params, unsigned n_params, struct client_context *context); /* Send a request to execute a non-prepared statement. */ DQLITE_VISIBLE_TO_TESTS int clientSendExecSQL(struct client_proto *c, const char *sql, struct value *params, unsigned n_params, struct client_context *context); /* Receive the response to an exec request. */ DQLITE_VISIBLE_TO_TESTS int clientRecvResult(struct client_proto *c, uint64_t *last_insert_id, uint64_t *rows_affected, struct client_context *context); /* Send a request to perform a query. */ DQLITE_VISIBLE_TO_TESTS int clientSendQuery(struct client_proto *c, uint32_t stmt_id, struct value *params, unsigned n_params, struct client_context *context); /* Send a request to perform a non-prepared query. */ DQLITE_VISIBLE_TO_TESTS int clientSendQuerySQL(struct client_proto *c, const char *sql, struct value *params, unsigned n_params, struct client_context *context); /* Receive the response of a query request. */ DQLITE_VISIBLE_TO_TESTS int clientRecvRows(struct client_proto *c, struct rows *rows, bool *done, struct client_context *context); /* Release all memory used in the given rows object. */ DQLITE_VISIBLE_TO_TESTS void clientCloseRows(struct rows *rows); /* Send a request to interrupt a server that's sending rows. */ DQLITE_VISIBLE_TO_TESTS int clientSendInterrupt(struct client_proto *c, struct client_context *context); /* Send a request to finalize a prepared statement. */ DQLITE_VISIBLE_TO_TESTS int clientSendFinalize(struct client_proto *c, uint32_t stmt_id, struct client_context *context); /* Send a request to add a dqlite node. */ DQLITE_VISIBLE_TO_TESTS int clientSendAdd(struct client_proto *c, uint64_t id, const char *address, struct client_context *context); /* Send a request to assign a role to a node. */ DQLITE_VISIBLE_TO_TESTS int clientSendAssign(struct client_proto *c, uint64_t id, int role, struct client_context *context); /* Send a request to remove a server from the cluster. */ DQLITE_VISIBLE_TO_TESTS int clientSendRemove(struct client_proto *c, uint64_t id, struct client_context *context); /* Send a request to dump the contents of the attached database. */ DQLITE_VISIBLE_TO_TESTS int clientSendDump(struct client_proto *c, struct client_context *context); /* Send a request to list the nodes of the cluster with their addresses and * roles. */ DQLITE_VISIBLE_TO_TESTS int clientSendCluster(struct client_proto *c, struct client_context *context); /* Send a request to transfer leadership to node with id `id`. */ DQLITE_VISIBLE_TO_TESTS int clientSendTransfer(struct client_proto *c, uint64_t id, struct client_context *context); /* Send a request to retrieve metadata about the attached server. */ DQLITE_VISIBLE_TO_TESTS int clientSendDescribe(struct client_proto *c, struct client_context *context); /* Send a request to set the weight metadata for the attached server. */ DQLITE_VISIBLE_TO_TESTS int clientSendWeight(struct client_proto *c, uint64_t weight, struct client_context *context); /* Receive a response with the ID and address of a single node. */ DQLITE_VISIBLE_TO_TESTS int clientRecvServer(struct client_proto *c, uint64_t *id, char **address, struct client_context *context); /* Receive a "welcome" handshake response. */ DQLITE_VISIBLE_TO_TESTS int clientRecvWelcome(struct client_proto *c, struct client_context *context); /* Receive an empty response. */ DQLITE_VISIBLE_TO_TESTS int clientRecvEmpty(struct client_proto *c, struct client_context *context); /* Receive a failure response. */ DQLITE_VISIBLE_TO_TESTS int clientRecvFailure(struct client_proto *c, uint64_t *code, char **msg, struct client_context *context); /* Receive a list of nodes in the cluster. */ DQLITE_VISIBLE_TO_TESTS int clientRecvServers(struct client_proto *c, struct client_node_info **servers, uint64_t *n_servers, struct client_context *context); /* Receive a list of files that make up a database. */ DQLITE_VISIBLE_TO_TESTS int clientRecvFiles(struct client_proto *c, struct client_file **files, size_t *n_files, struct client_context *context); /* Receive metadata for a single server. */ DQLITE_VISIBLE_TO_TESTS int clientRecvMetadata(struct client_proto *c, uint64_t *failure_domain, uint64_t *weight, struct client_context *context); #endif /* DQLITE_CLIENT_PROTOCOL_H_ */ dqlite-1.16.7/src/command.c000066400000000000000000000103001465252713400154440ustar00rootroot00000000000000#include #include "../include/dqlite.h" #include "lib/serialize.h" #include "command.h" #include "protocol.h" #define FORMAT 1 /* Format version */ #define HEADER(X, ...) \ X(uint8, format, ##__VA_ARGS__) \ X(uint8, type, ##__VA_ARGS__) \ X(uint8, _unused1, ##__VA_ARGS__) \ X(uint8, _unused2, ##__VA_ARGS__) \ X(uint32, _unused3, ##__VA_ARGS__) SERIALIZE__DEFINE(header, HEADER); SERIALIZE__IMPLEMENT(header, HEADER); static size_t frames__sizeof(const frames_t *frames) { size_t s = uint32__sizeof(&frames->n_pages) + uint16__sizeof(&frames->page_size) + uint16__sizeof(&frames->__unused__) + sizeof(uint64_t) * frames->n_pages + /* Page numbers */ frames->page_size * frames->n_pages; /* Page data */ return s; } static void frames__encode(const frames_t *frames, char **cursor) { const dqlite_vfs_frame *list; unsigned i; uint32__encode(&frames->n_pages, cursor); uint16__encode(&frames->page_size, cursor); uint16__encode(&frames->__unused__, cursor); list = frames->data; for (i = 0; i < frames->n_pages; i++) { uint64_t pgno = list[i].page_number; uint64__encode(&pgno, cursor); } for (i = 0; i < frames->n_pages; i++) { memcpy(*cursor, list[i].data, frames->page_size); *cursor += frames->page_size; } } static int frames__decode(struct cursor *cursor, frames_t *frames) { int rc; rc = uint32__decode(cursor, &frames->n_pages); if (rc != 0) { return rc; } rc = uint16__decode(cursor, &frames->page_size); if (rc != 0) { return rc; } rc = uint16__decode(cursor, &frames->__unused__); if (rc != 0) { return rc; } frames->data = cursor->p; return 0; } #define COMMAND__IMPLEMENT(LOWER, UPPER, _) \ SERIALIZE__IMPLEMENT(command_##LOWER, COMMAND__##UPPER); COMMAND__TYPES(COMMAND__IMPLEMENT, ); #define ENCODE(LOWER, UPPER, _) \ case COMMAND_##UPPER: \ h.type = COMMAND_##UPPER; \ buf->len = header__sizeof(&h); \ buf->len += command_##LOWER##__sizeof(command); \ buf->base = raft_malloc(buf->len); \ if (buf->base == NULL) { \ return DQLITE_NOMEM; \ } \ cursor = buf->base; \ header__encode(&h, &cursor); \ command_##LOWER##__encode(command, &cursor); \ break; int command__encode(int type, const void *command, struct raft_buffer *buf) { struct header h = {0}; char *cursor; int rc = 0; h.format = FORMAT; switch (type) { COMMAND__TYPES(ENCODE, ) }; return rc; } #define DECODE(LOWER, UPPER, _) \ case COMMAND_##UPPER: \ *command = raft_malloc(sizeof(struct command_##LOWER)); \ if (*command == NULL) { \ return DQLITE_NOMEM; \ } \ rc = command_##LOWER##__decode(&cursor, *command); \ break; int command__decode(const struct raft_buffer *buf, int *type, void **command) { struct header h; struct cursor cursor; int rc; cursor.p = buf->base; cursor.cap = buf->len; rc = header__decode(&cursor, &h); if (rc != 0) { return rc; } if (h.format != FORMAT) { return DQLITE_PROTO; } switch (h.type) { COMMAND__TYPES(DECODE, ) default: rc = DQLITE_PROTO; break; }; if (rc != 0) { return rc; } *type = h.type; return 0; } int command_frames__page_numbers(const struct command_frames *c, unsigned long *page_numbers[]) { unsigned i; struct cursor cursor; cursor.p = c->frames.data; cursor.cap = sizeof(uint64_t) * c->frames.n_pages; *page_numbers = sqlite3_malloc64(sizeof **page_numbers * c->frames.n_pages); if (*page_numbers == NULL) { return DQLITE_NOMEM; } for (i = 0; i < c->frames.n_pages; i++) { uint64_t pgno; int r = uint64__decode(&cursor, &pgno); if (r != 0) { return r; } (*page_numbers)[i] = (unsigned long)pgno; } return 0; } void command_frames__pages(const struct command_frames *c, void **pages) { *pages = (void *)(c->frames.data + (sizeof(uint64_t) * c->frames.n_pages)); } dqlite-1.16.7/src/command.h000066400000000000000000000042771465252713400154710ustar00rootroot00000000000000/** * Encode and decode dqlite Raft FSM commands. */ #ifndef COMMAND_H_ #define COMMAND_H_ #include "../include/dqlite.h" #include "lib/serialize.h" #include "raft.h" /* Command type codes */ enum { COMMAND_OPEN = 1, COMMAND_FRAMES, COMMAND_UNDO, COMMAND_CHECKPOINT }; /* Hold information about an array of WAL frames. */ struct frames { uint32_t n_pages; uint16_t page_size; uint16_t __unused__; /* TODO: because the sqlite3 replication APIs are asymmetrics, the * format differs between encode and decode. When encoding data is * expected to be a sqlite3_wal_replication_frame* array, and when * decoding it will be a pointer to raw memory which can be further * decoded with the command_frames__page_numbers() and * command_frames__pages() helpers. */ const void *data; }; typedef struct frames frames_t; /* Serialization definitions for a raft FSM command. */ #define COMMAND__DEFINE(LOWER, UPPER, _) \ SERIALIZE__DEFINE_STRUCT(command_##LOWER, COMMAND__##UPPER); #define COMMAND__OPEN(X, ...) X(text, filename, ##__VA_ARGS__) #define COMMAND__FRAMES(X, ...) \ X(text, filename, ##__VA_ARGS__) \ X(uint64, tx_id, ##__VA_ARGS__) \ X(uint32, truncate, ##__VA_ARGS__) \ X(uint8, is_commit, ##__VA_ARGS__) \ X(uint8, __unused1__, ##__VA_ARGS__) \ X(uint16, __unused2__, ##__VA_ARGS__) \ X(frames, frames, ##__VA_ARGS__) #define COMMAND__UNDO(X, ...) X(uint64, tx_id, ##__VA_ARGS__) #define COMMAND__CHECKPOINT(X, ...) X(text, filename, ##__VA_ARGS__) #define COMMAND__TYPES(X, ...) \ X(open, OPEN, __VA_ARGS__) \ X(frames, FRAMES, __VA_ARGS__) \ X(undo, UNDO, __VA_ARGS__) \ X(checkpoint, CHECKPOINT, __VA_ARGS__) COMMAND__TYPES(COMMAND__DEFINE); DQLITE_VISIBLE_TO_TESTS int command__encode(int type, const void *command, struct raft_buffer *buf); DQLITE_VISIBLE_TO_TESTS int command__decode(const struct raft_buffer *buf, int *type, void **command); DQLITE_VISIBLE_TO_TESTS int command_frames__page_numbers( const struct command_frames *c, unsigned long *page_numbers[]); DQLITE_VISIBLE_TO_TESTS void command_frames__pages( const struct command_frames *c, void **pages); #endif /* COMMAND_H_*/ dqlite-1.16.7/src/config.c000066400000000000000000000032221465252713400153000ustar00rootroot00000000000000#include #include #include #include "../include/dqlite.h" #include "./lib/assert.h" #include "config.h" #include "logger.h" /* Default heartbeat timeout in milliseconds. * * Clients will be disconnected if the server does not receive a heartbeat * message within this time. */ #define DEFAULT_HEARTBEAT_TIMEOUT 15000 /* Default database page size in bytes. */ #define DEFAULT_PAGE_SIZE 4096 /* Number of outstanding WAL frames after which a checkpoint is triggered as * soon as possible. */ #define DEFAULT_CHECKPOINT_THRESHOLD 1000 /* For generating unique replication/VFS registration names. * * TODO: make this thread safe. */ static unsigned serial = 1; int config__init(struct config *c, dqlite_node_id id, const char *address, const char *raft_dir, const char *database_dir) { int rv; c->id = id; c->address = sqlite3_malloc((int)strlen(address) + 1); if (c->address == NULL) { return DQLITE_NOMEM; } strcpy(c->address, address); c->heartbeat_timeout = DEFAULT_HEARTBEAT_TIMEOUT; c->page_size = DEFAULT_PAGE_SIZE; c->checkpoint_threshold = DEFAULT_CHECKPOINT_THRESHOLD; rv = snprintf(c->name, sizeof c->name, "dqlite-%u", serial); assert(rv < (int)(sizeof c->name)); c->logger.data = NULL; c->logger.emit = loggerDefaultEmit; c->failure_domain = 0; c->weight = 0; snprintf(c->raft_dir, sizeof(c->raft_dir), "%s", (raft_dir != NULL) ? raft_dir : ""); snprintf(c->database_dir, sizeof(c->database_dir), "%s", database_dir); c->disk = false; c->voters = 3; c->standbys = 0; c->pool_thread_count = 4; serial++; return 0; } void config__close(struct config *c) { sqlite3_free(c->address); } dqlite-1.16.7/src/config.h000066400000000000000000000027201465252713400153070ustar00rootroot00000000000000#ifndef CONFIG_H_ #define CONFIG_H_ #include "logger.h" /** * Value object holding dqlite configuration. */ struct config { dqlite_node_id id; /* Unique instance ID */ char *address; /* Instance address */ unsigned heartbeat_timeout; /* In milliseconds */ unsigned page_size; /* Database page size */ unsigned checkpoint_threshold; /* In outstanding WAL frames */ struct logger logger; /* Custom logger */ char name[256]; /* VFS/replication registriatio name */ unsigned long long failure_domain; /* User-provided failure domain */ unsigned long long int weight; /* User-provided node weight */ char raft_dir[1024]; /* Directory used by raft */ char database_dir[1024]; /* Data dir for on-disk database */ bool disk; /* Disk-mode or not */ int voters; /* Target number of voters */ int standbys; /* Target number of standbys */ unsigned pool_thread_count; /* Number of threads in thread pool */ }; /** * Initialize the config object with required values and set the rest to sane * defaults. A copy will be made of the given @address. */ int config__init(struct config *c, dqlite_node_id id, const char *address, const char *raft_dir, const char *database_dir); /** * Release any memory held by the config object. */ void config__close(struct config *c); #endif /* DQLITE_OPTIONS_H */ dqlite-1.16.7/src/conn.c000066400000000000000000000170371465252713400150010ustar00rootroot00000000000000#include "conn.h" #include "message.h" #include "protocol.h" #include "request.h" #include "tracing.h" #include "transport.h" #include /* Initialize the given buffer for reading, ensure it has the given size. */ static int init_read(struct conn *c, uv_buf_t *buf, size_t size) { buffer__reset(&c->read); buf->base = buffer__advance(&c->read, size); if (buf->base == NULL) { return DQLITE_NOMEM; } buf->len = size; return 0; } static int read_message(struct conn *c); static void conn_write_cb(struct transport *transport, int status) { struct conn *c = transport->data; bool finished; int rv; if (status != 0) { tracef("write cb status %d", status); goto abort; } buffer__reset(&c->write); buffer__advance(&c->write, message__sizeof(&c->response)); /* Header */ rv = gateway__resume(&c->gateway, &finished); if (rv != 0) { goto abort; } if (!finished) { return; } /* Start reading the next request */ rv = read_message(c); if (rv != 0) { goto abort; } return; abort: conn__stop(c); } static void gateway_handle_cb(struct handle *req, int status, uint8_t type, uint8_t schema) { struct conn *c = req->data; size_t n; char *cursor; uv_buf_t buf; int rv; assert(schema <= req->schema); /* Ignore results firing after we started closing. TODO: instead, we * should make gateway__close() asynchronous. */ if (c->closed) { tracef("gateway handle cb closed"); return; } if (status != 0) { tracef("gateway handle cb status %d", status); goto abort; } n = buffer__offset(&c->write) - message__sizeof(&c->response); assert(n % 8 == 0); c->response.type = type; c->response.words = (uint32_t)(n / 8); c->response.schema = schema; c->response.extra = 0; cursor = buffer__cursor(&c->write, 0); message__encode(&c->response, &cursor); buf.base = buffer__cursor(&c->write, 0); buf.len = buffer__offset(&c->write); rv = transport__write(&c->transport, &buf, conn_write_cb); if (rv != 0) { tracef("transport write failed %d", rv); goto abort; } return; abort: conn__stop(c); } static void closeCb(struct transport *transport) { struct conn *c = transport->data; buffer__close(&c->write); buffer__close(&c->read); if (c->close_cb != NULL) { c->close_cb(c); } } static void raft_connect(struct conn *c) { struct cursor *cursor = &c->handle.cursor; struct request_connect request; int rv; tracef("raft_connect"); rv = request_connect__decode(cursor, &request); if (rv != 0) { tracef("request connect decode failed %d", rv); conn__stop(c); return; } raftProxyAccept(c->uv_transport, request.id, request.address, c->transport.stream); /* Close the connection without actually closing the transport, since * the stream will be used by raft */ c->closed = true; closeCb(&c->transport); } static void read_request_cb(struct transport *transport, int status) { struct conn *c = transport->data; struct cursor *cursor = &c->handle.cursor; int rv; if (status != 0) { tracef("read error %d", status); // errorf(c->logger, "read error"); conn__stop(c); return; } cursor->p = buffer__cursor(&c->read, 0); cursor->cap = buffer__offset(&c->read); buffer__reset(&c->write); buffer__advance(&c->write, message__sizeof(&c->response)); /* Header */ switch (c->request.type) { case DQLITE_REQUEST_CONNECT: raft_connect(c); return; } rv = gateway__handle(&c->gateway, &c->handle, c->request.type, c->request.schema, &c->write, gateway_handle_cb); if (rv != 0) { tracef("read gateway handle error %d", rv); conn__stop(c); } } /* Start reading the body of the next request */ static int read_request(struct conn *c) { uv_buf_t buf; int rv; if (UINT64_C(8) * (uint64_t)c->request.words > (uint64_t)UINT32_MAX) { return DQLITE_ERROR; } rv = init_read(c, &buf, c->request.words * 8); if (rv != 0) { tracef("init read failed %d", rv); return rv; } if (c->request.words == 0) { return 0; } rv = transport__read(&c->transport, &buf, read_request_cb); if (rv != 0) { tracef("transport read failed %d", rv); return rv; } return 0; } static void read_message_cb(struct transport *transport, int status) { struct conn *c = transport->data; struct cursor cursor; int rv; if (status != 0) { // errorf(c->logger, "read error"); tracef("read error %d", status); conn__stop(c); return; } cursor.p = buffer__cursor(&c->read, 0); cursor.cap = buffer__offset(&c->read); rv = message__decode(&cursor, &c->request); assert(rv == 0); /* Can't fail, we know we have enough bytes */ rv = read_request(c); if (rv != 0) { tracef("read request error %d", rv); conn__stop(c); return; } } /* Start reading metadata about the next message */ static int read_message(struct conn *c) { uv_buf_t buf; int rv; rv = init_read(c, &buf, message__sizeof(&c->request)); if (rv != 0) { tracef("init read failed %d", rv); return rv; } rv = transport__read(&c->transport, &buf, read_message_cb); if (rv != 0) { tracef("transport read failed %d", rv); return rv; } return 0; } static void read_protocol_cb(struct transport *transport, int status) { struct conn *c = transport->data; struct cursor cursor; int rv; if (status != 0) { // errorf(c->logger, "read error"); tracef("read error %d", status); goto abort; } cursor.p = buffer__cursor(&c->read, 0); cursor.cap = buffer__offset(&c->read); rv = uint64__decode(&cursor, &c->protocol); assert(rv == 0); /* Can't fail, we know we have enough bytes */ if (c->protocol != DQLITE_PROTOCOL_VERSION && c->protocol != DQLITE_PROTOCOL_VERSION_LEGACY) { /* errorf(c->logger, "unknown protocol version: %lx", */ /* c->protocol); */ /* TODO: instead of closing the connection we should return * error messages */ tracef("unknown protocol version %" PRIu64, c->protocol); goto abort; } c->gateway.protocol = c->protocol; rv = read_message(c); if (rv != 0) { goto abort; } return; abort: conn__stop(c); } /* Start reading the protocol format version */ static int read_protocol(struct conn *c) { uv_buf_t buf; int rv; rv = init_read(c, &buf, sizeof c->protocol); if (rv != 0) { tracef("init read failed %d", rv); return rv; } rv = transport__read(&c->transport, &buf, read_protocol_cb); if (rv != 0) { tracef("transport read failed %d", rv); return rv; } return 0; } int conn__start(struct conn *c, struct config *config, struct uv_loop_s *loop, struct registry *registry, struct raft *raft, struct uv_stream_s *stream, struct raft_uv_transport *uv_transport, struct id_state seed, conn_close_cb close_cb) { int rv; (void)loop; tracef("conn start"); rv = transport__init(&c->transport, stream); if (rv != 0) { tracef("conn start - transport init failed %d", rv); goto err; } c->config = config; c->transport.data = c; c->uv_transport = uv_transport; c->close_cb = close_cb; gateway__init(&c->gateway, config, registry, raft, seed); rv = buffer__init(&c->read); if (rv != 0) { goto err_after_transport_init; } rv = buffer__init(&c->write); if (rv != 0) { goto err_after_read_buffer_init; } c->handle.data = c; c->closed = false; /* First, we expect the client to send us the protocol version. */ rv = read_protocol(c); if (rv != 0) { goto err_after_write_buffer_init; } return 0; err_after_write_buffer_init: buffer__close(&c->write); err_after_read_buffer_init: buffer__close(&c->read); err_after_transport_init: transport__close(&c->transport, NULL); err: return rv; } void conn__stop(struct conn *c) { tracef("conn stop"); if (c->closed) { return; } c->closed = true; gateway__close(&c->gateway); transport__close(&c->transport, closeCb); } dqlite-1.16.7/src/conn.h000066400000000000000000000032161465252713400150000ustar00rootroot00000000000000/** * Handle a single client connection. */ #ifndef DQLITE_CONN_H_ #define DQLITE_CONN_H_ #include "lib/buffer.h" #include "lib/queue.h" #include "lib/transport.h" #include "gateway.h" #include "id.h" #include "message.h" #include "raft.h" /** * Callbacks. */ struct conn; typedef void (*conn_close_cb)(struct conn *c); struct conn { struct config *config; struct raft_uv_transport *uv_transport; /* Raft transport */ conn_close_cb close_cb; /* Close callback */ struct transport transport; /* Async network read/write */ struct gateway gateway; /* Request handler */ struct buffer read; /* Read buffer */ struct buffer write; /* Write buffer */ uint64_t protocol; /* Protocol format version */ struct message request; /* Request message meta data */ struct message response; /* Response message meta data */ struct handle handle; bool closed; queue queue; }; /** * Initialize and start a connection. * * If no error is returned, the connection should be considered started. Any * error occurring after this point will trigger the @close_cb callback. */ int conn__start(struct conn *c, struct config *config, struct uv_loop_s *loop, struct registry *registry, struct raft *raft, struct uv_stream_s *stream, struct raft_uv_transport *uv_transport, struct id_state seed, conn_close_cb close_cb); /** * Force closing the connection. The close callback will be invoked when it's * safe to release the memory of the connection object. */ void conn__stop(struct conn *c); #endif /* DQLITE_CONN_H_ */ dqlite-1.16.7/src/db.c000066400000000000000000000073341465252713400144300ustar00rootroot00000000000000#include #include #include "../include/dqlite.h" #include "./lib/assert.h" #include "db.h" #include "tracing.h" /* Limit taken from sqlite unix vfs. */ #define MAX_PATHNAME 512 /* Open a SQLite connection and set it to follower mode. */ static int open_follower_conn(const char *filename, const char *vfs, unsigned page_size, sqlite3 **conn); static uint32_t str_hash(const char* name) { const unsigned char *p; uint32_t h = 5381U; for (p = (const unsigned char *) name; *p != '\0'; p++) { h = (h << 5) + h + *p; } return h; } int db__init(struct db *db, struct config *config, const char *filename) { tracef("db init filename=`%s'", filename); int rv; db->config = config; db->cookie = str_hash(filename); db->filename = sqlite3_malloc((int)(strlen(filename) + 1)); if (db->filename == NULL) { rv = DQLITE_NOMEM; goto err; } strcpy(db->filename, filename); db->path = sqlite3_malloc(MAX_PATHNAME + 1); if (db->path == NULL) { rv = DQLITE_NOMEM; goto err_after_filename_alloc; } if (db->config->disk) { rv = snprintf(db->path, MAX_PATHNAME + 1, "%s/%s", db->config->database_dir, db->filename); } else { rv = snprintf(db->path, MAX_PATHNAME + 1, "%s", db->filename); } if (rv < 0 || rv >= MAX_PATHNAME + 1) { goto err_after_path_alloc; } db->follower = NULL; db->tx_id = 0; db->read_lock = 0; queue_init(&db->leaders); return 0; err_after_path_alloc: sqlite3_free(db->path); err_after_filename_alloc: sqlite3_free(db->filename); err: return rv; } void db__close(struct db *db) { assert(queue_empty(&db->leaders)); if (db->follower != NULL) { int rc; rc = sqlite3_close(db->follower); assert(rc == SQLITE_OK); } sqlite3_free(db->path); sqlite3_free(db->filename); } int db__open_follower(struct db *db) { int rc; assert(db->follower == NULL); rc = open_follower_conn(db->path, db->config->name, db->config->page_size, &db->follower); return rc; } static int open_follower_conn(const char *filename, const char *vfs, unsigned page_size, sqlite3 **conn) { char pragma[255]; int flags = SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE; char *msg = NULL; int rc; tracef("open follower conn: %s page_size:%u", filename, page_size); rc = sqlite3_open_v2(filename, conn, flags, vfs); if (rc != SQLITE_OK) { tracef("open_v2 failed %d", rc); goto err; } /* Enable extended result codes */ rc = sqlite3_extended_result_codes(*conn, 1); if (rc != SQLITE_OK) { goto err; } /* The vfs, db, gateway, and leader code currently assumes that * each connection will operate on only one DB file/WAL file * pair. Make sure that the client can't use ATTACH DATABASE to * break this assumption. We apply the same limit in openConnection * in leader.c. * * Note, 0 instead of 1 -- apparently the "initial database" is not * counted when evaluating this limit. */ sqlite3_limit(*conn, SQLITE_LIMIT_ATTACHED, 0); /* Set the page size. */ sprintf(pragma, "PRAGMA page_size=%d", page_size); rc = sqlite3_exec(*conn, pragma, NULL, NULL, &msg); if (rc != SQLITE_OK) { tracef("page_size=%d failed", page_size); goto err; } /* Disable syncs. */ rc = sqlite3_exec(*conn, "PRAGMA synchronous=OFF", NULL, NULL, &msg); if (rc != SQLITE_OK) { tracef("synchronous=OFF failed"); goto err; } /* Set WAL journaling. */ rc = sqlite3_exec(*conn, "PRAGMA journal_mode=WAL", NULL, NULL, &msg); if (rc != SQLITE_OK) { tracef("journal_mode=WAL failed"); goto err; } rc = sqlite3_db_config(*conn, SQLITE_DBCONFIG_NO_CKPT_ON_CLOSE, 1, NULL); if (rc != SQLITE_OK) { goto err; } return 0; err: if (*conn != NULL) { sqlite3_close(*conn); *conn = NULL; } if (msg != NULL) { sqlite3_free(msg); } return rc; } dqlite-1.16.7/src/db.h000066400000000000000000000021711465252713400144270ustar00rootroot00000000000000/** * State of a single database. */ #ifndef DB_H_ #define DB_H_ #include #include "lib/queue.h" #include "config.h" struct db { struct config *config; /* Dqlite configuration */ char *filename; /* Database filename */ char *path; /* Used for on-disk db */ uint32_t cookie; /* Used to bind to the pool's thread */ sqlite3 *follower; /* Follower connection */ queue leaders; /* Open leader connections */ unsigned tx_id; /* Current ongoing transaction ID, if any */ queue queue; /* Prev/next database, used by the registry */ int read_lock; /* Lock used by snapshots & checkpoints */ }; /** * Initialize a database object. * * The given @filename will be copied. * Return 0 on success. */ int db__init(struct db *db, struct config *config, const char *filename); /** * Release all memory associated with a database object. * * If the follower connection was opened, it will be closed. */ void db__close(struct db *db); /** * Open the follower connection associated with this database. */ int db__open_follower(struct db *db); #endif /* DB_H_*/ dqlite-1.16.7/src/dqlite.c000066400000000000000000000037051465252713400153230ustar00rootroot00000000000000#include "../include/dqlite.h" #include "vfs.h" int dqlite_version_number(void) { return DQLITE_VERSION_NUMBER; } int dqlite_vfs_init(sqlite3_vfs *vfs, const char *name) { return VfsInit(vfs, name); } int dqlite_vfs_enable_disk(sqlite3_vfs *vfs) { return VfsEnableDisk(vfs); } void dqlite_vfs_close(sqlite3_vfs *vfs) { VfsClose(vfs); } int dqlite_vfs_poll(sqlite3_vfs *vfs, const char *filename, dqlite_vfs_frame **frames, unsigned *n) { return VfsPoll(vfs, filename, frames, n); } int dqlite_vfs_apply(sqlite3_vfs *vfs, const char *filename, unsigned n, unsigned long *page_numbers, void *frames) { return VfsApply(vfs, filename, n, page_numbers, frames); } int dqlite_vfs_abort(sqlite3_vfs *vfs, const char *filename) { return VfsAbort(vfs, filename); } int dqlite_vfs_snapshot(sqlite3_vfs *vfs, const char *filename, void **data, size_t *n) { return VfsSnapshot(vfs, filename, data, n); } int dqlite_vfs_snapshot_disk(sqlite3_vfs *vfs, const char *filename, struct dqlite_buffer bufs[], unsigned n) { int rv; if (n != 2) { return -1; } rv = VfsDiskSnapshotDb(vfs, filename, &bufs[0]); if (rv != 0) { return rv; } rv = VfsDiskSnapshotWal(vfs, filename, &bufs[1]); return rv; } int dqlite_vfs_num_pages(sqlite3_vfs *vfs, const char *filename, unsigned *n) { return VfsDatabaseNumPages(vfs, filename, n); } int dqlite_vfs_shallow_snapshot(sqlite3_vfs *vfs, const char *filename, struct dqlite_buffer bufs[], unsigned n) { return VfsShallowSnapshot(vfs, filename, bufs, n); } int dqlite_vfs_restore(sqlite3_vfs *vfs, const char *filename, const void *data, size_t n) { return VfsRestore(vfs, filename, data, n); } int dqlite_vfs_restore_disk(sqlite3_vfs *vfs, const char *filename, const void *data, size_t main_size, size_t wal_size) { return VfsDiskRestore(vfs, filename, data, main_size, wal_size); } dqlite-1.16.7/src/error.c000066400000000000000000000065701465252713400151750ustar00rootroot00000000000000#include #include #include #include #include #include #include "../include/dqlite.h" #include "./lib/assert.h" #include "error.h" /* Fallback message returned when failing to allocate the error message * itself. */ static char *dqlite__error_oom_msg = "error message unavailable (out of memory)"; void dqlite__error_init(dqlite__error *e) { *e = NULL; } void dqlite__error_close(dqlite__error *e) { if (*e != NULL && *e != dqlite__error_oom_msg) { sqlite3_free(*e); } } /* Set an error message by rendering the given format against the given * parameters. * * Any previously set error message will be cleared. */ static void dqlite__error_vprintf(dqlite__error *e, const char *fmt, va_list args) { assert(fmt != NULL); /* If a previous error was set (other than the hard-coded OOM fallback * fallback), let's free it. */ if (*e != NULL && *e != dqlite__error_oom_msg) { sqlite3_free(*e); } /* Render the message. In case of error we fallback to the hard-coded * OOM fallback message. */ *e = sqlite3_vmprintf(fmt, args); if (*e == NULL) { *e = dqlite__error_oom_msg; } } void dqlite__error_printf(dqlite__error *e, const char *fmt, ...) { va_list args; va_start(args, fmt); dqlite__error_vprintf(e, fmt, args); va_end(args); } static void dqlite__error_vwrapf(dqlite__error *e, const char *cause, const char *fmt, va_list args) { dqlite__error tmp; char *msg; /* First, print the format and arguments, using a temporary error. */ dqlite__error_init(&tmp); dqlite__error_vprintf(&tmp, fmt, args); if (cause == NULL) { /* Special case the cause error being empty. */ dqlite__error_printf(e, "%s: (null)", tmp); } else if (cause == *e) { /* When the error is wrapping itself, we need to make a copy */ dqlite__error_copy(e, &msg); dqlite__error_printf(e, "%s: %s", tmp, msg); sqlite3_free(msg); } else { dqlite__error_printf(e, "%s: %s", tmp, cause); } dqlite__error_close(&tmp); } void dqlite__error_wrapf(dqlite__error *e, const dqlite__error *cause, const char *fmt, ...) { va_list args; va_start(args, fmt); dqlite__error_vwrapf(e, (const char *)(*cause), fmt, args); va_end(args); } void dqlite__error_oom(dqlite__error *e, const char *msg, ...) { va_list args; va_start(args, msg); dqlite__error_vwrapf(e, "out of memory", msg, args); va_end(args); } void dqlite__error_sys(dqlite__error *e, const char *msg) { dqlite__error_printf(e, "%s: %s", msg, strerror(errno)); } void dqlite__error_uv(dqlite__error *e, int err, const char *msg) { dqlite__error_printf(e, "%s: %s (%s)", msg, uv_strerror(err), uv_err_name(err)); } int dqlite__error_copy(dqlite__error *e, char **msg) { char *copy; size_t len; assert(e != NULL); assert(msg != NULL); /* Trying to copy an empty error message is an error. */ if (*e == NULL) { *msg = NULL; return DQLITE_ERROR; } len = strlen(*e) + 1; copy = sqlite3_malloc((int)(len * sizeof *copy)); if (copy == NULL) { *msg = NULL; return DQLITE_NOMEM; } memcpy(copy, *e, len); *msg = copy; return 0; } int dqlite__error_is_null(dqlite__error *e) { return *e == NULL; } int dqlite__error_is_disconnect(dqlite__error *e) { if (*e == NULL) return 0; if (strstr(*e, uv_err_name(UV_EOF)) != NULL) return 1; if (strstr(*e, uv_err_name(UV_ECONNRESET)) != NULL) return 1; return 0; } dqlite-1.16.7/src/error.h000066400000000000000000000024461465252713400152000ustar00rootroot00000000000000#ifndef DQLITE_ERROR_H #define DQLITE_ERROR_H #include #include /* A message describing the last error occurred on an object */ typedef char *dqlite__error; /* Initialize the error with an empty message */ void dqlite__error_init(dqlite__error *e); /* Release the memory of the error message, if any is set */ void dqlite__error_close(dqlite__error *e); /* Set the error message */ void dqlite__error_printf(dqlite__error *e, const char *fmt, ...); /* Wrap an error with an additional message */ void dqlite__error_wrapf(dqlite__error *e, const dqlite__error *cause, const char *fmt, ...); /* Out of memory error */ void dqlite__error_oom(dqlite__error *e, const char *msg, ...); /* Wrap a system error */ void dqlite__error_sys(dqlite__error *e, const char *msg); /* Wrap an error from libuv */ void dqlite__error_uv(dqlite__error *e, int err, const char *msg); /* Copy the underlying error message. * * Client code is responsible of invoking sqlite3_free to deallocate the * returned string. */ int dqlite__error_copy(dqlite__error *e, char **msg); /* Whether the error is not set */ int dqlite__error_is_null(dqlite__error *e); /* Whether the error is due to client disconnection */ int dqlite__error_is_disconnect(dqlite__error *e); #endif /* DQLITE_ERROR_H */ dqlite-1.16.7/src/format.c000066400000000000000000000067531465252713400153370ustar00rootroot00000000000000#include #include #include #include #include "./lib/assert.h" #include "format.h" /* tinycc doesn't have this builtin, nor the warning that it's meant to silence. */ #ifdef __TINYC__ #define __builtin_assume_aligned(x, y) x #endif /* WAL magic value. Either this value, or the same value with the least * significant bit also set (FORMAT__WAL_MAGIC | 0x00000001) is stored in 32-bit * big-endian format in the first 4 bytes of a WAL file. * * If the LSB is set, then the checksums for each frame within the WAL file are * calculated by treating all data as an array of 32-bit big-endian * words. Otherwise, they are calculated by interpreting all data as 32-bit * little-endian words. */ #define FORMAT__WAL_MAGIC 0x377f0682 #define FORMAT__WAL_MAX_VERSION 3007000 static void formatGet32(const uint8_t buf[4], uint32_t *v) { *v = 0; *v += (uint32_t)(buf[0] << 24); *v += (uint32_t)(buf[1] << 16); *v += (uint32_t)(buf[2] << 8); *v += (uint32_t)(buf[3]); } /* Encode a 32-bit number to big endian format */ static void formatPut32(uint32_t v, uint8_t *buf) { buf[0] = (uint8_t)(v >> 24); buf[1] = (uint8_t)(v >> 16); buf[2] = (uint8_t)(v >> 8); buf[3] = (uint8_t)v; } /* * Generate or extend an 8 byte checksum based on the data in array data[] and * the initial values of in[0] and in[1] (or initial values of 0 and 0 if * in==NULL). * * The checksum is written back into out[] before returning. * * n must be a positive multiple of 8. */ static void formatWalChecksumBytes( bool native, /* True for native byte-order, false for non-native */ uint8_t *data, /* Content to be checksummed */ unsigned n, /* Bytes of content in a[]. Must be a multiple of 8. */ const uint32_t in[2], /* Initial checksum value input */ uint32_t out[2] /* OUT: Final checksum value output */ ) { uint32_t s1, s2; /* `data` is an alias for the `hdr` member of a `struct vfsWal`. `hdr` * is the first member of this struct. Because `struct vfsWal` contains * pointer members, the struct itself will have the alignment of the * pointer members. As `hdr` is the first member, it will have this * alignment too. Therefore it is safe to assume pointer alignment (and * silence the compiler). more info -> * http://www.catb.org/esr/structure-packing/ */ uint32_t *cur = (uint32_t *)__builtin_assume_aligned(data, sizeof(void *)); uint32_t *end = (uint32_t *)__builtin_assume_aligned(&data[n], sizeof(void *)); if (in) { s1 = in[0]; s2 = in[1]; } else { s1 = s2 = 0; } assert(n >= 8); assert((n & 0x00000007) == 0); assert(n <= 65536); if (native) { do { s1 += *cur++ + s2; s2 += *cur++ + s1; } while (cur < end); } else { do { uint32_t d; formatPut32(cur[0], (uint8_t *)&d); s1 += d + s2; formatPut32(cur[1], (uint8_t *)&d); s2 += d + s1; cur += 2; } while (cur < end); } out[0] = s1; out[1] = s2; } void formatWalRestartHeader(uint8_t *header) { uint32_t checksum[2] = {0, 0}; uint32_t checkpoint; uint32_t salt1; /* Increase the checkpoint sequence. */ formatGet32(&header[12], &checkpoint); checkpoint++; formatPut32(checkpoint, &header[12]); /* Increase salt1. */ formatGet32(&header[16], &salt1); salt1++; formatPut32(salt1, &header[16]); /* Generate a random salt2. */ sqlite3_randomness(4, &header[20]); /* Update the checksum. */ formatWalChecksumBytes(true, header, 24, checksum, checksum); formatPut32(checksum[0], header + 24); formatPut32(checksum[1], header + 28); } dqlite-1.16.7/src/format.h000066400000000000000000000024361465252713400153360ustar00rootroot00000000000000/* Utilities around SQLite file formats. * * See https://sqlite.org/fileformat.html. */ #ifndef FORMAT_H_ #define FORMAT_H_ #include #include /* Minumum and maximum page size. */ #define FORMAT__PAGE_SIZE_MIN 512 #define FORMAT__PAGE_SIZE_MAX 65536 /* Database header size. */ #define FORMAT__DB_HDR_SIZE 100 /* Write ahead log header size. */ #define FORMAT__WAL_HDR_SIZE 32 /* Write ahead log frame header size. */ #define FORMAT__WAL_FRAME_HDR_SIZE 24 /* Number of reader marks in the wal index header. */ #define FORMAT__WAL_NREADER 5 /* Given the page size, calculate the size of a full WAL frame (frame header * plus page data). */ #define formatWalCalcFrameSize(PAGE_SIZE) \ (FORMAT__WAL_FRAME_HDR_SIZE + PAGE_SIZE) /* Given the page size and the WAL file size, calculate the number of frames it * has. */ #define formatWalCalcFramesNumber(PAGE_SIZE, SIZE) \ ((SIZE - FORMAT__WAL_HDR_SIZE) / formatWalCalcFrameSize(PAGE_SIZE)) /* Given the page size, calculate the WAL page number of the frame starting at * the given offset. */ #define formatWalCalcFrameIndex(PAGE_SIZE, OFFSET) \ (formatWalCalcFramesNumber(PAGE_SIZE, OFFSET) + 1) /* Restart the header of a WAL file after a checkpoint. */ void formatWalRestartHeader(uint8_t *header); #endif /* FORMAT_H */ dqlite-1.16.7/src/fsm.c000066400000000000000000000636431465252713400146350ustar00rootroot00000000000000#include "lib/assert.h" #include "lib/serialize.h" #include "command.h" #include "fsm.h" #include "raft.h" #include "tracing.h" #include "vfs.h" #include struct fsm { struct logger *logger; struct registry *registry; struct { unsigned n_pages; unsigned long *page_numbers; uint8_t *pages; } pending; /* For upgrades from V1 */ }; static int apply_open(struct fsm *f, const struct command_open *c) { tracef("fsm apply open"); (void)f; (void)c; return 0; } static int add_pending_pages(struct fsm *f, unsigned long *page_numbers, uint8_t *pages, unsigned n_pages, unsigned page_size) { unsigned n = f->pending.n_pages + n_pages; unsigned i; f->pending.page_numbers = sqlite3_realloc64( f->pending.page_numbers, n * sizeof *f->pending.page_numbers); if (f->pending.page_numbers == NULL) { return DQLITE_NOMEM; } f->pending.pages = sqlite3_realloc64(f->pending.pages, n * page_size); if (f->pending.pages == NULL) { return DQLITE_NOMEM; } for (i = 0; i < n_pages; i++) { unsigned j = f->pending.n_pages + i; f->pending.page_numbers[j] = page_numbers[i]; memcpy(f->pending.pages + j * page_size, (uint8_t *)pages + i * page_size, page_size); } f->pending.n_pages = n; return 0; } static int databaseReadLock(struct db *db) { if (!db->read_lock) { db->read_lock = 1; return 0; } else { return -1; } } static int databaseReadUnlock(struct db *db) { if (db->read_lock) { db->read_lock = 0; return 0; } else { return -1; } } static void maybeCheckpoint(struct db *db) { tracef("maybe checkpoint"); struct sqlite3_file *main_f; struct sqlite3_file *wal; volatile void *region; sqlite3_int64 size; unsigned page_size; unsigned pages; int wal_size; int ckpt; int i; int rv; /* Don't run when a snapshot is busy. Running a checkpoint while a * snapshot is busy will result in illegal memory accesses by the * routines that try to access database page pointers contained in the * snapshot. */ rv = databaseReadLock(db); if (rv != 0) { tracef("busy snapshot %d", rv); return; } assert(db->follower == NULL); rv = db__open_follower(db); if (rv != 0) { tracef("open follower failed %d", rv); goto err_after_db_lock; } page_size = db->config->page_size; /* Get the database wal file associated with this connection */ rv = sqlite3_file_control(db->follower, "main", SQLITE_FCNTL_JOURNAL_POINTER, &wal); assert(rv == SQLITE_OK); /* Should never fail */ rv = wal->pMethods->xFileSize(wal, &size); assert(rv == SQLITE_OK); /* Should never fail */ /* Calculate the number of frames. */ pages = (unsigned)((size - 32) / (24 + page_size)); /* Check if the size of the WAL is beyond the threshold. */ if (pages < db->config->checkpoint_threshold) { tracef("wal size (%u) < threshold (%u)", pages, db->config->checkpoint_threshold); goto err_after_db_open; } /* Get the database file associated with this db->follower connection */ rv = sqlite3_file_control(db->follower, "main", SQLITE_FCNTL_FILE_POINTER, &main_f); assert(rv == SQLITE_OK); /* Should never fail */ /* Get the first SHM region, which contains the WAL header. */ rv = main_f->pMethods->xShmMap(main_f, 0, 0, 0, ®ion); assert(rv == SQLITE_OK); /* Should never fail */ rv = main_f->pMethods->xShmUnmap(main_f, 0); assert(rv == SQLITE_OK); /* Should never fail */ /* Try to acquire all locks. */ for (i = 0; i < SQLITE_SHM_NLOCK; i++) { int flags = SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE; rv = main_f->pMethods->xShmLock(main_f, i, 1, flags); if (rv == SQLITE_BUSY) { tracef("busy reader or writer - retry next time"); goto err_after_db_open; } /* Not locked. Let's release the lock we just * acquired. */ flags = SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE; main_f->pMethods->xShmLock(main_f, i, 1, flags); } rv = sqlite3_wal_checkpoint_v2( db->follower, "main", SQLITE_CHECKPOINT_TRUNCATE, &wal_size, &ckpt); /* TODO assert(rv == 0) here? Which failure modes do we expect? */ if (rv != 0) { tracef("sqlite3_wal_checkpoint_v2 failed %d", rv); goto err_after_db_open; } tracef("sqlite3_wal_checkpoint_v2 success"); /* Since no reader transaction is in progress, we must be able to * checkpoint the entire WAL */ assert(wal_size == 0); assert(ckpt == 0); err_after_db_open: sqlite3_close(db->follower); db->follower = NULL; err_after_db_lock: rv = databaseReadUnlock(db); assert(rv == 0); } static int apply_frames(struct fsm *f, const struct command_frames *c) { tracef("fsm apply frames"); struct db *db; sqlite3_vfs *vfs; unsigned long *page_numbers = NULL; void *pages; int exists; int rv; rv = registry__db_get(f->registry, c->filename, &db); if (rv != 0) { tracef("db get failed %d", rv); return rv; } vfs = sqlite3_vfs_find(db->config->name); /* Check if the database file exists, and create it by opening a * connection if it doesn't. */ rv = vfs->xAccess(vfs, db->path, 0, &exists); assert(rv == 0); if (!exists) { rv = db__open_follower(db); if (rv != 0) { tracef("open follower failed %d", rv); return rv; } sqlite3_close(db->follower); db->follower = NULL; } rv = command_frames__page_numbers(c, &page_numbers); if (rv != 0) { if (page_numbers != NULL) { sqlite3_free(page_numbers); } tracef("page numbers failed %d", rv); return rv; } command_frames__pages(c, &pages); /* If the commit marker is set, we apply the changes directly to the * VFS. Otherwise, if the commit marker is not set, this must be an * upgrade from V1, we accumulate uncommitted frames in memory until the * final commit or a rollback. */ if (c->is_commit) { if (f->pending.n_pages > 0) { rv = add_pending_pages(f, page_numbers, pages, c->frames.n_pages, db->config->page_size); if (rv != 0) { tracef("malloc"); sqlite3_free(page_numbers); return DQLITE_NOMEM; } rv = VfsApply(vfs, db->path, f->pending.n_pages, f->pending.page_numbers, f->pending.pages); if (rv != 0) { tracef("VfsApply failed %d", rv); sqlite3_free(page_numbers); return rv; } sqlite3_free(f->pending.page_numbers); sqlite3_free(f->pending.pages); f->pending.n_pages = 0; f->pending.page_numbers = NULL; f->pending.pages = NULL; } else { rv = VfsApply(vfs, db->path, c->frames.n_pages, page_numbers, pages); if (rv != 0) { tracef("VfsApply failed %d", rv); sqlite3_free(page_numbers); return rv; } } } else { rv = add_pending_pages(f, page_numbers, pages, c->frames.n_pages, db->config->page_size); if (rv != 0) { tracef("add pending pages failed %d", rv); sqlite3_free(page_numbers); return DQLITE_NOMEM; } } sqlite3_free(page_numbers); maybeCheckpoint(db); return 0; } static int apply_undo(struct fsm *f, const struct command_undo *c) { tracef("apply undo %" PRIu64, c->tx_id); (void)c; if (f->pending.n_pages == 0) { return 0; } sqlite3_free(f->pending.page_numbers); sqlite3_free(f->pending.pages); f->pending.n_pages = 0; f->pending.page_numbers = NULL; f->pending.pages = NULL; return 0; } /* Checkpoints used to be coordinated cluster-wide, these days a node * checkpoints independently in `apply_frames`, the checkpoint command becomes a * no-op for modern nodes. */ static int apply_checkpoint(struct fsm *f, const struct command_checkpoint *c) { (void)f; (void)c; tracef("apply no-op checkpoint"); return 0; } static int fsm__apply(struct raft_fsm *fsm, const struct raft_buffer *buf, void **result) { tracef("fsm apply"); struct fsm *f = fsm->data; int type; void *command; int rc; rc = command__decode(buf, &type, &command); if (rc != 0) { tracef("fsm: decode command: %d", rc); goto err; } switch (type) { case COMMAND_OPEN: rc = apply_open(f, command); break; case COMMAND_FRAMES: rc = apply_frames(f, command); break; case COMMAND_UNDO: rc = apply_undo(f, command); break; case COMMAND_CHECKPOINT: rc = apply_checkpoint(f, command); break; default: rc = RAFT_MALFORMED; break; } raft_free(command); err: *result = NULL; return rc; } #define SNAPSHOT_FORMAT 1 #define SNAPSHOT_HEADER(X, ...) \ X(uint64, format, ##__VA_ARGS__) \ X(uint64, n, ##__VA_ARGS__) SERIALIZE__DEFINE(snapshotHeader, SNAPSHOT_HEADER); SERIALIZE__IMPLEMENT(snapshotHeader, SNAPSHOT_HEADER); #define SNAPSHOT_DATABASE(X, ...) \ X(text, filename, ##__VA_ARGS__) \ X(uint64, main_size, ##__VA_ARGS__) \ X(uint64, wal_size, ##__VA_ARGS__) SERIALIZE__DEFINE(snapshotDatabase, SNAPSHOT_DATABASE); SERIALIZE__IMPLEMENT(snapshotDatabase, SNAPSHOT_DATABASE); /* Encode the global snapshot header. */ static int encodeSnapshotHeader(unsigned n, struct raft_buffer *buf) { struct snapshotHeader header; char *cursor; header.format = SNAPSHOT_FORMAT; header.n = n; buf->len = snapshotHeader__sizeof(&header); buf->base = sqlite3_malloc64(buf->len); if (buf->base == NULL) { return RAFT_NOMEM; } cursor = buf->base; snapshotHeader__encode(&header, &cursor); return 0; } /* Encode the given database. */ static int encodeDatabase(struct db *db, struct raft_buffer r_bufs[], uint32_t n) { struct snapshotDatabase header; sqlite3_vfs *vfs; uint32_t database_size = 0; uint8_t *page; char *cursor; struct dqlite_buffer *bufs = (struct dqlite_buffer *)r_bufs; int rv; header.filename = db->filename; vfs = sqlite3_vfs_find(db->config->name); rv = VfsShallowSnapshot(vfs, db->filename, &bufs[1], n - 1); if (rv != 0) { goto err; } /* Extract the database size from the first page. */ page = bufs[1].base; database_size += (uint32_t)(page[28] << 24); database_size += (uint32_t)(page[29] << 16); database_size += (uint32_t)(page[30] << 8); database_size += (uint32_t)(page[31]); header.main_size = (uint64_t)database_size * (uint64_t)db->config->page_size; header.wal_size = bufs[n - 1].len; /* Database header. */ bufs[0].len = snapshotDatabase__sizeof(&header); bufs[0].base = sqlite3_malloc64(bufs[0].len); if (bufs[0].base == NULL) { rv = RAFT_NOMEM; goto err_after_snapshot; } cursor = bufs[0].base; snapshotDatabase__encode(&header, &cursor); return 0; err_after_snapshot: /* Free the wal buffer */ sqlite3_free(bufs[n - 1].base); err: assert(rv != 0); return rv; } /* Decode the database contained in a snapshot. */ static int decodeDatabase(struct fsm *f, struct cursor *cursor) { struct snapshotDatabase header; struct db *db; sqlite3_vfs *vfs; size_t n; int exists; int rv; rv = snapshotDatabase__decode(cursor, &header); if (rv != 0) { return rv; } rv = registry__db_get(f->registry, header.filename, &db); if (rv != 0) { return rv; } vfs = sqlite3_vfs_find(db->config->name); /* Check if the database file exists, and create it by opening a * connection if it doesn't. */ rv = vfs->xAccess(vfs, header.filename, 0, &exists); assert(rv == 0); if (!exists) { rv = db__open_follower(db); if (rv != 0) { return rv; } sqlite3_close(db->follower); db->follower = NULL; } tracef("main_size:%" PRIu64 " wal_size:%" PRIu64, header.main_size, header.wal_size); if (header.main_size + header.wal_size > SIZE_MAX) { tracef("main_size + wal_size would overflow max DB size"); return -1; } /* Due to the check above, this cast is safe. */ n = (size_t)(header.main_size + header.wal_size); rv = VfsRestore(vfs, db->filename, cursor->p, n); if (rv != 0) { return rv; } cursor->p += n; return 0; } static unsigned dbNumPages(struct db *db) { sqlite3_vfs *vfs; int rv; uint32_t n; vfs = sqlite3_vfs_find(db->config->name); rv = VfsDatabaseNumPages(vfs, db->filename, &n); assert(rv == 0); return n; } /* Determine the total number of raft buffers needed for a snapshot */ static unsigned snapshotNumBufs(struct fsm *f) { struct db *db; queue *head; unsigned n = 1; /* snapshot header */ QUEUE_FOREACH(head, &f->registry->dbs) { n += 2; /* database header & wal */ db = QUEUE_DATA(head, struct db, queue); n += dbNumPages(db); /* 1 buffer per page (zero copy) */ } return n; } /* An example array of snapshot buffers looks like this: * * bufs: SH DH1 P1 P2 P3 WAL1 DH2 P1 P2 WAL2 * index: 0 1 2 3 4 5 6 7 8 9 * * SH: Snapshot Header * DHx: Database Header * Px: Database Page (not to be freed) * WALx: a WAL * */ static void freeSnapshotBufs(struct fsm *f, struct raft_buffer bufs[], unsigned n_bufs) { queue *head; struct db *db; unsigned i; if (bufs == NULL || n_bufs == 0) { return; } /* Free snapshot header */ sqlite3_free(bufs[0].base); i = 1; /* Free all database headers & WAL buffers */ QUEUE_FOREACH(head, &f->registry->dbs) { if (i == n_bufs) { break; } db = QUEUE_DATA(head, struct db, queue); /* i is the index of the database header */ sqlite3_free(bufs[i].base); /* i is now the index of the next database header (if any) */ i += 1 /* db header */ + dbNumPages(db) + 1 /* WAL */; /* free WAL buffer */ sqlite3_free(bufs[i - 1].base); } } static int fsm__snapshot(struct raft_fsm *fsm, struct raft_buffer *bufs[], unsigned *n_bufs) { struct fsm *f = fsm->data; queue *head; struct db *db; unsigned n_db = 0; unsigned i; int rv; /* First count how many databases we have and check that no transaction * nor checkpoint nor other snapshot is in progress. */ QUEUE_FOREACH(head, &f->registry->dbs) { db = QUEUE_DATA(head, struct db, queue); if (db->tx_id != 0 || db->read_lock) { return RAFT_BUSY; } n_db++; } /* Lock all databases, preventing the checkpoint from running */ QUEUE_FOREACH(head, &f->registry->dbs) { db = QUEUE_DATA(head, struct db, queue); rv = databaseReadLock(db); assert(rv == 0); } *n_bufs = snapshotNumBufs(f); *bufs = sqlite3_malloc64(*n_bufs * sizeof **bufs); if (*bufs == NULL) { rv = RAFT_NOMEM; goto err; } rv = encodeSnapshotHeader(n_db, &(*bufs)[0]); if (rv != 0) { goto err_after_bufs_alloc; } /* Encode individual databases. */ i = 1; QUEUE_FOREACH(head, &f->registry->dbs) { db = QUEUE_DATA(head, struct db, queue); /* database_header + num_pages + wal */ unsigned n = 1 + dbNumPages(db) + 1; rv = encodeDatabase(db, &(*bufs)[i], n); if (rv != 0) { goto err_after_encode_header; } i += n; } assert(i == *n_bufs); return 0; err_after_encode_header: freeSnapshotBufs(f, *bufs, i); err_after_bufs_alloc: sqlite3_free(*bufs); err: QUEUE_FOREACH(head, &f->registry->dbs) { db = QUEUE_DATA(head, struct db, queue); databaseReadUnlock(db); } assert(rv != 0); return rv; } static int fsm__snapshot_finalize(struct raft_fsm *fsm, struct raft_buffer *bufs[], unsigned *n_bufs) { struct fsm *f = fsm->data; queue *head; struct db *db; unsigned n_db; struct snapshotHeader header; int rv; if (bufs == NULL) { return 0; } /* Decode the header to determine the number of databases. */ struct cursor cursor = {(*bufs)[0].base, (*bufs)[0].len}; rv = snapshotHeader__decode(&cursor, &header); if (rv != 0) { tracef("decode failed %d", rv); return -1; } if (header.format != SNAPSHOT_FORMAT) { tracef("bad format"); return -1; } /* Free allocated buffers */ freeSnapshotBufs(f, *bufs, *n_bufs); sqlite3_free(*bufs); *bufs = NULL; *n_bufs = 0; /* Unlock all databases that were locked for the snapshot, this is safe * because DB's are only ever added at the back of the queue. */ n_db = 0; QUEUE_FOREACH(head, &f->registry->dbs) { if (n_db == header.n) { break; } db = QUEUE_DATA(head, struct db, queue); rv = databaseReadUnlock(db); assert(rv == 0); n_db++; } return 0; } static int fsm__restore(struct raft_fsm *fsm, struct raft_buffer *buf) { tracef("fsm restore"); struct fsm *f = fsm->data; struct cursor cursor = {buf->base, buf->len}; struct snapshotHeader header; unsigned i; int rv; rv = snapshotHeader__decode(&cursor, &header); if (rv != 0) { tracef("decode failed %d", rv); return rv; } if (header.format != SNAPSHOT_FORMAT) { tracef("bad format"); return RAFT_MALFORMED; } for (i = 0; i < header.n; i++) { rv = decodeDatabase(f, &cursor); if (rv != 0) { tracef("decode failed"); return rv; } } /* Don't use sqlite3_free as this buffer is allocated by raft. */ raft_free(buf->base); return 0; } int fsm__init(struct raft_fsm *fsm, struct config *config, struct registry *registry) { tracef("fsm init"); struct fsm *f = raft_malloc(sizeof *f); if (f == NULL) { return DQLITE_NOMEM; } f->logger = &config->logger; f->registry = registry; f->pending.n_pages = 0; f->pending.page_numbers = NULL; f->pending.pages = NULL; fsm->version = 2; fsm->data = f; fsm->apply = fsm__apply; fsm->snapshot = fsm__snapshot; fsm->snapshot_finalize = fsm__snapshot_finalize; fsm->restore = fsm__restore; return 0; } void fsm__close(struct raft_fsm *fsm) { tracef("fsm close"); struct fsm *f = fsm->data; raft_free(f); } /****************************************************************************** Disk-based FSM *****************************************************************************/ /* The synchronous part of the database encoding */ static int encodeDiskDatabaseSync(struct db *db, struct raft_buffer *r_buf) { sqlite3_vfs *vfs; struct dqlite_buffer *buf = (struct dqlite_buffer *)r_buf; int rv; vfs = sqlite3_vfs_find(db->config->name); rv = VfsDiskSnapshotWal(vfs, db->path, buf); if (rv != 0) { goto err; } return 0; err: assert(rv != 0); return rv; } /* The asynchronous part of the database encoding */ static int encodeDiskDatabaseAsync(struct db *db, struct raft_buffer r_bufs[], uint32_t n) { struct snapshotDatabase header; sqlite3_vfs *vfs; char *cursor; struct dqlite_buffer *bufs = (struct dqlite_buffer *)r_bufs; int rv; assert(n == 3); vfs = sqlite3_vfs_find(db->config->name); rv = VfsDiskSnapshotDb(vfs, db->path, &bufs[1]); if (rv != 0) { goto err; } /* Database header. */ header.filename = db->filename; header.main_size = bufs[1].len; header.wal_size = bufs[2].len; bufs[0].len = snapshotDatabase__sizeof(&header); bufs[0].base = sqlite3_malloc64(bufs[0].len); if (bufs[0].base == NULL) { rv = RAFT_NOMEM; goto err; } cursor = bufs[0].base; snapshotDatabase__encode(&header, &cursor); return 0; /* Cleanup is performed by call to snapshot_finalize */ err: assert(rv != 0); return rv; } /* Determine the total number of raft buffers needed * for a snapshot in disk-mode */ static unsigned snapshotNumBufsDisk(struct fsm *f) { queue *head; unsigned n = 1; /* snapshot header */ QUEUE_FOREACH(head, &f->registry->dbs) { n += 3; /* database header, database file and wal */ } return n; } /* An example array of snapshot buffers looks like this: * * bufs: SH DH1 DBMMAP1 WAL1 DH2 DMMAP2 WAL2 * index: 0 1 2 3 4 5 6 * * SH: Snapshot Header * DHx: Database Header * DBMMAP: Pointer to mmap'ed database file * WALx: a WAL * */ static void freeSnapshotBufsDisk(struct fsm *f, struct raft_buffer bufs[], unsigned n_bufs) { queue *head; unsigned i; if (bufs == NULL || n_bufs == 0) { return; } /* Free snapshot header */ sqlite3_free(bufs[0].base); i = 1; /* Free all database headers & WAL buffers. Unmap the DB file. */ QUEUE_FOREACH(head, &f->registry->dbs) { if (i == n_bufs) { break; } /* i is the index of the database header */ sqlite3_free(bufs[i].base); if (bufs[i + 1].base != NULL) { munmap(bufs[i + 1].base, bufs[i + 1].len); } sqlite3_free(bufs[i + 2].base); /* i is now the index of the next database header (if any) */ i += 3; } } static int fsm__snapshot_disk(struct raft_fsm *fsm, struct raft_buffer *bufs[], unsigned *n_bufs) { struct fsm *f = fsm->data; queue *head; struct db *db = NULL; unsigned n_db = 0; unsigned i; int rv; /* First count how many databases we have and check that no transaction * nor checkpoint nor other snapshot is in progress. */ QUEUE_FOREACH(head, &f->registry->dbs) { db = QUEUE_DATA(head, struct db, queue); if (db->tx_id != 0 || db->read_lock) { return RAFT_BUSY; } n_db++; } /* Lock all databases, preventing the checkpoint from running. This * ensures the database is not written while it is mmap'ed and copied by * raft. */ QUEUE_FOREACH(head, &f->registry->dbs) { db = QUEUE_DATA(head, struct db, queue); rv = databaseReadLock(db); assert(rv == 0); } *n_bufs = snapshotNumBufsDisk(f); *bufs = sqlite3_malloc64(*n_bufs * sizeof **bufs); if (*bufs == NULL) { rv = RAFT_NOMEM; goto err; } /* zero-init buffers, helps with cleanup */ for (unsigned j = 0; j < *n_bufs; j++) { (*bufs)[j].base = NULL; (*bufs)[j].len = 0; } rv = encodeSnapshotHeader(n_db, &(*bufs)[0]); if (rv != 0) { goto err_after_bufs_alloc; } /* Copy WAL of all databases. */ i = 1; QUEUE_FOREACH(head, &f->registry->dbs) { db = QUEUE_DATA(head, struct db, queue); /* database_header + db + WAL */ unsigned n = 3; /* pass pointer to buffer that will contain WAL. */ rv = encodeDiskDatabaseSync(db, &(*bufs)[i + n - 1]); if (rv != 0) { goto err_after_encode_sync; } i += n; } assert(i == *n_bufs); return 0; err_after_encode_sync: freeSnapshotBufsDisk(f, *bufs, i); err_after_bufs_alloc: sqlite3_free(*bufs); err: QUEUE_FOREACH(head, &f->registry->dbs) { db = QUEUE_DATA(head, struct db, queue); databaseReadUnlock(db); } assert(rv != 0); return rv; } static int fsm__snapshot_async_disk(struct raft_fsm *fsm, struct raft_buffer *bufs[], unsigned *n_bufs) { struct fsm *f = fsm->data; queue *head; struct snapshotHeader header; struct db *db = NULL; unsigned i; int rv; /* Decode the header to determine the number of databases. */ struct cursor cursor = {(*bufs)[0].base, (*bufs)[0].len}; rv = snapshotHeader__decode(&cursor, &header); if (rv != 0) { tracef("decode failed %d", rv); return -1; } if (header.format != SNAPSHOT_FORMAT) { tracef("bad format"); return -1; } /* Encode individual databases. */ i = 1; QUEUE_FOREACH(head, &f->registry->dbs) { if (i == *n_bufs) { /* In case a db was added in meanwhile */ break; } db = QUEUE_DATA(head, struct db, queue); /* database_header + database file + wal */ unsigned n = 3; rv = encodeDiskDatabaseAsync(db, &(*bufs)[i], n); if (rv != 0) { goto err; } i += n; } return 0; err: assert(rv != 0); return rv; } static int fsm__snapshot_finalize_disk(struct raft_fsm *fsm, struct raft_buffer *bufs[], unsigned *n_bufs) { struct fsm *f = fsm->data; queue *head; struct db *db; unsigned n_db; struct snapshotHeader header; int rv; if (bufs == NULL) { return 0; } /* Decode the header to determine the number of databases. */ struct cursor cursor = {(*bufs)[0].base, (*bufs)[0].len}; rv = snapshotHeader__decode(&cursor, &header); if (rv != 0) { tracef("decode failed %d", rv); return -1; } if (header.format != SNAPSHOT_FORMAT) { tracef("bad format"); return -1; } /* Free allocated buffers */ freeSnapshotBufsDisk(f, *bufs, *n_bufs); sqlite3_free(*bufs); *bufs = NULL; *n_bufs = 0; /* Unlock all databases that were locked for the snapshot, this is safe * because DB's are only ever added at the back of the queue. */ n_db = 0; QUEUE_FOREACH(head, &f->registry->dbs) { if (n_db == header.n) { break; } db = QUEUE_DATA(head, struct db, queue); databaseReadUnlock(db); n_db++; } return 0; } /* Decode the disk database contained in a snapshot. */ static int decodeDiskDatabase(struct fsm *f, struct cursor *cursor) { struct snapshotDatabase header; struct db *db; sqlite3_vfs *vfs; int exists; int rv; rv = snapshotDatabase__decode(cursor, &header); if (rv != 0) { return rv; } rv = registry__db_get(f->registry, header.filename, &db); if (rv != 0) { return rv; } vfs = sqlite3_vfs_find(db->config->name); /* Check if the database file exists, and create it by opening a * connection if it doesn't. */ rv = vfs->xAccess(vfs, db->path, 0, &exists); assert(rv == 0); if (!exists) { rv = db__open_follower(db); if (rv != 0) { return rv; } sqlite3_close(db->follower); db->follower = NULL; } /* The last check can overflow, but we would already be lost anyway, as * the raft snapshot restore API only supplies one buffer and the data * has to fit in size_t bytes anyway. */ if (header.main_size > SIZE_MAX || header.wal_size > SIZE_MAX || header.main_size + header.wal_size > SIZE_MAX) { tracef("main_size:%" PRIu64 "B wal_size:%" PRIu64 "B would overflow max DB size (%zuB)", header.main_size, header.wal_size, SIZE_MAX); return -1; } /* Due to the check above, these casts are safe. */ rv = VfsDiskRestore(vfs, db->path, cursor->p, (size_t)header.main_size, (size_t)header.wal_size); if (rv != 0) { tracef("VfsDiskRestore %d", rv); return rv; } cursor->p += header.main_size + header.wal_size; return 0; } static int fsm__restore_disk(struct raft_fsm *fsm, struct raft_buffer *buf) { tracef("fsm restore disk"); struct fsm *f = fsm->data; struct cursor cursor = {buf->base, buf->len}; struct snapshotHeader header; unsigned i; int rv; rv = snapshotHeader__decode(&cursor, &header); if (rv != 0) { tracef("decode failed %d", rv); return rv; } if (header.format != SNAPSHOT_FORMAT) { tracef("bad format"); return RAFT_MALFORMED; } for (i = 0; i < header.n; i++) { rv = decodeDiskDatabase(f, &cursor); if (rv != 0) { tracef("decode failed"); return rv; } } /* Don't use sqlite3_free as this buffer is allocated by raft. */ raft_free(buf->base); return 0; } int fsm__init_disk(struct raft_fsm *fsm, struct config *config, struct registry *registry) { tracef("fsm init"); struct fsm *f = raft_malloc(sizeof *f); if (f == NULL) { return DQLITE_NOMEM; } f->logger = &config->logger; f->registry = registry; f->pending.n_pages = 0; f->pending.page_numbers = NULL; f->pending.pages = NULL; fsm->version = 3; fsm->data = f; fsm->apply = fsm__apply; fsm->snapshot = fsm__snapshot_disk; fsm->snapshot_async = fsm__snapshot_async_disk; fsm->snapshot_finalize = fsm__snapshot_finalize_disk; fsm->restore = fsm__restore_disk; return 0; } dqlite-1.16.7/src/fsm.h000066400000000000000000000012001465252713400146170ustar00rootroot00000000000000/** * Dqlite Raft FSM */ #ifndef DQLITE_FSM_H_ #define DQLITE_FSM_H_ #include "config.h" #include "raft.h" #include "registry.h" /** * Initialize the given SQLite replication interface with dqlite's raft based * implementation. */ int fsm__init(struct raft_fsm *fsm, struct config *config, struct registry *registry); /** * Initialize the given SQLite replication interface with dqlite's on-disk * raft based implementation. */ int fsm__init_disk(struct raft_fsm *fsm, struct config *config, struct registry *registry); void fsm__close(struct raft_fsm *fsm); #endif /* DQLITE_REPLICATION_METHODS_H_ */ dqlite-1.16.7/src/gateway.c000066400000000000000000001141211465252713400154750ustar00rootroot00000000000000#include "gateway.h" #include "bind.h" #include "conn.h" #include "id.h" #include "lib/threadpool.h" #include "protocol.h" #include "query.h" #include "request.h" #include "response.h" #include "server.h" #include "tracing.h" #include "translate.h" #include "tuple.h" #include "vfs.h" void gateway__init(struct gateway *g, struct config *config, struct registry *registry, struct raft *raft, struct id_state seed) { tracef("gateway init"); g->config = config; g->registry = registry; g->raft = raft; g->leader = NULL; g->req = NULL; g->exec.data = g; stmt__registry_init(&g->stmts); g->barrier.data = g; g->barrier.cb = NULL; g->barrier.leader = NULL; g->protocol = DQLITE_PROTOCOL_VERSION; g->client_id = 0; g->random_state = seed; } void gateway__leader_close(struct gateway *g, int reason) { if (g == NULL || g->leader == NULL) { tracef("gateway:%p or gateway->leader are NULL", g); return; } if (g->req != NULL) { if (g->leader->inflight != NULL) { tracef("finish inflight apply request"); struct raft_apply *req = &g->leader->inflight->req; req->cb(req, reason, NULL); assert(g->req == NULL); } else if (g->barrier.cb != NULL) { tracef("finish inflight barrier"); /* This is not a typo, g->barrier.req.cb is a wrapper * around g->barrier.cb and will set g->barrier.cb to * NULL when called. */ struct raft_barrier *b = &g->barrier.req; b->cb(b, reason); assert(g->barrier.cb == NULL); } else if (g->leader->exec != NULL && g->leader->exec->barrier.cb != NULL) { tracef("finish inflight exec barrier"); struct raft_barrier *b = &g->leader->exec->barrier.req; b->cb(b, reason); assert(g->leader->exec == NULL); } else if (g->req->type == DQLITE_REQUEST_QUERY_SQL) { /* Finalize the statement that was in the process of * yielding rows. We only need to handle QUERY_SQL * because for QUERY and EXEC the statement is finalized * by the call to stmt__registry_close, below (and for * EXEC_SQL the lifetimes of the statements are managed * by leader__exec and the associated callback). * * It's okay if g->req->stmt is NULL since * sqlite3_finalize(NULL) is documented to be a no-op. */ sqlite3_finalize(g->req->stmt); g->req = NULL; } else if (g->req->type == DQLITE_REQUEST_QUERY) { /* In case the statement is a prepared one, it * will be finalized by the stmt__registry_close * call below. Nevertheless, we must signal that * the request is not in place anymore so that any * callback which is already in the queue will not * attempt to execute a finalized statement. */ g->req = NULL; } } stmt__registry_close(&g->stmts); leader__close(g->leader); sqlite3_free(g->leader); g->leader = NULL; } void gateway__close(struct gateway *g) { tracef("gateway close"); if (g->leader == NULL) { stmt__registry_close(&g->stmts); return; } gateway__leader_close(g, RAFT_SHUTDOWN); } /* Declare a request struct and a response struct of the appropriate types and * decode the request. This is used in the common case where only one schema * version is extant. */ #define START_V0(REQ, RES, ...) \ struct request_##REQ request = { 0 }; \ struct response_##RES response = { 0 }; \ { \ int rv_; \ if (req->schema != 0) { \ tracef("bad schema version %d", req->schema); \ failure(req, DQLITE_PARSE, \ "unrecognized schema version"); \ return 0; \ } \ rv_ = request_##REQ##__decode(cursor, &request); \ if (rv_ != 0) { \ return rv_; \ } \ } #define CHECK_LEADER(REQ) \ if (raft_state(g->raft) != RAFT_LEADER) { \ failure(REQ, SQLITE_IOERR_NOT_LEADER, "not leader"); \ return 0; \ } #define SUCCESS(LOWER, UPPER, RESP, SCHEMA) \ { \ size_t _n = response_##LOWER##__sizeof(&RESP); \ char *_cursor; \ assert(_n % 8 == 0); \ _cursor = buffer__advance(req->buffer, _n); \ /* Since responses are small and the buffer it's at least 4096 \ * bytes, this can't fail. */ \ assert(_cursor != NULL); \ response_##LOWER##__encode(&RESP, &_cursor); \ req->cb(req, 0, DQLITE_RESPONSE_##UPPER, SCHEMA); \ } /* Encode the given success response and invoke the request callback, * using schema version 0. */ #define SUCCESS_V0(LOWER, UPPER) SUCCESS(LOWER, UPPER, response, 0) /* Lookup the database with the given ID. * * TODO: support more than one database per connection? */ #define LOOKUP_DB(ID) \ if (ID != 0 || g->leader == NULL) { \ failure(req, SQLITE_NOTFOUND, "no database opened"); \ return 0; \ } /* Lookup the statement with the given ID. */ #define LOOKUP_STMT(ID) \ stmt = stmt__registry_get(&g->stmts, ID); \ if (stmt == NULL) { \ failure(req, SQLITE_NOTFOUND, \ "no statement with the given id"); \ return 0; \ } #define FAIL_IF_CHECKPOINTING \ { \ struct sqlite3_file *_file; \ int _rv; \ _rv = sqlite3_file_control(g->leader->conn, "main", \ SQLITE_FCNTL_FILE_POINTER, &_file); \ assert(_rv == SQLITE_OK); /* Should never fail */ \ \ _rv = _file->pMethods->xShmLock( \ _file, 1 /* checkpoint lock */, 1, \ SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE); \ if (_rv != 0) { \ assert(_rv == SQLITE_BUSY); \ failure(req, SQLITE_BUSY, "checkpoint in progress"); \ return 0; \ } \ _file->pMethods->xShmLock( \ _file, 1 /* checkpoint lock */, 1, \ SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE); \ } /* Encode fa failure response and invoke the request callback */ static void failure(struct handle *req, int code, const char *message) { struct response_failure failure; size_t n; char *cursor; failure.code = (uint64_t)code; failure.message = message; n = response_failure__sizeof(&failure); assert(n % 8 == 0); cursor = buffer__advance(req->buffer, n); /* The buffer has at least 4096 bytes, and error messages are shorter * than that. So this can't fail. */ assert(cursor != NULL); response_failure__encode(&failure, &cursor); req->cb(req, 0, DQLITE_RESPONSE_FAILURE, 0); } static void emptyRows(struct handle *req) { char *cursor = buffer__advance(req->buffer, 8 + 8); uint64_t val; assert(cursor != NULL); val = 0; uint64__encode(&val, &cursor); val = DQLITE_RESPONSE_ROWS_DONE; uint64__encode(&val, &cursor); req->cb(req, 0, DQLITE_RESPONSE_ROWS, 0); } static int handle_leader_legacy(struct gateway *g, struct handle *req) { tracef("handle leader legacy"); struct cursor *cursor = &req->cursor; START_V0(leader, server_legacy); raft_id id; raft_leader(g->raft, &id, &response.address); if (response.address == NULL) { response.address = ""; } SUCCESS_V0(server_legacy, SERVER_LEGACY); return 0; } static int handle_leader(struct gateway *g, struct handle *req) { tracef("handle leader"); struct cursor *cursor = &req->cursor; raft_id id = 0; const char *address = NULL; unsigned i; if (g->protocol == DQLITE_PROTOCOL_VERSION_LEGACY) { return handle_leader_legacy(g, req); } START_V0(leader, server); /* Only voters might now who the leader is. */ for (i = 0; i < g->raft->configuration.n; i++) { struct raft_server *server = &g->raft->configuration.servers[i]; if (server->id == g->raft->id && server->role == RAFT_VOTER) { tracef("handle leader - dispatch to %llu", server->id); raft_leader(g->raft, &id, &address); break; } } response.id = id; response.address = address; if (response.address == NULL) { response.address = ""; } SUCCESS_V0(server, SERVER); return 0; } static int handle_client(struct gateway *g, struct handle *req) { tracef("handle client"); struct cursor *cursor = &req->cursor; START_V0(client, welcome); g->client_id = request.id; response.heartbeat_timeout = g->config->heartbeat_timeout; SUCCESS_V0(welcome, WELCOME); return 0; } static int handle_open(struct gateway *g, struct handle *req) { tracef("handle open"); struct cursor *cursor = &req->cursor; struct db *db; int rc; START_V0(open, db); if (g->leader != NULL) { tracef("already open"); failure(req, SQLITE_BUSY, "a database for this connection is already open"); return 0; } rc = registry__db_get(g->registry, request.filename, &db); if (rc != 0) { tracef("registry db get failed %d", rc); return rc; } g->leader = sqlite3_malloc(sizeof *g->leader); if (g->leader == NULL) { tracef("malloc failed"); return DQLITE_NOMEM; } rc = leader__init(g->leader, db, g->raft); if (rc != 0) { tracef("leader init failed %d", rc); sqlite3_free(g->leader); g->leader = NULL; return rc; } response.id = 0; SUCCESS_V0(db, DB); return 0; } static void prepareBarrierCb(struct barrier *barrier, int status) { tracef("prepare barrier cb status:%d", status); struct gateway *g = barrier->data; struct handle *req = g->req; struct response_stmt response_v0 = { 0 }; struct response_stmt_with_offset response_v1 = { 0 }; const char *sql = req->sql; struct stmt *stmt; const char *tail; sqlite3_stmt *tail_stmt; int rc; assert(req != NULL); stmt = stmt__registry_get(&g->stmts, req->stmt_id); assert(stmt != NULL); g->req = NULL; if (status != 0) { stmt__registry_del(&g->stmts, stmt); failure(req, status, "barrier error"); return; } rc = sqlite3_prepare_v2(g->leader->conn, sql, -1, &stmt->stmt, &tail); if (rc != SQLITE_OK) { failure(req, rc, sqlite3_errmsg(g->leader->conn)); stmt__registry_del(&g->stmts, stmt); return; } if (stmt->stmt == NULL) { tracef("prepare barrier cb empty statement"); stmt__registry_del(&g->stmts, stmt); /* FIXME Should we use a code other than 0 here? */ failure(req, 0, "empty statement"); return; } if (req->schema == DQLITE_PREPARE_STMT_SCHEMA_V0) { rc = sqlite3_prepare_v2(g->leader->conn, tail, -1, &tail_stmt, NULL); if (rc != 0 || tail_stmt != NULL) { stmt__registry_del(&g->stmts, stmt); sqlite3_finalize(tail_stmt); failure(req, SQLITE_ERROR, "nonempty statement tail"); return; } } switch (req->schema) { case DQLITE_PREPARE_STMT_SCHEMA_V0: response_v0.db_id = (uint32_t)req->db_id; response_v0.id = (uint32_t)stmt->id; response_v0.params = (uint64_t)sqlite3_bind_parameter_count(stmt->stmt); SUCCESS(stmt, STMT, response_v0, DQLITE_PREPARE_STMT_SCHEMA_V0); break; case DQLITE_PREPARE_STMT_SCHEMA_V1: response_v1.db_id = (uint32_t)req->db_id; response_v1.id = (uint32_t)stmt->id; response_v1.params = (uint64_t)sqlite3_bind_parameter_count(stmt->stmt); response_v1.offset = (uint64_t)(tail - sql); SUCCESS(stmt_with_offset, STMT_WITH_OFFSET, response_v1, DQLITE_PREPARE_STMT_SCHEMA_V1); break; default: assert(0); } } static int handle_prepare(struct gateway *g, struct handle *req) { tracef("handle prepare"); struct cursor *cursor = &req->cursor; struct stmt *stmt; struct request_prepare request = { 0 }; int rc; if (req->schema != DQLITE_PREPARE_STMT_SCHEMA_V0 && req->schema != DQLITE_PREPARE_STMT_SCHEMA_V1) { failure(req, SQLITE_ERROR, "unrecognized schema version"); return 0; } rc = request_prepare__decode(cursor, &request); if (rc != 0) { return rc; } CHECK_LEADER(req); LOOKUP_DB(request.db_id); rc = stmt__registry_add(&g->stmts, &stmt); if (rc != 0) { tracef("handle prepare registry add failed %d", rc); return rc; } assert(stmt != NULL); /* This cast is safe as long as the TODO in LOOKUP_DB is not * implemented. */ req->db_id = (size_t)request.db_id; req->stmt_id = stmt->id; req->sql = request.sql; g->req = req; rc = leader__barrier(g->leader, &g->barrier, prepareBarrierCb); if (rc != 0) { tracef("handle prepare barrier failed %d", rc); stmt__registry_del(&g->stmts, stmt); g->req = NULL; return rc; } return 0; } /* Fill a result response with the last inserted ID and number of rows * affected. */ static void fill_result(struct gateway *g, struct response_result *response) { assert(g->leader != NULL); response->last_insert_id = (uint64_t)sqlite3_last_insert_rowid(g->leader->conn); /* FIXME eventually we should consider using sqlite3_changes64 here */ response->rows_affected = (uint64_t)sqlite3_changes(g->leader->conn); } static const char *error_message(sqlite3 *db, int rc) { switch (rc) { case SQLITE_IOERR_LEADERSHIP_LOST: return "disk I/O error"; case SQLITE_IOERR_WRITE: return "disk I/O error"; case SQLITE_ABORT: return "abort"; case SQLITE_ROW: return "rows yielded when none expected for EXEC " "request"; } return sqlite3_errmsg(db); } static void leader_exec_cb(struct exec *exec, int status) { struct gateway *g = exec->data; struct handle *req = g->req; struct stmt *stmt = stmt__registry_get(&g->stmts, req->stmt_id); assert(stmt != NULL); struct response_result response; g->req = NULL; if (status == SQLITE_DONE) { fill_result(g, &response); SUCCESS_V0(result, RESULT); } else { assert(g->leader != NULL); failure(req, status, error_message(g->leader->conn, status)); sqlite3_reset(stmt->stmt); } } static int handle_exec(struct gateway *g, struct handle *req) { tracef("handle exec schema:%" PRIu8, req->schema); struct cursor *cursor = &req->cursor; struct stmt *stmt; struct request_exec request = { 0 }; int tuple_format; uint64_t req_id; int rv; switch (req->schema) { case DQLITE_REQUEST_PARAMS_SCHEMA_V0: tuple_format = TUPLE__PARAMS; break; case DQLITE_REQUEST_PARAMS_SCHEMA_V1: tuple_format = TUPLE__PARAMS32; break; default: tracef("bad schema version %d", req->schema); failure(req, DQLITE_PARSE, "unrecognized schema version"); return 0; } /* The v0 and v1 schemas only differ in the layout of the tuple, * so we can use the same decode function for both. */ rv = request_exec__decode(cursor, &request); if (rv != 0) { return rv; } CHECK_LEADER(req); LOOKUP_DB(request.db_id); LOOKUP_STMT(request.stmt_id); FAIL_IF_CHECKPOINTING; rv = bind__params(stmt->stmt, cursor, tuple_format); if (rv != 0) { tracef("handle exec bind failed %d", rv); failure(req, rv, "bind parameters"); return 0; } req->stmt_id = stmt->id; g->req = req; req_id = idNext(&g->random_state); rv = leader__exec(g->leader, &g->exec, stmt->stmt, req_id, leader_exec_cb); if (rv != 0) { tracef("handle exec leader exec failed %d", rv); g->req = NULL; return rv; } return 0; } /* Step through the given statement and populate the response buffer of the * given request with a single batch of rows. * * A single batch of rows is typically about the size of a memory page. */ static void query_batch_async(struct handle *req, enum pool_half half) { struct gateway *g = req->gw; sqlite3_stmt *stmt = req->stmt; assert(stmt != NULL); struct response_rows response; int rc; if (half == POOL_TOP_HALF) { req->work.rc = query__batch(stmt, req->buffer); return; } /* else POOL_BOTTOM_HALF => */ rc = req->work.rc; if (rc != SQLITE_ROW && rc != SQLITE_DONE) { assert(g->leader != NULL); failure(req, rc, sqlite3_errmsg(g->leader->conn)); sqlite3_reset(stmt); goto done; } if (rc == SQLITE_ROW) { response.eof = DQLITE_RESPONSE_ROWS_PART; g->req = req; SUCCESS_V0(rows, ROWS); return; } else { response.eof = DQLITE_RESPONSE_ROWS_DONE; SUCCESS_V0(rows, ROWS); } done: if (req->type == DQLITE_REQUEST_QUERY_SQL) { sqlite3_finalize(stmt); } } #ifdef DQLITE_NEXT static void qb_top(pool_work_t *w) { struct handle *req = CONTAINER_OF(w, struct handle, work); query_batch_async(req, POOL_TOP_HALF); } static void qb_bottom(pool_work_t *w) { struct handle *req = CONTAINER_OF(w, struct handle, work); query_batch_async(req, POOL_BOTTOM_HALF); } #endif static void query_batch(struct gateway *g) { struct handle *req = g->req; assert(req != NULL); g->req = NULL; req->gw = g; #ifdef DQLITE_NEXT struct dqlite_node *node = g->raft->data; pool_t *pool = !!(pool_ut_fallback()->flags & POOL_FOR_UT) ? pool_ut_fallback() : &node->pool; pool_queue_work(pool, &req->work, g->leader->db->cookie, WT_UNORD, qb_top, qb_bottom); #else query_batch_async(req, POOL_TOP_HALF); query_batch_async(req, POOL_BOTTOM_HALF); #endif } static void query_barrier_cb(struct barrier *barrier, int status) { tracef("query barrier cb status:%d", status); struct gateway *g = barrier->data; struct handle *req = g->req; assert(req != NULL); g->req = NULL; struct stmt *stmt = stmt__registry_get(&g->stmts, req->stmt_id); assert(stmt != NULL); if (status != 0) { failure(req, status, "barrier error"); return; } req->stmt = stmt->stmt; g->req = req; query_batch(g); } static void leaderModifyingQueryCb(struct exec *exec, int status) { struct gateway *g = exec->data; struct handle *req = g->req; assert(req != NULL); g->req = NULL; struct stmt *stmt = stmt__registry_get(&g->stmts, req->stmt_id); assert(stmt != NULL); if (status == SQLITE_DONE) { emptyRows(req); } else { assert(g->leader != NULL); failure(req, status, error_message(g->leader->conn, status)); sqlite3_reset(stmt->stmt); } } static int handle_query(struct gateway *g, struct handle *req) { tracef("handle query schema:%" PRIu8, req->schema); struct cursor *cursor = &req->cursor; struct stmt *stmt; struct request_query request = { 0 }; int tuple_format; bool is_readonly; uint64_t req_id; int rv; switch (req->schema) { case DQLITE_REQUEST_PARAMS_SCHEMA_V0: tuple_format = TUPLE__PARAMS; break; case DQLITE_REQUEST_PARAMS_SCHEMA_V1: tuple_format = TUPLE__PARAMS32; break; default: tracef("bad schema version %d", req->schema); failure(req, DQLITE_PARSE, "unrecognized schema version"); return 0; } /* The only difference in layout between the v0 and v1 requests is in * the tuple, which isn't parsed until bind__params later on. */ rv = request_query__decode(cursor, &request); if (rv != 0) { return rv; } CHECK_LEADER(req); LOOKUP_DB(request.db_id); LOOKUP_STMT(request.stmt_id); FAIL_IF_CHECKPOINTING; rv = bind__params(stmt->stmt, cursor, tuple_format); if (rv != 0) { tracef("handle query bind failed %d", rv); failure(req, rv, "bind parameters"); return 0; } req->stmt_id = stmt->id; g->req = req; is_readonly = (bool)sqlite3_stmt_readonly(stmt->stmt); if (is_readonly) { rv = leader__barrier(g->leader, &g->barrier, query_barrier_cb); } else { req_id = idNext(&g->random_state); rv = leader__exec(g->leader, &g->exec, stmt->stmt, req_id, leaderModifyingQueryCb); } if (rv != 0) { g->req = NULL; return rv; } return 0; } static int handle_finalize(struct gateway *g, struct handle *req) { tracef("handle finalize"); struct cursor *cursor = &req->cursor; struct stmt *stmt; int rv; START_V0(finalize, empty); LOOKUP_DB(request.db_id); LOOKUP_STMT(request.stmt_id); rv = stmt__registry_del(&g->stmts, stmt); if (rv != 0) { tracef("handle finalize registry del failed %d", rv); failure(req, rv, "finalize statement"); return 0; } SUCCESS_V0(empty, EMPTY); return 0; } static void handle_exec_sql_next(struct gateway *g, struct handle *req, bool done); static void handle_exec_sql_cb(struct exec *exec, int status) { tracef("handle exec sql cb status %d", status); struct gateway *g = exec->data; struct handle *req = g->req; req->exec_count += 1; sqlite3_finalize(exec->stmt); if (status == SQLITE_DONE) { handle_exec_sql_next(g, req, true); } else { assert(g->leader != NULL); failure(req, status, error_message(g->leader->conn, status)); g->req = NULL; } } static void handle_exec_sql_next(struct gateway *g, struct handle *req, bool done) { tracef("handle exec sql next"); struct cursor *cursor = &req->cursor; struct response_result response = { 0 }; sqlite3_stmt *stmt = NULL; const char *tail; int tuple_format; uint64_t req_id; int rv; if (req->sql == NULL || strcmp(req->sql, "") == 0) { goto success; } /* stmt will be set to NULL by sqlite when an error occurs. */ assert(g->leader != NULL); rv = sqlite3_prepare_v2(g->leader->conn, req->sql, -1, &stmt, &tail); if (rv != SQLITE_OK) { tracef("exec sql prepare failed %d", rv); failure(req, rv, sqlite3_errmsg(g->leader->conn)); goto done; } if (stmt == NULL) { goto success; } if (!done) { switch (req->schema) { case DQLITE_REQUEST_PARAMS_SCHEMA_V0: tuple_format = TUPLE__PARAMS; break; case DQLITE_REQUEST_PARAMS_SCHEMA_V1: tuple_format = TUPLE__PARAMS32; break; default: /* Should have been caught by handle_exec_sql */ assert(0); } rv = bind__params(stmt, cursor, tuple_format); if (rv != SQLITE_OK) { failure(req, rv, "bind parameters"); goto done_after_prepare; } } req->sql = tail; g->req = req; req_id = idNext(&g->random_state); /* At this point, leader__exec takes ownership of stmt */ rv = leader__exec(g->leader, &g->exec, stmt, req_id, handle_exec_sql_cb); if (rv != SQLITE_OK) { failure(req, rv, sqlite3_errmsg(g->leader->conn)); goto done_after_prepare; } return; success: tracef("handle exec sql next success"); if (req->exec_count > 0) { fill_result(g, &response); } SUCCESS_V0(result, RESULT); done_after_prepare: sqlite3_finalize(stmt); done: g->req = NULL; } static void execSqlBarrierCb(struct barrier *barrier, int status) { tracef("exec sql barrier cb status:%d", status); struct gateway *g = barrier->data; struct handle *req = g->req; assert(req != NULL); g->req = NULL; if (status != 0) { failure(req, status, "barrier error"); return; } handle_exec_sql_next(g, req, false); } static int handle_exec_sql(struct gateway *g, struct handle *req) { tracef("handle exec sql schema:%" PRIu8, req->schema); struct cursor *cursor = &req->cursor; struct request_exec_sql request = { 0 }; int rc; /* Fail early if the schema version isn't recognized, even though we * won't use it until later. */ if (req->schema != 0 && req->schema != 1) { tracef("bad schema version %d", req->schema); failure(req, DQLITE_PARSE, "unrecognized schema version"); return 0; } /* The only difference in layout between the v0 and v1 requests is in * the tuple, which isn't parsed until bind__params later on. */ rc = request_exec_sql__decode(cursor, &request); if (rc != 0) { return rc; } CHECK_LEADER(req); LOOKUP_DB(request.db_id); FAIL_IF_CHECKPOINTING; req->sql = request.sql; req->exec_count = 0; g->req = req; rc = leader__barrier(g->leader, &g->barrier, execSqlBarrierCb); if (rc != 0) { tracef("handle exec sql barrier failed %d", rc); g->req = NULL; return rc; } return 0; } static void leaderModifyingQuerySqlCb(struct exec *exec, int status) { struct gateway *g = exec->data; struct handle *req = g->req; assert(req != NULL); g->req = NULL; sqlite3_stmt *stmt = exec->stmt; assert(stmt != NULL); sqlite3_finalize(stmt); if (status == SQLITE_DONE) { emptyRows(req); } else { assert(g->leader != NULL); failure(req, status, error_message(g->leader->conn, status)); } } static void querySqlBarrierCb(struct barrier *barrier, int status) { tracef("query sql barrier cb status:%d", status); struct gateway *g = barrier->data; struct handle *req = g->req; assert(req != NULL); g->req = NULL; struct cursor *cursor = &req->cursor; const char *sql = req->sql; sqlite3_stmt *stmt; const char *tail; sqlite3_stmt *tail_stmt; int tuple_format; bool is_readonly; uint64_t req_id; int rv; if (status != 0) { failure(req, status, "barrier error"); return; } rv = sqlite3_prepare_v2(g->leader->conn, sql, -1, &stmt, &tail); if (rv != SQLITE_OK) { tracef("handle query sql prepare failed %d", rv); failure(req, rv, sqlite3_errmsg(g->leader->conn)); return; } if (stmt == NULL) { tracef("handle query sql empty statement"); failure(req, rv, "empty statement"); return; } rv = sqlite3_prepare_v2(g->leader->conn, tail, -1, &tail_stmt, NULL); if (rv != 0 || tail_stmt != NULL) { sqlite3_finalize(stmt); sqlite3_finalize(tail_stmt); failure(req, SQLITE_ERROR, "nonempty statement tail"); return; } switch (req->schema) { case DQLITE_REQUEST_PARAMS_SCHEMA_V0: tuple_format = TUPLE__PARAMS; break; case DQLITE_REQUEST_PARAMS_SCHEMA_V1: tuple_format = TUPLE__PARAMS32; break; default: /* Should have been caught by handle_query_sql */ assert(0); } rv = bind__params(stmt, cursor, tuple_format); if (rv != 0) { tracef("handle query sql bind failed %d", rv); sqlite3_finalize(stmt); failure(req, rv, "bind parameters"); return; } req->stmt = stmt; g->req = req; is_readonly = (bool)sqlite3_stmt_readonly(stmt); if (is_readonly) { query_batch(g); } else { req_id = idNext(&g->random_state); rv = leader__exec(g->leader, &g->exec, stmt, req_id, leaderModifyingQuerySqlCb); if (rv != 0) { sqlite3_finalize(stmt); g->req = NULL; failure(req, rv, "leader exec"); } } } static int handle_query_sql(struct gateway *g, struct handle *req) { tracef("handle query sql schema:%" PRIu8, req->schema); struct cursor *cursor = &req->cursor; struct request_query_sql request = { 0 }; int rv; /* Fail early if the schema version isn't recognized. */ if (req->schema != 0 && req->schema != 1) { tracef("bad schema version %d", req->schema); failure(req, DQLITE_PARSE, "unrecognized schema version"); return 0; } /* Schema version only affect the tuple format, which is parsed later */ rv = request_query_sql__decode(cursor, &request); if (rv != 0) { return rv; } CHECK_LEADER(req); LOOKUP_DB(request.db_id); FAIL_IF_CHECKPOINTING; req->sql = request.sql; g->req = req; rv = leader__barrier(g->leader, &g->barrier, querySqlBarrierCb); if (rv != 0) { tracef("handle query sql barrier failed %d", rv); g->req = NULL; return rv; } return 0; } /* * An interrupt can only be handled when a query is already yielding rows. */ static int handle_interrupt(struct gateway *g, struct handle *req) { tracef("handle interrupt"); g->req = NULL; struct cursor *cursor = &req->cursor; START_V0(interrupt, empty); sqlite3_finalize(req->stmt); req->stmt = NULL; SUCCESS_V0(empty, EMPTY); return 0; } struct change { struct gateway *gateway; struct raft_change req; }; static void raftChangeCb(struct raft_change *change, int status) { tracef("raft change cb id:%" PRIu64 " status:%d", idExtract(change->req_id), status); struct change *r = change->data; struct gateway *g = r->gateway; struct handle *req = g->req; struct response_empty response = { 0 }; g->req = NULL; sqlite3_free(r); if (status != 0) { failure(req, translateRaftErrCode(status), raft_strerror(status)); } else { SUCCESS_V0(empty, EMPTY); } } static int handle_add(struct gateway *g, struct handle *req) { tracef("handle add"); struct cursor *cursor = &req->cursor; struct change *r; uint64_t req_id; int rv; START_V0(add, empty); (void)response; CHECK_LEADER(req); r = sqlite3_malloc(sizeof *r); if (r == NULL) { return DQLITE_NOMEM; } r->gateway = g; r->req.data = r; req_id = idNext(&g->random_state); idSet(r->req.req_id, req_id); g->req = req; rv = raft_add(g->raft, &r->req, request.id, request.address, raftChangeCb); if (rv != 0) { tracef("raft add failed %d", rv); g->req = NULL; sqlite3_free(r); failure(req, translateRaftErrCode(rv), raft_strerror(rv)); return 0; } return 0; } static int handle_promote_or_assign(struct gateway *g, struct handle *req) { tracef("handle assign"); struct cursor *cursor = &req->cursor; struct change *r; uint64_t role = DQLITE_VOTER; uint64_t req_id; int rv; START_V0(promote_or_assign, empty); (void)response; CHECK_LEADER(req); /* Detect if this is an assign role request, instead of the former * promote request. */ if (cursor->cap > 0) { rv = uint64__decode(cursor, &role); if (rv != 0) { tracef("handle assign promote rv %d", rv); return rv; } } r = sqlite3_malloc(sizeof *r); if (r == NULL) { tracef("malloc failed"); return DQLITE_NOMEM; } r->gateway = g; r->req.data = r; req_id = idNext(&g->random_state); idSet(r->req.req_id, req_id); g->req = req; rv = raft_assign(g->raft, &r->req, request.id, translateDqliteRole((int)role), raftChangeCb); if (rv != 0) { tracef("raft_assign failed %d", rv); g->req = NULL; sqlite3_free(r); failure(req, translateRaftErrCode(rv), raft_strerror(rv)); return 0; } return 0; } static int handle_remove(struct gateway *g, struct handle *req) { tracef("handle remove"); struct cursor *cursor = &req->cursor; struct change *r; uint64_t req_id; int rv; START_V0(remove, empty); (void)response; CHECK_LEADER(req); r = sqlite3_malloc(sizeof *r); if (r == NULL) { tracef("malloc failed"); return DQLITE_NOMEM; } r->gateway = g; r->req.data = r; req_id = idNext(&g->random_state); idSet(r->req.req_id, req_id); g->req = req; rv = raft_remove(g->raft, &r->req, request.id, raftChangeCb); if (rv != 0) { tracef("raft_remote failed %d", rv); g->req = NULL; sqlite3_free(r); failure(req, translateRaftErrCode(rv), raft_strerror(rv)); return 0; } return 0; } static int dumpFile(const char *filename, uint8_t *data, size_t n, struct buffer *buffer) { char *cur; uint64_t len = n; cur = buffer__advance(buffer, text__sizeof(&filename)); if (cur == NULL) { goto oom; } text__encode(&filename, &cur); cur = buffer__advance(buffer, uint64__sizeof(&len)); if (cur == NULL) { goto oom; } uint64__encode(&len, &cur); if (n == 0) { return 0; } assert(n % 8 == 0); assert(data != NULL); cur = buffer__advance(buffer, n); if (cur == NULL) { goto oom; } memcpy(cur, data, n); return 0; oom: return DQLITE_NOMEM; } static int handle_dump(struct gateway *g, struct handle *req) { tracef("handle dump"); struct cursor *cursor = &req->cursor; bool err = true; sqlite3_vfs *vfs; char *cur; char filename[1024] = { 0 }; void *data; size_t n; uint8_t *page; uint32_t database_size = 0; uint8_t *database; uint8_t *wal; size_t n_database; size_t n_wal; int rv; START_V0(dump, files); response.n = 2; cur = buffer__advance(req->buffer, response_files__sizeof(&response)); assert(cur != NULL); response_files__encode(&response, &cur); vfs = sqlite3_vfs_find(g->config->name); rv = VfsSnapshot(vfs, request.filename, &data, &n); if (rv != 0) { tracef("dump failed"); failure(req, rv, "failed to dump database"); return 0; } if (data != NULL) { /* Extract the database size from the first page. */ page = data; database_size += (uint32_t)(page[28] << 24); database_size += (uint32_t)(page[29] << 16); database_size += (uint32_t)(page[30] << 8); database_size += (uint32_t)(page[31]); n_database = database_size * g->config->page_size; n_wal = n - n_database; database = data; wal = database + n_database; } else { assert(n == 0); n_database = 0; n_wal = 0; database = NULL; wal = NULL; } rv = dumpFile(request.filename, database, n_database, req->buffer); if (rv != 0) { tracef("dump failed"); failure(req, rv, "failed to dump database"); goto out_free_data; } /* filename is zero inited and initially we allow only writing 1024 - 4 * - 1 bytes to it, so after strncpy filename will be zero-terminated * and will not have overflowed. strcat adds the 4 byte suffix and * also zero terminates the resulting string. */ const char *wal_suffix = "-wal"; strncpy(filename, request.filename, sizeof(filename) - strlen(wal_suffix) - 1); strcat(filename, wal_suffix); rv = dumpFile(filename, wal, n_wal, req->buffer); if (rv != 0) { tracef("wal dump failed"); failure(req, rv, "failed to dump wal file"); goto out_free_data; } err = false; out_free_data: if (data != NULL) { raft_free(data); } if (!err) { req->cb(req, 0, DQLITE_RESPONSE_FILES, 0); } return 0; } static int encodeServer(struct gateway *g, unsigned i, struct buffer *buffer, int format) { char *cur; uint64_t id; uint64_t role; text_t address; assert(format == DQLITE_REQUEST_CLUSTER_FORMAT_V0 || format == DQLITE_REQUEST_CLUSTER_FORMAT_V1); id = g->raft->configuration.servers[i].id; address = g->raft->configuration.servers[i].address; role = (uint64_t)translateRaftRole(g->raft->configuration.servers[i].role); cur = buffer__advance(buffer, uint64__sizeof(&id)); if (cur == NULL) { return DQLITE_NOMEM; } uint64__encode(&id, &cur); cur = buffer__advance(buffer, text__sizeof(&address)); if (cur == NULL) { return DQLITE_NOMEM; } text__encode(&address, &cur); if (format == DQLITE_REQUEST_CLUSTER_FORMAT_V0) { return 0; } cur = buffer__advance(buffer, uint64__sizeof(&role)); if (cur == NULL) { return DQLITE_NOMEM; } uint64__encode(&role, &cur); return 0; } static int handle_cluster(struct gateway *g, struct handle *req) { tracef("handle cluster"); struct cursor *cursor = &req->cursor; unsigned i; char *cur; int rv; START_V0(cluster, servers); if (request.format != DQLITE_REQUEST_CLUSTER_FORMAT_V0 && request.format != DQLITE_REQUEST_CLUSTER_FORMAT_V1) { tracef("bad cluster format"); failure(req, DQLITE_PARSE, "unrecognized cluster format"); return 0; } response.n = g->raft->configuration.n; cur = buffer__advance(req->buffer, response_servers__sizeof(&response)); assert(cur != NULL); response_servers__encode(&response, &cur); for (i = 0; i < response.n; i++) { rv = encodeServer(g, i, req->buffer, (int)request.format); if (rv != 0) { tracef("encode failed"); failure(req, rv, "failed to encode server"); return 0; } } req->cb(req, 0, DQLITE_RESPONSE_SERVERS, 0); return 0; } void raftTransferCb(struct raft_transfer *r) { struct gateway *g = r->data; struct handle *req = g->req; struct response_empty response = { 0 }; g->req = NULL; sqlite3_free(r); if (g->raft->state == RAFT_LEADER) { tracef("transfer failed"); failure(req, DQLITE_ERROR, "leadership transfer failed"); } else { SUCCESS_V0(empty, EMPTY); } } static int handle_transfer(struct gateway *g, struct handle *req) { tracef("handle transfer"); struct cursor *cursor = &req->cursor; struct raft_transfer *r; int rv; START_V0(transfer, empty); (void)response; CHECK_LEADER(req); r = sqlite3_malloc(sizeof *r); if (r == NULL) { tracef("malloc failed"); return DQLITE_NOMEM; } r->data = g; g->req = req; rv = raft_transfer(g->raft, r, request.id, raftTransferCb); if (rv != 0) { tracef("raft_transfer failed %d", rv); g->req = NULL; sqlite3_free(r); failure(req, translateRaftErrCode(rv), raft_strerror(rv)); return 0; } return 0; } static int handle_describe(struct gateway *g, struct handle *req) { tracef("handle describe"); struct cursor *cursor = &req->cursor; START_V0(describe, metadata); if (request.format != DQLITE_REQUEST_DESCRIBE_FORMAT_V0) { tracef("bad format"); failure(req, SQLITE_PROTOCOL, "bad format version"); } response.failure_domain = g->config->failure_domain; response.weight = g->config->weight; SUCCESS_V0(metadata, METADATA); return 0; } static int handle_weight(struct gateway *g, struct handle *req) { tracef("handle weight"); struct cursor *cursor = &req->cursor; START_V0(weight, empty); g->config->weight = request.weight; SUCCESS_V0(empty, EMPTY); return 0; } int gateway__handle(struct gateway *g, struct handle *req, int type, int schema, struct buffer *buffer, handle_cb cb) { tracef("gateway handle"); int rc = 0; sqlite3_stmt *stmt = NULL; // used for DQLITE_REQUEST_INTERRUPT if (g->req == NULL) { goto handle; } /* Request in progress. TODO The current implementation doesn't allow * reading a new request while a query is yielding rows, in that case * gateway__resume in write_cb will indicate it has not finished * returning results and a new request (in this case, the interrupt) * will not be read. */ if (g->req->type == DQLITE_REQUEST_QUERY && type == DQLITE_REQUEST_INTERRUPT) { goto handle; } if (g->req->type == DQLITE_REQUEST_QUERY_SQL && type == DQLITE_REQUEST_INTERRUPT) { stmt = g->req->stmt; goto handle; } /* Receiving a request when one is ongoing on the same connection * is a hard error. The connection will be stopped due to the non-0 * return code in case asserts are off. */ assert(false); return SQLITE_BUSY; handle: req->type = type; req->schema = schema; req->cb = cb; req->buffer = buffer; req->db_id = 0; req->stmt_id = 0; req->sql = NULL; req->stmt = stmt; req->exec_count = 0; req->work = (pool_work_t){}; switch (type) { #define DISPATCH(LOWER, UPPER, _) \ case DQLITE_REQUEST_##UPPER: \ rc = handle_##LOWER(g, req); \ break; REQUEST__TYPES(DISPATCH); default: tracef("unrecognized request type %d", type); failure(req, DQLITE_PARSE, "unrecognized request type"); rc = 0; } return rc; } int gateway__resume(struct gateway *g, bool *finished) { if (g->req == NULL || (g->req->type != DQLITE_REQUEST_QUERY && g->req->type != DQLITE_REQUEST_QUERY_SQL)) { tracef("gateway resume - finished"); *finished = true; return 0; } tracef("gateway resume - not finished"); *finished = false; g->req->work = (pool_work_t){}; query_batch(g); return 0; } dqlite-1.16.7/src/gateway.h000066400000000000000000000103111465252713400154760ustar00rootroot00000000000000/** * Core dqlite server engine, calling out SQLite for serving client requests. */ #ifndef DQLITE_GATEWAY_H_ #define DQLITE_GATEWAY_H_ #include "../include/dqlite.h" #include "lib/buffer.h" #include "lib/serialize.h" #include "config.h" #include "id.h" #include "leader.h" #include "raft.h" #include "registry.h" #include "stmt.h" struct handle; /** * Handle requests from a single connected client and forward them to * SQLite. */ struct gateway { struct config *config; /* Configuration */ struct registry *registry; /* Register of existing databases */ struct raft *raft; /* Raft instance */ struct leader *leader; /* Leader connection to the database */ struct handle *req; /* Asynchronous request being handled */ struct exec exec; /* Low-level exec async request */ struct stmt__registry stmts; /* Registry of prepared statements */ struct barrier barrier; /* Barrier for query requests */ uint64_t protocol; /* Protocol format version */ uint64_t client_id; struct id_state random_state; /* For generating IDs */ }; void gateway__init(struct gateway *g, struct config *config, struct registry *registry, struct raft *raft, struct id_state seed); void gateway__close(struct gateway *g); /** * Closes the leader connection to the database, reason should contain a raft * error code. */ void gateway__leader_close(struct gateway *g, int reason); /** * Asynchronous request to handle a client command. * * We also use the handle as a place to save request-scoped data that we need * to access from a callback. */ typedef void (*handle_cb)(struct handle *req, int status, uint8_t type, uint8_t schema); struct handle { /* User data. */ void *data; /* Type code for this request. */ int type; /* Schema version for this request. */ int schema; /* Buffer where the response to this request will be written. */ struct buffer *buffer; /* Cursor for reading the request. */ struct cursor cursor; /* Database ID parsed from this request. * * This is used by handle_prepare. */ size_t db_id; /* ID of the statement associated with this request. * * This is used by handle_prepare. */ size_t stmt_id; /* SQL string associated with this request. * * This is used by handle_prepare, handle_query_sql, and handle_exec_sql * to save the provided SQL string across calls to leader__barrier and * leader__exec, since there's no prepared statement that can be saved * instead. In the case of handle_exec_sql, after preparing each * statement we update this field to point to the "tail" that has not * been prepared yet. */ const char *sql; /* Prepared statement that will be queried to process this request. * * This is used by handle_query and handle_query_sql. */ sqlite3_stmt *stmt; /* Number of times a statement parsed from this request has been * executed. * * This is used by handle_exec_sql, which parses zero or more statements * from the provided SQL string and executes them successively. Only if * at least one statement was executed should we fill the RESULT * response using sqlite3_last_insert_rowid and sqlite3_changes. */ unsigned exec_count; /* Callback that will be invoked at the end of request processing to * write the response. */ handle_cb cb; /* A link into thread pool's queues. */ pool_work_t work; /* Gateway the handle belongs to. */ struct gateway *gw; }; /** * Start handling a new client request. * * At most one request can be outstanding at any given time. This function will * return an error if user code calls it and there's already a request in * progress. * * The @type parameter holds the request type code (e.g. #REQUEST_LEADER), and * the @buffer parameter is a buffer for writing the response. */ int gateway__handle(struct gateway *g, struct handle *req, int type, int schema, struct buffer *buffer, handle_cb cb); /** * Resume execution of a query that was yielding a lot of rows and has been * interrupted in order to start sending a first batch of rows. The response * write buffer associated with the request must have been reset. */ int gateway__resume(struct gateway *g, bool *finished); #endif /* DQLITE_GATEWAY_H_ */ dqlite-1.16.7/src/id.c000066400000000000000000000027421465252713400144350ustar00rootroot00000000000000#include "id.h" #include /* The PRNG used for generating request IDs is xoshiro256**, developed by * David Blackman and Sebastiano Vigna and released into the public domain. * See . */ static uint64_t rotl(uint64_t x, int k) { return (x << k) | (x >> (64 - k)); } uint64_t idNext(struct id_state *state) { uint64_t result = rotl(state->data[1] * 5, 7) * 9; uint64_t t = state->data[1] << 17; state->data[2] ^= state->data[0]; state->data[3] ^= state->data[1]; state->data[1] ^= state->data[2]; state->data[0] ^= state->data[3]; state->data[2] ^= t; state->data[3] = rotl(state->data[3], 45); return result; } void idJump(struct id_state *state) { static const uint64_t JUMP[] = {0x180ec6d33cfd0aba, 0xd5a61266f0c9392c, 0xa9582618e03fc9aa, 0x39abdc4529b1661c}; uint64_t s0 = 0; uint64_t s1 = 0; uint64_t s2 = 0; uint64_t s3 = 0; for (size_t i = 0; i < sizeof(JUMP) / sizeof(*JUMP); i++) { for (size_t b = 0; b < 64; b++) { if (JUMP[i] & UINT64_C(1) << b) { s0 ^= state->data[0]; s1 ^= state->data[1]; s2 ^= state->data[2]; s3 ^= state->data[3]; } idNext(state); } } state->data[0] = s0; state->data[1] = s1; state->data[2] = s2; state->data[3] = s3; } uint64_t idExtract(const uint8_t buf[16]) { uint64_t id; memcpy(&id, buf, sizeof(id)); return id; } void idSet(uint8_t buf[16], uint64_t id) { memset(buf, 0, 16); memcpy(buf, &id, sizeof(id)); buf[15] = (uint8_t)-1; } dqlite-1.16.7/src/id.h000066400000000000000000000020151465252713400144330ustar00rootroot00000000000000/** * Generate, set, and extract dqlite-generated request IDs. * * A fresh ID is generated for each config or exec client request that * arrives at a gateway. These IDs are passed down into raft via the * req_id field of RAFT__REQUEST, and are suitable for diagnostic use * only. */ #ifndef DQLITE_ID_H_ #define DQLITE_ID_H_ #include /** * State used to generate a request ID. */ struct id_state { uint64_t data[4]; }; /** * Generate a request ID, mutating the input state in the process. */ uint64_t idNext(struct id_state *state); /** * Cause the given state to yield a different sequence of IDs. * * This is used to ensure that the sequences of IDs generated for * distinct clients are (in practice) disjoint. */ void idJump(struct id_state *state); /** * Read a request ID from the req_id field of RAFT__REQUEST. */ uint64_t idExtract(const uint8_t buf[16]); /** * Write a request ID to the req_id field of RAFT__REQUEST. */ void idSet(uint8_t buf[16], uint64_t id); #endif /* DQLITE_ID_H_ */ dqlite-1.16.7/src/leader.c000066400000000000000000000303721465252713400152750ustar00rootroot00000000000000#include #include #include "../include/dqlite.h" #include "./lib/assert.h" #include "command.h" #include "conn.h" #include "gateway.h" #include "id.h" #include "leader.h" #include "lib/threadpool.h" #include "server.h" #include "tracing.h" #include "utils.h" #include "vfs.h" /* Called when a leader exec request terminates and the associated callback can * be invoked. */ static void leaderExecDone(struct exec *req) { tracef("leader exec done id:%" PRIu64, req->id); req->leader->exec = NULL; if (req->cb != NULL) { req->cb(req, req->status); } } /* Open a SQLite connection and set it to leader replication mode. */ static int openConnection(const char *filename, const char *vfs, unsigned page_size, sqlite3 **conn) { tracef("open connection filename %s", filename); char pragma[255]; int flags = SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE; char *msg = NULL; int rc; rc = sqlite3_open_v2(filename, conn, flags, vfs); if (rc != SQLITE_OK) { tracef("open failed %d", rc); goto err; } /* Enable extended result codes */ rc = sqlite3_extended_result_codes(*conn, 1); if (rc != SQLITE_OK) { tracef("extended codes failed %d", rc); goto err; } /* The vfs, db, gateway, and leader code currently assumes that * each connection will operate on only one DB file/WAL file * pair. Make sure that the client can't use ATTACH DATABASE to * break this assumption. We apply the same limit in open_follower_conn * in db.c. * * Note, 0 instead of 1 -- apparently the "initial database" is not * counted when evaluating this limit. */ sqlite3_limit(*conn, SQLITE_LIMIT_ATTACHED, 0); /* Set the page size. */ sprintf(pragma, "PRAGMA page_size=%d", page_size); rc = sqlite3_exec(*conn, pragma, NULL, NULL, &msg); if (rc != SQLITE_OK) { tracef("page size set failed %d page size %u", rc, page_size); goto err; } /* Disable syncs. */ rc = sqlite3_exec(*conn, "PRAGMA synchronous=OFF", NULL, NULL, &msg); if (rc != SQLITE_OK) { tracef("sync off failed %d", rc); goto err; } /* Set WAL journaling. */ rc = sqlite3_exec(*conn, "PRAGMA journal_mode=WAL", NULL, NULL, &msg); if (rc != SQLITE_OK) { tracef("wal on failed %d", rc); goto err; } rc = sqlite3_exec(*conn, "PRAGMA wal_autocheckpoint=0", NULL, NULL, &msg); if (rc != SQLITE_OK) { tracef("wal autocheckpoint off failed %d", rc); goto err; } rc = sqlite3_db_config(*conn, SQLITE_DBCONFIG_NO_CKPT_ON_CLOSE, 1, NULL); if (rc != SQLITE_OK) { tracef("db config failed %d", rc); goto err; } /* TODO: make setting foreign keys optional. */ rc = sqlite3_exec(*conn, "PRAGMA foreign_keys=1", NULL, NULL, &msg); if (rc != SQLITE_OK) { tracef("enable foreign keys failed %d", rc); goto err; } return 0; err: if (*conn != NULL) { sqlite3_close(*conn); *conn = NULL; } if (msg != NULL) { sqlite3_free(msg); } return rc; } /* Whether we need to submit a barrier request because there is no transaction * in progress in the underlying database and the FSM is behind the last log * index. */ static bool needsBarrier(struct leader *l) { return l->db->tx_id == 0 && raft_last_applied(l->raft) < raft_last_index(l->raft); } int leader__init(struct leader *l, struct db *db, struct raft *raft) { tracef("leader init"); int rc; l->db = db; l->raft = raft; rc = openConnection(db->path, db->config->name, db->config->page_size, &l->conn); if (rc != 0) { tracef("open failed %d", rc); return rc; } l->exec = NULL; l->inflight = NULL; queue_insert_tail(&db->leaders, &l->queue); return 0; } void leader__close(struct leader *l) { tracef("leader close"); int rc; /* TODO: there shouldn't be any ongoing exec request. */ if (l->exec != NULL) { assert(l->inflight == NULL); l->exec->status = SQLITE_ERROR; leaderExecDone(l->exec); } rc = sqlite3_close(l->conn); assert(rc == 0); queue_remove(&l->queue); } /* A checkpoint command that fails to commit is not a huge issue. * The WAL will not be checkpointed this time around on these nodes, * a new checkpoint command will be issued once the WAL on the leader reaches * threshold size again. It's improbable that the WAL in this way could grow * without bound, it would mean that apply frames commands commit without * issues, while the checkpoint command would somehow always fail to commit. */ static void leaderCheckpointApplyCb(struct raft_apply *req, int status, void *result) { (void)result; raft_free(req); if (status != 0) { tracef("checkpoint apply failed %d", status); } } /* Attempt to perform a checkpoint on nodes running a version of dqlite that * doens't perform autonomous checkpoints. For recent nodes, the checkpoint * command will just be a no-op. * This function will run after the WAL might have been checkpointed during a * call to `apply_frames`. * */ static void leaderMaybeCheckpointLegacy(struct leader *l) { tracef("leader maybe checkpoint legacy"); struct sqlite3_file *wal; struct raft_buffer buf; struct command_checkpoint command; sqlite3_int64 size; int rv; /* Get the database file associated with this connection */ rv = sqlite3_file_control(l->conn, "main", SQLITE_FCNTL_JOURNAL_POINTER, &wal); assert(rv == SQLITE_OK); /* Should never fail */ rv = wal->pMethods->xFileSize(wal, &size); assert(rv == SQLITE_OK); /* Should never fail */ /* size of the WAL will be 0 if it has just been checkpointed on this * leader as a result of running apply_frames. */ if (size != 0) { return; } tracef("issue checkpoint command"); /* Attempt to perfom a checkpoint across nodes that don't perform * autonomous snapshots. */ command.filename = l->db->filename; rv = command__encode(COMMAND_CHECKPOINT, &command, &buf); if (rv != 0) { tracef("encode failed %d", rv); return; } struct raft_apply *apply = raft_malloc(sizeof(*apply)); if (apply == NULL) { tracef("raft_malloc - no mem"); goto err_after_buf_alloc; } #ifdef USE_SYSTEM_RAFT rv = raft_apply(l->raft, apply, &buf, 1, leaderCheckpointApplyCb); #else rv = raft_apply(l->raft, apply, &buf, NULL, 1, leaderCheckpointApplyCb); #endif if (rv != 0) { tracef("raft_apply failed %d", rv); raft_free(apply); goto err_after_buf_alloc; } return; err_after_buf_alloc: raft_free(buf.base); } static void leaderApplyFramesCb(struct raft_apply *req, int status, void *result) { tracef("apply frames cb id:%" PRIu64, idExtract(req->req_id)); struct apply *apply = req->data; struct leader *l = apply->leader; if (l == NULL) { raft_free(apply); return; } (void)result; if (status != 0) { tracef("apply frames cb failed status %d", status); sqlite3_vfs *vfs = sqlite3_vfs_find(l->db->config->name); switch (status) { case RAFT_LEADERSHIPLOST: l->exec->status = SQLITE_IOERR_LEADERSHIP_LOST; break; case RAFT_NOSPACE: l->exec->status = SQLITE_IOERR_WRITE; break; case RAFT_SHUTDOWN: /* If we got here it means we have manually * fired the apply callback from * gateway__close(). In this case we don't * free() the apply object, since it will be * freed when the callback is fired again by * raft. * * TODO: we should instead make gatewa__close() * itself asynchronous. */ apply->leader = NULL; l->exec->status = SQLITE_ABORT; goto finish; break; default: l->exec->status = SQLITE_IOERR; break; } VfsAbort(vfs, l->db->path); } raft_free(apply); if (status == 0) { leaderMaybeCheckpointLegacy(l); } finish: l->inflight = NULL; l->db->tx_id = 0; leaderExecDone(l->exec); } static int leaderApplyFrames(struct exec *req, dqlite_vfs_frame *frames, unsigned n) { tracef("leader apply frames id:%" PRIu64, req->id); struct leader *l = req->leader; struct db *db = l->db; struct command_frames c; struct raft_buffer buf; struct apply *apply; int rv; c.filename = db->filename; c.tx_id = 0; c.truncate = 0; c.is_commit = 1; c.frames.n_pages = (uint32_t)n; c.frames.page_size = (uint16_t)db->config->page_size; c.frames.data = frames; apply = raft_malloc(sizeof *req); if (apply == NULL) { tracef("malloc"); rv = DQLITE_NOMEM; goto err; } rv = command__encode(COMMAND_FRAMES, &c, &buf); if (rv != 0) { tracef("encode %d", rv); goto err_after_apply_alloc; } apply->leader = req->leader; apply->req.data = apply; apply->type = COMMAND_FRAMES; idSet(apply->req.req_id, req->id); #ifdef USE_SYSTEM_RAFT rv = raft_apply(l->raft, &apply->req, &buf, 1, leaderApplyFramesCb); #else /* TODO actual WAL slice goes here */ struct raft_entry_local_data local_data = {}; rv = raft_apply(l->raft, &apply->req, &buf, &local_data, 1, leaderApplyFramesCb); #endif if (rv != 0) { tracef("raft apply failed %d", rv); goto err_after_command_encode; } db->tx_id = 1; l->inflight = apply; return 0; err_after_command_encode: raft_free(buf.base); err_after_apply_alloc: raft_free(apply); err: assert(rv != 0); return rv; } static void leaderExecV2(struct exec *req, enum pool_half half) { tracef("leader exec v2 id:%" PRIu64, req->id); struct leader *l = req->leader; struct db *db = l->db; sqlite3_vfs *vfs = sqlite3_vfs_find(db->config->name); dqlite_vfs_frame *frames; uint64_t size; unsigned n; unsigned i; int rv; if (half == POOL_TOP_HALF) { req->status = sqlite3_step(req->stmt); return; } /* else POOL_BOTTOM_HALF => */ rv = VfsPoll(vfs, db->path, &frames, &n); if (rv != 0 || n == 0) { tracef("vfs poll"); goto finish; } /* Check if the new frames would create an overfull database */ size = VfsDatabaseSize(vfs, db->path, n, db->config->page_size); if (size > VfsDatabaseSizeLimit(vfs)) { rv = SQLITE_FULL; goto abort; } rv = leaderApplyFrames(req, frames, n); if (rv != 0) { goto abort; } for (i = 0; i < n; i++) { sqlite3_free(frames[i].data); } sqlite3_free(frames); return; abort: for (i = 0; i < n; i++) { sqlite3_free(frames[i].data); } sqlite3_free(frames); VfsAbort(vfs, l->db->path); finish: if (rv != 0) { tracef("exec v2 failed %d", rv); l->exec->status = rv; } leaderExecDone(l->exec); } #ifdef DQLITE_NEXT static void exec_top(pool_work_t *w) { struct exec *req = CONTAINER_OF(w, struct exec, work); leaderExecV2(req, POOL_TOP_HALF); } static void exec_bottom(pool_work_t *w) { struct exec *req = CONTAINER_OF(w, struct exec, work); leaderExecV2(req, POOL_BOTTOM_HALF); } #endif static void execBarrierCb(struct barrier *barrier, int status) { tracef("exec barrier cb status %d", status); struct exec *req = barrier->data; struct leader *l = req->leader; if (status != 0) { l->exec->status = status; leaderExecDone(l->exec); return; } #ifdef DQLITE_NEXT struct dqlite_node *node = l->raft->data; pool_t *pool = !!(pool_ut_fallback()->flags & POOL_FOR_UT) ? pool_ut_fallback() : &node->pool; pool_queue_work(pool, &req->work, l->db->cookie, WT_UNORD, exec_top, exec_bottom); #else leaderExecV2(req, POOL_TOP_HALF); leaderExecV2(req, POOL_BOTTOM_HALF); #endif } int leader__exec(struct leader *l, struct exec *req, sqlite3_stmt *stmt, uint64_t id, exec_cb cb) { tracef("leader exec id:%" PRIu64, id); int rv; if (l->exec != NULL) { tracef("busy"); return SQLITE_BUSY; } l->exec = req; req->leader = l; req->stmt = stmt; req->id = id; req->cb = cb; req->barrier.data = req; req->barrier.cb = NULL; req->work = (pool_work_t){}; rv = leader__barrier(l, &req->barrier, execBarrierCb); if (rv != 0) { l->exec = NULL; return rv; } return 0; } static void raftBarrierCb(struct raft_barrier *req, int status) { tracef("raft barrier cb status %d", status); struct barrier *barrier = req->data; int rv = 0; if (status != 0) { if (status == RAFT_LEADERSHIPLOST) { rv = SQLITE_IOERR_LEADERSHIP_LOST; } else { rv = SQLITE_ERROR; } } barrier_cb cb = barrier->cb; if (cb == NULL) { tracef("barrier cb already fired"); return; } barrier->cb = NULL; cb(barrier, rv); } int leader__barrier(struct leader *l, struct barrier *barrier, barrier_cb cb) { tracef("leader barrier"); int rv; if (!needsBarrier(l)) { tracef("not needed"); cb(barrier, 0); return 0; } barrier->cb = cb; barrier->leader = l; barrier->req.data = barrier; rv = raft_barrier(l->raft, &barrier->req, raftBarrierCb); if (rv != 0) { tracef("raft barrier failed %d", rv); barrier->req.data = NULL; barrier->leader = NULL; barrier->cb = NULL; return rv; } return 0; } dqlite-1.16.7/src/leader.h000066400000000000000000000063641465252713400153060ustar00rootroot00000000000000/** * Track the state of leader connection and execute statements asynchronously. */ #ifndef LEADER_H_ #define LEADER_H_ #include #include #include "./lib/queue.h" #include "db.h" #include "lib/threadpool.h" #include "raft.h" #define SQLITE_IOERR_NOT_LEADER (SQLITE_IOERR | (40 << 8)) #define SQLITE_IOERR_LEADERSHIP_LOST (SQLITE_IOERR | (41 << 8)) struct exec; struct barrier; struct leader; typedef void (*exec_cb)(struct exec *req, int status); typedef void (*barrier_cb)(struct barrier *req, int status); /* Wrapper around raft_apply, saving context information. */ struct apply { struct raft_apply req; /* Raft apply request */ int status; /* Raft apply result */ struct leader *leader; /* Leader connection that triggered the hook */ int type; /* Command type */ union { /* Command-specific data */ struct { bool is_commit; } frames; }; }; struct leader { struct db *db; /* Database the connection. */ sqlite3 *conn; /* Underlying SQLite connection. */ struct raft *raft; /* Raft instance. */ struct exec *exec; /* Exec request in progress, if any. */ queue queue; /* Prev/next leader, used by struct db. */ struct apply *inflight; /* TODO: make leader__close async */ }; struct barrier { void *data; struct leader *leader; struct raft_barrier req; barrier_cb cb; }; /** * Asynchronous request to execute a statement. */ struct exec { void *data; struct leader *leader; struct barrier barrier; sqlite3_stmt *stmt; uint64_t id; int status; queue queue; exec_cb cb; pool_work_t work; }; /** * Initialize a new leader connection. * * This function will start the leader loop coroutine and pause it immediately, * transfering control back to main coroutine and then opening a new leader * connection against the given database. */ int leader__init(struct leader *l, struct db *db, struct raft *raft); void leader__close(struct leader *l); /** * Submit a request to step a SQLite statement. * * The request will be dispatched to the leader loop coroutine, which will be * resumed and will invoke sqlite_step(). If the statement triggers the * replication hooks and one or more new Raft log entries need to be appended, * then the loop coroutine will be paused and control will be transferred back * to the main coroutine. In this state the leader loop coroutine call stack * will be "blocked" on the xFrames() replication hook call triggered by the top * sqlite_step() call. The leader loop coroutine will be resumed once the Raft * append request completes (either successfully or not) and at that point the * stack will rewind back to the @sqlite_step() call, returning to the leader * loop which will then have completed the request and transfer control back to * the main coroutine, pausing until the next request. */ int leader__exec(struct leader *l, struct exec *req, sqlite3_stmt *stmt, uint64_t id, exec_cb cb); /** * Submit a raft barrier request if there is no transaction in progress in the * underlying database and the FSM is behind the last log index. * * Otherwise, just invoke the given @cb immediately. */ int leader__barrier(struct leader *l, struct barrier *barrier, barrier_cb cb); #endif /* LEADER_H_*/ dqlite-1.16.7/src/lib/000077500000000000000000000000001465252713400144365ustar00rootroot00000000000000dqlite-1.16.7/src/lib/addr.c000066400000000000000000000051331465252713400155160ustar00rootroot00000000000000#include "addr.h" #include #include #include #include #include #include #include "../../include/dqlite.h" int AddrParse(const char *input, struct sockaddr *addr, socklen_t *addr_len, const char *service, int flags) { int rv; char *node = NULL; size_t input_len = strlen(input); char c = input[0]; struct sockaddr_un *addr_un; const char *name, *addr_start, *close_bracket, *colon; size_t name_len; struct addrinfo hints, *res; if (c == '@') { /* Unix domain address. * FIXME the use of the "abstract namespace" here is * Linux-specific */ if (!(flags & DQLITE_ADDR_PARSE_UNIX)) { return DQLITE_MISUSE; } addr_un = (struct sockaddr_un *)addr; if (*addr_len < sizeof(*addr_un)) { return DQLITE_ERROR; } name = input + 1; name_len = input_len - 1; if (name_len == 0) { /* Autogenerated abstract socket name */ addr_un->sun_family = AF_UNIX; *addr_len = sizeof(addr_un->sun_family); return 0; } /* Leading null byte, no trailing null byte */ if (name_len + 1 > sizeof(addr_un->sun_path)) { return DQLITE_ERROR; } memset(addr_un->sun_path, 0, sizeof(addr_un->sun_path)); memcpy(addr_un->sun_path + 1, name, name_len); addr_un->sun_family = AF_UNIX; *addr_len = (socklen_t)offsetof(struct sockaddr_un, sun_path) + (socklen_t)name_len + 1; return 0; } else if (c == '[') { /* IPv6 address with port */ addr_start = input + 1; close_bracket = memchr(input, ']', input_len); if (!close_bracket) { return DQLITE_ERROR; } colon = close_bracket + 1; if (*colon != ':') { return DQLITE_ERROR; } service = colon + 1; node = strndup(addr_start, (size_t)(close_bracket - addr_start)); } else if (memchr(input, '.', input_len)) { /* IPv4 address */ colon = memchr(input, ':', input_len); if (colon) { service = colon + 1; node = strndup(input, (size_t)(colon - input)); } else { node = strdup(input); } } else { /* IPv6 address without port */ node = strdup(input); } if (!node) { return DQLITE_NOMEM; } memset(&hints, 0, sizeof(hints)); hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; hints.ai_flags = AI_NUMERICHOST | AI_NUMERICSERV; rv = getaddrinfo(node, service, &hints, &res); if (rv != 0) { rv = DQLITE_ERROR; goto err_after_strdup; } if (res->ai_addrlen > *addr_len) { rv = DQLITE_ERROR; goto err_after_getaddrinfo; } memcpy(addr, res->ai_addr, res->ai_addrlen); *addr_len = res->ai_addrlen; err_after_getaddrinfo: freeaddrinfo(res); err_after_strdup: free(node); return rv; } dqlite-1.16.7/src/lib/addr.h000066400000000000000000000017001465252713400155170ustar00rootroot00000000000000#ifndef ADDR_H_ #define ADDR_H_ #include enum { /* Parse Unix socket addresses in @ notation */ DQLITE_ADDR_PARSE_UNIX = 1 << 0 }; /** Parse a socket address from the string @input. * * On success, the resulting address is placed in @addr, and its size is placed * in @addr_len. If @addr is not large enough (based on the initial value of * @addr_len) to hold the result, DQLITE_ERROR is returned. * * @service should be a string representing a port number, e.g. "8080". * * @flags customizes the behavior of the function. Currently the only flag is * DQLITE_ADDR_PARSE_UNIX: when this is ORed in @flags, AddrParse will also * parse Unix socket addresses in the form `@NAME`, where NAME may be empty. * This creates a socket address in the (Linux-specific) "abstract namespace". */ int AddrParse(const char *input, struct sockaddr *addr, socklen_t *addr_len, const char *service, int flags); #endif dqlite-1.16.7/src/lib/assert.h000066400000000000000000000017151465252713400161140ustar00rootroot00000000000000/** * Define the assert() macro, either as the standard one or the test one. */ #ifndef LIB_ASSERT_H_ #define LIB_ASSERT_H_ #if defined(DQLITE_TEST) #include "../../test/lib/munit.h" #define assert(expr) munit_assert(expr) #elif defined(DQLITE_ASSERT_WITH_BACKTRACE) #include /* for __assert_fail */ #include #include #undef assert #define assert(x) \ do { \ struct backtrace_state *state_; \ if (!(x)) { \ state_ = backtrace_create_state(NULL, 0, NULL, NULL); \ backtrace_print(state_, 0, stderr); \ __assert_fail(#x, __FILE__, __LINE__, __func__); \ } \ } while (0) #else #include #endif #endif /* LIB_ASSERT_H_ */ dqlite-1.16.7/src/lib/buffer.c000066400000000000000000000026311465252713400160550ustar00rootroot00000000000000#include #include #include #include "buffer.h" #include "../../include/dqlite.h" /* How large is the buffer currently */ #define SIZE(B) (B->n_pages * B->page_size) /* How many remaining bytes the buffer currently */ #define CAP(B) (SIZE(B) - B->offset) int buffer__init(struct buffer *b) { b->page_size = (unsigned)sysconf(_SC_PAGESIZE); b->n_pages = 1; b->data = malloc(SIZE(b)); if (b->data == NULL) { return DQLITE_NOMEM; } b->offset = 0; return 0; } void buffer__close(struct buffer *b) { free(b->data); } /* Ensure that the buffer has at least @size spare bytes */ static inline bool ensure(struct buffer *b, size_t size) { void *data; uint32_t n_pages = b->n_pages; /* Double the buffer until we have enough capacity */ while (size > CAP(b)) { b->n_pages *= 2; } /* CAP(b) was insufficient */ if (b->n_pages > n_pages) { data = realloc(b->data, SIZE(b)); if (data == NULL) { b->n_pages = n_pages; return false; } b->data = data; } return true; } void *buffer__advance(struct buffer *b, size_t size) { void *cursor; if (!ensure(b, size)) { return NULL; } cursor = buffer__cursor(b, b->offset); b->offset += size; return cursor; } size_t buffer__offset(struct buffer *b) { return b->offset; } void *buffer__cursor(struct buffer *b, size_t offset) { return b->data + offset; } void buffer__reset(struct buffer *b) { b->offset = 0; } dqlite-1.16.7/src/lib/buffer.h000066400000000000000000000030231465252713400160560ustar00rootroot00000000000000/** * A dynamic buffer which can grow as needed when writing to it. * * The buffer size is always a multiple of the OS virtual memory page size, so * resizing the buffer *should* not incur in memory being copied. * * See https://stackoverflow.com/questions/16765389 * * TODO: consider using mremap. */ #ifndef LIB_BUFFER_H_ #define LIB_BUFFER_H_ #include #include "../../include/dqlite.h" struct buffer { void *data; /* Allocated buffer */ unsigned page_size; /* Size of an OS page */ unsigned n_pages; /* Number of pages allocated */ size_t offset; /* Next byte to write in the buffer */ }; /** * Initialize the buffer. It will initially have 1 memory page. */ DQLITE_VISIBLE_TO_TESTS int buffer__init(struct buffer *b); /** * Release the memory of the buffer. */ DQLITE_VISIBLE_TO_TESTS void buffer__close(struct buffer *b); /** * Return a write cursor pointing to the next byte to write, ensuring that the * buffer has at least @size spare bytes. * * Return #NULL in case of out-of-memory errors. */ DQLITE_VISIBLE_TO_TESTS void *buffer__advance(struct buffer *b, size_t size); /** * Return the offset of next byte to write. */ DQLITE_VISIBLE_TO_TESTS size_t buffer__offset(struct buffer *b); /** * Return a write cursor pointing to the @offset'th byte of the buffer. */ DQLITE_VISIBLE_TO_TESTS void *buffer__cursor(struct buffer *b, size_t offset); /** * Reset the write offset of the buffer. */ DQLITE_VISIBLE_TO_TESTS void buffer__reset(struct buffer *b); #endif /* LIB_BUFFER_H_ */ dqlite-1.16.7/src/lib/byte.h000066400000000000000000000061301465252713400155520ustar00rootroot00000000000000#ifndef LIB_BYTE_H_ #define LIB_BYTE_H_ #include #include #include #if defined(__cplusplus) #define DQLITE_INLINE inline #else #define DQLITE_INLINE static inline #endif #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) #define DQLITE_LITTLE_ENDIAN #elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) #define DQLITE_BIG_ENDIAN #endif #if defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 #define DQLITE_HAVE_BSWAP #endif /* Flip a 16-bit number to little-endian byte order */ DQLITE_INLINE uint16_t ByteFlipLe16(uint16_t v) { #if defined(DQLITE_LITTLE_ENDIAN) return v; #elif defined(DQLITE_BIG_ENDIAN) && defined(DQLITE_HAVE_BSWAP) return __builtin_bswap16(v); #else union { uint16_t u; uint8_t v[2]; } s; s.v[0] = (uint8_t)v; s.v[1] = (uint8_t)(v >> 8); return s.u; #endif } /* Flip a 32-bit number to little-endian byte order */ DQLITE_INLINE uint32_t ByteFlipLe32(uint32_t v) { #if defined(DQLITE_LITTLE_ENDIAN) return v; #elif defined(DQLITE_BIG_ENDIAN) && defined(DQLITE_HAVE_BSWAP) return __builtin_bswap32(v); #else union { uint32_t u; uint8_t v[4]; } s; s.v[0] = (uint8_t)v; s.v[1] = (uint8_t)(v >> 8); s.v[2] = (uint8_t)(v >> 16); s.v[3] = (uint8_t)(v >> 24); return s.u; #endif } /* Flip a 64-bit number to little-endian byte order */ DQLITE_INLINE uint64_t ByteFlipLe64(uint64_t v) { #if defined(DQLITE_LITTLE_ENDIAN) return v; #elif defined(DQLITE_BIG_ENDIAN) && defined(DQLITE_HAVE_BSWAP) return __builtin_bswap64(v); #else union { uint64_t u; uint8_t v[8]; } s; s.v[0] = (uint8_t)v; s.v[1] = (uint8_t)(v >> 8); s.v[2] = (uint8_t)(v >> 16); s.v[3] = (uint8_t)(v >> 24); s.v[4] = (uint8_t)(v >> 32); s.v[5] = (uint8_t)(v >> 40); s.v[6] = (uint8_t)(v >> 48); s.v[7] = (uint8_t)(v >> 56); return s.u; #endif } /* -Wconversion before GCC 10 is overly sensitive. */ #if defined(__GNUC__) && __GNUC__ < 10 #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wconversion" #endif DQLITE_INLINE uint16_t ByteGetBe16(const uint8_t *buf) { uint16_t x = buf[0]; uint16_t y = buf[1]; x <<= 8; return x | y; } DQLITE_INLINE uint32_t ByteGetBe32(const uint8_t *buf) { uint32_t w = buf[0]; uint32_t x = buf[1]; uint32_t y = buf[2]; uint32_t z = buf[3]; w <<= 24; x <<= 16; y <<= 8; return w | x | y | z; } DQLITE_INLINE uint32_t ByteGetLe32(const uint8_t *buf) { uint32_t w = buf[0]; uint32_t x = buf[1]; uint32_t y = buf[2]; uint32_t z = buf[3]; z <<= 24; y <<= 16; x <<= 8; return w | x | y | z; } DQLITE_INLINE void BytePutBe32(uint32_t v, uint8_t *buf) { buf[0] = (uint8_t)(v >> 24); buf[1] = (uint8_t)(v >> 16); buf[2] = (uint8_t)(v >> 8); buf[3] = (uint8_t)v; } /** * Add padding to size if it's not a multiple of 8. E.g. if 11 is passed, 16 is * returned. */ DQLITE_INLINE size_t BytePad64(size_t size) { size_t rest = size % sizeof(uint64_t); if (rest != 0) { size += sizeof(uint64_t) - rest; } return size; } #define ARRAY_SIZE(a) ((sizeof(a)) / (sizeof(a)[0])) #if defined(__GNUC__) && __GNUC__ < 10 #pragma GCC diagnostic pop #endif #endif /* LIB_BYTE_H_ */ dqlite-1.16.7/src/lib/fs.c000066400000000000000000000015451465252713400152170ustar00rootroot00000000000000#include #include #include #include #include #include "../tracing.h" #include "fs.h" int FsEnsureDir(const char *path) { int rv; struct stat st = {0}; rv = stat(path, &st); if (rv == 0) { if (!S_ISDIR(st.st_mode)) { tracef("%s is not a directory", path); return -1; } } /* Directory does not exist */ if (rv == -1) { return mkdir(path, 0755); } return 0; } static int fsRemoveDirFilesNftwFn(const char *path, const struct stat *sb, int type, struct FTW *ftwb) { int rv; (void)sb; (void)type; (void)ftwb; rv = 0; /* Don't remove directory */ if (S_ISREG(sb->st_mode)) { rv = remove(path); } return rv; } int FsRemoveDirFiles(const char *path) { int rv; rv = nftw(path, fsRemoveDirFilesNftwFn, 10, FTW_DEPTH | FTW_MOUNT | FTW_PHYS); return rv; } dqlite-1.16.7/src/lib/fs.h000066400000000000000000000003741465252713400152230ustar00rootroot00000000000000#ifndef DQLITE_LIB_FS_H #define DQLITE_LIB_FS_H /* Create a directory if it does not already exist. */ int FsEnsureDir(const char *path); /* Removes all files from a directory. */ int FsRemoveDirFiles(const char *path); #endif /* DQLITE_LIB_FS_H */ dqlite-1.16.7/src/lib/queue.h000066400000000000000000000032351465252713400157360ustar00rootroot00000000000000#ifndef LIB_QUEUE_H_ #define LIB_QUEUE_H_ #include /* offsetof */ struct queue { struct queue *next; struct queue *prev; }; typedef struct queue queue; #define QUEUE_DATA(e, type, field) \ ((type *)((void *)((char *)(e)-offsetof(type, field)))) #define QUEUE_FOREACH(q, h) for ((q) = (h)->next; (q) != (h); (q) = (q)->next) static inline void queue_init(struct queue *q) { q->next = q; q->prev = q; } static inline int queue_empty(const struct queue *q) { return q == q->next; } static inline struct queue *queue_head(const struct queue *q) { return q->next; } static inline struct queue *queue_next(const struct queue *q) { return q->next; } static inline struct queue *queue_tail(const struct queue *q) { return q->prev; } static inline void queue_add(struct queue *h, struct queue *n) { h->prev->next = n->next; n->next->prev = h->prev; h->prev = n->prev; h->prev->next = h; } static inline void queue_split(struct queue *h, struct queue *q, struct queue *n) { n->prev = h->prev; n->prev->next = n; n->next = q; h->prev = q->prev; h->prev->next = h; q->prev = n; } static inline void queue_move(struct queue *h, struct queue *n) { if (queue_empty(h)) queue_init(n); else queue_split(h, h->next, n); } static inline void queue_insert_head(struct queue *h, struct queue *q) { q->next = h->next; q->prev = h; q->next->prev = q; h->next = q; } static inline void queue_insert_tail(struct queue *h, struct queue *q) { q->next = h; q->prev = h->prev; q->prev->next = q; h->prev = q; } static inline void queue_remove(struct queue *q) { q->prev->next = q->next; q->next->prev = q->prev; } #endif /* LIB_QUEUE_H_*/ dqlite-1.16.7/src/lib/registry.h000066400000000000000000000342441465252713400164660ustar00rootroot00000000000000#ifndef LIB_REGISTRY_H_ #define LIB_REGISTRY_H_ #include #include #include #include #include "../../include/dqlite.h" #include "assert.h" #define DQLITE_NOTFOUND 1002 /** * Define a type-safe registry able to allocate and lookup items of a given * type. * * The item TYPE is required to implement three methods: TYPE##_init, * TYPE##_close and TYPE##_hash. */ #define REGISTRY(NAME, TYPE) \ \ struct NAME \ { \ struct TYPE **buf; /* Array of registry item slots */ \ size_t len; /* Index of the highest used slot */ \ size_t cap; /* Total number of slots */ \ }; \ \ /* Initialize the registry. */ \ void NAME##_init(struct NAME *r); \ \ /* Close the registry. */ \ void NAME##_close(struct NAME *r); \ \ /* Add an item to the registry. \ * \ * Return a pointer to a newly allocated an initialized item. \ * The "id" field of the item will be set to a unique value \ * identifying the item in the registry. */ \ int NAME##_add(struct NAME *r, struct TYPE **item); \ \ /* Given its ID, retrieve an item previously added to the \ * registry. */ \ struct TYPE *NAME##_get(struct NAME *r, size_t id); \ \ /* Get the index of the first item matching the given hash key. Return \ * 0 on success and DQLITE_NOTFOUND otherwise. */ \ int NAME##_idx(struct NAME *r, const char *key, size_t *i); \ \ /* Delete a previously added item. */ \ int NAME##_del(struct NAME *r, struct TYPE *item) /** * Define the methods of a registry */ #define REGISTRY_METHODS(NAME, TYPE) \ void NAME##_init(struct NAME *r) \ { \ assert(r != NULL); \ \ r->buf = NULL; \ r->len = 0; \ r->cap = 0; \ } \ \ void NAME##_close(struct NAME *r) \ { \ size_t i; \ struct TYPE *item; \ \ assert(r != NULL); \ \ /* Loop through all items currently in the registry, \ * and close them. */ \ for (i = 0; i < r->len; i++) { \ item = *(r->buf + i); \ /* Some slots may have been deleted, so we need \ * to check if the slot is actually used. */ \ if (item != NULL) { \ TYPE##_close(item); \ sqlite3_free(item); \ } \ } \ \ r->len = 0; \ r->cap = 0; \ if (r->buf != NULL) { \ sqlite3_free(r->buf); \ r->buf = NULL; \ } \ } \ \ int NAME##_add(struct NAME *r, struct TYPE **item) \ { \ struct TYPE **buf; \ size_t cap; \ size_t i; \ \ assert(r != NULL); \ assert(item != NULL); \ \ /* Check if there is an unllocated slot. */ \ for (i = 0; i < r->len; i++) { \ if (*(r->buf + i) == NULL) { \ goto ok_slot; \ } \ } \ \ /* There are no unallocated slots. */ \ assert(i == r->len); \ \ /* If we are full, then double the capacity. */ \ if (r->len + 1 > r->cap) { \ cap = (r->cap == 0) ? 1 : r->cap * 2; \ buf = sqlite3_realloc(r->buf, \ (int)(cap * sizeof(*r->buf))); \ if (buf == NULL) { \ return DQLITE_NOMEM; \ } \ r->buf = buf; \ r->cap = cap; \ } \ r->len++; \ \ ok_slot: \ assert(i < r->len); \ \ /* Allocate and initialize the new item */ \ *item = sqlite3_malloc(sizeof **item); \ if (*item == NULL) \ return DQLITE_NOMEM; \ \ (*item)->id = i; \ \ TYPE##_init(*item); \ \ /* Save the item in its registry slot */ \ *(r->buf + i) = *item; \ \ return 0; \ } \ \ struct TYPE *NAME##_get(struct NAME *r, size_t id) \ { \ struct TYPE *item; \ size_t i = id; \ \ assert(r != NULL); \ \ if (i >= r->len) { \ return NULL; \ } \ \ item = *(r->buf + i); \ \ assert(item->id == id); \ \ return item; \ } \ \ int NAME##_idx(struct NAME *r, const char *key, size_t *i) \ { \ struct TYPE *item; \ \ assert(r != NULL); \ assert(key != NULL); \ assert(i != NULL); \ \ for (*i = 0; *i < r->len; (*i)++) { \ const char *hash; \ \ item = *(r->buf + *i); \ \ if (item == NULL) { \ continue; \ } \ \ hash = TYPE##_hash(item); \ \ if (hash != NULL && strcmp(hash, key) == 0) { \ return 0; \ } \ } \ \ return DQLITE_NOTFOUND; \ } \ \ int NAME##_del(struct NAME *r, struct TYPE *item) \ { \ struct TYPE **buf; \ size_t cap; \ size_t i = item->id; \ \ assert(r != NULL); \ \ if (i >= r->len) { \ return DQLITE_NOTFOUND; \ } \ \ /* Check that the item address actually matches the one \ * we have in the registry */ \ if (*(r->buf + i) != item) { \ return DQLITE_NOTFOUND; \ } \ \ TYPE##_close(item); \ sqlite3_free(item); \ \ *(r->buf + i) = NULL; \ \ /* If this was the last item in the registry buffer, \ * decrease the length. */ \ if (i == r->len - 1) { \ r->len--; \ } \ \ /* If the new length is less than half of the capacity, \ * try to shrink the registry. */ \ if (r->len < (r->cap / 2)) { \ cap = r->cap / 2; \ buf = sqlite3_realloc(r->buf, \ (int)(cap * sizeof *r->buf)); \ if (buf != NULL) { \ r->buf = buf; \ r->cap = cap; \ } \ } \ \ return 0; \ } #endif /* LIB_REGISTRY_H_ */ dqlite-1.16.7/src/lib/serialize.h000066400000000000000000000201731465252713400166010ustar00rootroot00000000000000#ifndef LIB_SERIALIZE_H_ #define LIB_SERIALIZE_H_ #include #include #include #include "../../include/dqlite.h" #include "assert.h" #include "byte.h" #define DQLITE_PARSE 1005 /** * The size in bytes of a single serialized word. */ #define SERIALIZE__WORD_SIZE 8 /* We rely on the size of double to be 64 bit, since that's what is sent over * the wire. * * See https://stackoverflow.com/questions/752309/ensuring-c-doubles-are-64-bits */ #ifndef __STDC_IEC_559__ #if __SIZEOF_DOUBLE__ != 8 #error "Requires IEEE 754 floating point!" #endif #endif #ifdef static_assert static_assert(sizeof(double) == sizeof(uint64_t), "Size of 'double' is not 64 bits"); #endif /** * Basic type aliases to used by macro-based processing. */ typedef const char *text_t; typedef double float_t; typedef uv_buf_t blob_t; /** * Cursor to progressively read a buffer. */ struct cursor { const char *p; /* Next byte to read */ size_t cap; /* Number of bytes left in the buffer */ }; /** * Define a serializable struct. * * NAME: Name of the structure which will be defined. * FIELDS: List of X-based macros defining the fields in the schema, in the form * of X(KIND, NAME, ##__VA_ARGS__). E.g. X(uint64, id, ##__VA_ARGS__). * * A new struct called NAME will be defined, along with sizeof, encode and * decode functions. */ #define SERIALIZE__DEFINE(NAME, FIELDS) \ SERIALIZE__DEFINE_STRUCT(NAME, FIELDS); \ SERIALIZE__DEFINE_METHODS(NAME, FIELDS) #define SERIALIZE__DEFINE_STRUCT(NAME, FIELDS) \ struct NAME \ { \ FIELDS(SERIALIZE__DEFINE_FIELD) \ } #define SERIALIZE__DEFINE_METHODS(NAME, FIELDS) \ size_t NAME##__sizeof(const struct NAME *p); \ void NAME##__encode(const struct NAME *p, char **cursor); \ int NAME##__decode(struct cursor *cursor, struct NAME *p) /* Define a single field in serializable struct. * * KIND: Type code (e.g. uint64, text, etc). * MEMBER: Field name. */ #define SERIALIZE__DEFINE_FIELD(KIND, MEMBER) KIND##_t MEMBER; /** * Implement the sizeof, encode and decode function of a serializable struct. */ #define SERIALIZE__IMPLEMENT(NAME, FIELDS) \ size_t NAME##__sizeof(const struct NAME *p) \ { \ size_t size = 0; \ FIELDS(SERIALIZE__SIZEOF_FIELD, p); \ return size; \ } \ void NAME##__encode(const struct NAME *p, char **cursor) \ { \ FIELDS(SERIALIZE__ENCODE_FIELD, p, cursor); \ } \ int NAME##__decode(struct cursor *cursor, struct NAME *p) \ { \ int rc; \ FIELDS(SERIALIZE__DECODE_FIELD, p, cursor); \ return 0; \ } #define SERIALIZE__SIZEOF_FIELD(KIND, MEMBER, P) \ size += KIND##__sizeof(&((P)->MEMBER)); #define SERIALIZE__ENCODE_FIELD(KIND, MEMBER, P, CURSOR) \ KIND##__encode(&((P)->MEMBER), CURSOR); #define SERIALIZE__DECODE_FIELD(KIND, MEMBER, P, CURSOR) \ rc = KIND##__decode(CURSOR, &((P)->MEMBER)); \ if (rc != 0) { \ return rc; \ } DQLITE_INLINE size_t uint8__sizeof(const uint8_t *value) { (void)value; return sizeof(uint8_t); } DQLITE_INLINE size_t uint16__sizeof(const uint16_t *value) { (void)value; return sizeof(uint16_t); } DQLITE_INLINE size_t uint32__sizeof(const uint32_t *value) { (void)value; return sizeof(uint32_t); } DQLITE_INLINE size_t uint64__sizeof(const uint64_t *value) { (void)value; return sizeof(uint64_t); } DQLITE_INLINE size_t int64__sizeof(const int64_t *value) { (void)value; return sizeof(int64_t); } DQLITE_INLINE size_t float__sizeof(const float_t *value) { (void)value; return sizeof(double); } DQLITE_INLINE size_t text__sizeof(const text_t *value) { return BytePad64(strlen(*value) + 1); } DQLITE_INLINE size_t blob__sizeof(const blob_t *value) { /* length + data */ return sizeof(uint64_t) + BytePad64(value->len); } DQLITE_INLINE void uint8__encode(const uint8_t *value, char **cursor) { *(uint8_t *)(*cursor) = *value; *cursor += sizeof(uint8_t); } DQLITE_INLINE void uint16__encode(const uint16_t *value, char **cursor) { uint16_t x = ByteFlipLe16(*value); memcpy(*cursor, &x, sizeof(uint16_t)); *cursor += sizeof(uint16_t); } DQLITE_INLINE void uint32__encode(const uint32_t *value, char **cursor) { uint32_t x = ByteFlipLe32(*value); memcpy(*cursor, &x, sizeof(uint32_t)); *cursor += sizeof(uint32_t); } DQLITE_INLINE void uint64__encode(const uint64_t *value, char **cursor) { uint64_t x = ByteFlipLe64(*value); memcpy(*cursor, &x, sizeof(uint64_t)); *cursor += sizeof(uint64_t); } DQLITE_INLINE void int64__encode(const int64_t *value, char **cursor) { int64_t x = (int64_t)ByteFlipLe64((uint64_t)*value); memcpy(*cursor, &x, sizeof(int64_t)); *cursor += sizeof(int64_t); } DQLITE_INLINE void float__encode(const float_t *value, char **cursor) { uint64_t x = ByteFlipLe64(*(uint64_t *)value); memcpy(*cursor, &x, sizeof(uint64_t)); *cursor += sizeof(uint64_t); } DQLITE_INLINE void text__encode(const text_t *value, char **cursor) { size_t len = BytePad64(strlen(*value) + 1); memset(*cursor, 0, len); strcpy(*cursor, *value); *cursor += len; } DQLITE_INLINE void blob__encode(const blob_t *value, char **cursor) { size_t len = BytePad64(value->len); uint64_t value_len = value->len; uint64__encode(&value_len, cursor); memcpy(*cursor, value->base, value->len); *cursor += len; } DQLITE_INLINE int uint8__decode(struct cursor *cursor, uint8_t *value) { size_t n = sizeof(uint8_t); if (n > cursor->cap) { return DQLITE_PARSE; } *value = *(uint8_t *)cursor->p; cursor->p += n; cursor->cap -= n; return 0; } DQLITE_INLINE int uint16__decode(struct cursor *cursor, uint16_t *value) { size_t n = sizeof(uint16_t); if (n > cursor->cap) { return DQLITE_PARSE; } memcpy(value, cursor->p, sizeof(*value)); *value = ByteFlipLe16(*value); cursor->p += n; cursor->cap -= n; return 0; } DQLITE_INLINE int uint32__decode(struct cursor *cursor, uint32_t *value) { size_t n = sizeof(uint32_t); if (n > cursor->cap) { return DQLITE_PARSE; } memcpy(value, cursor->p, sizeof(*value)); *value = ByteFlipLe32(*value); cursor->p += n; cursor->cap -= n; return 0; } DQLITE_INLINE int uint64__decode(struct cursor *cursor, uint64_t *value) { size_t n = sizeof(uint64_t); if (n > cursor->cap) { return DQLITE_PARSE; } memcpy(value, cursor->p, sizeof(*value)); *value = ByteFlipLe64(*value); cursor->p += n; cursor->cap -= n; return 0; } DQLITE_INLINE int int64__decode(struct cursor *cursor, int64_t *value) { size_t n = sizeof(int64_t); if (n > cursor->cap) { return DQLITE_PARSE; } memcpy(value, cursor->p, sizeof(*value)); *value = (int64_t)ByteFlipLe64((uint64_t)*value); cursor->p += n; cursor->cap -= n; return 0; } DQLITE_INLINE int float__decode(struct cursor *cursor, float_t *value) { size_t n = sizeof(double); if (n > cursor->cap) { return DQLITE_PARSE; } uint64_t x; memcpy(&x, cursor->p, sizeof(x)); *(uint64_t *)value = ByteFlipLe64(x); cursor->p += n; cursor->cap -= n; return 0; } DQLITE_INLINE int text__decode(struct cursor *cursor, text_t *value) { /* Find the terminating null byte of the next string, if any. */ size_t len = strnlen(cursor->p, cursor->cap); size_t n; if (len == cursor->cap) { return DQLITE_PARSE; } *value = cursor->p; n = BytePad64(strlen(*value) + 1); cursor->p += n; cursor->cap -= n; return 0; } DQLITE_INLINE int blob__decode(struct cursor *cursor, blob_t *value) { uint64_t len; size_t n; int rv; rv = uint64__decode(cursor, &len); if (rv != 0) { return rv; } n = BytePad64((size_t)len); if (n > cursor->cap) { return DQLITE_PARSE; } value->base = (char *)cursor->p; value->len = (size_t)len; cursor->p += n; cursor->cap -= n; return 0; } #endif /* LIB_SERIALIZE_H_ */ dqlite-1.16.7/src/lib/sm.c000066400000000000000000000043131465252713400152220ustar00rootroot00000000000000#include "sm.h" #include #include #include /* NULL */ #include /* fprintf */ #include #include #include "../tracing.h" #include "../utils.h" static bool sm_is_locked(const struct sm *m) { return ERGO(m->is_locked, m->is_locked(m)); } int sm_state(const struct sm *m) { PRE(sm_is_locked(m)); return m->state; } static inline void sm_obs(const struct sm *m) { tracef("%s pid: %d sm_id: %" PRIu64 " %s |\n", m->name, m->pid, m->id, m->conf[sm_state(m)].name); } void sm_relate(const struct sm *from, const struct sm *to) { tracef("%s-to-%s opid: %d dpid: %d id: %" PRIu64 " id: %" PRIu64 " |\n", from->name, to->name, from->pid, to->pid, from->id, to->id); } void sm_init(struct sm *m, bool (*invariant)(const struct sm *, int), bool (*is_locked)(const struct sm *), const struct sm_conf *conf, const char *name, int state) { static atomic_uint_least64_t id = 0; PRE(conf[state].flags & SM_INITIAL); m->conf = conf; m->state = state; m->invariant = invariant; m->is_locked = is_locked; m->id = ++id; m->pid = getpid(); snprintf(m->name, SM_MAX_NAME_LENGTH, "%s", name); sm_obs(m); POST(m->invariant != NULL && m->invariant(m, SM_PREV_NONE)); } void sm_fini(struct sm *m) { PRE(m->invariant != NULL && m->invariant(m, SM_PREV_NONE)); PRE(m->conf[sm_state(m)].flags & SM_FINAL); } void sm_move(struct sm *m, int next_state) { int prev = sm_state(m); PRE(sm_is_locked(m)); PRE(m->conf[sm_state(m)].allowed & BITS(next_state)); m->state = next_state; sm_obs(m); POST(m->invariant != NULL && m->invariant(m, prev)); } void sm_fail(struct sm *m, int fail_state, int rc) { int prev = sm_state(m); PRE(sm_is_locked(m)); PRE(rc != 0 && m->rc == 0); PRE(m->conf[fail_state].flags & SM_FAILURE); PRE(m->conf[sm_state(m)].allowed & BITS(fail_state)); m->rc = rc; m->state = fail_state; POST(m->invariant != NULL && m->invariant(m, prev)); } static __attribute__((noinline)) bool check_failed(const char *f, int n, const char *s) { tracef("%s:%d check failed: %s\n", f, n, s); return false; } bool sm_check(bool b, const char *f, int n, const char *s) { if (!b) { return check_failed(f, n, s); } return true; } dqlite-1.16.7/src/lib/sm.h000066400000000000000000000024101465252713400152230ustar00rootroot00000000000000#ifndef __LIB_SM__ #define __LIB_SM__ #include #include #include #define BITS(state) (1ULL << (state)) #define CHECK(cond) sm_check((cond), __FILE__, __LINE__, #cond) #define SM_MAX_NAME_LENGTH 50 enum { SM_PREV_NONE = -1, /* sizeof(sm_conf::allowed * 8) */ SM_STATES_MAX = 64, /* flags */ SM_INITIAL = 1U << 0, SM_FAILURE = 1U << 1, SM_FINAL = 1U << 2, }; struct sm_conf { uint32_t flags; uint64_t allowed; const char *name; }; struct sm { int rc; int state; char name[SM_MAX_NAME_LENGTH]; uint64_t id; pid_t pid; bool (*is_locked)(const struct sm *); bool (*invariant)(const struct sm *, int); const struct sm_conf *conf; }; void sm_init(struct sm *m, bool (*invariant)(const struct sm *, int), /* optional, set NULL if not used */ bool (*is_locked)(const struct sm *), const struct sm_conf *conf, const char *name, int state); void sm_fini(struct sm *m); void sm_move(struct sm *m, int next_state); void sm_fail(struct sm *m, int fail_state, int rc); int sm_state(const struct sm *m); bool sm_check(bool b, const char *f, int n, const char *s); /* Relates one state machine to another for observability. */ void sm_relate(const struct sm *from, const struct sm *to); #endif /* __LIB_SM__ */ dqlite-1.16.7/src/lib/threadpool.c000066400000000000000000000313231465252713400167450ustar00rootroot00000000000000#include "threadpool.h" #include #include #include #include #include #include "../../src/lib/queue.h" #include "../../src/lib/sm.h" #include "../../src/utils.h" #include "../tracing.h" /** * Planner thread state machine. * * signal() && * empty(o) && signal() && exiting * empty(u) && +-----> NOTHING ----------------> EXITED * !exiting +------- ^ | * | | * empty(o) && | | signal() * empty(u) | | !empty(o) || !empty(u) * | | * | | * | V * !empty(o) && +-----> DRAINING * !empty(u) && +------- ^ | * type(head(o)) != BAR | | * | | type(head(o)) == BAR * ord_in_flight == 0 | | * | V * BARRIER --------+ signal() * ^ | <-------+ ord_in_flight == 0 * | | * empty(u) | | !empty(u) * | V * DRAINING_UNORD */ enum planner_states { PS_NOTHING, PS_DRAINING, PS_BARRIER, PS_DRAINING_UNORD, PS_EXITED, PS_NR, }; static const struct sm_conf planner_states[PS_NR] = { [PS_NOTHING] = { .flags = SM_INITIAL, .name = "nothing", .allowed = BITS(PS_DRAINING) | BITS(PS_EXITED), }, [PS_DRAINING] = { .name = "draining", .allowed = BITS(PS_DRAINING) | BITS(PS_NOTHING) | BITS(PS_BARRIER), }, [PS_BARRIER] = { .name = "barrier", .allowed = BITS(PS_DRAINING_UNORD) | BITS(PS_DRAINING) | BITS(PS_BARRIER), }, [PS_DRAINING_UNORD] = { .name = "draining-unord", .allowed = BITS(PS_BARRIER) }, [PS_EXITED] = { .flags = SM_FINAL, .name = "exited", .allowed = 0, }, }; enum { THREADPOOL_SIZE_MAX = 1024, }; typedef struct pool_thread pool_thread_t; typedef struct pool_impl pool_impl_t; struct targs { pool_impl_t *pi; uv_sem_t *sem; uint32_t idx; /* Thread's index */ }; /* Worker thread of the pool */ struct pool_thread { queue inq; /* Thread's input queue */ uv_cond_t cond; /* Signalled when work item appears in @inq */ uv_thread_t thread; /* Pool's worker thread */ struct targs arg; }; /* clang-format off */ struct pool_impl { uv_mutex_t mutex; /* Input queue, planner_sm, worker and planner threads lock */ uint32_t threads_nr; pool_thread_t *threads; queue outq; /* Output queue used by libuv part */ uv_mutex_t outq_mutex; /* Output queue lock */ uv_async_t outq_async; /* Signalled when output queue is not empty and libuv loop has to process items from it */ uint64_t active_ws; /* Number of all work items in flight, accessed from the main thread only */ queue ordered; /* Queue of WT_ORD{N} items */ queue unordered; /* Queue of WT_UNORD items */ struct sm planner_sm; /* State machine of the scheduler */ uv_cond_t planner_cond; uv_thread_t planner_thread; /* Scheduler's thread */ uint32_t ord_in_flight; /* Number of WT_ORD{N} in flight */ bool exiting; /* True when the pool is being stopped */ enum pool_work_type /* Type of the previous work item, */ ord_prev; /* used in WT_ORD{N} ivariants */ uint32_t qos; /* QoS token */ uint32_t qos_prio; /* QoS prio */ }; /* clang-format on */ static inline bool pool_is_inited(const pool_t *pool) { return pool->pi != NULL; } static inline bool has_active_ws(pool_t *pool) { return pool->pi->active_ws > 0; } static inline void w_register(pool_t *pool, pool_work_t *w) { if (w->type != WT_BAR) { pool->pi->active_ws++; } } static inline void w_unregister(pool_t *pool, pool_work_t *w) { (void)w; PRE(has_active_ws(pool)); pool->pi->active_ws--; } static bool empty(const queue *q) { return queue_empty(q); } static queue *head(const queue *q) { return queue_head(q); } static void push(queue *to, queue *what) { queue_insert_tail(to, what); } static queue *pop(queue *from) { queue *q = queue_head(from); PRE(q != NULL); queue_remove(q); queue_init(q); return q; } static queue *qos_pop(pool_impl_t *pi, queue *first, queue *second) { PRE(!empty(first) || !empty(second)); if (empty(first)) { return pop(second); } else if (empty(second)) { return pop(first); } return pop(pi->qos++ % pi->qos_prio ? first : second); } static pool_work_t *q_to_w(const queue *q) { return QUEUE_DATA(q, pool_work_t, link); } static enum pool_work_type q_type(const queue *q) { return q_to_w(q)->type; } static uint32_t q_tid(const queue *q) { return q_to_w(q)->thread_id; } static bool planner_invariant(const struct sm *m, int prev_state) { pool_impl_t *pi = CONTAINER_OF(m, pool_impl_t, planner_sm); queue *o = &pi->ordered; queue *u = &pi->unordered; /* clang-format off */ return ERGO(sm_state(m) == PS_NOTHING, empty(o) && empty(u)) && ERGO(sm_state(m) == PS_DRAINING, ERGO(prev_state == PS_BARRIER, pi->ord_in_flight == 0 && empty(u)) && ERGO(prev_state == PS_NOTHING, !empty(u) || !empty(o))) && ERGO(sm_state(m) == PS_EXITED, pi->exiting && empty(o) && empty(u)) && ERGO(sm_state(m) == PS_BARRIER, ERGO(prev_state == PS_DRAINING, q_type(head(o)) == WT_BAR) && ERGO(prev_state == PS_DRAINING_UNORD, empty(u))) && ERGO(sm_state(m) == PS_DRAINING_UNORD, !empty(u)); /* clang-format on */ } static void planner(void *arg) { struct targs *ta = arg; uv_sem_t *sem = ta->sem; pool_impl_t *pi = ta->pi; uv_mutex_t *mutex = &pi->mutex; pool_thread_t *ts = pi->threads; struct sm *planner_sm = &pi->planner_sm; queue *o = &pi->ordered; queue *u = &pi->unordered; queue *q; sm_init(planner_sm, planner_invariant, NULL, planner_states, "ps", PS_NOTHING); uv_sem_post(sem); uv_mutex_lock(mutex); for (;;) { switch (sm_state(planner_sm)) { case PS_NOTHING: while (empty(o) && empty(u) && !pi->exiting) { uv_cond_wait(&pi->planner_cond, mutex); } sm_move(planner_sm, pi->exiting && empty(o) && empty(u) ? PS_EXITED : PS_DRAINING); break; case PS_DRAINING: while (!(empty(o) && empty(u))) { sm_move(planner_sm, PS_DRAINING); if (!empty(o) && q_type(head(o)) == WT_BAR) { sm_move(planner_sm, PS_BARRIER); goto ps_barrier; } q = qos_pop(pi, o, u); push(&ts[q_tid(q)].inq, q); uv_cond_signal(&ts[q_tid(q)].cond); if (q_type(q) >= WT_ORD1) { pi->ord_in_flight++; } } sm_move(planner_sm, PS_NOTHING); ps_barrier: break; case PS_BARRIER: if (!empty(u)) { sm_move(planner_sm, PS_DRAINING_UNORD); break; } if (pi->ord_in_flight == 0) { q = pop(o); PRE(q_to_w(q)->type == WT_BAR); free(q_to_w(q)); sm_move(planner_sm, PS_DRAINING); break; } uv_cond_wait(&pi->planner_cond, mutex); sm_move(planner_sm, PS_BARRIER); break; case PS_DRAINING_UNORD: while (!empty(u)) { q = pop(u); push(&ts[q_tid(q)].inq, q); uv_cond_signal(&ts[q_tid(q)].cond); } sm_move(planner_sm, PS_BARRIER); break; case PS_EXITED: sm_fini(planner_sm); uv_mutex_unlock(mutex); return; default: POST(false && "Impossible!"); } } } static void queue_work(pool_work_t *w) { w->work_cb(w); } static void queue_done(pool_work_t *w) { w_unregister(w->pool, w); if (w->after_work_cb != NULL) { w->after_work_cb(w); } } static void worker(void *arg) { struct targs *ta = arg; pool_impl_t *pi = ta->pi; uv_mutex_t *mutex = &pi->mutex; pool_thread_t *ts = pi->threads; enum pool_work_type wtype; pool_work_t *w; queue *q; uv_sem_post(ta->sem); uv_mutex_lock(mutex); for (;;) { while (empty(&ts[ta->idx].inq)) { if (pi->exiting) { uv_mutex_unlock(mutex); return; } uv_cond_wait(&ts[ta->idx].cond, mutex); } q = pop(&ts[ta->idx].inq); uv_mutex_unlock(mutex); w = q_to_w(q); wtype = w->type; queue_work(w); uv_mutex_lock(&pi->outq_mutex); push(&pi->outq, &w->link); uv_async_send(&pi->outq_async); uv_mutex_unlock(&pi->outq_mutex); uv_mutex_lock(mutex); if (wtype > WT_BAR) { assert(pi->ord_in_flight > 0); if (--pi->ord_in_flight == 0) { uv_cond_signal(&pi->planner_cond); } } } } static void pool_cleanup(pool_t *pool) { pool_impl_t *pi = pool->pi; pool_thread_t *ts = pi->threads; uint32_t i; if (pi->threads_nr == 0) { return; } uv_cond_signal(&pi->planner_cond); if (uv_thread_join(&pi->planner_thread)) { abort(); } uv_cond_destroy(&pi->planner_cond); POST(empty(&pi->ordered) && empty(&pi->unordered)); for (i = 0; i < pi->threads_nr; i++) { uv_cond_signal(&ts[i].cond); if (uv_thread_join(&ts[i].thread)) { abort(); } POST(empty(&ts[i].inq)); uv_cond_destroy(&ts[i].cond); } free(pi->threads); uv_mutex_destroy(&pi->mutex); pi->threads_nr = 0; } static void pool_threads_init(pool_t *pool) { uint32_t i; uv_sem_t sem; pool_impl_t *pi = pool->pi; pool_thread_t *ts; struct targs pa = { .sem = &sem, .pi = pi, }; uv_thread_options_t config = { .flags = UV_THREAD_HAS_STACK_SIZE, .stack_size = 8u << 20, }; if (uv_mutex_init(&pi->mutex)) { abort(); } if (uv_sem_init(&sem, 0)) { abort(); } pi->threads = calloc(pi->threads_nr, sizeof(pi->threads[0])); if (pi->threads == NULL) { abort(); } for (i = 0, ts = pi->threads; i < pi->threads_nr; i++) { ts[i].arg = (struct targs){ .pi = pi, .sem = &sem, .idx = i, }; queue_init(&ts[i].inq); if (uv_cond_init(&ts[i].cond)) { abort(); } if (uv_thread_create_ex(&ts[i].thread, &config, worker, &ts[i].arg)) { abort(); } } if (uv_cond_init(&pi->planner_cond)) { abort(); } if (uv_thread_create_ex(&pi->planner_thread, &config, planner, &pa)) { abort(); } for (i = 0; i < pi->threads_nr + 1 /* +planner */; i++) { uv_sem_wait(&sem); } uv_sem_destroy(&sem); } static void pool_work_submit(pool_t *pool, pool_work_t *w) { pool_impl_t *pi = pool->pi; queue *o = &pi->ordered; queue *u = &pi->unordered; if (w->type > WT_UNORD) { /* Make sure that elements in the ordered queue come in order. */ PRE(ERGO(pi->ord_prev != WT_BAR && w->type != WT_BAR, pi->ord_prev == w->type)); pi->ord_prev = w->type; } uv_mutex_lock(&pi->mutex); POST(!pi->exiting); push(w->type == WT_UNORD ? u : o, &w->link); uv_cond_signal(&pi->planner_cond); uv_mutex_unlock(&pi->mutex); } void work_done(uv_async_t *handle) { queue q = {}; pool_impl_t *pi = CONTAINER_OF(handle, pool_impl_t, outq_async); uv_mutex_lock(&pi->outq_mutex); queue_move(&pi->outq, &q); uv_mutex_unlock(&pi->outq_mutex); while (!empty(&q)) { queue_done(q_to_w(pop(&q))); } } void pool_queue_work(pool_t *pool, pool_work_t *w, uint32_t cookie, enum pool_work_type type, void (*work_cb)(pool_work_t *w), void (*after_work_cb)(pool_work_t *w)) { PRE(memcmp(w, &(pool_work_t){}, sizeof *w) == 0); PRE(work_cb != NULL && type < WT_NR); if (!!(pool->flags & POOL_FOR_UT_NOT_ASYNC)) { work_cb(w); after_work_cb(w); return; } PRE(pool_is_inited(pool)); *w = (pool_work_t){ .pool = pool, .type = type, .thread_id = cookie % pool->pi->threads_nr, .work_cb = work_cb, .after_work_cb = after_work_cb, }; w_register(pool, w); pool_work_submit(pool, w); } int pool_init(pool_t *pool, uv_loop_t *loop, uint32_t threads_nr, uint32_t qos_prio) { int rc; pool_impl_t *pi = pool->pi; PRE(threads_nr <= THREADPOOL_SIZE_MAX); pool->flags = 0x0; pi = pool->pi = calloc(1, sizeof(*pool->pi)); if (pi == NULL) { return UV_ENOMEM; } *pi = (pool_impl_t){ .qos = 0, .qos_prio = qos_prio, .exiting = false, .ord_prev = WT_BAR, .threads_nr = threads_nr, .ord_in_flight = 0, }; queue_init(&pi->outq); queue_init(&pi->ordered); queue_init(&pi->unordered); rc = uv_mutex_init(&pi->outq_mutex); if (rc != 0) { free(pi); return rc; } rc = uv_async_init(loop, &pi->outq_async, work_done); if (rc != 0) { uv_mutex_destroy(&pi->outq_mutex); free(pi); return rc; } pool_threads_init(pool); return 0; } void pool_fini(pool_t *pool) { pool_impl_t *pi = pool->pi; pool_cleanup(pool); uv_mutex_lock(&pi->outq_mutex); POST(!!(pool->flags & POOL_FOR_UT_NON_CLEAN_FINI) || (empty(&pi->outq) && !has_active_ws(pool))); uv_mutex_unlock(&pi->outq_mutex); uv_mutex_destroy(&pi->outq_mutex); free(pi); } void pool_close(pool_t *pool) { pool_impl_t *pi = pool->pi; uv_close((uv_handle_t *)&pi->outq_async, NULL); uv_mutex_lock(&pi->mutex); pi->exiting = true; uv_mutex_unlock(&pi->mutex); } pool_t *pool_ut_fallback(void) { static pool_t pool; return &pool; } dqlite-1.16.7/src/lib/threadpool.h000066400000000000000000000066171465252713400167620ustar00rootroot00000000000000#ifndef __THREAD_POOL__ #define __THREAD_POOL__ #include #include "queue.h" /** Thread pool - Use-cases: - Move sqlite3-, IO- related blocking operations from libuv loop's thread to pool's threads in order to unblock serving incoming dqlite requests during sqlite3 IO. Multiple sqlite3_step()-s can be in flight and executed concurrently, while thread's loop is not IO blocked. - Introduced pool's work item thread affinity to serve sqlite3- related items of each database in a "dedicated" thread which allows not to make any assumption on sqlite3 threading model. @see https://www.sqlite.org/threadsafe.html - The pool supports servicing of the following types of work items: - WT_UNORD - items, which can be processed by the pool in any order, concurrency assumptions of this type of work are guaranteed by other layers of the application. Read and write transactions executed by sqlite3_step() are good examples for such work item type. - WT_ORD_N - items, which can NOT be processed by the pool in any order. The pool's logic shall guarantee that servicing all WT_ORD_{N}s happens before WT_ORD_{N + 1}s. WT_ORD_{N}s and WT_ORD_{N + 1}s operations can't be put into the pool interleaved. Sqlite3 checkpoints is an example of WT_ORD_{N} and InstallSnapshot(CP(), MV()) is an example of WT_ORD_{N + 1}. - WT_BAR - special purpose item, barrier. Delimits WT_ORD_{N}s from WT_ORD_{N + 1}s. - The pool supports servicing of work items with a given quality of service (QoS) considerations. For example, the priority of serving read/write sqlite3 transactions (WT_UNORD) can be set higher then snapshot installation (WT_ORD{N}). */ struct pool_impl; typedef struct pool_s pool_t; typedef struct pool_work_s pool_work_t; enum pool_work_type { WT_UNORD, WT_BAR, WT_ORD1, WT_ORD2, WT_NR, }; struct pool_work_s { queue link; /* Link into ordered, unordered and outq */ uint32_t thread_id; /* Identifier of the thread the item is affined */ pool_t *pool; /* The pool, item is being associated with */ enum pool_work_type type; int rc; /* Return code used to deliver pool work operation result to the * uv_loop's thread. */ void (*work_cb)(pool_work_t *w); void (*after_work_cb)(pool_work_t *w); }; struct pool_s { struct pool_impl *pi; int flags; }; enum { POOL_QOS_PRIO_FAIR = 2, }; enum pool_half { POOL_TOP_HALF = 0x109, POOL_BOTTOM_HALF = 0xb01103, }; enum { /** * Setting POOL_FOR_UT_NON_CLEAN_FINI relaxes pool's invariant during * the finalization w.r.t. to pass a few tests checking failures with * non-clean unit-test termination. */ POOL_FOR_UT_NON_CLEAN_FINI = 1u << 0, /** * Set this flag if there's no event loop in unit test. Top- and * bottom- halves will be called in the current thread. */ POOL_FOR_UT_NOT_ASYNC = 1u << 1, /** * Set if the pool runs in the context of unit test. */ POOL_FOR_UT = 1u << 2, }; int pool_init(pool_t *pool, uv_loop_t *loop, uint32_t threads_nr, uint32_t qos_prio); void pool_fini(pool_t *pool); void pool_close(pool_t *pool); void pool_queue_work(pool_t *pool, pool_work_t *w, uint32_t cookie, enum pool_work_type type, void (*work_cb)(pool_work_t *w), void (*after_work_cb)(pool_work_t *w)); pool_t *pool_ut_fallback(void); #endif /* __THREAD_POOL__ */ dqlite-1.16.7/src/lib/transport.c000066400000000000000000000070071465252713400166420ustar00rootroot00000000000000#include "../raft.h" #include "../../include/dqlite.h" #include "assert.h" #include "transport.h" /* Called to allocate a buffer for the next stream read. */ static void alloc_cb(uv_handle_t *stream, size_t suggested_size, uv_buf_t *buf) { struct transport *t; (void)suggested_size; t = stream->data; assert(t->read.base != NULL); assert(t->read.len > 0); *buf = t->read; } /* Invoke the read callback. */ static void read_done(struct transport *t, ssize_t status) { transport_read_cb cb; int rv; rv = uv_read_stop(t->stream); assert(rv == 0); cb = t->read_cb; assert(cb != NULL); t->read_cb = NULL; t->read.base = NULL; t->read.len = 0; cb(t, (int)status); } static void read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf) { struct transport *t; (void)buf; t = stream->data; if (nread > 0) { size_t n = (size_t)nread; /* We shouldn't have read more data than the pending amount. */ assert(n <= t->read.len); /* Advance the read window */ t->read.base += n; t->read.len -= n; /* If there's more data to read in order to fill the current * read buffer, just return, we'll be invoked again. */ if (t->read.len > 0) { return; } /* Read completed, invoke the callback. */ read_done(t, 0); return; } assert(nread <= 0); if (nread == 0) { /* Empty read */ return; } assert(nread < 0); /* Failure. */ read_done(t, nread); } int transport__stream(struct uv_loop_s *loop, int fd, struct uv_stream_s **stream) { struct uv_pipe_s *pipe; struct uv_tcp_s *tcp; int rv; switch (uv_guess_handle(fd)) { case UV_TCP: tcp = raft_malloc(sizeof *tcp); if (tcp == NULL) { return DQLITE_NOMEM; } rv = uv_tcp_init(loop, tcp); assert(rv == 0); rv = uv_tcp_open(tcp, fd); if (rv != 0) { raft_free(tcp); return TRANSPORT__BADSOCKET; } *stream = (struct uv_stream_s *)tcp; break; case UV_NAMED_PIPE: pipe = raft_malloc(sizeof *pipe); if (pipe == NULL) { return DQLITE_NOMEM; } rv = uv_pipe_init(loop, pipe, 0); assert(rv == 0); rv = uv_pipe_open(pipe, fd); if (rv != 0) { raft_free(pipe); return TRANSPORT__BADSOCKET; } *stream = (struct uv_stream_s *)pipe; break; default: return TRANSPORT__BADSOCKET; }; return 0; } int transport__init(struct transport *t, struct uv_stream_s *stream) { t->stream = stream; t->stream->data = t; t->read.base = NULL; t->read.len = 0; t->write.data = t; t->read_cb = NULL; t->write_cb = NULL; t->close_cb = NULL; return 0; } static void close_cb(uv_handle_t *handle) { struct transport *t = handle->data; raft_free(t->stream); if (t->close_cb != NULL) { t->close_cb(t); } } void transport__close(struct transport *t, transport_close_cb cb) { assert(t->close_cb == NULL); t->close_cb = cb; uv_close((uv_handle_t *)t->stream, close_cb); } int transport__read(struct transport *t, uv_buf_t *buf, transport_read_cb cb) { int rv; assert(t->read.base == NULL); assert(t->read.len == 0); t->read = *buf; t->read_cb = cb; rv = uv_read_start(t->stream, alloc_cb, read_cb); if (rv != 0) { return DQLITE_ERROR; } return 0; } static void write_cb(uv_write_t *req, int status) { struct transport *t = req->data; transport_write_cb cb = t->write_cb; assert(cb != NULL); t->write_cb = NULL; cb(t, status); } int transport__write(struct transport *t, uv_buf_t *buf, transport_write_cb cb) { int rv; assert(t->write_cb == NULL); t->write_cb = cb; rv = uv_write(&t->write, t->stream, buf, 1, write_cb); if (rv != 0) { return rv; } return 0; } dqlite-1.16.7/src/lib/transport.h000066400000000000000000000032231465252713400166430ustar00rootroot00000000000000/** * Asynchronously read and write buffer from and to the network. */ #ifndef LIB_TRANSPORT_H_ #define LIB_TRANSPORT_H_ #include #define TRANSPORT__BADSOCKET 1000 /** * Callbacks. */ struct transport; typedef void (*transport_read_cb)(struct transport *t, int status); typedef void (*transport_write_cb)(struct transport *t, int status); typedef void (*transport_close_cb)(struct transport *t); /** * Light wrapper around a libuv stream handle, providing a more convenient way * to read a certain amount of bytes. */ struct transport { void *data; /* User defined */ struct uv_stream_s *stream; /* Data stream */ uv_buf_t read; /* Read buffer */ uv_write_t write; /* Write request */ transport_read_cb read_cb; /* Read callback */ transport_write_cb write_cb; /* Write callback */ transport_close_cb close_cb; /* Close callback */ }; /** * Initialize a transport of the appropriate type (TCP or PIPE) attached to the * given file descriptor. */ int transport__init(struct transport *t, struct uv_stream_s *stream); /** * Start closing by the transport. */ void transport__close(struct transport *t, transport_close_cb cb); /** * Read from the transport file descriptor until the given buffer is full. */ int transport__read(struct transport *t, uv_buf_t *buf, transport_read_cb cb); /** * Write the given buffer to the transport. */ int transport__write(struct transport *t, uv_buf_t *buf, transport_write_cb cb); /* Create an UV stream object from the given fd. */ int transport__stream(struct uv_loop_s *loop, int fd, struct uv_stream_s **stream); #endif /* LIB_TRANSPORT_H_ */ dqlite-1.16.7/src/logger.c000066400000000000000000000016211465252713400153130ustar00rootroot00000000000000#include #include #include "logger.h" #define EMIT_BUF_LEN 1024 void loggerDefaultEmit(void *data, int level, const char *fmt, va_list args) { char buf[EMIT_BUF_LEN]; char *cursor = buf; size_t n; (void)data; /* First, render the logging level. */ switch (level) { case DQLITE_DEBUG: sprintf(cursor, "[DEBUG]: "); break; case DQLITE_INFO: sprintf(cursor, "[INFO ]: "); break; case DQLITE_WARN: sprintf(cursor, "[WARN ]: "); break; case DQLITE_LOG_ERROR: sprintf(cursor, "[ERROR]: "); break; default: sprintf(cursor, "[ ]: "); break; }; cursor = buf + strlen(buf); /* Then render the message, possibly truncating it. */ n = EMIT_BUF_LEN - strlen(buf) - 1; #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wformat-nonliteral" vsnprintf(cursor, n, fmt, args); #pragma GCC diagnostic pop fprintf(stderr, "%s\n", buf); } dqlite-1.16.7/src/logger.h000066400000000000000000000015171465252713400153240ustar00rootroot00000000000000#ifndef LOGGER_H_ #define LOGGER_H_ #include "raft.h" #include "../include/dqlite.h" /* Log levels */ enum { DQLITE_DEBUG = 0, DQLITE_INFO, DQLITE_WARN, DQLITE_LOG_ERROR }; /* Function to emit log messages. */ typedef void (*dqlite_emit)(void *data, int level, const char *fmt, va_list args); struct logger { void *data; dqlite_emit emit; }; /* Default implementation of dqlite_emit, using stderr. */ void loggerDefaultEmit(void *data, int level, const char *fmt, va_list args); /* Emit a log message with a certain level. */ /* #define debugf(L, FORMAT, ...) \ */ /* logger__emit(L, DQLITE_DEBUG, FORMAT, ##__VA_ARGS__) */ #define debugf(C, FORMAT, ...) \ C->gateway.raft->io->emit(C->gateway.raft->io, RAFT_DEBUG, FORMAT, \ ##__VA_ARGS__) #endif /* LOGGER_H_ */ dqlite-1.16.7/src/message.c000066400000000000000000000000761465252713400154630ustar00rootroot00000000000000#include "message.h" SERIALIZE__IMPLEMENT(message, MESSAGE); dqlite-1.16.7/src/message.h000066400000000000000000000005661465252713400154740ustar00rootroot00000000000000#ifndef MESSAGE_H_ #define MESSAGE_H_ #include "lib/serialize.h" /** * Metadata about an incoming or outgoing RPC message. */ #define MESSAGE(X, ...) \ X(uint32, words, ##__VA_ARGS__) \ X(uint8, type, ##__VA_ARGS__) \ X(uint8, schema, ##__VA_ARGS__) \ X(uint16, extra, ##__VA_ARGS__) SERIALIZE__DEFINE(message, MESSAGE); #endif /* MESSAGE_H_x */ dqlite-1.16.7/src/metrics.c000066400000000000000000000002701465252713400155010ustar00rootroot00000000000000#include #include "./lib/assert.h" #include "metrics.h" void dqlite__metrics_init(struct dqlite__metrics *m) { assert(m != NULL); m->requests = 0; m->duration = 0; } dqlite-1.16.7/src/metrics.h000066400000000000000000000010031465252713400155010ustar00rootroot00000000000000/****************************************************************************** * * Collect various performance metrics. * *****************************************************************************/ #ifndef DQLITE_METRICS_H #define DQLITE_METRICS_H #include struct dqlite__metrics { uint64_t requests; /* Total number of requests served. */ uint64_t duration; /* Total time spent to server requests. */ }; void dqlite__metrics_init(struct dqlite__metrics *m); #endif /* DQLITE_METRICS_H */ dqlite-1.16.7/src/protocol.h000066400000000000000000000050351465252713400157050ustar00rootroot00000000000000#ifndef DQLITE_PROTOCOL_H_ #define DQLITE_PROTOCOL_H_ /* Special datatypes */ #define DQLITE_UNIXTIME 9 #define DQLITE_ISO8601 10 #define DQLITE_BOOLEAN 11 #define DQLITE_PROTO 1001 /* Protocol error */ /* Role codes */ enum { DQLITE_VOTER, DQLITE_STANDBY, DQLITE_SPARE }; /* Current protocol version */ #define DQLITE_PROTOCOL_VERSION 1 /* Legacly pre-1.0 version. */ #define DQLITE_PROTOCOL_VERSION_LEGACY 0x86104dd760433fe5 /* Special value indicating that a batch of rows is over, but there are more. */ #define DQLITE_RESPONSE_ROWS_PART 0xeeeeeeeeeeeeeeee /* Special value indicating that the result set is complete. */ #define DQLITE_RESPONSE_ROWS_DONE 0xffffffffffffffff /* Request types */ enum { DQLITE_REQUEST_LEADER, DQLITE_REQUEST_CLIENT, DQLITE_REQUEST_HEARTBEAT, DQLITE_REQUEST_OPEN, DQLITE_REQUEST_PREPARE, DQLITE_REQUEST_EXEC, DQLITE_REQUEST_QUERY, DQLITE_REQUEST_FINALIZE, DQLITE_REQUEST_EXEC_SQL, DQLITE_REQUEST_QUERY_SQL, DQLITE_REQUEST_INTERRUPT, DQLITE_REQUEST_CONNECT, DQLITE_REQUEST_ADD, /* The PROMOTE and ASSIGN requests share a type tag. We expose it under * two names here to facilitate the macro shenanigans in request.h. */ DQLITE_REQUEST_PROMOTE_OR_ASSIGN, DQLITE_REQUEST_ASSIGN = DQLITE_REQUEST_PROMOTE_OR_ASSIGN, DQLITE_REQUEST_REMOVE, DQLITE_REQUEST_DUMP, DQLITE_REQUEST_CLUSTER, DQLITE_REQUEST_TRANSFER, DQLITE_REQUEST_DESCRIBE, DQLITE_REQUEST_WEIGHT }; #define DQLITE_REQUEST_CLUSTER_FORMAT_V0 0 /* ID and address */ #define DQLITE_REQUEST_CLUSTER_FORMAT_V1 1 /* ID, address and role */ #define DQLITE_REQUEST_DESCRIBE_FORMAT_V0 0 /* Failure domain and weight */ /* These apply to REQUEST_EXEC, REQUEST_EXEC_SQL, REQUEST_QUERY, and * REQUEST_QUERY_SQL. */ #define DQLITE_REQUEST_PARAMS_SCHEMA_V0 0 /* One-byte params count */ #define DQLITE_REQUEST_PARAMS_SCHEMA_V1 1 /* Four-byte params count */ /* These apply to REQUEST_PREPARE and RESPONSE_STMT. */ /* At most one statement in request, no tail offset in response */ #define DQLITE_PREPARE_STMT_SCHEMA_V0 0 /* Any number of statements in request, tail offset in response */ #define DQLITE_PREPARE_STMT_SCHEMA_V1 1 /* Response types */ enum { DQLITE_RESPONSE_FAILURE, DQLITE_RESPONSE_SERVER, DQLITE_RESPONSE_SERVER_LEGACY = DQLITE_RESPONSE_SERVER, DQLITE_RESPONSE_WELCOME, DQLITE_RESPONSE_SERVERS, DQLITE_RESPONSE_DB, DQLITE_RESPONSE_STMT, DQLITE_RESPONSE_STMT_WITH_OFFSET = DQLITE_RESPONSE_STMT, DQLITE_RESPONSE_RESULT, DQLITE_RESPONSE_ROWS, DQLITE_RESPONSE_EMPTY, DQLITE_RESPONSE_FILES, DQLITE_RESPONSE_METADATA }; #endif /* DQLITE_PROTOCOL_H_ */ dqlite-1.16.7/src/query.c000066400000000000000000000064541465252713400152120ustar00rootroot00000000000000#include "query.h" #include "tuple.h" /* Return the type code of the i'th column value. * * TODO: find a better way to handle time types. */ static int value_type(sqlite3_stmt *stmt, int i) { int type = sqlite3_column_type(stmt, i); const char *column_type_name = sqlite3_column_decltype(stmt, i); if (column_type_name != NULL) { if ((strcasecmp(column_type_name, "DATETIME") == 0) || (strcasecmp(column_type_name, "DATE") == 0) || (strcasecmp(column_type_name, "TIMESTAMP") == 0)) { if (type == SQLITE_INTEGER) { type = DQLITE_UNIXTIME; } else { assert(type == SQLITE_TEXT || type == SQLITE_NULL); type = DQLITE_ISO8601; } } else if (strcasecmp(column_type_name, "BOOLEAN") == 0) { assert(type == SQLITE_INTEGER || type == SQLITE_NULL); type = DQLITE_BOOLEAN; } } assert(type < 16); return type; } /* Append a single row to the message. */ static int encode_row(sqlite3_stmt *stmt, struct buffer *buffer, int n) { struct tuple_encoder encoder; int rc; int i; rc = tuple_encoder__init(&encoder, (unsigned)n, TUPLE__ROW, buffer); if (rc != 0) { return SQLITE_ERROR; } /* Encode the row values */ for (i = 0; i < n; i++) { /* Figure the type */ struct value value; value.type = value_type(stmt, i); switch (value.type) { case SQLITE_INTEGER: value.integer = sqlite3_column_int64(stmt, i); break; case SQLITE_FLOAT: value.float_ = sqlite3_column_double(stmt, i); break; case SQLITE_BLOB: value.blob.base = (char *)sqlite3_column_blob(stmt, i); value.blob.len = (size_t)sqlite3_column_bytes(stmt, i); break; case SQLITE_NULL: /* TODO: allow null to be encoded with 0 bytes */ value.null = 0; break; case SQLITE_TEXT: value.text = (text_t)sqlite3_column_text(stmt, i); break; case DQLITE_UNIXTIME: value.integer = sqlite3_column_int64(stmt, i); break; case DQLITE_ISO8601: value.text = (text_t)sqlite3_column_text(stmt, i); if (value.text == NULL) { value.text = ""; } break; case DQLITE_BOOLEAN: value.integer = sqlite3_column_int64(stmt, i); break; default: return SQLITE_ERROR; } rc = tuple_encoder__next(&encoder, &value); if (rc != 0) { return rc; } } return SQLITE_OK; } int query__batch(sqlite3_stmt *stmt, struct buffer *buffer) { int n; /* Column count */ int i; uint64_t n64; char *cursor; int rc; n = sqlite3_column_count(stmt); if (n <= 0) { return SQLITE_ERROR; } n64 = (uint64_t)n; /* Insert the column count */ cursor = buffer__advance(buffer, sizeof(uint64_t)); assert(cursor != NULL); uint64__encode(&n64, &cursor); /* Insert the column names */ for (i = 0; i < n; i++) { const char *name = sqlite3_column_name(stmt, i); cursor = buffer__advance(buffer, text__sizeof(&name)); if (cursor == NULL) { return SQLITE_NOMEM; } text__encode(&name, &cursor); } /* Insert the rows. */ do { if (buffer__offset(buffer) >= buffer->page_size) { /* If we are already filled a memory page, let's break * for now, we'll send more rows in a separate * response. */ rc = SQLITE_ROW; break; } rc = sqlite3_step(stmt); if (rc != SQLITE_ROW) { break; } rc = encode_row(stmt, buffer, n); if (rc != SQLITE_OK) { break; } } while (1); return rc; } dqlite-1.16.7/src/query.h000066400000000000000000000007051465252713400152100ustar00rootroot00000000000000/** * Step through a query progressively encoding a the row tuples. */ #ifndef QUERY_H_ #define QUERY_H_ #include #include "lib/buffer.h" #include "lib/serialize.h" /** * Step through the given query statement progressively encoding the yielded row * tuples, either until #SQLITE_DONE is returned or a full page of the given * buffer is filled. */ int query__batch(sqlite3_stmt *stmt, struct buffer *buffer); #endif /* QUERY_H_*/ dqlite-1.16.7/src/raft.h000066400000000000000000002063061465252713400150040ustar00rootroot00000000000000#if defined(USE_SYSTEM_RAFT) #include #include #include #elif !defined(RAFT_H) #define RAFT_H #include #include #include #include #include #include #include "lib/sm.h" #include "lib/queue.h" #ifndef RAFT_API #define RAFT_API __attribute__((visibility("default"))) #endif #ifndef DQLITE_VISIBLE_TO_TESTS #define DQLITE_VISIBLE_TO_TESTS __attribute__((visibility("default"))) #endif /** * Version. */ #define RAFT_VERSION_MAJOR 0 #define RAFT_VERSION_MINOR 18 #define RAFT_VERSION_RELEASE 0 #define RAFT_VERSION_NUMBER \ (RAFT_VERSION_MAJOR * 100 * 100 + RAFT_VERSION_MINOR * 100 + \ RAFT_VERSION_RELEASE) int raft_version_number(void); /** * Error codes. */ enum { RAFT_NOMEM = 1, /* Out of memory */ RAFT_BADID, /* Server ID is not valid */ RAFT_DUPLICATEID, /* Server ID already in use */ RAFT_DUPLICATEADDRESS, /* Server address already in use */ RAFT_BADROLE, /* Server role is not valid */ RAFT_MALFORMED, RAFT_NOTLEADER, RAFT_LEADERSHIPLOST, RAFT_SHUTDOWN, RAFT_CANTBOOTSTRAP, RAFT_CANTCHANGE, RAFT_CORRUPT, RAFT_CANCELED, RAFT_NAMETOOLONG, RAFT_TOOBIG, RAFT_NOCONNECTION, RAFT_BUSY, RAFT_IOERR, /* File system or storage error */ RAFT_NOTFOUND, /* Resource not found */ RAFT_INVALID, /* Invalid parameter */ RAFT_UNAUTHORIZED, /* No access to a resource */ RAFT_NOSPACE, /* Not enough space on disk */ RAFT_TOOMANY /* Some system or raft limit was hit */ }; /** * Size of human-readable error message buffers. */ #define RAFT_ERRMSG_BUF_SIZE 256 /** * Return the error message describing the given error code. */ RAFT_API const char *raft_strerror(int errnum); typedef unsigned long long raft_id; /** * Hold the value of a raft term. Guaranteed to be at least 64-bit long. */ typedef unsigned long long raft_term; /** * Hold the value of a raft entry index. Guaranteed to be at least 64-bit long. */ typedef unsigned long long raft_index; /** * Hold a time value expressed in milliseconds since the epoch. */ typedef unsigned long long raft_time; /** * Hold the features a raft node is capable of. */ typedef uint64_t raft_flags; /** * A data buffer. */ struct raft_buffer { void *base; /* Pointer to the buffer data. */ size_t len; /* Length of the buffer. */ }; /** * Server role codes. */ enum { RAFT_STANDBY, /* Replicate log, does not participate in quorum. */ RAFT_VOTER, /* Replicate log, does participate in quorum. */ RAFT_SPARE /* Does not replicate log, or participate in quorum. */ }; /** * Hold information about a single server in the cluster configuration. * WARNING: This struct is encoded/decoded, be careful when adapting it. */ struct raft_server { raft_id id; /* Server ID, must be greater than zero. */ char *address; /* Server address. User defined. */ int role; /* Server role. */ }; /** * Hold information about all servers currently part of the cluster. * WARNING: This struct is encoded/decoded, be careful when adapting it. */ struct raft_configuration { struct raft_server *servers; /* Array of servers member of the cluster. */ unsigned n; /* Number of servers in the array. */ }; /** * Initialize an empty raft configuration. */ RAFT_API void raft_configuration_init(struct raft_configuration *c); /** * Release all memory used by the given configuration object. */ RAFT_API void raft_configuration_close(struct raft_configuration *c); /** * Add a server to a raft configuration. * * The @id must be greater than zero and @address point to a valid string. * * The @role must be either #RAFT_VOTER, #RAFT_STANDBY, #RAFT_SPARE. * * If @id or @address are already in use by another server in the configuration, * an error is returned. * * The @address string will be copied and can be released after this function * returns. */ RAFT_API int raft_configuration_add(struct raft_configuration *c, raft_id id, const char *address, int role); /** * Encode the given configuration object. * * The memory of the returned buffer is allocated using raft_malloc(), and * client code is responsible for releasing it when no longer needed. */ RAFT_API int raft_configuration_encode(const struct raft_configuration *c, struct raft_buffer *buf); /** * Hash function which outputs a 64-bit value based on a text and a number. * * This can be used to generate a unique ID for a new server being added, for * example based on its address and on the current time in milliseconds since * the Epoch. * * It's internally implemented as a SHA1 where only the last 8 bytes of the hash * value are kept. */ RAFT_API unsigned long long raft_digest(const char *text, unsigned long long n); /** * Log entry types. */ enum { RAFT_COMMAND = 1, /* Command for the application FSM. */ RAFT_BARRIER, /* Wait for all previous commands to be applied. */ RAFT_CHANGE /* Raft configuration change. */ }; /** * A small fixed-size inline buffer that stores extra data for a raft_entry * that is different for each node in the cluster. * * A leader initializes the local data for an entry before passing it into * raft_apply. This local data is stored in the volatile raft log and also * in the persistent raft log on the leader. AppendEntries messages sent by * the leader never contain the local data for entries. * * When a follower accepts an AppendEntries request, it invokes a callback * provided by the FSM to fill out the local data for each new entry before * appending the entries to its log (volatile and persistent). This local * data doesn't have to be the same as the local data that the leader computed. * * When starting up, a raft node reads the local data for each entry for its * persistent log as part of populating the volatile log. */ struct raft_entry_local_data { /* Must be the only member of this struct. */ uint8_t buf[16]; }; /** * A single entry in the raft log. * * An entry that originated from this raft instance while it was the leader * (typically via client calls to raft_apply()) should normally have a @buf * attribute referencing directly the memory that was originally allocated by * the client itself to contain the entry data, and the @batch attribute set to * #NULL. * * An entry that was received from the network as part of an AppendEntries RPC * or that was loaded from disk at startup should normally have a @batch * attribute that points to a contiguous chunk of memory that contains the data * of the entry itself plus possibly the data for other entries that were * received or loaded with it at the same time. In this case the @buf pointer * will be equal to the @batch pointer plus an offset, that locates the position * of the entry's data within the batch. * * When the @batch attribute is not #NULL the raft library will take care of * releasing that memory only once there are no more references to the * associated entries. * * This arrangement makes it possible to minimize the amount of memory-copying * when performing I/O. * * The @is_local field is set to `true` by a leader that appends an entry to its * volatile log. It is set to `false` by a follower that copies an entry received * via AppendEntries to its volatile log. It is not represented in the AppendEntries * message or in the persistent log. This field can be used by the FSM's `apply` * callback to handle a COMMAND entry differently depending on whether it * originated locally. * * Note: The @local_data and @is_local fields do not exist when we use an external * libraft, because the last separate release of libraft predates their addition. * The ifdef at the very top of this file ensures that we use the system raft headers * when we build against an external libraft, so there will be no ABI mismatch as * a result of incompatible struct layouts. */ struct raft_entry { raft_term term; /* Term in which the entry was created. */ unsigned short type; /* Type (FSM command, barrier, config change). */ bool is_local; /* Placed here so it goes in the padding after @type. */ struct raft_buffer buf; /* Entry data. */ struct raft_entry_local_data local_data; void *batch; /* Batch that buf's memory points to, if any. */ }; /** * Hold the arguments of a RequestVote RPC. * * The RequestVote RPC is invoked by candidates to gather votes. */ struct raft_request_vote { int version; raft_term term; /* Candidate's term. */ raft_id candidate_id; /* ID of the server requesting the vote. */ raft_index last_log_index; /* Index of candidate's last log entry. */ raft_index last_log_term; /* Term of log entry at last_log_index. */ bool disrupt_leader; /* True if current leader should be discarded. */ bool pre_vote; /* True if this is a pre-vote request. */ }; #define RAFT_REQUEST_VOTE_VERSION 2 /** * Hold the result of a RequestVote RPC. */ struct raft_request_vote_result { int version; raft_term term; /* Receiver's current term (candidate updates itself). */ bool vote_granted; /* True means candidate received vote. */ bool pre_vote; /* The response to a pre-vote RequestVote or not. */ }; #define RAFT_REQUEST_VOTE_RESULT_VERSION 2 /** * Hold the arguments of an AppendEntries RPC. * * The AppendEntries RPC is invoked by the leader to replicate log entries. It's * also used as heartbeat (figure 3.1). */ struct raft_append_entries { int version; raft_term term; /* Leader's term. */ raft_index prev_log_index; /* Index of log entry preceeding new ones. */ raft_term prev_log_term; /* Term of entry at prev_log_index. */ raft_index leader_commit; /* Leader's commit index. */ struct raft_entry *entries; /* Log entries to append. */ unsigned n_entries; /* Size of the log entries array. */ }; #define RAFT_APPEND_ENTRIES_VERSION 0 /** * Hold the result of an AppendEntries RPC (figure 3.1). */ struct raft_append_entries_result { int version; raft_term term; /* Receiver's current_term. */ raft_index rejected; /* If non-zero, the index that was rejected. */ raft_index last_log_index; /* Receiver's last log entry index, as hint. */ raft_flags features; /* Feature flags. */ }; #define RAFT_APPEND_ENTRIES_RESULT_VERSION 1 typedef uint32_t checksum_t; typedef uint32_t pageno_t; struct page_checksum { pageno_t page_no; checksum_t checksum; }; /* page range [from, to], with to included */ struct page_from_to { pageno_t from; pageno_t to; }; enum raft_result { RAFT_RESULT_OK = 0, RAFT_RESULT_UNEXPECTED = 1, RAFT_RESULT_DONE = 2, }; /** * Hold the arguments of an InstallSnapshot RPC (figure 5.3). */ struct raft_install_snapshot { int version; raft_term term; /* Leader's term. */ raft_index last_index; /* Index of last entry in the snapshot. */ raft_term last_term; /* Term of last_index. */ struct raft_configuration conf; /* Config as of last_index. */ raft_index conf_index; /* Commit index of conf. */ struct raft_buffer data; /* Raw snapshot data. */ enum raft_result result; }; #define RAFT_INSTALL_SNAPSHOT_VERSION 0 struct raft_install_snapshot_result { int version; enum raft_result result; }; #define RAFT_INSTALL_SNAPSHOT_RESULT_VERSION 0 struct raft_signature { int version; const char *db; struct page_from_to page_from_to; pageno_t cs_page_no; enum raft_result result; }; #define RAFT_SIGNATURE_VERSION 0 struct raft_signature_result { int version; const char *db; struct page_checksum *cs; unsigned int cs_nr; pageno_t cs_page_no; enum raft_result result; }; #define RAFT_SIGNATURE_RESULT_VERSION 0 struct raft_install_snapshot_mv { int version; const char *db; struct page_from_to *mv; unsigned int mv_nr; enum raft_result result; }; #define RAFT_INSTALL_SNAPSHOT_MV_VERSION 0 struct raft_install_snapshot_mv_result { int version; const char *db; pageno_t last_known_page_no; /* used for retries and message losses */ enum raft_result result; }; #define RAFT_INSTALL_SNAPSHOT_MV_RESULT_VERSION 0 struct raft_install_snapshot_cp { int version; const char *db; pageno_t page_no; struct raft_buffer page_data; enum raft_result result; }; #define RAFT_INSTALL_SNAPSHOT_CP_VERSION 0 struct raft_install_snapshot_cp_result { int version; pageno_t last_known_page_no; /* used for retries and message losses */ enum raft_result result; }; #define RAFT_INSTALL_SNAPSHOT_CP_RESULT_VERSION 0 /** * Hold the arguments of a TimeoutNow RPC. * * The TimeoutNow RPC is invoked by leaders to transfer leadership to a * follower. */ struct raft_timeout_now { int version; raft_term term; /* Leader's term. */ raft_index last_log_index; /* Index of leader's last log entry. */ raft_index last_log_term; /* Term of log entry at last_log_index. */ }; #define RAFT_TIMEOUT_NOW_VERSION 0 /** * Type codes for RPC messages. */ enum { RAFT_IO_APPEND_ENTRIES = 1, RAFT_IO_APPEND_ENTRIES_RESULT, RAFT_IO_REQUEST_VOTE, RAFT_IO_REQUEST_VOTE_RESULT, RAFT_IO_INSTALL_SNAPSHOT, RAFT_IO_TIMEOUT_NOW, RAFT_IO_SIGNATURE, RAFT_IO_SIGNATURE_RESULT, RAFT_IO_INSTALL_SNAPSHOT_RESULT, RAFT_IO_INSTALL_SNAPSHOT_MV, RAFT_IO_INSTALL_SNAPSHOT_MV_RESULT, RAFT_IO_INSTALL_SNAPSHOT_CP, RAFT_IO_INSTALL_SNAPSHOT_CP_RESULT, }; /** * A single RPC message that can be sent or received over the network. * * The RPC message types all have a `version` field. * In the libuv io implementation, `version` is filled out during decoding * and is based on the size of the message on the wire, see e.g. * `sizeofRequestVoteV1`. The version number in the RAFT_MESSAGE_XXX_VERSION * macro needs to be bumped every time the message is updated. * * Notes when adding a new message type to raft: * raft_io implementations compiled against old versions of raft don't know the * new message type and possibly have not allocated enough space for it. When * such an application receives a new message over the wire, the raft_io * implementation will err out or drop the message, because it doesn't know how * to decode it based on its type. * raft_io implementations compiled against versions of raft that know the new * message type but at runtime are linked against an older raft lib, will pass * the message to raft, where raft will drop it. * When raft receives a message and accesses a field of a new message type, * the raft_io implementation must have known about the new message type, * so it was compiled against a modern enough version of raft, and memory * accesses should be safe. * * Sending a new message type with a raft_io implementation that doesn't know * the type is safe, the implementation should drop the message based on its * type and will not try to access fields it doesn't know the existence of. */ struct raft_message { unsigned short type; /* RPC type code. */ raft_id server_id; /* ID of sending or destination server. */ const char *server_address; /* Address of sending or destination server. */ union { /* Type-specific data */ struct raft_request_vote request_vote; struct raft_request_vote_result request_vote_result; struct raft_append_entries append_entries; struct raft_append_entries_result append_entries_result; struct raft_install_snapshot install_snapshot; struct raft_install_snapshot_result install_snapshot_result; struct raft_signature signature; struct raft_signature_result signature_result; struct raft_install_snapshot_cp install_snapshot_cp; struct raft_install_snapshot_cp_result install_snapshot_cp_result; struct raft_install_snapshot_mv install_snapshot_mv; struct raft_install_snapshot_mv_result install_snapshot_mv_result; struct raft_timeout_now timeout_now; }; }; /** * Hold the details of a snapshot. * The user-provided raft_buffer structs should provide the user with enough * flexibility to adapt/evolve snapshot formats. * If this struct would NEED to be adapted in the future, raft can always move * to a new struct with a new name and a new raft_io version. */ struct raft_snapshot { /* Index and term of last entry included in the snapshot. */ raft_index index; raft_term term; /* Last committed configuration included in the snapshot, along with the * index it was committed at. */ struct raft_configuration configuration; raft_index configuration_index; /* Content of the snapshot. When a snapshot is taken, the user FSM can * fill the bufs array with more than one buffer. When a snapshot is * restored, there will always be a single buffer. */ struct raft_buffer *bufs; unsigned n_bufs; }; /** * Asynchronous request to send an RPC message. */ struct raft_io_send; typedef void (*raft_io_send_cb)(struct raft_io_send *req, int status); struct raft_io_send { void *data; /* User data */ raft_io_send_cb cb; /* Request callback */ }; /** * Asynchronous request to store new log entries. */ struct raft_io_append; typedef void (*raft_io_append_cb)(struct raft_io_append *req, int status); struct raft_io_append { void *data; /* User data */ raft_io_append_cb cb; /* Request callback */ }; /** * Asynchronous request to store a new snapshot. */ struct raft_io_snapshot_put; typedef void (*raft_io_snapshot_put_cb)(struct raft_io_snapshot_put *req, int status); struct raft_io_snapshot_put { void *data; /* User data */ raft_io_snapshot_put_cb cb; /* Request callback */ }; /** * Asynchronous request to load the most recent snapshot available. */ struct raft_io_snapshot_get; typedef void (*raft_io_snapshot_get_cb)(struct raft_io_snapshot_get *req, struct raft_snapshot *snapshot, int status); struct raft_io_snapshot_get { void *data; /* User data */ raft_io_snapshot_get_cb cb; /* Request callback */ }; /** * Asynchronous work request. */ struct raft_io_async_work; typedef int (*raft_io_async_work_fn)(struct raft_io_async_work *req); typedef void (*raft_io_async_work_cb)(struct raft_io_async_work *req, int status); struct raft_io_async_work { void *data; /* User data */ raft_io_async_work_fn work; /* Function to run async from the main loop */ raft_io_async_work_cb cb; /* Request callback */ }; /** * Customizable tracer, for debugging purposes. */ struct raft_tracer { /** * Implementation-defined state object. */ void *impl; /** * Whether this tracer should emit messages. */ bool enabled; /** * Trace level. */ unsigned level; /** * Emit the given trace message, possibly decorating it with the * provided metadata. */ void (*emit)(struct raft_tracer *t, const char *file, unsigned int line, const char *func, unsigned int level, const char *message); }; struct raft_io; /* Forward declaration. */ /** * Callback invoked by the I/O implementation at regular intervals. */ typedef void (*raft_io_tick_cb)(struct raft_io *io); /** * Callback invoked by the I/O implementation when an RPC message is received. */ typedef void (*raft_io_recv_cb)(struct raft_io *io, struct raft_message *msg); typedef void (*raft_io_close_cb)(struct raft_io *io); /** * version field MUST be filled out by user. * When moving to a new version, the user MUST implement the newly added * methods. */ struct raft_io { int version; /* 1 or 2 */ void *data; void *impl; char errmsg[RAFT_ERRMSG_BUF_SIZE]; int (*init)(struct raft_io *io, raft_id id, const char *address); void (*close)(struct raft_io *io, raft_io_close_cb cb); int (*load)(struct raft_io *io, raft_term *term, raft_id *voted_for, struct raft_snapshot **snapshot, raft_index *start_index, struct raft_entry *entries[], size_t *n_entries); int (*start)(struct raft_io *io, unsigned msecs, raft_io_tick_cb tick, raft_io_recv_cb recv); int (*bootstrap)(struct raft_io *io, const struct raft_configuration *conf); int (*recover)(struct raft_io *io, const struct raft_configuration *conf); int (*set_term)(struct raft_io *io, raft_term term); int (*set_vote)(struct raft_io *io, raft_id server_id); int (*send)(struct raft_io *io, struct raft_io_send *req, const struct raft_message *message, raft_io_send_cb cb); int (*append)(struct raft_io *io, struct raft_io_append *req, const struct raft_entry entries[], unsigned n, raft_io_append_cb cb); int (*truncate)(struct raft_io *io, raft_index index); int (*snapshot_put)(struct raft_io *io, unsigned trailing, struct raft_io_snapshot_put *req, const struct raft_snapshot *snapshot, raft_io_snapshot_put_cb cb); int (*snapshot_get)(struct raft_io *io, struct raft_io_snapshot_get *req, raft_io_snapshot_get_cb cb); raft_time (*time)(struct raft_io *io); int (*random)(struct raft_io *io, int min, int max); /* Field(s) below added since version 2. */ int (*async_work)(struct raft_io *io, struct raft_io_async_work *req, raft_io_async_work_cb cb); }; /** * version field MUST be filled out by user. * When moving to a new version, the user MUST initialize the new methods, * either with an implementation or with NULL. * * version 2: * introduces `snapshot_finalize`, when this method is not NULL, it will * always run after a successful call to `snapshot`, whether the snapshot has * been successfully written to disk or not. If it is set, raft will * assume no ownership of any of the `raft_buffer`s and the responsibility to * clean up lies with the user of raft. * `snapshot_finalize` can be used to e.g. release a lock that was taken during * a call to `snapshot`. Until `snapshot_finalize` is called, raft can access * the data contained in the `raft_buffer`s. * * version 3: * Adds support for async snapshots through the `snapshot_async` function. * When this method is provided, raft will call `snapshot` in the main loop, * and when successful, will call `snapshot_async` using the `io->async_work` * method, so blocking I/O calls are allowed in the implementation. After the * `snapshot_async` completes, `snapshot_finalize` will be called in the main * loop, independent of the return value of `snapshot_async`. * An implementation that does not use asynchronous snapshots MUST set * `snapshot_async` to NULL. * All memory allocated by the snapshot routines MUST be freed by the snapshot * routines themselves. */ struct raft_fsm { int version; /* 1, 2 or 3 */ void *data; int (*apply)(struct raft_fsm *fsm, const struct raft_buffer *buf, void **result); int (*snapshot)(struct raft_fsm *fsm, struct raft_buffer *bufs[], unsigned *n_bufs); int (*restore)(struct raft_fsm *fsm, struct raft_buffer *buf); /* Fields below added since version 2. */ int (*snapshot_finalize)(struct raft_fsm *fsm, struct raft_buffer *bufs[], unsigned *n_bufs); /* Fields below added since version 3. */ int (*snapshot_async)(struct raft_fsm *fsm, struct raft_buffer *bufs[], unsigned *n_bufs); }; struct raft; /* Forward declaration. */ /** * State codes. */ enum { RAFT_UNAVAILABLE, RAFT_FOLLOWER, RAFT_CANDIDATE, RAFT_LEADER }; /** * State callback to invoke if raft's state changes. */ typedef void (*raft_state_cb)(struct raft *raft, unsigned short old_state, unsigned short new_state); struct raft_progress; /** * Close callback. * * It's safe to release the memory of a raft instance only after this callback * has fired. */ typedef void (*raft_close_cb)(struct raft *raft); struct raft_change; /* Forward declaration */ struct raft_transfer; /* Forward declaration */ struct raft_log; /** * Hold and drive the state of a single raft server in a cluster. * When replacing reserved fields in the middle of this struct, you MUST use a * type with the same size and alignment requirements as the original type. */ struct raft { void *data; /* Custom user data. */ struct raft_tracer *tracer; /* Tracer implementation. */ struct raft_io *io; /* Disk and network I/O implementation. */ struct raft_fsm *fsm; /* User-defined FSM to apply commands to. */ raft_id id; /* Server ID of this raft instance. */ char *address; /* Server address of this raft instance. */ /* * Cache of the server's persistent state, updated on stable storage * before responding to RPCs (Figure 3.1). */ raft_term current_term; /* Latest term server has seen. */ raft_id voted_for; /* Candidate that received vote in current term. */ struct raft_log *log; /* Log entries. */ /* * Current membership configuration (Chapter 4). * * At any given moment the current configuration can be committed or * uncommitted. * * If a server is voting, the log entry with index 1 must always contain * the first committed configuration. * * At all times #configuration_committed_index is either zero or is the * index of the most recent log entry of type #RAFT_CHANGE that we know * to be committed. That means #configuration_committed_index is always * equal or lower than #commit_index. * * At all times #configuration_uncommitted_index is either zero or is * the index of an uncommitted log entry of type #RAFT_CHANGE. There can * be at most one uncommitted entry of type #RAFT_CHANGE because we * allow only one configuration change at a time. * * At all times #configuration_last_snapshot is a copy of the * configuration contained the most recent snapshot, if any. * * The possible scenarios are: * * 1. #configuration_committed_index and * #configuration_uncommitted_index are both zero. This should only * happen when a brand new server starts joining a cluster and is * waiting to receive log entries from the current leader. In this case * #configuration and #configuration_last_snapshot must be empty and * have no servers. * * 2. #configuration_committed_index is non-zero and * #configuration_uncommitted_index is zero. This means that * #configuration is committed and there is no pending configuration * change. The content of #configuration must match the one of the * log entry at #configuration_committed_index. * * 3. #configuration_committed_index and * #configuration_uncommitted_index are both non-zero, with the latter * being greater than the former. This means that #configuration is * uncommitted and represents a pending configuration change. The * content of #configuration must match the one of the log entry at * #configuration_uncommitted_index. * * When a snapshot is taken, a copy of the most recent configuration * known to be committed (i.e. the configuration contained in the log * entry at #configuration_committed_index) is saved in * #configuration_last_snapshot, so it can be easily retrieved in case * the log gets truncated because of compaction and does not contain the * entry at #configuration_committed_index anymore. Likewise, if a * snapshot is restored its associated configuration is saved in * #configuration_last_snapshot. */ struct raft_configuration configuration; struct raft_configuration configuration_last_snapshot; raft_index configuration_committed_index; raft_index configuration_uncommitted_index; /* * Election timeout in milliseconds (default 1000). * * From 3.4: * * Raft uses a heartbeat mechanism to trigger leader election. When * servers start up, they begin as followers. A server remains in * follower state as long as it receives valid RPCs from a leader or * candidate. Leaders send periodic heartbeats (AppendEntries RPCs * that carry no log entries) to all followers in order to maintain * their authority. If a follower receives no communication over a * period of time called the election timeout, then it assumes there is * no viable leader and begins an election to choose a new leader. * * This is the baseline value and will be randomized between 1x and 2x. * * See raft_change_election_timeout() to customize the value of this * attribute. */ unsigned election_timeout; /* * Heartbeat timeout in milliseconds (default 100). This is relevant * only for when the raft instance is in leader state: empty * AppendEntries RPCs will be sent if this amount of milliseconds * elapses without any user-triggered AppendEntries RCPs being sent. * * From Figure 3.1: * * [Leaders] Send empty AppendEntries RPC during idle periods to * prevent election timeouts. */ unsigned heartbeat_timeout; /* * When the leader sends an InstallSnapshot RPC to a follower it will * consider the RPC as failed after this timeout and retry. */ unsigned install_snapshot_timeout; /* * The fields below hold the part of the server's volatile state which * is always applicable regardless of the whether the server is * follower, candidate or leader (Figure 3.1). This state is rebuilt * automatically after a server restart. */ raft_index commit_index; /* Highest log entry known to be committed */ raft_index last_applied; /* Highest log entry applied to the FSM */ raft_index last_stored; /* Highest log entry persisted on disk */ /* * Current server state of this raft instance, along with a union * defining state-specific values. */ unsigned short state; union { struct /* Follower */ { unsigned randomized_election_timeout; /* Timer expiration. */ struct /* Current leader info. */ { raft_id id; char *address; } current_leader; uint64_t append_in_flight_count; uint64_t reserved[7]; /* Future use */ } follower_state; struct { unsigned randomized_election_timeout; /* Timer expiration. */ bool *votes; /* Vote results. */ bool disrupt_leader; /* For leadership transfer */ bool in_pre_vote; /* True in pre-vote phase. */ uint64_t reserved[8]; /* Future use */ } candidate_state; struct { struct raft_progress *progress; /* Per-server replication state. */ struct raft_change *change; /* Pending membership change. */ raft_id promotee_id; /* ID of server being promoted. */ unsigned short round_number; /* Current sync round. */ raft_index round_index; /* Target of the current round. */ raft_time round_start; /* Start of current round. */ queue requests; /* Outstanding client requests. */ uint32_t voter_contacts; /* Current number of voting nodes we are in contact with */ uint32_t reserved2; /* Future use */ uint64_t reserved[7]; /* Future use */ } leader_state; }; /* Election timer start. * * This timer has different purposes depending on the state. Followers * convert to candidate after the randomized election timeout has * elapsed without leader contact. Candidates start a new election after * the randomized election timeout has elapsed without a winner. Leaders * step down after the election timeout has elapsed without contacting a * majority of voting servers. */ raft_time election_timer_start; /* In-progress leadership transfer request, if any. */ struct raft_transfer *transfer; /* * Information about the last snapshot that was taken (if any). */ struct { unsigned threshold; /* N. of entries before snapshot */ unsigned trailing; /* N. of trailing entries to retain */ struct raft_snapshot pending; /* In progress snapshot */ struct raft_io_snapshot_put put; /* Store snapshot request */ uint64_t reserved[8]; /* Future use */ } snapshot; /* * Callback to invoke once a close request has completed. */ raft_close_cb close_cb; /* * Human-readable message providing diagnostic information about the * last error occurred. */ char errmsg[RAFT_ERRMSG_BUF_SIZE]; /* Whether to use pre-vote to avoid disconnected servers disrupting the * current leader, as described in 4.2.3 and 9.6. */ bool pre_vote; /* Limit how long to wait for a stand-by to catch-up with the log when * its being promoted to voter. */ unsigned max_catch_up_rounds; unsigned max_catch_up_round_duration; /* uint64_t because we used a reserved field. In reality this a pointer * to a `struct raft_callbacks` that can be used to store e.g. various * user-supplied callbacks. */ uint64_t callbacks; /* Future extensions */ uint64_t reserved[31]; }; RAFT_API int raft_init(struct raft *r, struct raft_io *io, struct raft_fsm *fsm, raft_id id, const char *address); RAFT_API void raft_close(struct raft *r, raft_close_cb cb); /** * This function MUST be called after raft_init and before raft_start. * @cb will be called every time the raft state changes. */ RAFT_API void raft_register_state_cb(struct raft *r, raft_state_cb cb); /** * Bootstrap this raft instance using the given configuration. The instance must * not have been started yet and must be completely pristine, otherwise * #RAFT_CANTBOOTSTRAP will be returned. */ RAFT_API int raft_bootstrap(struct raft *r, const struct raft_configuration *conf); /** * Force a new configuration in order to recover from a loss of quorum where the * current configuration cannot be restored, such as when a majority of servers * die at the same time. * * This works by appending the new configuration directly to the log stored on * disk. * * In order for this operation to be safe you must follow these steps: * * 1. Make sure that no servers in the cluster are running, either because they * died or because you manually stopped them. * * 2. Run @raft_recover exactly one time, on the non-dead server which has * the highest term and the longest log. * * 3. Copy the data directory of the server you ran @raft_recover on to all * other non-dead servers in the cluster, replacing their current data * directory. * * 4. Restart all servers. */ RAFT_API int raft_recover(struct raft *r, const struct raft_configuration *conf); RAFT_API int raft_start(struct raft *r); /** * Set the election timeout. * * Every raft instance is initialized with a default election timeout of 1000 * milliseconds. If you wish to tweak it, call this function before starting * your event loop. * * From Chapter 9: * * We recommend a range that is 10-20 times the one-way network latency, which * keeps split votes rates under 40% in all cases for reasonably sized * clusters, and typically results in much lower rates. * * Note that the current random election timer will be reset and a new one timer * will be generated. */ RAFT_API void raft_set_election_timeout(struct raft *r, unsigned msecs); /** * Set the heartbeat timeout. */ RAFT_API void raft_set_heartbeat_timeout(struct raft *r, unsigned msecs); /** * Set the snapshot install timeout. */ RAFT_API void raft_set_install_snapshot_timeout(struct raft *r, unsigned msecs); /** * Number of outstanding log entries before starting a new snapshot. The default * is 1024. */ RAFT_API void raft_set_snapshot_threshold(struct raft *r, unsigned n); /** * Enable or disable pre-vote support. Pre-vote is turned off by default. */ RAFT_API void raft_set_pre_vote(struct raft *r, bool enabled); /** * Number of outstanding log entries to keep in the log after a snapshot has * been taken. This avoids sending snapshots when a follower is behind by just a * few entries. The default is 128. */ RAFT_API void raft_set_snapshot_trailing(struct raft *r, unsigned n); /** * Set the maximum number of a catch-up rounds to try when replicating entries * to a stand-by server that is being promoted to voter, before giving up and * failing the configuration change. The default is 10. */ RAFT_API void raft_set_max_catch_up_rounds(struct raft *r, unsigned n); /** * Set the maximum duration of a catch-up round when replicating entries to a * stand-by server that is being promoted to voter. The default is 5 seconds. */ RAFT_API void raft_set_max_catch_up_round_duration(struct raft *r, unsigned msecs); /** * Return a human-readable description of the last error occurred. */ RAFT_API const char *raft_errmsg(struct raft *r); /** * Return the code of the current raft state (follower/candidate/leader). */ RAFT_API int raft_state(struct raft *r); /** * Return the code of the current raft role (spare/standby/voter), * or -1 if this server is not in the current configuration. */ RAFT_API int raft_role(struct raft *r); /** * Return the ID and address of the current known leader, if any. */ RAFT_API void raft_leader(struct raft *r, raft_id *id, const char **address); /** * Return the index of the last entry that was appended to the local log. */ RAFT_API raft_index raft_last_index(struct raft *r); /** * Return the index of the last entry that was applied to the local FSM. */ RAFT_API raft_index raft_last_applied(struct raft *r); /** * Return the number of voting servers that the leader has recently been in * contact with. This can be used to help determine whether the cluster may be * in a degraded/at risk state. * * Returns valid values >= 1, because a leader is always in contact with * itself. * Returns -1 if called on a follower. * * Note that the value returned may be out of date, and so should not be relied * upon for absolute correctness. */ RAFT_API int raft_voter_contacts(struct raft *r); /** * Common fields across client request types. * `req_id`, `client_id` and `unique_id` are currently unused. * `reserved` fields should be replaced by new members with the same size * and alignment requirements as `uint64_t`. */ #define RAFT__REQUEST \ void *data; \ int type; \ raft_index index; \ queue queue; \ uint8_t req_id[16]; \ uint8_t client_id[16]; \ uint8_t unique_id[16]; \ uint64_t reserved[4] /** * Asynchronous request to append a new command entry to the log and apply it to * the FSM when a quorum is reached. */ struct raft_apply; typedef void (*raft_apply_cb)(struct raft_apply *req, int status, void *result); struct raft_apply { RAFT__REQUEST; raft_apply_cb cb; }; /** * Propose to append commands to the log and apply them to the FSM once * committed. * * If this server is the leader, it will create @n new log entries of type * #RAFT_COMMAND using the given buffers as their payloads, append them to its * own log and attempt to replicate them on other servers by sending * AppendEntries RPCs. * * The memory pointed at by the @base attribute of each #raft_buffer in the * given array must have been allocated with raft_malloc() or a compatible * allocator. If this function returns 0, the ownership of this memory is * implicitly transferred to the raft library, which will take care of releasing * it when appropriate. Any further client access to such memory leads to * undefined behavior. * * The ownership of the memory of the @bufs array itself is not transferred to * the raft library, and, if allocated dynamically, must be deallocated by the * caller. * * If the command was successfully applied, r->last_applied will be equal to * the log entry index of the applied command when the cb is invoked. */ RAFT_API int raft_apply(struct raft *r, struct raft_apply *req, const struct raft_buffer bufs[], const struct raft_entry_local_data local_data[], const unsigned n, raft_apply_cb cb); /** * Asynchronous request to append a barrier entry. */ struct raft_barrier; typedef void (*raft_barrier_cb)(struct raft_barrier *req, int status); struct raft_barrier { RAFT__REQUEST; raft_barrier_cb cb; }; /** * Propose to append a log entry of type #RAFT_BARRIER. * * This can be used to ensure that there are no unapplied commands. */ RAFT_API int raft_barrier(struct raft *r, struct raft_barrier *req, raft_barrier_cb cb); /** * Asynchronous request to change the raft configuration. */ typedef void (*raft_change_cb)(struct raft_change *req, int status); struct raft_change { RAFT__REQUEST; raft_change_cb cb; }; /** * Add a new server to the cluster configuration. Its initial role will be * #RAFT_SPARE. */ RAFT_API int raft_add(struct raft *r, struct raft_change *req, raft_id id, const char *address, raft_change_cb cb); /** * Assign a new role to the given server. * * If the server has already the given role, or if the given role is unknown, * #RAFT_BADROLE is returned. */ RAFT_API int raft_assign(struct raft *r, struct raft_change *req, raft_id id, int role, raft_change_cb cb); /** * Remove the given server from the cluster configuration. */ RAFT_API int raft_remove(struct raft *r, struct raft_change *req, raft_id id, raft_change_cb cb); /** * Asynchronous request to transfer leadership. */ typedef void (*raft_transfer_cb)(struct raft_transfer *req); struct raft_transfer { RAFT__REQUEST; raft_id id; /* ID of target server. */ raft_time start; /* Start of leadership transfer. */ struct raft_io_send send; /* For sending TimeoutNow */ raft_transfer_cb cb; /* User callback */ }; /** * Transfer leadership to the server with the given ID. * * If the target server is not part of the configuration, or it's the leader * itself, or it's not a #RAFT_VOTER, then #RAFT_BADID is returned. * * The special value #0 means to automatically select a voting follower to * transfer leadership to. If there are no voting followers, return * #RAFT_NOTFOUND. * * When this server detects that the target server has become the leader, or * when @election_timeout milliseconds have elapsed, the given callback will be * invoked. * * After the callback files, clients can check whether the operation was * successful or not by calling @raft_leader() and checking if it returns the * target server. */ RAFT_API int raft_transfer(struct raft *r, struct raft_transfer *req, raft_id id, raft_transfer_cb cb); /** * User-definable dynamic memory allocation functions. * * The @data field will be passed as first argument to all functions. */ struct raft_heap { void *data; /* User data */ void *(*malloc)(void *data, size_t size); void (*free)(void *data, void *ptr); void *(*calloc)(void *data, size_t nmemb, size_t size); void *(*realloc)(void *data, void *ptr, size_t size); void *(*aligned_alloc)(void *data, size_t alignment, size_t size); void (*aligned_free)(void *data, size_t alignment, void *ptr); }; DQLITE_VISIBLE_TO_TESTS void *raft_malloc(size_t size); DQLITE_VISIBLE_TO_TESTS void raft_free(void *ptr); DQLITE_VISIBLE_TO_TESTS void *raft_calloc(size_t nmemb, size_t size); DQLITE_VISIBLE_TO_TESTS void *raft_realloc(void *ptr, size_t size); DQLITE_VISIBLE_TO_TESTS void *raft_aligned_alloc(size_t alignment, size_t size); DQLITE_VISIBLE_TO_TESTS void raft_aligned_free(size_t alignment, void *ptr); /** * Use a custom dynamic memory allocator. */ DQLITE_VISIBLE_TO_TESTS void raft_heap_set(struct raft_heap *heap); /** * Use the default dynamic memory allocator (from the stdlib). This clears any * custom allocator specified with @raft_heap_set. */ DQLITE_VISIBLE_TO_TESTS void raft_heap_set_default(void); /** * Return a reference to the current dynamic memory allocator. * * This is intended for use by applications that want to temporarily replace * and then restore the original allocator, or that want to defer to the * original allocator in some circumstances. * * The behavior of attempting to mutate the default allocator through the * pointer returned by this function, including attempting to deallocate * the backing memory, is undefined. */ DQLITE_VISIBLE_TO_TESTS const struct raft_heap *raft_heap_get(void); #undef RAFT__REQUEST struct raft_uv_transport; /** * Configure the given @raft_io instance to use a libuv-based I/O * implementation. * * The @dir path will be copied, and its memory can possibly be released once * this function returns. * * Return #RAFT_NAMETOOLONG if @dir exceeds the size of the internal buffer * that should hold it * * Return #RAFT_NOTFOUND if @dir does not exist. * * Return #RAFT_INVALID if @dir exists but it's not a directory. * * The implementation of metadata and log persistency is virtually the same as * the one found in LogCabin [0]. * * The disk files consist of metadata files, closed segments, and open * segments. Metadata files are used to track Raft metadata, such as the * server's current term, vote, and log's start index. Segments contain * contiguous entries that are part of the log. Closed segments are never * written to again (but may be renamed and truncated if a suffix of the log is * truncated). Open segments are where newly appended entries go. Once an open * segment reaches the maximum allowed size, it is closed and a new one is used. * * Metadata files are named "metadata1" and "metadata2". The code alternates * between these so that there is always at least one readable metadata file. * On boot, the readable metadata file with the higher version number is used. * * The format of a metadata file is: * * [8 bytes] Format (currently 1). * [8 bytes] Incremental version number. * [8 bytes] Current term. * [8 bytes] ID of server we voted for. * * Closed segments are named by the format string "%lu-%lu" with their * start and end indexes, both inclusive. Closed segments always contain at * least one entry; the end index is always at least as large as the start * index. Closed segment files may occasionally include data past their * filename's end index (these are ignored but a warning is logged). This can * happen if the suffix of the segment is truncated and a crash occurs at an * inopportune time (the segment file is first renamed, then truncated, and a * crash occurs in between). * * Open segments are named by the format string "open-%lu" with a unique * number. These should not exist when the server shuts down cleanly, but they * exist while the server is running and may be left around during a crash. * Open segments either contain entries which come after the last closed * segment or are full of zeros. When the server crashes while appending to an * open segment, the end of that file may be corrupt. We can't distinguish * between a corrupt file and a partially written entry. The code assumes it's * a partially written entry, logs a warning, and ignores it. * * Truncating a suffix of the log will remove all entries that are no longer * part of the log. Truncating a prefix of the log will only remove complete * segments that are before the new log start index. For example, if a * segment has entries 10 through 20 and the prefix of the log is truncated to * start at entry 15, that entire segment will be retained. * * Each segment file starts with a segment header, which currently contains * just an 8-byte version number for the format of that segment. The current * format (version 1) is just a concatenation of serialized entry batches. * * Each batch has the following format: * * [4 bytes] CRC32 checksum of the batch header, little endian. * [4 bytes] CRC32 checksum of the batch data, little endian. * [ ... ] Batch (as described in @raft_decode_entries_batch). * * [0] https://github.com/logcabin/logcabin/blob/master/Storage/SegmentedLog.h */ RAFT_API int raft_uv_init(struct raft_io *io, struct uv_loop_s *loop, const char *dir, struct raft_uv_transport *transport); /** * Release any memory allocated internally. */ RAFT_API void raft_uv_close(struct raft_io *io); /** * Set the block size that will be used for direct I/O. * * The default is to automatically detect the appropriate block size. */ RAFT_API void raft_uv_set_block_size(struct raft_io *io, size_t size); /** * Set the maximum initial size of newly created open segments. * * If the given size is not a multiple of the block size, the actual size will * be reduced to the closest multiple. * * The default is 8 megabytes. */ RAFT_API void raft_uv_set_segment_size(struct raft_io *io, size_t size); /** * Turn snapshot compression on or off. * Returns non-0 on failure, this can e.g. happen when compression is requested * while no suitable compression library is found. * * By default snapshots are compressed if the appropriate libraries are found. */ RAFT_API int raft_uv_set_snapshot_compression(struct raft_io *io, bool compressed); /** * Set how many milliseconds to wait between subsequent retries when * establishing a connection with another server. The default is 1000 * milliseconds. */ RAFT_API void raft_uv_set_connect_retry_delay(struct raft_io *io, unsigned msecs); /** * Emit low-level debug messages using the given tracer. */ RAFT_API void raft_uv_set_tracer(struct raft_io *io, struct raft_tracer *tracer); /** * Enable or disable auto-recovery on startup. Default enabled. */ RAFT_API void raft_uv_set_auto_recovery(struct raft_io *io, bool flag); /** * Callback invoked by the transport implementation when a new incoming * connection has been established. * * No references to @address must be kept after this function returns. * * Ownership of @stream is transferred to user code, which is responsible of * uv_close()'ing it and then releasing its memory. */ typedef void (*raft_uv_accept_cb)(struct raft_uv_transport *t, raft_id id, const char *address, struct uv_stream_s *stream); /** * Callback invoked by the transport implementation after a connect request has * completed. If status is #0, then @stream will point to a valid handle, which * user code is then responsible to uv_close() and then release. */ struct raft_uv_connect; typedef void (*raft_uv_connect_cb)(struct raft_uv_connect *req, struct uv_stream_s *stream, int status); /** * Handle to a connect request. */ struct raft_uv_connect { void *data; /* User data */ raft_uv_connect_cb cb; /* Callback */ }; /** * Callback invoked by the transport implementation after a close request is * completed. */ typedef void (*raft_uv_transport_close_cb)(struct raft_uv_transport *t); /** * Interface to establish outgoing connections to other Raft servers and to * accept incoming connections from them. */ struct raft_uv_transport { /** * Keep track of struct version, MUST be filled out by user. * When moving to a new version, the user MUST implement the newly added * methods. * Latest version is 1. */ int version; /** * User defined data. */ void *data; /** * Implementation-defined state. */ void *impl; /** * Human-readable message providing diagnostic information about the * last error occurred. */ char errmsg[RAFT_ERRMSG_BUF_SIZE]; /** * Initialize the transport with the given server's identity. */ int (*init)(struct raft_uv_transport *t, raft_id id, const char *address); /** * Start listening for incoming connections. * * Once a new connection is accepted, the @cb callback passed in the * initializer must be invoked with the relevant details of the * connecting Raft server. */ int (*listen)(struct raft_uv_transport *t, raft_uv_accept_cb cb); /** * Connect to the server with the given ID and address. * * The @cb callback must be invoked when the connection has been * established or the connection attempt has failed. The memory pointed * by @req can be released only after @cb has fired. */ int (*connect)(struct raft_uv_transport *t, struct raft_uv_connect *req, raft_id id, const char *address, raft_uv_connect_cb cb); /** * Close the transport. * * The implementation must: * * - Stop accepting incoming connections. The @cb callback passed to * @listen must not be invoked anymore. * * - Cancel all pending @connect requests. * * - Invoke the @cb callback passed to this method once it's safe to * release the memory of the transport object. */ void (*close)(struct raft_uv_transport *t, raft_uv_transport_close_cb cb); }; /** * Init a transport interface that uses TCP sockets. */ RAFT_API int raft_uv_tcp_init(struct raft_uv_transport *t, struct uv_loop_s *loop); /** * Release any memory allocated internally. */ RAFT_API void raft_uv_tcp_close(struct raft_uv_transport *t); /** * Set the IP address and port that the listening socket will bind to. * * By default the socket will bind to the address provided in * raft_init(), which may be inconvenient if running your application in a * container, for example. * * The @address argument must be an IPv4 dotted quad IP address and port, e.g. * "0.0.0.0:8080". If you do not provide a port, the default of 8080 will be * used. The port given here *must* match the port given to raft_init(). * * Must be called before raft_init(). */ RAFT_API int raft_uv_tcp_set_bind_address(struct raft_uv_transport *t, const char *address); /** * Raft cluster test fixture, using an in-memory @raft_io implementation. This * is meant to be used in unit tests. */ #define RAFT_FIXTURE_MAX_SERVERS 8 /** * Fixture step event types. */ enum { RAFT_FIXTURE_TICK = 1, /* The tick callback has been invoked */ RAFT_FIXTURE_NETWORK, /* A network request has been sent or received */ RAFT_FIXTURE_DISK, /* An I/O request has been submitted */ RAFT_FIXTURE_WORK /* A large, CPU and/or memory intensive task */ }; /** * State of a single server in a cluster fixture. */ struct raft_fixture_server; /** * Information about a test cluster event triggered by the fixture. */ struct raft_fixture_event; /** * Returns the type of the event. */ int raft_fixture_event_type(struct raft_fixture_event *event); /** * Returns the server index of the event. */ unsigned raft_fixture_event_server_index(struct raft_fixture_event *event); /** * Event callback. See raft_fixture_hook(). */ struct raft_fixture; typedef void (*raft_fixture_event_cb)(struct raft_fixture *f, struct raft_fixture_event *event); /** * Test implementation of a cluster of @n servers, each having a user-provided * FSM. * * The cluster can simulate network latency and time elapsed on individual * servers. * * Servers can be alive or dead. Network messages sent to dead servers are * dropped. Dead servers do not have their @raft_io_tick_cb callback invoked. * * Any two servers can be connected or disconnected. Network messages sent * between disconnected servers are dropped. */ struct raft_fixture { raft_time time; /* Global time, common to all servers. */ unsigned n; /* Number of servers. */ raft_id leader_id; /* ID of current leader, or 0 if none. */ struct raft_log *log; /* Copy of current leader's log. */ raft_index commit_index; /* Current commit index on leader. */ struct raft_fixture_event *event; /* Last event occurred. */ raft_fixture_event_cb hook; /* Event callback. */ struct raft_fixture_server *servers[RAFT_FIXTURE_MAX_SERVERS]; uint64_t reserved[16]; /* For future expansion of struct. */ }; /** * Initialize a raft cluster fixture. Servers can be added by using * `raft_fixture_grow`. */ RAFT_API int raft_fixture_init(struct raft_fixture *f); /** * Release all memory used by the fixture. */ RAFT_API void raft_fixture_close(struct raft_fixture *f); /** * Convenience to generate a configuration object containing all servers in the * cluster. The first @n_voting servers will be voting ones. */ RAFT_API int raft_fixture_configuration(struct raft_fixture *f, unsigned n_voting, struct raft_configuration *conf); /** * Convenience to bootstrap all servers in the cluster using the given * configuration. */ RAFT_API int raft_fixture_bootstrap(struct raft_fixture *f, struct raft_configuration *conf); /** * Convenience to start all servers in the fixture. */ RAFT_API int raft_fixture_start(struct raft_fixture *f); /** * Return the number of servers in the fixture. */ RAFT_API unsigned raft_fixture_n(struct raft_fixture *f); /** * Return the current cluster global time. All raft instances see the same time. */ RAFT_API raft_time raft_fixture_time(struct raft_fixture *f); /** * Return the raft instance associated with the @i'th server of the fixture. */ RAFT_API struct raft *raft_fixture_get(struct raft_fixture *f, unsigned i); /** * Return @true if the @i'th server hasn't been killed. */ RAFT_API bool raft_fixture_alive(struct raft_fixture *f, unsigned i); /** * Return the index of the current leader, or the current number of servers if * there's no leader. */ RAFT_API unsigned raft_fixture_leader_index(struct raft_fixture *f); /** * Return the ID of the server the @i'th server has voted for, or zero . */ RAFT_API raft_id raft_fixture_voted_for(struct raft_fixture *f, unsigned i); /** * Drive the cluster so the @i'th server starts an election but doesn't * necessarily win it. * * This is achieved by bumping the randomized election timeout of all other * servers to a very high value, letting the one of the @i'th server expire. * * There must currently be no leader and no candidate and the given server must * be a voting one. Also, the @i'th server must be connected to a majority of * voting servers. */ RAFT_API void raft_fixture_start_elect(struct raft_fixture *f, unsigned i); /** * Calls raft_fixture_start_elect, but waits and asserts that the @i'th server * has become the leader. */ RAFT_API void raft_fixture_elect(struct raft_fixture *f, unsigned i); /** * Drive the cluster so the current leader gets deposed. * * This is achieved by dropping all AppendEntries result messages sent by * followers to the leader, until the leader decides to step down because it has * lost connectivity to a majority of followers. */ RAFT_API void raft_fixture_depose(struct raft_fixture *f); /** * Step through the cluster state advancing the time to the minimum value needed * for it to make progress (i.e. for a message to be delivered, for an I/O * operation to complete or for a single time tick to occur). * * In particular, the following happens: * * 1. If there are pending #raft_io_send requests, that have been submitted * using #raft_io->send() and not yet sent, the oldest one is picked and the * relevant callback fired. This simulates completion of a socket write, * which means that the send request has been completed. The receiver does * not immediately receives the message, as the message is propagating * through the network. However any memory associated with the #raft_io_send * request can be released (e.g. log entries). The in-memory I/O * implementation assigns a latency to each RPC message, which will get * delivered to the receiver only after that amount of time elapses. If the * sender and the receiver are currently disconnected, the RPC message is * simply dropped. If a callback was fired, jump directly to 3. and skip 2. * * 2. All pending #raft_io_append disk writes across all servers, that have been * submitted using #raft_io->append() but not yet completed, are scanned and * the one with the lowest completion time is picked. All in-flight network * messages waiting to be delivered are scanned and the one with the lowest * delivery time is picked. All servers are scanned, and the one with the * lowest tick expiration time is picked. The three times are compared and * the lowest one is picked. If a #raft_io_append disk write has completed, * the relevant callback will be invoked, if there's a network message to be * delivered, the receiver's @raft_io_recv_cb callback gets fired, if a tick * timer has expired the relevant #raft_io->tick() callback will be * invoked. Only one event will be fired. If there is more than one event to * fire, one of them is picked according to the following rules: events for * servers with lower index are fired first, tick events take precedence over * disk events, and disk events take precedence over network events. * * 3. The current cluster leader is detected (if any). When detecting the leader * the Election Safety property is checked: no servers can be in leader state * for the same term. The server in leader state with the highest term is * considered the current cluster leader, as long as it's "stable", i.e. it * has been acknowledged by all servers connected to it, and those servers * form a majority (this means that no further leader change can happen, * unless the network gets disrupted). If there is a stable leader and it has * not changed with respect to the previous call to @raft_fixture_step(), * then the Leader Append-Only property is checked, by comparing its log with * a copy of it that was taken during the previous iteration. * * 4. If there is a stable leader, its current log is copied, in order to be * able to check the Leader Append-Only property at the next call. * * 5. If there is a stable leader, its commit index gets copied. * * The function returns information about which particular event occurred * (either in step 1 or 2). */ RAFT_API struct raft_fixture_event *raft_fixture_step(struct raft_fixture *f); /** * Call raft_fixture_step() exactly @n times, and return the last event fired. */ RAFT_API struct raft_fixture_event *raft_fixture_step_n(struct raft_fixture *f, unsigned n); /** * Step the cluster until the given @stop function returns #true, or @max_msecs * have elapsed. * * Return #true if the @stop function has returned #true within @max_msecs. */ RAFT_API bool raft_fixture_step_until(struct raft_fixture *f, bool (*stop)(struct raft_fixture *f, void *arg), void *arg, unsigned max_msecs); /** * Step the cluster until @msecs have elapsed. */ RAFT_API void raft_fixture_step_until_elapsed(struct raft_fixture *f, unsigned msecs); /** * Step the cluster until a leader is elected, or @max_msecs have elapsed. */ RAFT_API bool raft_fixture_step_until_has_leader(struct raft_fixture *f, unsigned max_msecs); /** * Step the cluster until the current leader gets deposed, or @max_msecs have * elapsed. */ RAFT_API bool raft_fixture_step_until_has_no_leader(struct raft_fixture *f, unsigned max_msecs); /** * Step the cluster until the @i'th server has applied the entry at the given * index, or @max_msecs have elapsed. If @i equals the number of servers, then * step until all servers have applied the given entry. */ RAFT_API bool raft_fixture_step_until_applied(struct raft_fixture *f, unsigned i, raft_index index, unsigned max_msecs); /** * Step the cluster until the state of the @i'th server matches the given one, * or @max_msecs have elapsed. */ RAFT_API bool raft_fixture_step_until_state_is(struct raft_fixture *f, unsigned i, int state, unsigned max_msecs); /** * Step the cluster until the term of the @i'th server matches the given one, * or @max_msecs have elapsed. */ RAFT_API bool raft_fixture_step_until_term_is(struct raft_fixture *f, unsigned i, raft_term term, unsigned max_msecs); /** * Step the cluster until the @i'th server has voted for the @j'th one, or * @max_msecs have elapsed. */ RAFT_API bool raft_fixture_step_until_voted_for(struct raft_fixture *f, unsigned i, unsigned j, unsigned max_msecs); /** * Step the cluster until all pending network messages from the @i'th server to * the @j'th server have been delivered, or @max_msecs have elapsed. */ RAFT_API bool raft_fixture_step_until_delivered(struct raft_fixture *f, unsigned i, unsigned j, unsigned max_msecs); /** * Set a function to be called after every time a fixture event occurs as * consequence of a step. */ RAFT_API void raft_fixture_hook(struct raft_fixture *f, raft_fixture_event_cb hook); /** * Disconnect the @i'th and the @j'th servers, so attempts to send a message * from @i to @j will fail with #RAFT_NOCONNECTION. */ RAFT_API void raft_fixture_disconnect(struct raft_fixture *f, unsigned i, unsigned j); /** * Reconnect the @i'th and the @j'th servers, so attempts to send a message * from @i to @j will succeed again. */ RAFT_API void raft_fixture_reconnect(struct raft_fixture *f, unsigned i, unsigned j); /** * Saturate the connection between the @i'th and the @j'th servers, so messages * sent by @i to @j will be silently dropped. */ RAFT_API void raft_fixture_saturate(struct raft_fixture *f, unsigned i, unsigned j); /** * Return true if the connection from the @i'th to the @j'th server has been set * as saturated. */ RAFT_API bool raft_fixture_saturated(struct raft_fixture *f, unsigned i, unsigned j); /** * Desaturate the connection between the @i'th and the @j'th servers, so * messages sent by @i to @j will start being delivered again. */ RAFT_API void raft_fixture_desaturate(struct raft_fixture *f, unsigned i, unsigned j); /** * Kill the server with the given index. The server won't receive any message * and its tick callback won't be invoked. */ RAFT_API void raft_fixture_kill(struct raft_fixture *f, unsigned i); /** * Revive a killed server with the given index. */ RAFT_API void raft_fixture_revive(struct raft_fixture *f, unsigned i); /** * Add a new empty server to the cluster and connect it to all others. */ RAFT_API int raft_fixture_grow(struct raft_fixture *f, struct raft_fsm *fsm); /** * Set the value that will be returned to the @i'th raft instance when it asks * the underlying #raft_io implementation for a randomized election timeout * value. The default value is 1000 + @i * 100, meaning that the election timer * of server 0 will expire first. */ RAFT_API void raft_fixture_set_randomized_election_timeout( struct raft_fixture *f, unsigned i, unsigned msecs); /** * Set the network latency in milliseconds. Each RPC message sent by the @i'th * server from now on will take @msecs milliseconds to be delivered. The default * value is 15. */ RAFT_API void raft_fixture_set_network_latency(struct raft_fixture *f, unsigned i, unsigned msecs); /** * Set the disk I/O latency in milliseconds. Each append request will take this * amount of milliseconds to complete. The default value is 10. */ RAFT_API void raft_fixture_set_disk_latency(struct raft_fixture *f, unsigned i, unsigned msecs); /** * Send the send latency in milliseconds. Each message send will take this many * milliseconds before the send callback is invoked. */ RAFT_API void raft_fixture_set_send_latency(struct raft_fixture *f, unsigned i, unsigned j, unsigned msecs); /** * Set the persisted term of the @i'th server. */ RAFT_API void raft_fixture_set_term(struct raft_fixture *f, unsigned i, raft_term term); /** * Set the most recent persisted snapshot on the @i'th server. */ RAFT_API void raft_fixture_set_snapshot(struct raft_fixture *f, unsigned i, struct raft_snapshot *snapshot); /** * Add an entry to the persisted entries of the @i'th server. */ RAFT_API void raft_fixture_add_entry(struct raft_fixture *f, unsigned i, struct raft_entry *entry); RAFT_API void raft_fixture_append_fault(struct raft_fixture *f, unsigned i, int delay); RAFT_API void raft_fixture_vote_fault(struct raft_fixture *f, unsigned i, int delay); RAFT_API void raft_fixture_term_fault(struct raft_fixture *f, unsigned i, int delay); RAFT_API void raft_fixture_send_fault(struct raft_fixture *f, unsigned i, int delay); /** * Return the number of messages of the given type that the @i'th server has * successfully sent so far. */ RAFT_API unsigned raft_fixture_n_send(struct raft_fixture *f, unsigned i, int type); /** * Return the number of messages of the given type that the @i'th server has * received so far. */ RAFT_API unsigned raft_fixture_n_recv(struct raft_fixture *f, unsigned i, int type); /** * Force the @i'th server into the UNAVAILABLE state. */ RAFT_API void raft_fixture_make_unavailable(struct raft_fixture *f, unsigned i); #endif /* RAFT_H */ dqlite-1.16.7/src/raft/000077500000000000000000000000001465252713400146245ustar00rootroot00000000000000dqlite-1.16.7/src/raft/array.h000066400000000000000000000016541465252713400161210ustar00rootroot00000000000000/* Macros to manipulate contiguous arrays. */ #ifndef ARRAY_H_ #define ARRAY_H_ #include "../raft.h" /* Append item I of type T to array A which currently has N items. * * A and N must both by pointers. Set RV to -1 in case of failure. */ #define ARRAY__APPEND(T, I, A, N, RV) \ { \ T *tmp_array; \ tmp_array = raft_realloc(*A, (*N + 1) * sizeof **A); \ if (tmp_array != NULL) { \ (*N)++; \ *A = tmp_array; \ (*A)[(*N) - 1] = I; \ RV = 0; \ } else { \ RV = -1; \ } \ } #endif /* ARRAY_H_ */ dqlite-1.16.7/src/raft/assert.h000066400000000000000000000026621465252713400163040ustar00rootroot00000000000000/* Define the assert() macro, either as the standard one or the test one. */ #ifndef ASSERT_H_ #define ASSERT_H_ #if defined(RAFT_TEST) extern void munit_errorf_ex(const char *filename, int line, const char *format, ...); #define assert(expr) \ do { \ if (!expr) { \ munit_errorf_ex(__FILE__, __LINE__, \ "assertion failed: ", #expr); \ } \ } while (0) #elif defined(NDEBUG) #define assert(x) \ do { \ (void)sizeof(x); \ } while (0) #elif defined(RAFT_ASSERT_WITH_BACKTRACE) #include /* for __assert_fail */ #include #include #undef assert #define assert(x) \ do { \ struct backtrace_state *state_; \ if (!(x)) { \ state_ = backtrace_create_state(NULL, 0, NULL, NULL); \ backtrace_print(state_, 0, stderr); \ __assert_fail(#x, __FILE__, __LINE__, __func__); \ } \ } while (0) #else #include #endif #endif /* ASSERT_H_ */ dqlite-1.16.7/src/raft/byte.c000066400000000000000000000304741465252713400157430ustar00rootroot00000000000000#include "byte.h" /* Taken from https://github.com/gcc-mirror/gcc/blob/master/libiberty/crc32.c */ static const unsigned byteCrcTable[] = { 0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9, 0x130476dc, 0x17c56b6b, 0x1a864db2, 0x1e475005, 0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61, 0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd, 0x4c11db70, 0x48d0c6c7, 0x4593e01e, 0x4152fda9, 0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75, 0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011, 0x791d4014, 0x7ddc5da3, 0x709f7b7a, 0x745e66cd, 0x9823b6e0, 0x9ce2ab57, 0x91a18d8e, 0x95609039, 0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5, 0xbe2b5b58, 0xbaea46ef, 0xb7a96036, 0xb3687d81, 0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d, 0xd4326d90, 0xd0f37027, 0xddb056fe, 0xd9714b49, 0xc7361b4c, 0xc3f706fb, 0xceb42022, 0xca753d95, 0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1, 0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d, 0x34867077, 0x30476dc0, 0x3d044b19, 0x39c556ae, 0x278206ab, 0x23431b1c, 0x2e003dc5, 0x2ac12072, 0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16, 0x018aeb13, 0x054bf6a4, 0x0808d07d, 0x0cc9cdca, 0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde, 0x6b93dddb, 0x6f52c06c, 0x6211e6b5, 0x66d0fb02, 0x5e9f46bf, 0x5a5e5b08, 0x571d7dd1, 0x53dc6066, 0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba, 0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e, 0xbfa1b04b, 0xbb60adfc, 0xb6238b25, 0xb2e29692, 0x8aad2b2f, 0x8e6c3698, 0x832f1041, 0x87ee0df6, 0x99a95df3, 0x9d684044, 0x902b669d, 0x94ea7b2a, 0xe0b41de7, 0xe4750050, 0xe9362689, 0xedf73b3e, 0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2, 0xc6bcf05f, 0xc27dede8, 0xcf3ecb31, 0xcbffd686, 0xd5b88683, 0xd1799b34, 0xdc3abded, 0xd8fba05a, 0x690ce0ee, 0x6dcdfd59, 0x608edb80, 0x644fc637, 0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb, 0x4f040d56, 0x4bc510e1, 0x46863638, 0x42472b8f, 0x5c007b8a, 0x58c1663d, 0x558240e4, 0x51435d53, 0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47, 0x36194d42, 0x32d850f5, 0x3f9b762c, 0x3b5a6b9b, 0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff, 0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623, 0xf12f560e, 0xf5ee4bb9, 0xf8ad6d60, 0xfc6c70d7, 0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, 0xef68060b, 0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f, 0xc423cd6a, 0xc0e2d0dd, 0xcda1f604, 0xc960ebb3, 0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7, 0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b, 0x9b3660c6, 0x9ff77d71, 0x92b45ba8, 0x9675461f, 0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3, 0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640, 0x4e8ee645, 0x4a4ffbf2, 0x470cdd2b, 0x43cdc09c, 0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8, 0x68860bfd, 0x6c47164a, 0x61043093, 0x65c52d24, 0x119b4be9, 0x155a565e, 0x18197087, 0x1cd86d30, 0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec, 0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088, 0x2497d08d, 0x2056cd3a, 0x2d15ebe3, 0x29d4f654, 0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0, 0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c, 0xe3a1cbc1, 0xe760d676, 0xea23f0af, 0xeee2ed18, 0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4, 0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0, 0x9abc8bd5, 0x9e7d9662, 0x933eb0bb, 0x97ffad0c, 0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668, 0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4}; unsigned byteCrc32(const void *buf, const size_t size, const unsigned init) { unsigned crc = init; uint8_t *cursor = (uint8_t *)buf; size_t count = size; while (count--) { crc = (crc << 8) ^ byteCrcTable[((crc >> 24) ^ *cursor) & 255]; cursor++; } return crc; } /* ================ sha1.c ================ */ /* SHA-1 in C By Steve Reid 100% Public Domain Test Vectors (from FIPS PUB 180-1) "abc" A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq" 84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1 A million repetitions of "a" 34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F */ /* #define LITTLE_ENDIAN * This should be #define'd already, if true. */ /* #define SHA1HANDSOFF * Copies data before messing with it. */ #define SHA1HANDSOFF #include #include #include /* for u_int*_t */ #if defined(__sun) #include "solarisfixes.h" #endif #ifndef BYTE_ORDER #if (BSD >= 199103) #include #else #if defined(linux) || defined(__linux__) #include #else #define LITTLE_ENDIAN 1234 /* least-significant byte first (vax, pc) */ #define BIG_ENDIAN 4321 /* most-significant byte first (IBM, net) */ #define PDP_ENDIAN 3412 /* LSB first in word, MSW first in long (pdp)*/ #if defined(vax) || defined(ns32000) || defined(sun386) || \ defined(__i386__) || defined(MIPSEL) || defined(_MIPSEL) || \ defined(BIT_ZERO_ON_RIGHT) || defined(__alpha__) || defined(__alpha) #define BYTE_ORDER LITTLE_ENDIAN #endif #if defined(sel) || defined(pyr) || defined(mc68000) || defined(sparc) || \ defined(is68k) || defined(tahoe) || defined(ibm032) || defined(ibm370) || \ defined(MIPSEB) || defined(_MIPSEB) || defined(_IBMR2) || defined(DGUX) || \ defined(apollo) || defined(__convex__) || defined(_CRAY) || \ defined(__hppa) || defined(__hp9000) || defined(__hp9000s300) || \ defined(__hp9000s700) || defined(BIT_ZERO_ON_LEFT) || defined(m68k) || \ defined(__sparc) #define BYTE_ORDER BIG_ENDIAN #endif #endif /* linux */ #endif /* BSD */ #endif /* BYTE_ORDER */ #if defined(__BYTE_ORDER) && !defined(BYTE_ORDER) #if (__BYTE_ORDER == __LITTLE_ENDIAN) #define BYTE_ORDER LITTLE_ENDIAN #else #define BYTE_ORDER BIG_ENDIAN #endif #endif #if !defined(BYTE_ORDER) || \ (BYTE_ORDER != BIG_ENDIAN && BYTE_ORDER != LITTLE_ENDIAN && \ BYTE_ORDER != PDP_ENDIAN) /* you must determine what the correct bit order is for * your compiler - the next line is an intentional error * which will force your compiles to bomb until you fix * the above macros. */ #error "Undefined or invalid BYTE_ORDER" #endif #define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits)))) /* blk0() and blk() perform the initial expand. */ /* I got the idea of expanding during the round function from SSLeay */ #if BYTE_ORDER == LITTLE_ENDIAN #define blk0(i) \ (block->l[i] = (rol(block->l[i], 24) & 0xFF00FF00) | \ (rol(block->l[i], 8) & 0x00FF00FF)) #elif BYTE_ORDER == BIG_ENDIAN #define blk0(i) block->l[i] #else #error "Endianness not defined!" #endif #define blk(i) \ (block->l[i & 15] = \ rol(block->l[(i + 13) & 15] ^ block->l[(i + 8) & 15] ^ \ block->l[(i + 2) & 15] ^ block->l[i & 15], \ 1)) /* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */ #define R0(v, w, x, y, z, i) \ z += ((w & (x ^ y)) ^ y) + blk0(i) + 0x5A827999 + rol(v, 5); \ w = rol(w, 30); #define R1(v, w, x, y, z, i) \ z += ((w & (x ^ y)) ^ y) + blk(i) + 0x5A827999 + rol(v, 5); \ w = rol(w, 30); #define R2(v, w, x, y, z, i) \ z += (w ^ x ^ y) + blk(i) + 0x6ED9EBA1 + rol(v, 5); \ w = rol(w, 30); #define R3(v, w, x, y, z, i) \ z += (((w | x) & y) | (w & x)) + blk(i) + 0x8F1BBCDC + rol(v, 5); \ w = rol(w, 30); #define R4(v, w, x, y, z, i) \ z += (w ^ x ^ y) + blk(i) + 0xCA62C1D6 + rol(v, 5); \ w = rol(w, 30); static void byteSha1Transform(uint32_t state[5], const uint8_t buffer[64]) { uint32_t a, b, c, d, e; typedef union { uint8_t c[64]; uint32_t l[16]; } CHAR64LONG16; #ifdef SHA1HANDSOFF CHAR64LONG16 block[1]; /* use array to appear as a pointer */ memcpy(block, buffer, 64); #else /* The following had better never be used because it causes the * pointer-to-const buffer to be cast into a pointer to non-const. * And the result is written through. I threw a "const" in, hoping * this will cause a diagnostic. */ CHAR64LONG16 *block = (const CHAR64LONG16 *)buffer; #endif /* Copy context->state[] to working vars */ a = state[0]; b = state[1]; c = state[2]; d = state[3]; e = state[4]; /* 4 rounds of 20 operations each. Loop unrolled. */ R0(a, b, c, d, e, 0); R0(e, a, b, c, d, 1); R0(d, e, a, b, c, 2); R0(c, d, e, a, b, 3); R0(b, c, d, e, a, 4); R0(a, b, c, d, e, 5); R0(e, a, b, c, d, 6); R0(d, e, a, b, c, 7); R0(c, d, e, a, b, 8); R0(b, c, d, e, a, 9); R0(a, b, c, d, e, 10); R0(e, a, b, c, d, 11); R0(d, e, a, b, c, 12); R0(c, d, e, a, b, 13); R0(b, c, d, e, a, 14); R0(a, b, c, d, e, 15); R1(e, a, b, c, d, 16); R1(d, e, a, b, c, 17); R1(c, d, e, a, b, 18); R1(b, c, d, e, a, 19); R2(a, b, c, d, e, 20); R2(e, a, b, c, d, 21); R2(d, e, a, b, c, 22); R2(c, d, e, a, b, 23); R2(b, c, d, e, a, 24); R2(a, b, c, d, e, 25); R2(e, a, b, c, d, 26); R2(d, e, a, b, c, 27); R2(c, d, e, a, b, 28); R2(b, c, d, e, a, 29); R2(a, b, c, d, e, 30); R2(e, a, b, c, d, 31); R2(d, e, a, b, c, 32); R2(c, d, e, a, b, 33); R2(b, c, d, e, a, 34); R2(a, b, c, d, e, 35); R2(e, a, b, c, d, 36); R2(d, e, a, b, c, 37); R2(c, d, e, a, b, 38); R2(b, c, d, e, a, 39); R3(a, b, c, d, e, 40); R3(e, a, b, c, d, 41); R3(d, e, a, b, c, 42); R3(c, d, e, a, b, 43); R3(b, c, d, e, a, 44); R3(a, b, c, d, e, 45); R3(e, a, b, c, d, 46); R3(d, e, a, b, c, 47); R3(c, d, e, a, b, 48); R3(b, c, d, e, a, 49); R3(a, b, c, d, e, 50); R3(e, a, b, c, d, 51); R3(d, e, a, b, c, 52); R3(c, d, e, a, b, 53); R3(b, c, d, e, a, 54); R3(a, b, c, d, e, 55); R3(e, a, b, c, d, 56); R3(d, e, a, b, c, 57); R3(c, d, e, a, b, 58); R3(b, c, d, e, a, 59); R4(a, b, c, d, e, 60); R4(e, a, b, c, d, 61); R4(d, e, a, b, c, 62); R4(c, d, e, a, b, 63); R4(b, c, d, e, a, 64); R4(a, b, c, d, e, 65); R4(e, a, b, c, d, 66); R4(d, e, a, b, c, 67); R4(c, d, e, a, b, 68); R4(b, c, d, e, a, 69); R4(a, b, c, d, e, 70); R4(e, a, b, c, d, 71); R4(d, e, a, b, c, 72); R4(c, d, e, a, b, 73); R4(b, c, d, e, a, 74); R4(a, b, c, d, e, 75); R4(e, a, b, c, d, 76); R4(d, e, a, b, c, 77); R4(c, d, e, a, b, 78); R4(b, c, d, e, a, 79); /* Add the working vars back into context.state[] */ state[0] += a; state[1] += b; state[2] += c; state[3] += d; state[4] += e; /* Wipe variables */ a = b = c = d = e = 0; #ifdef SHA1HANDSOFF memset(block, '\0', sizeof(block)); #endif } void byteSha1Init(struct byteSha1 *s) { /* SHA1 initialization constants */ s->state[0] = 0x67452301; s->state[1] = 0xEFCDAB89; s->state[2] = 0x98BADCFE; s->state[3] = 0x10325476; s->state[4] = 0xC3D2E1F0; s->count[0] = s->count[1] = 0; } /* Run your data through this. */ void __attribute__((noinline)) byteSha1Update(struct byteSha1 *s, const uint8_t *data, uint32_t len) { uint32_t i; uint32_t j; j = s->count[0]; if ((s->count[0] += len << 3) < j) s->count[1]++; s->count[1] += (len >> 29); j = (j >> 3) & 63; if ((j + len) > 63) { memcpy(&s->buffer[j], data, (i = 64 - j)); byteSha1Transform(s->state, s->buffer); for (; i + 63 < len; i += 64) { byteSha1Transform(s->state, &data[i]); } j = 0; } else i = 0; memcpy(&s->buffer[j], &data[i], len - i); } /* Add padding and return the message digest. */ void byteSha1Digest(struct byteSha1 *s, uint8_t value[20]) { unsigned i; uint8_t finalcount[8]; uint8_t c; #if 0 /* untested "improvement" by DHR */ /* Convert context->count to a sequence of bytes * in finalcount. Second element first, but * big-endian order within element. * But we do it all backwards. */ uint8_t *fcp = &finalcount[8]; for (i = 0; i < 2; i++) { u_int32_t t = context->count[i]; int j; for (j = 0; j < 4; t >>= 8, j++) *--fcp = (uint8_t) t } #else for (i = 0; i < 8; i++) { finalcount[i] = (uint8_t)((s->count[(i >= 4 ? 0 : 1)] >> ((3 - (i & 3)) * 8)) & 255); /* Endian independent */ } #endif c = 0200; byteSha1Update(s, &c, 1); while ((s->count[0] & 504) != 448) { c = 0000; byteSha1Update(s, &c, 1); } byteSha1Update(s, finalcount, 8); /* Should cause a SHA1Transform() */ for (i = 0; i < 20; i++) { value[i] = (uint8_t)((s->state[i >> 2] >> ((3 - (i & 3)) * 8)) & 255); } /* Wipe variables */ memset(s, '\0', sizeof(*s)); memset(&finalcount, '\0', sizeof(finalcount)); } /* ================ end of sha1.c ================ */ dqlite-1.16.7/src/raft/byte.h000066400000000000000000000074051465252713400157460ustar00rootroot00000000000000/* Byte-level utilities. */ #ifndef BYTE_H_ #define BYTE_H_ #include #include #include #if defined(__cplusplus) #define BYTE__INLINE inline #else #if defined(__clang__) #define BYTE__INLINE static inline __attribute__((unused)) #else #define BYTE__INLINE static inline #endif #endif /* Compile-time endianess detection (best effort). */ #if (defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \ (defined(__ARMEL__) && (__ARMEL__ == 1)) #define BYTE__LITTLE_ENDIAN #elif defined(__BYTE_ORDER) && (__BYTE_ORDER == __BIG_ENDIAN) && \ defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 #define RAFT__BIG_ENDIAN #endif /* Flip a 32-bit number to network byte order (little endian) */ BYTE__INLINE uint32_t byteFlip32(uint32_t v) { #if defined(BYTE__LITTLE_ENDIAN) return v; #elif defined(RAFT__BIG_ENDIAN) return __builtin_bswap32(v); #else /* Unknown endianess */ union { uint32_t u; uint8_t v[4]; } s; s.v[0] = (uint8_t)v; s.v[1] = (uint8_t)(v >> 8); s.v[2] = (uint8_t)(v >> 16); s.v[3] = (uint8_t)(v >> 24); return s.u; #endif } /* Flip a 64-bit number to network byte order (little endian) */ BYTE__INLINE uint64_t byteFlip64(uint64_t v) { #if defined(BYTE__LITTLE_ENDIAN) return v; #elif defined(RAFT__BIG_ENDIAN) return __builtin_bswap64(v); #else union { uint64_t u; uint8_t v[8]; } s; s.v[0] = (uint8_t)v; s.v[1] = (uint8_t)(v >> 8); s.v[2] = (uint8_t)(v >> 16); s.v[3] = (uint8_t)(v >> 24); s.v[4] = (uint8_t)(v >> 32); s.v[5] = (uint8_t)(v >> 40); s.v[6] = (uint8_t)(v >> 48); s.v[7] = (uint8_t)(v >> 56); return s.u; #endif } BYTE__INLINE void bytePut8(void **cursor, uint8_t value) { uint8_t **p = (uint8_t **)cursor; **p = value; *p += 1; } BYTE__INLINE void bytePut32(void **cursor, uint32_t value) { unsigned i; uint32_t flipped = byteFlip32(value); for (i = 0; i < sizeof(uint32_t); i++) { bytePut8(cursor, ((uint8_t *)(&flipped))[i]); } } BYTE__INLINE void bytePut64(void **cursor, uint64_t value) { unsigned i; uint64_t flipped = byteFlip64(value); for (i = 0; i < sizeof(uint64_t); i++) { bytePut8(cursor, ((uint8_t *)(&flipped))[i]); } } BYTE__INLINE void bytePutString(void **cursor, const char *value) { char **p = (char **)cursor; strcpy(*p, value); *p += strlen(value) + 1; } BYTE__INLINE uint8_t byteGet8(const void **cursor) { const uint8_t **p = (const uint8_t **)cursor; uint8_t value = **p; *p += 1; return value; } BYTE__INLINE uint32_t byteGet32(const void **cursor) { uint32_t value = 0; unsigned i; for (i = 0; i < sizeof(uint32_t); i++) { ((uint8_t *)(&value))[i] = byteGet8(cursor); } return byteFlip32(value); } BYTE__INLINE uint64_t byteGet64(const void **cursor) { uint64_t value = 0; unsigned i; for (i = 0; i < sizeof(uint64_t); i++) { ((uint8_t *)(&value))[i] = byteGet8(cursor); } return byteFlip64(value); } BYTE__INLINE const char *byteGetString(const void **cursor, size_t max_len) { const char **p = (const char **)cursor; const char *value = *p; size_t len = 0; while (len < max_len) { if (*(*p + len) == 0) { break; } len++; } if (len == max_len) { return NULL; } *p += len + 1; return value; } /* Add padding to size if it's not a multiple of 8. */ BYTE__INLINE size_t bytePad64(size_t size) { size_t rest = size % sizeof(uint64_t); if (rest != 0) { size += sizeof(uint64_t) - rest; } return size; } /* Calculate the CRC32 checksum of the given data buffer. */ unsigned byteCrc32(const void *buf, size_t size, unsigned init); struct byteSha1 { uint32_t state[5]; uint32_t count[2]; uint8_t buffer[64]; uint8_t value[20]; }; void byteSha1Init(struct byteSha1 *s); void byteSha1Update(struct byteSha1 *s, const uint8_t *data, uint32_t len); void byteSha1Digest(struct byteSha1 *s, uint8_t value[20]); #endif /* BYTE_H_ */ dqlite-1.16.7/src/raft/callbacks.c000066400000000000000000000007371465252713400167160ustar00rootroot00000000000000#include "callbacks.h" #include "heap.h" int raftInitCallbacks(struct raft *r) { r->callbacks = 0; struct raft_callbacks *cbs = RaftHeapCalloc(1, sizeof(*cbs)); if (cbs == NULL) { return RAFT_NOMEM; } r->callbacks = (uint64_t)(uintptr_t)cbs; return 0; } void raftDestroyCallbacks(struct raft *r) { RaftHeapFree((void *)(uintptr_t)r->callbacks); r->callbacks = 0; } struct raft_callbacks *raftGetCallbacks(struct raft *r) { return (void *)(uintptr_t)r->callbacks; } dqlite-1.16.7/src/raft/callbacks.h000066400000000000000000000004111465252713400167100ustar00rootroot00000000000000#ifndef CALLBACKS_H_ #define CALLBACKS_H_ #include "../raft.h" struct raft_callbacks { raft_state_cb state_cb; }; int raftInitCallbacks(struct raft *r); void raftDestroyCallbacks(struct raft *r); struct raft_callbacks *raftGetCallbacks(struct raft *r); #endif dqlite-1.16.7/src/raft/client.c000066400000000000000000000227061465252713400162550ustar00rootroot00000000000000#include "../raft.h" #include "../tracing.h" #include "assert.h" #include "configuration.h" #include "err.h" #include "lifecycle.h" #include "log.h" #include "membership.h" #include "progress.h" #include "../lib/queue.h" #include "replication.h" #include "request.h" int raft_apply(struct raft *r, struct raft_apply *req, const struct raft_buffer bufs[], const struct raft_entry_local_data local_data[], const unsigned n, raft_apply_cb cb) { raft_index index; int rv; tracef("raft_apply n %d", n); assert(r != NULL); assert(bufs != NULL); assert(n > 0); if (r->state != RAFT_LEADER || r->transfer != NULL) { rv = RAFT_NOTLEADER; ErrMsgFromCode(r->errmsg, rv); tracef("raft_apply not leader"); goto err; } /* Index of the first entry being appended. */ index = logLastIndex(r->log) + 1; tracef("%u commands starting at %lld", n, index); req->type = RAFT_COMMAND; req->index = index; req->cb = cb; /* Append the new entries to the log. */ rv = logAppendCommands(r->log, r->current_term, bufs, local_data, n); if (rv != 0) { goto err; } lifecycleRequestStart(r, (struct request *)req); rv = replicationTrigger(r, index); if (rv != 0) { goto err_after_log_append; } return 0; err_after_log_append: logDiscard(r->log, index); queue_remove(&req->queue); err: assert(rv != 0); return rv; } int raft_barrier(struct raft *r, struct raft_barrier *req, raft_barrier_cb cb) { raft_index index; struct raft_buffer buf; int rv; if (r->state != RAFT_LEADER || r->transfer != NULL) { rv = RAFT_NOTLEADER; goto err; } /* TODO: use a completely empty buffer */ buf.len = 8; buf.base = raft_malloc(buf.len); if (buf.base == NULL) { rv = RAFT_NOMEM; goto err; } /* Index of the barrier entry being appended. */ index = logLastIndex(r->log) + 1; tracef("barrier starting at %lld", index); req->type = RAFT_BARRIER; req->index = index; req->cb = cb; rv = logAppend(r->log, r->current_term, RAFT_BARRIER, buf, (struct raft_entry_local_data){}, true, NULL); if (rv != 0) { goto err_after_buf_alloc; } lifecycleRequestStart(r, (struct request *)req); rv = replicationTrigger(r, index); if (rv != 0) { goto err_after_log_append; } return 0; err_after_log_append: logDiscard(r->log, index); queue_remove(&req->queue); err_after_buf_alloc: raft_free(buf.base); err: return rv; } static int clientChangeConfiguration( struct raft *r, struct raft_change *req, const struct raft_configuration *configuration) { raft_index index; raft_term term = r->current_term; int rv; (void)req; /* Index of the entry being appended. */ index = logLastIndex(r->log) + 1; /* Encode the new configuration and append it to the log. */ rv = logAppendConfiguration(r->log, term, configuration); if (rv != 0) { goto err; } if (configuration->n != r->configuration.n) { rv = progressRebuildArray(r, configuration); if (rv != 0) { goto err; } } /* Update the current configuration if we've created a new object. */ if (configuration != &r->configuration) { raft_configuration_close(&r->configuration); r->configuration = *configuration; } /* Start writing the new log entry to disk and send it to the followers. */ rv = replicationTrigger(r, index); if (rv != 0) { /* TODO: restore the old next/match indexes and configuration. */ goto err_after_log_append; } r->configuration_uncommitted_index = index; return 0; err_after_log_append: logTruncate(r->log, index); err: assert(rv != 0); return rv; } int raft_add(struct raft *r, struct raft_change *req, raft_id id, const char *address, raft_change_cb cb) { struct raft_configuration configuration; int rv; rv = membershipCanChangeConfiguration(r); if (rv != 0) { return rv; } tracef("add server: id %llu, address %s", id, address); /* Make a copy of the current configuration, and add the new server to * it. */ rv = configurationCopy(&r->configuration, &configuration); if (rv != 0) { goto err; } rv = raft_configuration_add(&configuration, id, address, RAFT_SPARE); if (rv != 0) { goto err_after_configuration_copy; } req->cb = cb; rv = clientChangeConfiguration(r, req, &configuration); if (rv != 0) { goto err_after_configuration_copy; } assert(r->leader_state.change == NULL); r->leader_state.change = req; return 0; err_after_configuration_copy: raft_configuration_close(&configuration); err: assert(rv != 0); return rv; } int raft_assign(struct raft *r, struct raft_change *req, raft_id id, int role, raft_change_cb cb) { const struct raft_server *server; unsigned server_index; raft_index last_index; int rv; tracef("raft_assign to id:%llu the role:%d", id, role); if (role != RAFT_STANDBY && role != RAFT_VOTER && role != RAFT_SPARE) { rv = RAFT_BADROLE; ErrMsgFromCode(r->errmsg, rv); return rv; } rv = membershipCanChangeConfiguration(r); if (rv != 0) { return rv; } server = configurationGet(&r->configuration, id); if (server == NULL) { rv = RAFT_NOTFOUND; ErrMsgPrintf(r->errmsg, "no server has ID %llu", id); goto err; } /* Check if we have already the desired role. */ if (server->role == role) { const char *name; rv = RAFT_BADROLE; switch (role) { case RAFT_VOTER: name = "voter"; break; case RAFT_STANDBY: name = "stand-by"; break; case RAFT_SPARE: name = "spare"; break; default: name = NULL; assert(0); break; } ErrMsgPrintf(r->errmsg, "server is already %s", name); goto err; } server_index = configurationIndexOf(&r->configuration, id); assert(server_index < r->configuration.n); last_index = logLastIndex(r->log); req->cb = cb; assert(r->leader_state.change == NULL); r->leader_state.change = req; /* If we are not promoting to the voter role or if the log of this * server is already up-to-date, we can submit the configuration change * immediately. */ if (role != RAFT_VOTER || progressMatchIndex(r, server_index) == last_index) { int old_role = r->configuration.servers[server_index].role; r->configuration.servers[server_index].role = role; rv = clientChangeConfiguration(r, req, &r->configuration); if (rv != 0) { tracef("clientChangeConfiguration failed %d", rv); r->configuration.servers[server_index].role = old_role; return rv; } return 0; } r->leader_state.promotee_id = server->id; /* Initialize the first catch-up round. */ r->leader_state.round_number = 1; r->leader_state.round_index = last_index; r->leader_state.round_start = r->io->time(r->io); /* Immediately initiate an AppendEntries request. */ rv = replicationProgress(r, server_index); if (rv != 0 && rv != RAFT_NOCONNECTION) { /* This error is not fatal. */ tracef("failed to send append entries to server %llu: %s (%d)", server->id, raft_strerror(rv), rv); } return 0; err: assert(rv != 0); return rv; } int raft_remove(struct raft *r, struct raft_change *req, raft_id id, raft_change_cb cb) { const struct raft_server *server; struct raft_configuration configuration; int rv; rv = membershipCanChangeConfiguration(r); if (rv != 0) { return rv; } server = configurationGet(&r->configuration, id); if (server == NULL) { rv = RAFT_BADID; goto err; } tracef("remove server: id %llu", id); /* Make a copy of the current configuration, and remove the given server * from it. */ rv = configurationCopy(&r->configuration, &configuration); if (rv != 0) { goto err; } rv = configurationRemove(&configuration, id); if (rv != 0) { goto err_after_configuration_copy; } req->cb = cb; rv = clientChangeConfiguration(r, req, &configuration); if (rv != 0) { goto err_after_configuration_copy; } assert(r->leader_state.change == NULL); r->leader_state.change = req; return 0; err_after_configuration_copy: raft_configuration_close(&configuration); err: assert(rv != 0); return rv; } /* Find a suitable voting follower. */ static raft_id clientSelectTransferee(struct raft *r) { const struct raft_server *transferee = NULL; unsigned i; for (i = 0; i < r->configuration.n; i++) { const struct raft_server *server = &r->configuration.servers[i]; if (server->id == r->id || server->role != RAFT_VOTER) { continue; } transferee = server; if (progressIsUpToDate(r, i)) { break; } } if (transferee != NULL) { return transferee->id; } return 0; } int raft_transfer(struct raft *r, struct raft_transfer *req, raft_id id, raft_transfer_cb cb) { const struct raft_server *server; unsigned i; int rv; tracef("transfer to %llu", id); if (r->state != RAFT_LEADER || r->transfer != NULL) { tracef("transfer error - state:%d", r->state); rv = RAFT_NOTLEADER; ErrMsgFromCode(r->errmsg, rv); goto err; } if (id == 0) { id = clientSelectTransferee(r); if (id == 0) { rv = RAFT_NOTFOUND; ErrMsgPrintf(r->errmsg, "there's no other voting server"); goto err; } } server = configurationGet(&r->configuration, id); if (server == NULL || server->id == r->id || server->role != RAFT_VOTER) { rv = RAFT_BADID; ErrMsgFromCode(r->errmsg, rv); goto err; } /* If this follower is up-to-date, we can send it the TimeoutNow message * right away. */ i = configurationIndexOf(&r->configuration, server->id); assert(i < r->configuration.n); membershipLeadershipTransferInit(r, req, id, cb); if (progressPersistedIsUpToDate(r, i)) { rv = membershipLeadershipTransferStart(r); if (rv != 0) { r->transfer = NULL; goto err; } } return 0; err: assert(rv != 0); return rv; } #undef tracef dqlite-1.16.7/src/raft/compress.c000066400000000000000000000167041465252713400166330ustar00rootroot00000000000000#include "compress.h" #ifdef LZ4_AVAILABLE #include #endif #include #include #include "assert.h" #include "byte.h" #include "err.h" #define min(a, b) ((a) < (b) ? (a) : (b)) #define max(a, b) ((a) > (b) ? (a) : (b)) #define MEGABYTE 1048576 int Compress(struct raft_buffer bufs[], unsigned n_bufs, struct raft_buffer *compressed, char *errmsg) { #ifndef LZ4_AVAILABLE (void)bufs; (void)n_bufs; (void)compressed; ErrMsgPrintf(errmsg, "LZ4 not available"); return RAFT_INVALID; #else assert(bufs != NULL); assert(n_bufs > 0); assert(compressed != NULL); assert(errmsg != NULL); int rv = RAFT_IOERR; size_t src_size = 0; size_t dst_size = 0; size_t src_offset = 0; size_t dst_offset = 0; size_t dst_size_needed = 0; /* Store minimal dst_size */ size_t ret = 0; /* Return value of LZ4F_XXX functions */ compressed->base = NULL; compressed->len = 0; /* Determine total uncompressed size */ for (unsigned i = 0; i < n_bufs; ++i) { src_size += bufs[i].len; } /* Work around a bug in liblz4 on bionic, in practice raft should only * Compress non-0 length buffers, so this should be fine. * https://github.com/lz4/lz4/issues/157 * */ if (src_size == 0) { ErrMsgPrintf(errmsg, "total size must be larger then 0"); rv = RAFT_INVALID; goto err; } /* Set LZ4 preferences */ LZ4F_preferences_t lz4_pref; memset(&lz4_pref, 0, sizeof(lz4_pref)); /* Detect data corruption when decompressing */ lz4_pref.frameInfo.contentChecksumFlag = 1; /* For allocating a suitable buffer when decompressing */ lz4_pref.frameInfo.contentSize = src_size; /* Context to track compression progress */ LZ4F_compressionContext_t ctx; ret = LZ4F_createCompressionContext(&ctx, LZ4F_VERSION); if (LZ4F_isError(ret)) { ErrMsgPrintf(errmsg, "LZ4F_createDecompressionContext %s", LZ4F_getErrorName(ret)); rv = RAFT_NOMEM; goto err; } /* Guestimate of eventual compressed size, mainly not to allocate a huge * buffer as `LZ4F_compressBound` calculates the worst case scenario. */ dst_size = LZ4F_compressBound( max(MEGABYTE, (size_t)lz4_pref.frameInfo.contentSize / 10), &lz4_pref); dst_size += LZ4F_HEADER_SIZE_MAX_RAFT; compressed->base = raft_malloc(dst_size); if (compressed->base == NULL) { rv = RAFT_NOMEM; goto err_after_ctx_alloc; } /* Returns the size of the lz4 header, data should be written after the * header */ dst_offset = LZ4F_compressBegin(ctx, compressed->base, dst_size, &lz4_pref); if (LZ4F_isError(dst_offset)) { ErrMsgPrintf(errmsg, "LZ4F_compressBegin %s", LZ4F_getErrorName(dst_offset)); rv = RAFT_IOERR; goto err_after_buff_alloc; } /* Compress all buffers */ for (unsigned i = 0; i < n_bufs; ++i) { src_offset = 0; while (src_offset < bufs[i].len) { /* Compress in chunks of maximum 1MB and check if there * is enough room in the dst buffer, if not realloc */ src_size = min(bufs[i].len - src_offset, (size_t)MEGABYTE); dst_size_needed = LZ4F_compressBound(src_size, &lz4_pref); if (dst_size - dst_offset < dst_size_needed) { dst_size += max(dst_size_needed, (size_t)lz4_pref.frameInfo.contentSize / 10); compressed->base = raft_realloc(compressed->base, dst_size); if (compressed->base == NULL) { rv = RAFT_NOMEM; goto err_after_ctx_alloc; } } /* There is guaranteed enough room in `dst` to perform * the compression */ ret = LZ4F_compressUpdate( ctx, (char *)compressed->base + dst_offset, dst_size - dst_offset, (char *)bufs[i].base + src_offset, src_size, NULL); if (LZ4F_isError(ret)) { ErrMsgPrintf(errmsg, "LZ4F_compressUpdate %s", LZ4F_getErrorName(ret)); rv = RAFT_IOERR; goto err_after_buff_alloc; } dst_offset += ret; src_offset += src_size; } } /* Make sure LZ4F_compressEnd has enough room to succeed */ dst_size_needed = LZ4F_compressBound(0, &lz4_pref); if ((dst_size - dst_offset) < dst_size_needed) { dst_size += dst_size_needed; compressed->base = raft_realloc(compressed->base, dst_size); if (compressed->base == NULL) { rv = RAFT_NOMEM; goto err_after_ctx_alloc; } } /* Finalize compression */ ret = LZ4F_compressEnd(ctx, (char *)compressed->base + dst_offset, dst_size - dst_offset, NULL); if (LZ4F_isError(ret)) { ErrMsgPrintf(errmsg, "LZ4F_compressEnd %s", LZ4F_getErrorName(ret)); rv = RAFT_IOERR; goto err_after_buff_alloc; } dst_offset += ret; compressed->len = dst_offset; LZ4F_freeCompressionContext(ctx); return 0; err_after_buff_alloc: raft_free(compressed->base); compressed->base = NULL; err_after_ctx_alloc: LZ4F_freeCompressionContext(ctx); err: return rv; #endif /* LZ4_AVAILABLE */ } int Decompress(struct raft_buffer buf, struct raft_buffer *decompressed, char *errmsg) { #ifndef LZ4_AVAILABLE (void)buf; (void)decompressed; ErrMsgPrintf(errmsg, "LZ4 not available"); return RAFT_INVALID; #else assert(decompressed != NULL); int rv = RAFT_IOERR; size_t src_offset = 0; size_t dst_offset = 0; size_t src_size = 0; size_t dst_size = 0; size_t ret = 0; LZ4F_decompressionContext_t ctx; if (LZ4F_isError(LZ4F_createDecompressionContext(&ctx, LZ4F_VERSION))) { ErrMsgPrintf(errmsg, "LZ4F_createDecompressionContext"); rv = RAFT_NOMEM; goto err; } src_size = buf.len; LZ4F_frameInfo_t frameInfo = {0}; /* `src_size` will contain the size of the LZ4 Frame Header after the * call, decompression must resume at that offset. */ ret = LZ4F_getFrameInfo(ctx, &frameInfo, buf.base, &src_size); if (LZ4F_isError(ret)) { ErrMsgPrintf(errmsg, "LZ4F_getFrameInfo %s", LZ4F_getErrorName(ret)); rv = RAFT_IOERR; goto err_after_ctx_alloc; } src_offset = src_size; decompressed->base = raft_malloc((size_t)frameInfo.contentSize); decompressed->len = (size_t)frameInfo.contentSize; if (decompressed->base == NULL) { rv = RAFT_NOMEM; goto err_after_ctx_alloc; } ret = 1; while (ret != 0) { src_size = buf.len - src_offset; /* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! * The next line works around a bug in an older lz4 lib where * the `size_t` dst_size parameter would overflow an `int`. * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */ dst_size = min(decompressed->len - dst_offset, (size_t)INT_MAX); /* `dst_size` will contain the number of bytes written to * decompressed->base, while `src_size` will contain the number * of bytes consumed from buf.base */ ret = LZ4F_decompress( ctx, (char *)decompressed->base + dst_offset, &dst_size, (char *)buf.base + src_offset, &src_size, NULL); if (LZ4F_isError(ret)) { ErrMsgPrintf(errmsg, "LZ4F_decompress %s", LZ4F_getErrorName(ret)); rv = RAFT_IOERR; goto err_after_buff_alloc; } src_offset += src_size; dst_offset += dst_size; } if (LZ4F_freeDecompressionContext(ctx) != 0) { raft_free(decompressed->base); decompressed->base = NULL; return RAFT_IOERR; } return 0; err_after_buff_alloc: raft_free(decompressed->base); decompressed->base = NULL; err_after_ctx_alloc: LZ4F_freeDecompressionContext(ctx); err: return rv; #endif /* LZ4_AVAILABLE */ } bool IsCompressed(const void *data, size_t sz) { if (data == NULL || sz < 4) { return false; } const void *cursor = data; #ifdef LZ4F_MAGICNUMBER #define RAFT_LZ4F_MAGICNUMBER LZ4F_MAGICNUMBER #else #define RAFT_LZ4F_MAGICNUMBER 0x184D2204U #endif return byteGet32(&cursor) == RAFT_LZ4F_MAGICNUMBER; } dqlite-1.16.7/src/raft/compress.h000066400000000000000000000016111465252713400166270ustar00rootroot00000000000000#ifndef COMPRESS_H_ #define COMPRESS_H_ #include "../raft.h" #ifdef LZ4F_HEADER_SIZE_MAX #define LZ4F_HEADER_SIZE_MAX_RAFT LZ4F_HEADER_SIZE_MAX #else #define LZ4F_HEADER_SIZE_MAX_RAFT 19UL #endif /* * Compresses the content of `bufs` into a newly allocated buffer that is * returned to the caller through `compressed`. Returns a non-0 value upon * failure. */ int Compress(struct raft_buffer bufs[], unsigned n_bufs, struct raft_buffer *compressed, char *errmsg); /* * Decompresses the content of `buf` into a newly allocated buffer that is * returned to the caller through `decompressed`. Returns a non-0 value upon * failure. */ int Decompress(struct raft_buffer buf, struct raft_buffer *decompressed, char *errmsg); /* Returns `true` if `data` is compressed, `false` otherwise. */ bool IsCompressed(const void *data, size_t sz); #endif /* COMPRESS_H_ */ dqlite-1.16.7/src/raft/configuration.c000066400000000000000000000174011465252713400176420ustar00rootroot00000000000000#include "configuration.h" #include "../tracing.h" #include "assert.h" #include "byte.h" /* Current encoding format version. */ #define ENCODING_FORMAT 1 void configurationInit(struct raft_configuration *c) { c->servers = NULL; c->n = 0; } void configurationClose(struct raft_configuration *c) { size_t i; assert(c != NULL); assert(c->n == 0 || c->servers != NULL); for (i = 0; i < c->n; i++) { raft_free(c->servers[i].address); } if (c->servers != NULL) { raft_free(c->servers); } } unsigned configurationIndexOf(const struct raft_configuration *c, const raft_id id) { unsigned i; assert(c != NULL); for (i = 0; i < c->n; i++) { if (c->servers[i].id == id) { return i; } } return c->n; } unsigned configurationIndexOfVoter(const struct raft_configuration *c, const raft_id id) { unsigned i; unsigned j = 0; assert(c != NULL); assert(id > 0); for (i = 0; i < c->n; i++) { if (c->servers[i].id == id) { if (c->servers[i].role == RAFT_VOTER) { return j; } return c->n; } if (c->servers[i].role == RAFT_VOTER) { j++; } } return c->n; } const struct raft_server *configurationGet(const struct raft_configuration *c, const raft_id id) { size_t i; assert(c != NULL); assert(id > 0); /* Grab the index of the server with the given ID */ i = configurationIndexOf(c, id); if (i == c->n) { /* No server with matching ID. */ return NULL; } assert(i < c->n); return &c->servers[i]; } unsigned configurationVoterCount(const struct raft_configuration *c) { unsigned i; unsigned n = 0; assert(c != NULL); for (i = 0; i < c->n; i++) { if (c->servers[i].role == RAFT_VOTER) { n++; } } return n; } int configurationCopy(const struct raft_configuration *src, struct raft_configuration *dst) { size_t i; int rv; configurationInit(dst); for (i = 0; i < src->n; i++) { struct raft_server *server = &src->servers[i]; rv = configurationAdd(dst, server->id, server->address, server->role); if (rv != 0) { goto err; } } return 0; err: configurationClose(dst); assert(rv == RAFT_NOMEM); return rv; } int configurationAdd(struct raft_configuration *c, raft_id id, const char *address, int role) { struct raft_server *servers; struct raft_server *server; char *address_copy; size_t i; int rv; assert(c != NULL); assert(id != 0); if (role != RAFT_STANDBY && role != RAFT_VOTER && role != RAFT_SPARE) { rv = RAFT_BADROLE; goto err; } /* Check that neither the given id or address is already in use */ for (i = 0; i < c->n; i++) { server = &c->servers[i]; if (server->id == id) { rv = RAFT_DUPLICATEID; goto err; } if (strcmp(server->address, address) == 0) { rv = RAFT_DUPLICATEADDRESS; goto err; } } /* Make a copy of the given address */ address_copy = raft_malloc(strlen(address) + 1); if (address_copy == NULL) { rv = RAFT_NOMEM; goto err; } strcpy(address_copy, address); /* Grow the servers array.. */ servers = raft_realloc(c->servers, (c->n + 1) * sizeof *server); if (servers == NULL) { rv = RAFT_NOMEM; goto err_after_address_copy; } c->servers = servers; /* Fill the newly allocated slot (the last one) with the given details. */ server = &servers[c->n]; server->id = id; server->address = address_copy; server->role = role; c->n++; return 0; err_after_address_copy: raft_free(address_copy); err: assert(rv == RAFT_BADROLE || rv == RAFT_DUPLICATEID || rv == RAFT_DUPLICATEADDRESS || rv == RAFT_NOMEM); return rv; } int configurationRemove(struct raft_configuration *c, const raft_id id) { unsigned i; unsigned j; struct raft_server *servers; int rv; assert(c != NULL); i = configurationIndexOf(c, id); if (i == c->n) { rv = RAFT_BADID; goto err; } assert(i < c->n); /* If this is the last server in the configuration, reset everything. */ if (c->n - 1 == 0) { assert(i == 0); servers = NULL; goto out; } /* Create a new servers array. */ servers = raft_calloc(c->n - 1, sizeof *servers); if (servers == NULL) { rv = RAFT_NOMEM; goto err; } /* Copy the first part of the servers array into a new array, excluding * the i'th server. */ for (j = 0; j < i; j++) { servers[j] = c->servers[j]; } /* Copy the second part of the servers array into a new array. */ for (j = i + 1; j < c->n; j++) { servers[j - 1] = c->servers[j]; } out: /* Release the address of the server that was deleted. */ raft_free(c->servers[i].address); /* Release the old servers array */ raft_free(c->servers); c->servers = servers; c->n--; return 0; err: assert(rv == RAFT_BADID || rv == RAFT_NOMEM); return rv; } size_t configurationEncodedSize(const struct raft_configuration *c) { size_t n = 0; unsigned i; /* We need one byte for the encoding format version */ n++; /* Then 8 bytes for number of servers. */ n += sizeof(uint64_t); /* Then some space for each server. */ for (i = 0; i < c->n; i++) { struct raft_server *server = &c->servers[i]; assert(server->address != NULL); n += sizeof(uint64_t); /* Server ID */ n += strlen(server->address) + 1; /* Address */ n++; /* Voting flag */ }; return bytePad64(n); } void configurationEncodeToBuf(const struct raft_configuration *c, void *buf) { void *cursor = buf; unsigned i; /* Encoding format version */ bytePut8(&cursor, ENCODING_FORMAT); /* Number of servers. */ bytePut64(&cursor, c->n); for (i = 0; i < c->n; i++) { struct raft_server *server = &c->servers[i]; assert(server->address != NULL); bytePut64(&cursor, server->id); bytePutString(&cursor, server->address); assert(server->role < 255); bytePut8(&cursor, (uint8_t)server->role); }; } int configurationEncode(const struct raft_configuration *c, struct raft_buffer *buf) { int rv; assert(c != NULL); assert(buf != NULL); /* The configuration can't be empty. */ assert(c->n > 0); buf->len = configurationEncodedSize(c); buf->base = raft_malloc(buf->len); if (buf->base == NULL) { rv = RAFT_NOMEM; goto err; } configurationEncodeToBuf(c, buf->base); return 0; err: assert(rv == RAFT_NOMEM); return rv; } int configurationDecode(const struct raft_buffer *buf, struct raft_configuration *c) { const void *cursor; size_t i; size_t n; int rv; assert(c != NULL); assert(buf != NULL); /* TODO: use 'if' instead of assert for checking buffer boundaries */ assert(buf->len > 0); configurationInit(c); cursor = buf->base; /* Check the encoding format version */ if (byteGet8(&cursor) != ENCODING_FORMAT) { rv = RAFT_MALFORMED; goto err; } /* Read the number of servers. */ n = (size_t)byteGet64(&cursor); /* Decode the individual servers. */ for (i = 0; i < n; i++) { raft_id id; const char *address; int role; /* Server ID. */ id = byteGet64(&cursor); /* Server Address. */ address = byteGetString( &cursor, buf->len - (size_t)((uint8_t *)cursor - (uint8_t *)buf->base)); if (address == NULL) { rv = RAFT_MALFORMED; goto err; } /* Role code. */ role = byteGet8(&cursor); rv = configurationAdd(c, id, address, role); if (rv != 0) { /* Only valid configurations should be ever be encoded, * so in case configurationAdd() fails because of * invalid data we return RAFT_MALFORMED. */ if (rv != RAFT_NOMEM) { rv = RAFT_MALFORMED; } goto err; } } return 0; err: assert(rv == RAFT_MALFORMED || rv == RAFT_NOMEM); configurationClose(c); return rv; } void configurationTrace(const struct raft *r, struct raft_configuration *c, const char *msg) { (void)r; tracef("%s", msg); tracef("=== CONFIG START ==="); unsigned i; struct raft_server *s; for (i = 0; i < c->n; i++) { s = &c->servers[i]; tracef("id:%llu address:%s role:%d", s->id, s->address, s->role); } tracef("=== CONFIG END ==="); } #undef tracef dqlite-1.16.7/src/raft/configuration.h000066400000000000000000000101211465252713400176370ustar00rootroot00000000000000/* Modify and inspect @raft_configuration objects. */ #ifndef CONFIGURATION_H_ #define CONFIGURATION_H_ #include "../raft.h" /* Initialize an empty configuration. */ void configurationInit(struct raft_configuration *c); /* Release all memory used by the given configuration. */ void configurationClose(struct raft_configuration *c); /* Add a server to the given configuration. * * The given @address is copied and no reference to it is kept. In case of * error, @c is left unchanged. * * Errors: * * RAFT_DUPLICATEID * @c already has a server with the given id. * * RAFT_DUPLICATEADDRESS * @c already has a server with the given @address. * * RAFT_BADROLE * @role is not one of ROLE_STANDBY, ROLE_VOTER or ROLE_SPARE. * * RAFT_NOMEM * A copy of @address could not me made or the @c->servers could not * be extended */ int configurationAdd(struct raft_configuration *c, raft_id id, const char *address, int role); /* Return the number of servers with the RAFT_VOTER role. */ unsigned configurationVoterCount(const struct raft_configuration *c); /* Return the index of the server with the given ID (relative to the c->servers * array). If there's no server with the given ID, return the number of * servers. */ unsigned configurationIndexOf(const struct raft_configuration *c, raft_id id); /* Return the index of the RAFT_VOTER server with the given ID (relative to the * sub array of c->servers that has only voting servers). If there's no server * with the given ID, or if it's not flagged as voting, return the number of * servers. */ unsigned configurationIndexOfVoter(const struct raft_configuration *c, raft_id id); /* Get the server with the given ID, or #NULL if no matching server is found. */ const struct raft_server *configurationGet(const struct raft_configuration *c, raft_id id); /* Remove a server from a raft configuration. The given ID must match the one of * an existing server in the configuration. * * In case of error @c is left unchanged. * * Errors: * * RAFT_BADID * @c does not contain any server with the given @id * * RAFT_NOMEM * Memory to hold the new set of servers could not be allocated. */ int configurationRemove(struct raft_configuration *c, raft_id id); /* Deep copy @src to @dst. * * The configuration @src is assumed to be valid (i.e. each of its servers has a * valid ID, address and role). * * The @dst configuration object must be uninitialized or empty. * * In case of error, both @src and @dst are left unchanged. * * Errors: * * RAFT_NOMEM * Memory to copy all the servers could not be allocated. */ int configurationCopy(const struct raft_configuration *src, struct raft_configuration *dst); /* Number of bytes needed to encode the given configuration object. */ size_t configurationEncodedSize(const struct raft_configuration *c); /* Encode the given configuration object to the given pre-allocated buffer, * which is assumed to be at least configurationEncodedSize(c) bytes. */ void configurationEncodeToBuf(const struct raft_configuration *c, void *buf); /* Encode the given configuration object. The memory of the returned buffer is * allocated using raft_malloc(), and client code is responsible for releasing * it when no longer needed. * * Errors: * * RAFT_NOMEM * Memory for the encoded buffer could not be allocated. */ int configurationEncode(const struct raft_configuration *c, struct raft_buffer *buf); /* Populate a configuration object by decoding the given serialized payload. * * The @c configuration object must be uninitialized or empty. * * In case of error, @c will be left empty. * * Errors: * * RAFT_MALFORMED * The given buffer does not contain a valid encoded configuration. * * RAFT_NOMEM * Memory to populate the given configuration could not be allocated. */ int configurationDecode(const struct raft_buffer *buf, struct raft_configuration *c); /* Output the configuration to the raft tracer */ void configurationTrace(const struct raft *r, struct raft_configuration *c, const char *msg); #endif /* CONFIGURATION_H_ */ dqlite-1.16.7/src/raft/convert.c000066400000000000000000000162671465252713400164640ustar00rootroot00000000000000#include "convert.h" #include "../raft.h" #include "../tracing.h" #include "assert.h" #include "callbacks.h" #include "configuration.h" #include "election.h" #include "log.h" #include "membership.h" #include "progress.h" #include "../lib/queue.h" #include "replication.h" #include "request.h" /* Convenience for setting a new state value and asserting that the transition * is valid. */ static void convertSetState(struct raft *r, unsigned short new_state) { /* Check that the transition is legal, see Figure 3.3. Note that with * respect to the paper we have an additional "unavailable" state, which * is the initial or final state. */ unsigned short old_state = r->state; tracef("old_state:%u new_state:%u", old_state, new_state); assert((r->state == RAFT_UNAVAILABLE && new_state == RAFT_FOLLOWER) || (r->state == RAFT_FOLLOWER && new_state == RAFT_CANDIDATE) || (r->state == RAFT_CANDIDATE && new_state == RAFT_FOLLOWER) || (r->state == RAFT_CANDIDATE && new_state == RAFT_LEADER) || (r->state == RAFT_LEADER && new_state == RAFT_FOLLOWER) || (r->state == RAFT_FOLLOWER && new_state == RAFT_UNAVAILABLE) || (r->state == RAFT_CANDIDATE && new_state == RAFT_UNAVAILABLE) || (r->state == RAFT_LEADER && new_state == RAFT_UNAVAILABLE)); r->state = new_state; if (r->state == RAFT_LEADER) { r->leader_state.voter_contacts = 1; } struct raft_callbacks *cbs = raftGetCallbacks(r); if (cbs != NULL && cbs->state_cb != NULL) { cbs->state_cb(r, old_state, new_state); } } /* Clear follower state. */ static void convertClearFollower(struct raft *r) { tracef("clear follower state"); r->follower_state.current_leader.id = 0; if (r->follower_state.current_leader.address != NULL) { raft_free(r->follower_state.current_leader.address); } r->follower_state.current_leader.address = NULL; } /* Clear candidate state. */ static void convertClearCandidate(struct raft *r) { tracef("clear candidate state"); if (r->candidate_state.votes != NULL) { raft_free(r->candidate_state.votes); r->candidate_state.votes = NULL; } } static void convertFailApply(struct raft_apply *req) { if (req != NULL && req->cb != NULL) { req->cb(req, RAFT_LEADERSHIPLOST, NULL); } } static void convertFailBarrier(struct raft_barrier *req) { if (req != NULL && req->cb != NULL) { req->cb(req, RAFT_LEADERSHIPLOST); } } static void convertFailChange(struct raft_change *req) { if (req != NULL && req->cb != NULL) { req->cb(req, RAFT_LEADERSHIPLOST); } } /* Clear leader state. */ static void convertClearLeader(struct raft *r) { tracef("clear leader state"); if (r->leader_state.progress != NULL) { raft_free(r->leader_state.progress); r->leader_state.progress = NULL; } /* Fail all outstanding requests */ while (!queue_empty(&r->leader_state.requests)) { struct request *req; queue *head; head = queue_head(&r->leader_state.requests); queue_remove(head); req = QUEUE_DATA(head, struct request, queue); assert(req->type == RAFT_COMMAND || req->type == RAFT_BARRIER); switch (req->type) { case RAFT_COMMAND: convertFailApply((struct raft_apply *)req); break; case RAFT_BARRIER: convertFailBarrier((struct raft_barrier *)req); break; }; } /* Fail any promote request that is still outstanding because the server * is still catching up and no entry was submitted. */ if (r->leader_state.change != NULL) { convertFailChange(r->leader_state.change); r->leader_state.change = NULL; } } /* Clear the current state */ static void convertClear(struct raft *r) { assert(r->state == RAFT_UNAVAILABLE || r->state == RAFT_FOLLOWER || r->state == RAFT_CANDIDATE || r->state == RAFT_LEADER); switch (r->state) { case RAFT_FOLLOWER: convertClearFollower(r); break; case RAFT_CANDIDATE: convertClearCandidate(r); break; case RAFT_LEADER: convertClearLeader(r); break; } } void convertToFollower(struct raft *r) { convertClear(r); convertSetState(r, RAFT_FOLLOWER); /* Reset election timer. */ electionResetTimer(r); r->follower_state.current_leader.id = 0; r->follower_state.current_leader.address = NULL; r->follower_state.append_in_flight_count = 0; } int convertToCandidate(struct raft *r, bool disrupt_leader) { const struct raft_server *server; size_t n_voters = configurationVoterCount(&r->configuration); int rv; (void)server; /* Only used for assertions. */ convertClear(r); convertSetState(r, RAFT_CANDIDATE); /* Allocate the votes array. */ r->candidate_state.votes = raft_malloc(n_voters * sizeof(bool)); if (r->candidate_state.votes == NULL) { return RAFT_NOMEM; } r->candidate_state.disrupt_leader = disrupt_leader; r->candidate_state.in_pre_vote = disrupt_leader ? false : r->pre_vote; /* Fast-forward to leader if we're the only voting server in the * configuration. */ server = configurationGet(&r->configuration, r->id); assert(server != NULL); assert(server->role == RAFT_VOTER); if (n_voters == 1) { tracef("self elect and convert to leader"); return convertToLeader(r); } /* Start a new election round */ rv = electionStart(r); if (rv != 0) { r->state = RAFT_FOLLOWER; raft_free(r->candidate_state.votes); return rv; } return 0; } void convertInitialBarrierCb(struct raft_barrier *req, int status) { (void)status; raft_free(req); } int convertToLeader(struct raft *r) { int rv; tracef("become leader for term %llu", r->current_term); convertClear(r); convertSetState(r, RAFT_LEADER); /* Reset timers */ r->election_timer_start = r->io->time(r->io); /* Reset apply requests queue */ queue_init(&r->leader_state.requests); /* Allocate and initialize the progress array. */ rv = progressBuildArray(r); if (rv != 0) { return rv; } r->leader_state.change = NULL; /* Reset promotion state. */ r->leader_state.promotee_id = 0; r->leader_state.round_number = 0; r->leader_state.round_index = 0; r->leader_state.round_start = 0; /* By definition, all entries until the last_stored entry will be * committed if we are the only voter around. */ size_t n_voters = configurationVoterCount(&r->configuration); if (n_voters == 1 && (r->last_stored > r->commit_index)) { tracef("apply log entries after self election %llu %llu", r->last_stored, r->commit_index); r->commit_index = r->last_stored; rv = replicationApply(r); } else if (n_voters > 1) { /* Raft Dissertation, paragraph 6.4: * The Leader Completeness Property guarantees that a leader has * all committed entries, but at the start of its term, it may * not know which those are. To find out, it needs to commit an * entry from its term. Raft handles this by having each leader * commit a blank no-op entry into the log at the start of its * term. */ struct raft_barrier *req = raft_malloc(sizeof(*req)); if (req == NULL) { return RAFT_NOMEM; } rv = raft_barrier(r, req, convertInitialBarrierCb); if (rv != 0) { tracef( "failed to send no-op barrier entry after leader " "conversion: " "%d", rv); } } return rv; } void convertToUnavailable(struct raft *r) { /* Abort any pending leadership transfer request. */ if (r->transfer != NULL) { membershipLeadershipTransferClose(r); } convertClear(r); convertSetState(r, RAFT_UNAVAILABLE); } #undef tracef dqlite-1.16.7/src/raft/convert.h000066400000000000000000000030621465252713400164560ustar00rootroot00000000000000/* Convert from one state to another. */ #ifndef CONVERT_H_ #define CONVERT_H_ #include "../raft.h" /* Convert from unavailable, or candidate or leader to follower. * * From Figure 3.1: * * If election timeout elapses without receiving AppendEntries RPC from * current leader or granting vote to candidate: convert to candidate. * * The above implies that we need to reset the election timer when converting to * follower. */ void convertToFollower(struct raft *r); /* Convert from follower to candidate, starting a new election. * * From Figure 3.1: * * On conversion to candidate, start election * * If the disrupt_leader flag is true, the server will set the disrupt leader * flag of the RequestVote messages it sends. */ int convertToCandidate(struct raft *r, bool disrupt_leader); /* Convert from candidate to leader. * * From Figure 3.1: * * Upon election: send initial empty AppendEntries RPC (heartbeat) to each * server. * * From Section 3.4: * * Once a candidate wins an election, it becomes leader. It then sends * heartbeat messages to all of the other servers to establish its authority * and prevent new elections. * * From Section 3.3: * * The leader maintains a nextIndex for each follower, which is the index * of the next log entry the leader will send to that follower. When a * leader first comes to power, it initializes all nextIndex values to the * index just after the last one in its log. */ int convertToLeader(struct raft *r); void convertToUnavailable(struct raft *r); #endif /* CONVERT_H_ */ dqlite-1.16.7/src/raft/election.c000066400000000000000000000222101465252713400165670ustar00rootroot00000000000000#include "election.h" #include "../tracing.h" #include "assert.h" #include "configuration.h" #include "heap.h" #include "log.h" /* Common fields between follower and candidate state. * * The follower_state and candidate_state structs in raft.h must be kept * consistent with this definition. */ struct followerOrCandidateState { unsigned randomized_election_timeout; }; /* Return a pointer to either the follower or candidate state. */ struct followerOrCandidateState *getFollowerOrCandidateState(struct raft *r) { struct followerOrCandidateState *state; assert(r->state == RAFT_FOLLOWER || r->state == RAFT_CANDIDATE); if (r->state == RAFT_FOLLOWER) { state = (struct followerOrCandidateState *)&r->follower_state; } else { state = (struct followerOrCandidateState *)&r->candidate_state; } return state; } void electionResetTimer(struct raft *r) { struct followerOrCandidateState *state = getFollowerOrCandidateState(r); unsigned timeout = (unsigned)r->io->random( r->io, (int)r->election_timeout, 2 * (int)r->election_timeout); assert(timeout >= r->election_timeout); assert(timeout <= r->election_timeout * 2); state->randomized_election_timeout = timeout; r->election_timer_start = r->io->time(r->io); } bool electionTimerExpired(struct raft *r) { struct followerOrCandidateState *state = getFollowerOrCandidateState(r); raft_time now = r->io->time(r->io); return now - r->election_timer_start >= state->randomized_election_timeout; } static void sendRequestVoteCb(struct raft_io_send *send, int status) { (void)status; RaftHeapFree(send); } /* Send a RequestVote RPC to the given server. */ static int electionSend(struct raft *r, const struct raft_server *server) { struct raft_message message; struct raft_io_send *send; raft_term term; int rv; assert(server->id != r->id); assert(server->id != 0); /* If we are in the pre-vote phase, we indicate our future term in the * request. */ term = r->current_term; if (r->candidate_state.in_pre_vote) { term++; } /* Fill the RequestVote message. * * Note that we set last_log_index and last_log_term to the index and * term of the last persisted entry, to the last entry in our in-memory * log cache, because we must advertise only log entries that can't be * lost at restart. * * Also note that, for a similar reason, we apply pending configuration * changes only once they are persisted. When running an election we * then use only persisted information, which is safe (while using * unpersisted information for the log and persisted information for the * configuration or viceversa would lead to inconsistencies and * violations of Raft invariants). */ message.type = RAFT_IO_REQUEST_VOTE; message.request_vote.term = term; message.request_vote.candidate_id = r->id; message.request_vote.last_log_index = r->last_stored; message.request_vote.last_log_term = logTermOf(r->log, r->last_stored); message.request_vote.disrupt_leader = r->candidate_state.disrupt_leader; message.request_vote.pre_vote = r->candidate_state.in_pre_vote; message.server_id = server->id; message.server_address = server->address; send = RaftHeapMalloc(sizeof *send); if (send == NULL) { return RAFT_NOMEM; } send->data = r; rv = r->io->send(r->io, send, &message, sendRequestVoteCb); if (rv != 0) { RaftHeapFree(send); return rv; } return 0; } int electionStart(struct raft *r) { raft_term term; size_t n_voters; size_t voting_index; size_t i; int rv; assert(r->state == RAFT_CANDIDATE); n_voters = configurationVoterCount(&r->configuration); voting_index = configurationIndexOfVoter(&r->configuration, r->id); /* This function should not be invoked if we are not a voting server, * hence voting_index must be lower than the number of servers in the * configuration (meaning that we are a voting server). */ assert(voting_index < r->configuration.n); /* Coherence check that configurationVoterCount and * configurationIndexOfVoter have returned something that makes sense. */ assert(n_voters <= r->configuration.n); assert(voting_index < n_voters); /* During pre-vote we don't increment our term, or reset our vote. * Resetting our vote could lead to double-voting if we were to receive * a RequestVote RPC during our Candidate state while we already voted * for a server during the term. */ if (!r->candidate_state.in_pre_vote) { /* Increment current term */ term = r->current_term + 1; rv = r->io->set_term(r->io, term); if (rv != 0) { tracef("set_term failed %d", rv); goto err; } tracef("beginning of term %llu", term); /* Vote for self */ rv = r->io->set_vote(r->io, r->id); if (rv != 0) { tracef("set_vote self failed %d", rv); goto err; } /* Update our cache too. */ r->current_term = term; r->voted_for = r->id; } /* Reset election timer. */ electionResetTimer(r); assert(r->candidate_state.votes != NULL); /* Initialize the votes array and send vote requests. */ for (i = 0; i < n_voters; i++) { if (i == voting_index) { r->candidate_state.votes[i] = true; /* We vote for ourselves */ } else { r->candidate_state.votes[i] = false; } } for (i = 0; i < r->configuration.n; i++) { const struct raft_server *server = &r->configuration.servers[i]; if (server->id == r->id || server->role != RAFT_VOTER) { continue; } rv = electionSend(r, server); if (rv != 0) { /* This is not a critical failure, let's just log it. */ tracef("failed to send vote request to server %llu: %s", server->id, raft_strerror(rv)); } } return 0; err: assert(rv != 0); return rv; } int electionVote(struct raft *r, const struct raft_request_vote *args, bool *granted) { const struct raft_server *local_server; raft_index local_last_index; raft_term local_last_term; bool is_transferee; /* Requester is the target of a leadership transfer */ int rv; assert(r != NULL); assert(args != NULL); assert(granted != NULL); local_server = configurationGet(&r->configuration, r->id); *granted = false; if (local_server == NULL || local_server->role != RAFT_VOTER) { tracef("local server is not voting -> not granting vote"); return 0; } is_transferee = r->transfer != NULL && r->transfer->id == args->candidate_id; if (!args->pre_vote && r->voted_for != 0 && r->voted_for != args->candidate_id && !is_transferee) { tracef("local server already voted -> not granting vote"); return 0; } /* Raft Dissertation 9.6: * > In the Pre-Vote algorithm, a candidate * > only increments its term if it first learns from a majority of the * > cluster that they would be willing * > to grant the candidate their votes (if the candidate's log is * > sufficiently up-to-date, and the voters * > have not received heartbeats from a valid leader for at least a * baseline > election timeout) Arriving here means that in a pre-vote * phase, we will cast our vote if the candidate's log is sufficiently * up-to-date, no matter what the candidate's term is. We have already * checked if we currently have a leader upon reception of the * RequestVote RPC, meaning the 2 conditions will be satisfied if the * candidate's log is up-to-date. * */ local_last_index = logLastIndex(r->log); /* Our log is definitely not more up-to-date if it's empty! */ if (local_last_index == 0) { tracef("local log is empty -> granting vote"); goto grant_vote; } local_last_term = logLastTerm(r->log); if (args->last_log_term < local_last_term) { /* The requesting server has last entry's log term lower than * ours. */ tracef( "local last entry %llu has term %llu higher than %llu -> " "not " "granting", local_last_index, local_last_term, args->last_log_term); return 0; } if (args->last_log_term > local_last_term) { /* The requesting server has a more up-to-date log. */ tracef( "remote last entry %llu has term %llu higher than %llu -> " "granting vote", args->last_log_index, args->last_log_term, local_last_term); goto grant_vote; } /* The term of the last log entry is the same, so let's compare the * length of the log. */ assert(args->last_log_term == local_last_term); if (local_last_index <= args->last_log_index) { /* Our log is shorter or equal to the one of the requester. */ tracef( "remote log equal or longer than local -> granting vote"); goto grant_vote; } tracef("remote log shorter than local -> not granting vote"); return 0; grant_vote: if (!args->pre_vote) { rv = r->io->set_vote(r->io, args->candidate_id); if (rv != 0) { tracef("set_vote failed %d", rv); return rv; } r->voted_for = args->candidate_id; /* Reset the election timer. */ r->election_timer_start = r->io->time(r->io); } tracef("vote granted to %llu", args->candidate_id); *granted = true; return 0; } bool electionTally(struct raft *r, size_t voter_index) { size_t n_voters = configurationVoterCount(&r->configuration); size_t votes = 0; size_t i; size_t half = n_voters / 2; assert(r->state == RAFT_CANDIDATE); assert(r->candidate_state.votes != NULL); r->candidate_state.votes[voter_index] = true; for (i = 0; i < n_voters; i++) { if (r->candidate_state.votes[i]) { votes++; } } return votes >= half + 1; } #undef tracef dqlite-1.16.7/src/raft/election.h000066400000000000000000000055741465252713400166120ustar00rootroot00000000000000/* Election-related logic and helpers. */ #ifndef ELECTION_H_ #define ELECTION_H_ #include "../raft.h" /* Reset the election_timer clock and set randomized_election_timeout to a * random value between election_timeout and 2 * election_timeout. * * From Section 3.4: * * Raft uses randomized election timeouts to ensure that split votes are rare * and that they are resolved quickly. To prevent split votes in the first * place, election timeouts are chosen randomly from a fixed interval (e.g., * 150-300 ms). This spreads out the servers so that in most cases only a * single server will time out. * * From Section 9.4: * * We used AvailSim to approximate a WAN spanning the continental US. Each * message was assigned a latency chosen randomly from the uniform range of * 30-40 ms, and the servers' election timeout range was set accordingly to * 300-600 ms (about 10-20 times the one-way network latency). When only one * of the five servers has failed, the average election completes within about * 475 ms, and 99.9% of elections complete within 1.5 s. Even when two of the * five servers have failed, the average election takes about 650 ms (about 20 * times the one-way network latency), and 99.9% of elections complete in 3 * s. We believe these election times are more than adequate for most WAN * deployments. * * Must be called in follower or candidate state. */ void electionResetTimer(struct raft *r); /* Return true if the election timer has expired. * * Must be called in follower or candidate state. */ bool electionTimerExpired(struct raft *r); /* Start a new election round. * * From Figure 3.1: * * [Rules for Servers] Candidates: On conversion to candidates, start * election: * * - Increment current term * - Vote for self * - Reset election timer * - Send RequestVote RPCs to all other servers * * From Section 3.4: * * To begin an election, a follower increments its current term and * transitions to candidate state. It then votes for itself and issues * RequestVote RPCs in parallel to each of the other servers in the * cluster. */ int electionStart(struct raft *r); /* Decide whether our vote should be granted to the requesting server and update * our state accordingly. * * From Figure 3.1: * * RequestVote RPC: Receiver Implementation: * * - If votedFor is null or candidateId, and candidate's log is at least as * up-to-date as receiver's log, grant vote. * * The outcome of the decision is stored through the @granted pointer. */ int electionVote(struct raft *r, const struct raft_request_vote *args, bool *granted); /* Update the votes array by adding the vote from the server at the given * index. Return true if with this vote the server has reached the majority of * votes and won elections. */ bool electionTally(struct raft *r, size_t voter_index); #endif /* ELECTION_H_ */ dqlite-1.16.7/src/raft/entry.c000066400000000000000000000031701465252713400161320ustar00rootroot00000000000000#include #include #include "assert.h" #include "entry.h" void entryBatchesDestroy(struct raft_entry *entries, const size_t n) { void *batch = NULL; size_t i; if (entries == NULL) { assert(n == 0); return; } assert(n > 0); for (i = 0; i < n; i++) { assert(entries[i].batch != NULL); if (entries[i].batch != batch) { batch = entries[i].batch; raft_free(batch); } } raft_free(entries); } int entryCopy(const struct raft_entry *src, struct raft_entry *dst) { dst->term = src->term; dst->type = src->type; dst->buf.len = src->buf.len; dst->buf.base = raft_malloc(dst->buf.len); if (dst->buf.len > 0 && dst->buf.base == NULL) { return RAFT_NOMEM; } memcpy(dst->buf.base, src->buf.base, dst->buf.len); dst->batch = NULL; return 0; } int entryBatchCopy(const struct raft_entry *src, struct raft_entry **dst, const size_t n) { size_t size = 0; void *batch; uint8_t *cursor; unsigned i; if (n == 0) { *dst = NULL; return 0; } /* Calculate the total size of the entries content and allocate the * batch. */ for (i = 0; i < n; i++) { size += src[i].buf.len; } batch = raft_malloc(size); if (batch == NULL) { return RAFT_NOMEM; } /* Copy the entries. */ *dst = raft_malloc(n * sizeof **dst); if (*dst == NULL) { raft_free(batch); return RAFT_NOMEM; } cursor = batch; for (i = 0; i < n; i++) { (*dst)[i].term = src[i].term; (*dst)[i].type = src[i].type; (*dst)[i].buf.base = cursor; (*dst)[i].buf.len = src[i].buf.len; (*dst)[i].batch = batch; memcpy((*dst)[i].buf.base, src[i].buf.base, src[i].buf.len); cursor += src[i].buf.len; } return 0; } dqlite-1.16.7/src/raft/entry.h000066400000000000000000000011531465252713400161360ustar00rootroot00000000000000#ifndef ENTRY_H_ #define ENTRY_H_ #include "../raft.h" /* Release all memory associated with the given entries, including the array * itself. The entries are supposed to belong to one or more batches. */ void entryBatchesDestroy(struct raft_entry *entries, size_t n); /* Create a copy of a log entry, including its data. */ int entryCopy(const struct raft_entry *src, struct raft_entry *dst); /* Create a single batch of entries containing a copy of the given entries, * including their data. */ int entryBatchCopy(const struct raft_entry *src, struct raft_entry **dst, size_t n); #endif /* ENTRY_H */ dqlite-1.16.7/src/raft/err.c000066400000000000000000000032351465252713400155630ustar00rootroot00000000000000#include "err.h" #include #include "../raft.h" #include "assert.h" #define WRAP_SEP ": " #define WRAP_SEP_LEN ((size_t)strlen(WRAP_SEP)) void errMsgWrap(char *e, const char *format) { size_t n = RAFT_ERRMSG_BUF_SIZE; size_t prefix_n; size_t prefix_and_sep_n; size_t trail_n; size_t i; /* Calculate the length of the prefix. */ prefix_n = strlen(format); /* If there isn't enough space for the ": " separator and at least one * character of the wrapped error message, then just print the prefix. */ if (prefix_n >= n - (WRAP_SEP_LEN + 1)) { /* We explicitly allow truncation here + silence clang about unknown * warning-group "-Wformat-truncation" */ #ifdef __GNUC__ #ifndef __clang__ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wformat-truncation" #endif #endif ErrMsgPrintf(e, "%s", format); #ifdef __GNUC__ #ifndef __clang__ #pragma GCC diagnostic pop #endif #endif return; } /* Right-shift the wrapped message, to make room for the prefix. */ prefix_and_sep_n = prefix_n + WRAP_SEP_LEN; trail_n = strnlen(e, n - prefix_and_sep_n - 1); memmove(e + prefix_and_sep_n, e, trail_n); e[prefix_and_sep_n + trail_n] = 0; /* Print the prefix. */ ErrMsgPrintf(e, "%s", format); /* Print the separator. * * Avoid using strncpy(e->msg + prefix_n, WRAP_SEP, WRAP_SEP_LEN) since * it generates a warning. */ for (i = 0; i < WRAP_SEP_LEN; i++) { e[prefix_n + i] = WRAP_SEP[i]; } } #define ERR_CODE_TO_STRING_CASE(CODE, MSG) \ case CODE: \ return MSG; const char *errCodeToString(int code) { switch (code) { ERR_CODE_TO_STRING_MAP(ERR_CODE_TO_STRING_CASE); default: return "unknown error"; } } dqlite-1.16.7/src/raft/err.h000066400000000000000000000057631465252713400156000ustar00rootroot00000000000000/* Utilities around error handling. */ #ifndef ERROR_H_ #define ERROR_H_ #include #include #define ERR_CODE_TO_STRING_MAP(X) \ X(RAFT_NOMEM, "out of memory") \ X(RAFT_BADID, "server ID is not valid") \ X(RAFT_DUPLICATEID, "server ID already in use") \ X(RAFT_DUPLICATEADDRESS, "server address already in use") \ X(RAFT_BADROLE, "server role is not valid") \ X(RAFT_MALFORMED, "encoded data is malformed") \ X(RAFT_NOTLEADER, "server is not the leader") \ X(RAFT_LEADERSHIPLOST, "server has lost leadership") \ X(RAFT_SHUTDOWN, "server is shutting down") \ X(RAFT_CANTBOOTSTRAP, "bootstrap only works on new clusters") \ X(RAFT_CANTCHANGE, "a configuration change is already in progress") \ X(RAFT_CORRUPT, "persisted data is corrupted") \ X(RAFT_CANCELED, "operation canceled") \ X(RAFT_NAMETOOLONG, "resource name too long") \ X(RAFT_TOOBIG, "data is too big") \ X(RAFT_NOCONNECTION, "no connection to remote server available") \ X(RAFT_BUSY, "operation can't be performed at this time") \ X(RAFT_IOERR, "I/O error") \ X(RAFT_NOTFOUND, "Resource not found") \ X(RAFT_INVALID, "Invalid parameter") \ X(RAFT_UNAUTHORIZED, "No access to resource") \ X(RAFT_NOSPACE, "Not enough disk space") \ X(RAFT_TOOMANY, "System or raft limit met or exceeded") /* Format an error message. */ #define ErrMsgPrintf(ERRMSG, ...) \ snprintf(ERRMSG, RAFT_ERRMSG_BUF_SIZE, __VA_ARGS__) /* Wrap the given error message with an additional prefix message.. */ #define ErrMsgWrapf(ERRMSG, ...) \ do { \ char _errmsg[RAFT_ERRMSG_BUF_SIZE]; \ ErrMsgPrintf(_errmsg, __VA_ARGS__); \ errMsgWrap(ERRMSG, _errmsg); \ } while (0) void errMsgWrap(char *e, const char *format); /* Transfer an error message from an object to another, wrapping it. */ #define ErrMsgTransfer(ERRMSG1, ERRMSG2, FORMAT) \ memcpy(ERRMSG2, ERRMSG1, RAFT_ERRMSG_BUF_SIZE); \ ErrMsgWrapf(ERRMSG2, FORMAT) #define ErrMsgTransferf(ERRMSG1, ERRMSG2, FORMAT, ...) \ memcpy(ERRMSG2, ERRMSG1, RAFT_ERRMSG_BUF_SIZE); \ ErrMsgWrapf(ERRMSG2, FORMAT, __VA_ARGS__) /* Use the static error message for the error with the given code. */ #define ErrMsgFromCode(ERRMSG, CODE) \ ErrMsgPrintf(ERRMSG, "%s", errCodeToString(CODE)) /* Format the out of memory error message. */ #define ErrMsgOom(ERRMSG) ErrMsgFromCode(ERRMSG, RAFT_NOMEM) /* Convert a numeric raft error code to a human-readable error message. */ const char *errCodeToString(int code); #endif /* ERROR_H_ */ dqlite-1.16.7/src/raft/fixture.c000066400000000000000000001344521465252713400164670ustar00rootroot00000000000000#include #include #include #include #include "../raft.h" #include "../tracing.h" #include "assert.h" #include "configuration.h" #include "convert.h" #include "entry.h" #include "log.h" #include "../lib/queue.h" #include "snapshot.h" /* Defaults */ #define HEARTBEAT_TIMEOUT 100 #define INSTALL_SNAPSHOT_TIMEOUT 30000 #define ELECTION_TIMEOUT 1000 #define NETWORK_LATENCY 15 #define DISK_LATENCY 10 #define WORK_DURATION 200 #define SEND_LATENCY 0 /* To keep in sync with raft.h */ #define N_MESSAGE_TYPES 6 /* Maximum number of peer stub instances connected to a certain stub * instance. This should be enough for testing purposes. */ #define MAX_PEERS 8 struct raft_fixture_server { bool alive; /* If false, the server is down. */ raft_id id; /* Server ID. */ char address[16]; /* Server address (stringified ID). */ struct raft_tracer tracer; /* Tracer. */ struct raft_io io; /* In-memory raft_io implementation. */ struct raft raft; /* Raft instance. */ }; struct raft_fixture_event { unsigned server_index; /* Index of the server the event occurred on. */ int type; /* Type of the event. */ }; RAFT_API int raft_fixture_event_type(struct raft_fixture_event *event) { assert(event != NULL); return event->type; } RAFT_API unsigned raft_fixture_event_server_index( struct raft_fixture_event *event) { assert(event != NULL); return event->server_index; } /* Fields common across all request types. */ #define REQUEST \ int type; /* Request code type. */ \ raft_time completion_time; /* When the request should be fulfilled. */ \ queue queue /* Link the I/O pending requests queue. */ /* Request type codes. */ enum { APPEND = 1, SEND, TRANSMIT, SNAPSHOT_PUT, SNAPSHOT_GET, ASYNC_WORK }; /* Abstract base type for an asynchronous request submitted to the stub I/o * implementation. */ struct ioRequest { REQUEST; }; /* Pending request to append entries to the log. */ struct append { REQUEST; struct raft_io_append *req; const struct raft_entry *entries; unsigned n; unsigned start; /* Request timestamp. */ }; /* Pending request to send a message. */ struct send { REQUEST; struct raft_io_send *req; struct raft_message message; }; /* Pending request to store a snapshot. */ struct snapshot_put { REQUEST; unsigned trailing; struct raft_io_snapshot_put *req; const struct raft_snapshot *snapshot; }; /* Pending request to perform general work. */ struct async_work { REQUEST; struct raft_io_async_work *req; }; /* Pending request to load a snapshot. */ struct snapshot_get { REQUEST; struct raft_io_snapshot_get *req; }; /* Message that has been written to the network and is waiting to be delivered * (or discarded). */ struct transmit { REQUEST; struct raft_message message; /* Message to deliver */ int timer; /* Deliver after this n of msecs. */ }; /* Information about a peer server. */ struct peer { struct io *io; /* The peer's I/O backend. */ bool connected; /* Whether a connection is established. */ bool saturated; /* Whether the established connection is saturated. */ unsigned send_latency; }; /* Stub I/O implementation implementing all operations in-memory. */ struct io { struct raft_io *io; /* I/O object we're implementing. */ unsigned index; /* Fixture server index. */ raft_time *time; /* Global cluster time. */ raft_time next_tick; /* Time the next tick should occurs. */ /* Term and vote */ raft_term term; raft_id voted_for; /* Log */ struct raft_snapshot *snapshot; /* Latest snapshot */ struct raft_entry *entries; /* Array or persisted entries */ size_t n; /* Size of the persisted entries array */ /* Parameters passed via raft_io->init and raft_io->start */ raft_id id; const char *address; unsigned tick_interval; raft_io_tick_cb tick_cb; raft_io_recv_cb recv_cb; /* Queue of pending asynchronous requests, whose callbacks still haven't * been fired. */ queue requests; /* Peers connected to us. */ struct peer peers[MAX_PEERS]; unsigned n_peers; unsigned randomized_election_timeout; /* Value returned by io->random() */ unsigned network_latency; /* Milliseconds to deliver RPCs */ unsigned disk_latency; /* Milliseconds to perform disk I/O */ unsigned work_duration; /* Milliseconds to run async work */ int append_fault_countdown; int vote_fault_countdown; int term_fault_countdown; int send_fault_countdown; /* If flag i is true, messages of type i will be silently dropped. */ bool drop[N_MESSAGE_TYPES]; /* Counters of events that happened so far. */ unsigned n_send[N_MESSAGE_TYPES]; unsigned n_recv[N_MESSAGE_TYPES]; unsigned n_append; }; static bool faultTick(int *countdown) { bool trigger = *countdown == 0; if (*countdown >= 0) { *countdown -= 1; } return trigger; } static int ioMethodInit(struct raft_io *raft_io, raft_id id, const char *address) { struct io *io = raft_io->impl; io->id = id; io->address = address; return 0; } static int ioMethodStart(struct raft_io *raft_io, unsigned msecs, raft_io_tick_cb tick_cb, raft_io_recv_cb recv_cb) { struct io *io = raft_io->impl; io->tick_interval = msecs; io->tick_cb = tick_cb; io->recv_cb = recv_cb; io->next_tick = *io->time + io->tick_interval; return 0; } /* Flush an append entries request, appending its entries to the local in-memory * log. */ static void ioFlushAppend(struct io *s, struct append *append) { struct raft_entry *entries; unsigned i; int status = 0; /* Simulates a disk write failure. */ if (faultTick(&s->append_fault_countdown)) { status = RAFT_IOERR; goto done; } /* Allocate an array for the old entries plus the new ones. */ entries = raft_realloc(s->entries, (s->n + append->n) * sizeof *s->entries); assert(entries != NULL); /* Copy new entries into the new array. */ for (i = 0; i < append->n; i++) { const struct raft_entry *src = &append->entries[i]; struct raft_entry *dst = &entries[s->n + i]; int rv = entryCopy(src, dst); assert(rv == 0); } s->entries = entries; s->n += append->n; done: if (append->req->cb != NULL) { append->req->cb(append->req, status); } raft_free(append); } /* Flush a snapshot put request, copying the snapshot data. */ static void ioFlushSnapshotPut(struct io *s, struct snapshot_put *r) { int rv; if (s->snapshot == NULL) { s->snapshot = raft_malloc(sizeof *s->snapshot); assert(s->snapshot != NULL); } else { snapshotClose(s->snapshot); } rv = snapshotCopy(r->snapshot, s->snapshot); assert(rv == 0); if (r->trailing == 0) { rv = s->io->truncate(s->io, 1); assert(rv == 0); } if (r->req->cb != NULL) { r->req->cb(r->req, 0); } raft_free(r); } /* Flush a snapshot get request, returning to the client a copy of the local * snapshot (if any). */ static void ioFlushSnapshotGet(struct io *s, struct snapshot_get *r) { struct raft_snapshot *snapshot; int rv; snapshot = raft_malloc(sizeof *snapshot); assert(snapshot != NULL); rv = snapshotCopy(s->snapshot, snapshot); assert(rv == 0); r->req->cb(r->req, snapshot, 0); raft_free(r); } /* Flush an async work request */ static void ioFlushAsyncWork(struct io *s, struct async_work *r) { (void)s; int rv; rv = r->req->work(r->req); r->req->cb(r->req, rv); raft_free(r); } /* Search for the peer with the given ID. */ static struct peer *ioGetPeer(struct io *io, raft_id id) { unsigned i; for (i = 0; i < io->n_peers; i++) { struct peer *peer = &io->peers[i]; if (peer->io->id == id) { return peer; } } return NULL; } /* Copy the dynamically allocated memory of an AppendEntries message. */ static void copyAppendEntries(const struct raft_append_entries *src, struct raft_append_entries *dst) { int rv; rv = entryBatchCopy(src->entries, &dst->entries, src->n_entries); assert(rv == 0); dst->n_entries = src->n_entries; } /* Copy the dynamically allocated memory of an InstallSnapshot message. */ static void copyInstallSnapshot(const struct raft_install_snapshot *src, struct raft_install_snapshot *dst) { int rv; rv = configurationCopy(&src->conf, &dst->conf); assert(rv == 0); dst->data.base = raft_malloc(dst->data.len); assert(dst->data.base != NULL); memcpy(dst->data.base, src->data.base, src->data.len); } /* Flush a raft_io_send request, copying the message content into a new struct * transmit object and invoking the user callback. */ static void ioFlushSend(struct io *io, struct send *send) { struct peer *peer; struct transmit *transmit; struct raft_message *src; struct raft_message *dst; int status; /* If the peer doesn't exist or was disconnected, fail the request. */ peer = ioGetPeer(io, send->message.server_id); if (peer == NULL || !peer->connected) { status = RAFT_NOCONNECTION; goto out; } transmit = raft_calloc(1, sizeof *transmit); assert(transmit != NULL); transmit->type = TRANSMIT; transmit->completion_time = *io->time + io->network_latency; src = &send->message; dst = &transmit->message; queue_insert_tail(&io->requests, &transmit->queue); *dst = *src; switch (dst->type) { case RAFT_IO_APPEND_ENTRIES: /* Make a copy of the entries being sent */ copyAppendEntries(&src->append_entries, &dst->append_entries); break; case RAFT_IO_INSTALL_SNAPSHOT: copyInstallSnapshot(&src->install_snapshot, &dst->install_snapshot); break; } io->n_send[send->message.type]++; status = 0; out: if (send->req->cb != NULL) { send->req->cb(send->req, status); } raft_free(send); } /* Release the memory used by the given message transmit object. */ static void ioDestroyTransmit(struct transmit *transmit) { struct raft_message *message; message = &transmit->message; switch (message->type) { case RAFT_IO_APPEND_ENTRIES: if (message->append_entries.entries != NULL) { raft_free( message->append_entries.entries[0].batch); raft_free(message->append_entries.entries); } break; case RAFT_IO_INSTALL_SNAPSHOT: raft_configuration_close( &message->install_snapshot.conf); raft_free(message->install_snapshot.data.base); break; } raft_free(transmit); } /* Flush all requests in the queue. */ static void ioFlushAll(struct io *io) { while (!queue_empty(&io->requests)) { queue *head; struct ioRequest *r; head = queue_head(&io->requests); queue_remove(head); r = QUEUE_DATA(head, struct ioRequest, queue); switch (r->type) { case APPEND: ioFlushAppend(io, (struct append *)r); break; case SEND: ioFlushSend(io, (struct send *)r); break; case TRANSMIT: ioDestroyTransmit((struct transmit *)r); break; case SNAPSHOT_PUT: ioFlushSnapshotPut(io, (struct snapshot_put *)r); break; case SNAPSHOT_GET: ioFlushSnapshotGet(io, (struct snapshot_get *)r); break; case ASYNC_WORK: ioFlushAsyncWork(io, (struct async_work *)r); break; default: assert(0); } } } static void ioMethodClose(struct raft_io *raft_io, raft_io_close_cb cb) { if (cb != NULL) { cb(raft_io); } } static int ioMethodLoad(struct raft_io *io, raft_term *term, raft_id *voted_for, struct raft_snapshot **snapshot, raft_index *start_index, struct raft_entry **entries, size_t *n_entries) { struct io *s; int rv; s = io->impl; *term = s->term; *voted_for = s->voted_for; *start_index = 1; *n_entries = s->n; /* Make a copy of the persisted entries, storing their data into a * single batch. */ rv = entryBatchCopy(s->entries, entries, s->n); assert(rv == 0); if (s->snapshot != NULL) { *snapshot = raft_malloc(sizeof **snapshot); assert(*snapshot != NULL); rv = snapshotCopy(s->snapshot, *snapshot); assert(rv == 0); *start_index = (*snapshot)->index + 1; } else { *snapshot = NULL; } return 0; } static int ioMethodBootstrap(struct raft_io *raft_io, const struct raft_configuration *conf) { struct io *io = raft_io->impl; struct raft_buffer buf; struct raft_entry *entries; int rv; if (io->term != 0) { return RAFT_CANTBOOTSTRAP; } assert(io->voted_for == 0); assert(io->snapshot == NULL); assert(io->entries == NULL); assert(io->n == 0); /* Encode the given configuration. */ rv = configurationEncode(conf, &buf); if (rv != 0) { return rv; } entries = raft_calloc(1, sizeof *io->entries); if (entries == NULL) { return RAFT_NOMEM; } entries[0].term = 1; entries[0].type = RAFT_CHANGE; entries[0].buf = buf; io->term = 1; io->voted_for = 0; io->snapshot = NULL; io->entries = entries; io->n = 1; return 0; } static int ioMethodRecover(struct raft_io *io, const struct raft_configuration *conf) { /* TODO: implement this API */ (void)io; (void)conf; return RAFT_IOERR; } static int ioMethodSetTerm(struct raft_io *raft_io, const raft_term term) { struct io *io = raft_io->impl; if (faultTick(&io->term_fault_countdown)) { return RAFT_IOERR; } io->term = term; io->voted_for = 0; return 0; } static int ioMethodSetVote(struct raft_io *raft_io, const raft_id server_id) { struct io *io = raft_io->impl; if (faultTick(&io->vote_fault_countdown)) { return RAFT_IOERR; } io->voted_for = server_id; return 0; } static int ioMethodAppend(struct raft_io *raft_io, struct raft_io_append *req, const struct raft_entry entries[], unsigned n, raft_io_append_cb cb) { struct io *io = raft_io->impl; struct append *r; r = raft_malloc(sizeof *r); assert(r != NULL); r->type = APPEND; r->completion_time = *io->time + io->disk_latency; r->req = req; r->entries = entries; r->n = n; req->cb = cb; queue_insert_tail(&io->requests, &r->queue); return 0; } static int ioMethodTruncate(struct raft_io *raft_io, raft_index index) { struct io *io = raft_io->impl; size_t n; n = (size_t)(index - 1); /* Number of entries left after truncation */ if (n > 0) { struct raft_entry *entries; /* Create a new array of entries holding the non-truncated * entries */ entries = raft_malloc(n * sizeof *entries); if (entries == NULL) { return RAFT_NOMEM; } memcpy(entries, io->entries, n * sizeof *io->entries); /* Release any truncated entry */ if (io->entries != NULL) { size_t i; for (i = n; i < io->n; i++) { raft_free(io->entries[i].buf.base); } raft_free(io->entries); } io->entries = entries; } else { /* Release everything we have */ if (io->entries != NULL) { size_t i; for (i = 0; i < io->n; i++) { raft_free(io->entries[i].buf.base); } raft_free(io->entries); io->entries = NULL; } } io->n = n; return 0; } static int ioMethodSnapshotPut(struct raft_io *raft_io, unsigned trailing, struct raft_io_snapshot_put *req, const struct raft_snapshot *snapshot, raft_io_snapshot_put_cb cb) { struct io *io = raft_io->impl; struct snapshot_put *r; r = raft_malloc(sizeof *r); assert(r != NULL); r->type = SNAPSHOT_PUT; r->req = req; r->req->cb = cb; r->snapshot = snapshot; r->completion_time = *io->time + io->disk_latency; r->trailing = trailing; queue_insert_tail(&io->requests, &r->queue); return 0; } static int ioMethodAsyncWork(struct raft_io *raft_io, struct raft_io_async_work *req, raft_io_async_work_cb cb) { struct io *io = raft_io->impl; struct async_work *r; r = raft_malloc(sizeof *r); assert(r != NULL); r->type = ASYNC_WORK; r->req = req; r->req->cb = cb; r->completion_time = *io->time + io->work_duration; queue_insert_tail(&io->requests, &r->queue); return 0; } static int ioMethodSnapshotGet(struct raft_io *raft_io, struct raft_io_snapshot_get *req, raft_io_snapshot_get_cb cb) { struct io *io = raft_io->impl; struct snapshot_get *r; r = raft_malloc(sizeof *r); assert(r != NULL); r->type = SNAPSHOT_GET; r->req = req; r->req->cb = cb; r->completion_time = *io->time + io->disk_latency; queue_insert_tail(&io->requests, &r->queue); return 0; } static raft_time ioMethodTime(struct raft_io *raft_io) { struct io *io = raft_io->impl; return *io->time; } static int ioMethodRandom(struct raft_io *raft_io, int min, int max) { struct io *io = raft_io->impl; int t = (int)io->randomized_election_timeout; if (t < min) { return min; } else if (t > max) { return max; } else { return t; } } /* Queue up a request which will be processed later, when io_stub_flush() * is invoked. */ static int ioMethodSend(struct raft_io *raft_io, struct raft_io_send *req, const struct raft_message *message, raft_io_send_cb cb) { struct io *io = raft_io->impl; struct send *r; struct peer *peer; if (faultTick(&io->send_fault_countdown)) { return RAFT_IOERR; } r = raft_malloc(sizeof *r); assert(r != NULL); r->type = SEND; r->req = req; r->message = *message; r->req->cb = cb; peer = ioGetPeer(io, message->server_id); r->completion_time = *io->time + peer->send_latency; queue_insert_tail(&io->requests, &r->queue); return 0; } static void ioReceive(struct io *io, struct raft_message *message) { io->recv_cb(io->io, message); io->n_recv[message->type]++; } static void ioDeliverTransmit(struct io *io, struct transmit *transmit) { struct raft_message *message = &transmit->message; struct peer *peer; /* Destination peer */ /* If this message type is in the drop list, let's discard it */ if (io->drop[message->type - 1]) { ioDestroyTransmit(transmit); return; } peer = ioGetPeer(io, message->server_id); /* We don't have any peer with this ID or it's disconnected or if the * connection is saturated, let's drop the message */ if (peer == NULL || !peer->connected || peer->saturated) { ioDestroyTransmit(transmit); return; } /* Update the message object with our details. */ message->server_id = io->id; message->server_address = io->address; ioReceive(peer->io, message); raft_free(transmit); } /* Connect @raft_io to @other, enabling delivery of messages sent from @io to * @other. */ static void ioConnect(struct raft_io *raft_io, struct raft_io *other) { struct io *io = raft_io->impl; struct io *io_other = other->impl; assert(io->n_peers < MAX_PEERS); io->peers[io->n_peers].io = io_other; io->peers[io->n_peers].connected = true; io->peers[io->n_peers].saturated = false; io->peers[io->n_peers].send_latency = SEND_LATENCY; io->n_peers++; } /* Return whether the connection with the given peer is saturated. */ static bool ioSaturated(struct raft_io *raft_io, struct raft_io *other) { struct io *io = raft_io->impl; struct io *io_other = other->impl; struct peer *peer; peer = ioGetPeer(io, io_other->id); return peer != NULL && peer->saturated; } /* Disconnect @raft_io and @other, causing calls to @io->send() to fail * asynchronously when sending messages to @other. */ static void ioDisconnect(struct raft_io *raft_io, struct raft_io *other) { struct io *io = raft_io->impl; struct io *io_other = other->impl; struct peer *peer; peer = ioGetPeer(io, io_other->id); assert(peer != NULL); peer->connected = false; } /* Reconnect @raft_io and @other. */ static void ioReconnect(struct raft_io *raft_io, struct raft_io *other) { struct io *io = raft_io->impl; struct io *io_other = other->impl; struct peer *peer; peer = ioGetPeer(io, io_other->id); assert(peer != NULL); peer->connected = true; } /* Saturate the connection from @io to @other, causing messages sent from @io to * @other to be dropped. */ static void ioSaturate(struct raft_io *io, struct raft_io *other) { struct io *s; struct io *s_other; struct peer *peer; s = io->impl; s_other = other->impl; peer = ioGetPeer(s, s_other->id); assert(peer != NULL && peer->connected); peer->saturated = true; } /* Desaturate the connection from @raft_io to @other, re-enabling delivery of * messages sent from @raft_io to @other. */ static void ioDesaturate(struct raft_io *raft_io, struct raft_io *other) { struct io *io = raft_io->impl; struct io *io_other = other->impl; struct peer *peer; peer = ioGetPeer(io, io_other->id); assert(peer != NULL && peer->connected); peer->saturated = false; } /* Enable or disable silently dropping all outgoing messages of type @type. */ void ioDrop(struct io *io, int type, bool flag) { io->drop[type - 1] = flag; } static int ioInit(struct raft_io *raft_io, unsigned index, raft_time *time) { struct io *io; io = raft_malloc(sizeof *io); assert(io != NULL); io->io = raft_io; io->index = index; io->time = time; io->term = 0; io->voted_for = 0; io->snapshot = NULL; io->entries = NULL; io->n = 0; queue_init(&io->requests); io->n_peers = 0; io->randomized_election_timeout = ELECTION_TIMEOUT + index * 100; io->network_latency = NETWORK_LATENCY; io->disk_latency = DISK_LATENCY; io->work_duration = WORK_DURATION; io->append_fault_countdown = -1; io->vote_fault_countdown = -1; io->term_fault_countdown = -1; io->send_fault_countdown = -1; memset(io->drop, 0, sizeof io->drop); memset(io->n_send, 0, sizeof io->n_send); memset(io->n_recv, 0, sizeof io->n_recv); io->n_append = 0; raft_io->impl = io; raft_io->version = 2; raft_io->init = ioMethodInit; raft_io->close = ioMethodClose; raft_io->start = ioMethodStart; raft_io->load = ioMethodLoad; raft_io->bootstrap = ioMethodBootstrap; raft_io->recover = ioMethodRecover; raft_io->set_term = ioMethodSetTerm; raft_io->set_vote = ioMethodSetVote; raft_io->append = ioMethodAppend; raft_io->truncate = ioMethodTruncate; raft_io->send = ioMethodSend; raft_io->snapshot_put = ioMethodSnapshotPut; raft_io->async_work = ioMethodAsyncWork; raft_io->snapshot_get = ioMethodSnapshotGet; raft_io->time = ioMethodTime; raft_io->random = ioMethodRandom; return 0; } /* Release all memory held by the given stub I/O implementation. */ void ioClose(struct raft_io *raft_io) { struct io *io = raft_io->impl; size_t i; for (i = 0; i < io->n; i++) { struct raft_entry *entry = &io->entries[i]; raft_free(entry->buf.base); } if (io->entries != NULL) { raft_free(io->entries); } if (io->snapshot != NULL) { snapshotClose(io->snapshot); raft_free(io->snapshot); } raft_free(io); } /* Custom emit tracer function which include the server ID. */ static void emit(struct raft_tracer *t, const char *file, unsigned int line, const char *func, unsigned int level, const char *message) { unsigned id = *(unsigned *)t->impl; (void)func; (void)level; fprintf(stderr, "%d: %30s:%*d - %s\n", id, file, 3, line, message); } static int serverInit(struct raft_fixture *f, unsigned i, struct raft_fsm *fsm) { int rv; struct raft_fixture_server *s; s = raft_malloc(sizeof(*s)); if (s == NULL) { return RAFT_NOMEM; } f->servers[i] = s; s->alive = true; s->id = i + 1; sprintf(s->address, "%llu", s->id); rv = ioInit(&s->io, i, &f->time); if (rv != 0) { return rv; } rv = raft_init(&s->raft, &s->io, fsm, s->id, s->address); if (rv != 0) { return rv; } raft_set_election_timeout(&s->raft, ELECTION_TIMEOUT); raft_set_heartbeat_timeout(&s->raft, HEARTBEAT_TIMEOUT); raft_set_install_snapshot_timeout(&s->raft, INSTALL_SNAPSHOT_TIMEOUT); s->tracer.impl = (void *)&s->id; s->tracer.emit = emit; s->raft.tracer = NULL; return 0; } static void serverClose(struct raft_fixture_server *s) { raft_close(&s->raft, NULL); ioClose(&s->io); raft_free(s); } /* Connect the server with the given index to all others */ static void serverConnectToAll(struct raft_fixture *f, unsigned i) { unsigned j; for (j = 0; j < f->n; j++) { struct raft_io *io1 = &f->servers[i]->io; struct raft_io *io2 = &f->servers[j]->io; if (i == j) { continue; } ioConnect(io1, io2); } } int raft_fixture_init(struct raft_fixture *f) { f->time = 0; f->n = 0; f->log = logInit(); if (f->log == NULL) { return RAFT_NOMEM; } f->commit_index = 0; f->hook = NULL; f->event = raft_malloc(sizeof(*f->event)); if (f->event == NULL) { return RAFT_NOMEM; } return 0; } void raft_fixture_close(struct raft_fixture *f) { unsigned i; for (i = 0; i < f->n; i++) { struct io *io = f->servers[i]->io.impl; ioFlushAll(io); } for (i = 0; i < f->n; i++) { serverClose(f->servers[i]); } raft_free(f->event); logClose(f->log); } int raft_fixture_configuration(struct raft_fixture *f, unsigned n_voting, struct raft_configuration *configuration) { unsigned i; assert(f->n > 0); assert(n_voting > 0); assert(n_voting <= f->n); raft_configuration_init(configuration); for (i = 0; i < f->n; i++) { struct raft_fixture_server *s; int role = i < n_voting ? RAFT_VOTER : RAFT_STANDBY; int rv; s = f->servers[i]; rv = raft_configuration_add(configuration, s->id, s->address, role); if (rv != 0) { return rv; } } return 0; } int raft_fixture_bootstrap(struct raft_fixture *f, struct raft_configuration *configuration) { unsigned i; for (i = 0; i < f->n; i++) { struct raft *raft = raft_fixture_get(f, i); int rv; rv = raft_bootstrap(raft, configuration); if (rv != 0) { return rv; } } return 0; } int raft_fixture_start(struct raft_fixture *f) { unsigned i; int rv; for (i = 0; i < f->n; i++) { struct raft_fixture_server *s = f->servers[i]; rv = raft_start(&s->raft); if (rv != 0) { return rv; } } return 0; } unsigned raft_fixture_n(struct raft_fixture *f) { return f->n; } raft_time raft_fixture_time(struct raft_fixture *f) { return f->time; } struct raft *raft_fixture_get(struct raft_fixture *f, unsigned i) { assert(i < f->n); return &f->servers[i]->raft; } bool raft_fixture_alive(struct raft_fixture *f, unsigned i) { assert(i < f->n); return f->servers[i]->alive; } unsigned raft_fixture_leader_index(struct raft_fixture *f) { if (f->leader_id != 0) { return (unsigned)(f->leader_id - 1); } return f->n; } raft_id raft_fixture_voted_for(struct raft_fixture *f, unsigned i) { struct io *io = f->servers[i]->io.impl; return io->voted_for; } /* Update the leader and check for election safety. * * From figure 3.2: * * Election Safety -> At most one leader can be elected in a given * term. * * Return true if the current leader turns out to be different from the one at * the time this function was called. */ static bool updateLeaderAndCheckElectionSafety(struct raft_fixture *f) { raft_id leader_id = 0; unsigned leader_i = 0; raft_term leader_term = 0; unsigned i; bool changed; for (i = 0; i < f->n; i++) { struct raft *raft = raft_fixture_get(f, i); unsigned j; /* If the server is not alive or is not the leader, skip to the * next server. */ if (!raft_fixture_alive(f, i) || raft_state(raft) != RAFT_LEADER) { continue; } /* Check that no other server is leader for this term. */ for (j = 0; j < f->n; j++) { struct raft *other = raft_fixture_get(f, j); if (other->id == raft->id || other->state != RAFT_LEADER) { continue; } if (other->current_term == raft->current_term) { fprintf(stderr, "server %llu and %llu are both leaders " "in term %llu", raft->id, other->id, raft->current_term); abort(); } } if (raft->current_term > leader_term) { leader_id = raft->id; leader_i = i; leader_term = raft->current_term; } } /* Check that the leader is stable, in the sense that it has been * acknowledged by all alive servers connected to it, and those servers * together with the leader form a majority. */ if (leader_id != 0) { unsigned n_acks = 0; bool acked = true; unsigned n_quorum = 0; for (i = 0; i < f->n; i++) { struct raft *raft = raft_fixture_get(f, i); const struct raft_server *server = configurationGet(&raft->configuration, raft->id); /* If the server is not in the configuration or is idle, * then don't count it. */ if (server == NULL || server->role == RAFT_SPARE) { continue; } n_quorum++; /* If this server is itself the leader, or it's not * alive or it's not connected to the leader, then don't * count it in for stability. */ if (i == leader_i || !raft_fixture_alive(f, i) || raft_fixture_saturated(f, leader_i, i)) { continue; } if (raft->current_term != leader_term) { acked = false; break; } if (raft->state != RAFT_FOLLOWER) { acked = false; break; } if (raft->follower_state.current_leader.id == 0) { acked = false; break; } if (raft->follower_state.current_leader.id != leader_id) { acked = false; break; } n_acks++; } if (!acked || n_acks < (n_quorum / 2)) { leader_id = 0; } } changed = leader_id != f->leader_id; f->leader_id = leader_id; return changed; } /* Check for leader append-only. * * From figure 3.2: * * Leader Append-Only -> A leader never overwrites or deletes entries in its * own log; it only appends new entries. */ static void checkLeaderAppendOnly(struct raft_fixture *f) { struct raft *raft; raft_index index; raft_index last = logLastIndex(f->log); /* If the cached log is empty it means there was no leader before. */ if (last == 0) { return; } /* If there's no new leader, just return. */ if (f->leader_id == 0) { return; } raft = raft_fixture_get(f, (unsigned)f->leader_id - 1); last = logLastIndex(f->log); for (index = 1; index <= last; index++) { const struct raft_entry *entry1; const struct raft_entry *entry2; size_t i; entry1 = logGet(f->log, index); entry2 = logGet(raft->log, index); assert(entry1 != NULL); /* Check if the entry was snapshotted. */ if (entry2 == NULL) { assert(raft->log->snapshot.last_index >= index); continue; } /* Entry was not overwritten. */ assert(entry1->type == entry2->type); assert(entry1->term == entry2->term); for (i = 0; i < entry1->buf.len; i++) { assert(((uint8_t *)entry1->buf.base)[i] == ((uint8_t *)entry2->buf.base)[i]); } } } /* Make a copy of the the current leader log, in order to perform the Leader * Append-Only check at the next iteration. */ static void copyLeaderLog(struct raft_fixture *f) { struct raft *raft = raft_fixture_get(f, (unsigned)f->leader_id - 1); struct raft_entry *entries; unsigned n; size_t i; int rv; logClose(f->log); f->log = logInit(); if (f->log == NULL) { assert(false); return; } rv = logAcquire(raft->log, 1, &entries, &n); assert(rv == 0); for (i = 0; i < n; i++) { struct raft_entry *entry = &entries[i]; struct raft_buffer buf; buf.len = entry->buf.len; buf.base = raft_malloc(buf.len); assert(buf.base != NULL); memcpy(buf.base, entry->buf.base, buf.len); /* FIXME(cole) what to do here for is_local? */ rv = logAppend(f->log, entry->term, entry->type, buf, (struct raft_entry_local_data){}, false, NULL); assert(rv == 0); } logRelease(raft->log, 1, entries, n); } /* Update the commit index to match the one from the current leader. */ static void updateCommitIndex(struct raft_fixture *f) { struct raft *raft = raft_fixture_get(f, (unsigned)f->leader_id - 1); if (raft->commit_index > f->commit_index) { f->commit_index = raft->commit_index; } } /* Return the lowest tick time across all servers, along with the associated * server index */ static void getLowestTickTime(struct raft_fixture *f, raft_time *t, unsigned *i) { unsigned j; *t = (raft_time)-1 /* Maximum value */; for (j = 0; j < f->n; j++) { struct io *io = f->servers[j]->io.impl; if (io->next_tick < *t) { *t = io->next_tick; *i = j; } } } /* Return the completion time of the request with the lowest completion time * across all servers, along with the associated server index. */ static void getLowestRequestCompletionTime(struct raft_fixture *f, raft_time *t, unsigned *i) { unsigned j; *t = (raft_time)-1 /* Maximum value */; for (j = 0; j < f->n; j++) { struct io *io = f->servers[j]->io.impl; queue *head; QUEUE_FOREACH(head, &io->requests) { struct ioRequest *r = QUEUE_DATA(head, struct ioRequest, queue); if (r->completion_time < *t) { *t = r->completion_time; *i = j; } } } } /* Fire the tick callback of the i'th server. */ static void fireTick(struct raft_fixture *f, unsigned i) { struct io *io = f->servers[i]->io.impl; f->time = io->next_tick; f->event->server_index = i; f->event->type = RAFT_FIXTURE_TICK; io->next_tick += io->tick_interval; if (f->servers[i]->alive) { io->tick_cb(io->io); } } /* Complete the first request with completion time @t on the @i'th server. */ static void completeRequest(struct raft_fixture *f, unsigned i, raft_time t) { struct io *io = f->servers[i]->io.impl; queue *head; struct ioRequest *r = NULL; bool found = false; f->time = t; f->event->server_index = i; QUEUE_FOREACH(head, &io->requests) { r = QUEUE_DATA(head, struct ioRequest, queue); if (r->completion_time == t) { found = true; break; } } assert(found); queue_remove(head); switch (r->type) { case APPEND: ioFlushAppend(io, (struct append *)r); f->event->type = RAFT_FIXTURE_DISK; break; case SEND: ioFlushSend(io, (struct send *)r); f->event->type = RAFT_FIXTURE_NETWORK; break; case TRANSMIT: ioDeliverTransmit(io, (struct transmit *)r); f->event->type = RAFT_FIXTURE_NETWORK; break; case SNAPSHOT_PUT: ioFlushSnapshotPut(io, (struct snapshot_put *)r); f->event->type = RAFT_FIXTURE_DISK; break; case SNAPSHOT_GET: ioFlushSnapshotGet(io, (struct snapshot_get *)r); f->event->type = RAFT_FIXTURE_DISK; break; case ASYNC_WORK: ioFlushAsyncWork(io, (struct async_work *)r); f->event->type = RAFT_FIXTURE_WORK; break; default: assert(0); } } struct raft_fixture_event *raft_fixture_step(struct raft_fixture *f) { raft_time tick_time; raft_time completion_time; unsigned i = f->n; unsigned j = f->n; getLowestTickTime(f, &tick_time, &i); getLowestRequestCompletionTime(f, &completion_time, &j); assert(i < f->n || j < f->n); if (tick_time < completion_time || (tick_time == completion_time && i <= j)) { fireTick(f, i); } else { completeRequest(f, j, completion_time); } /* If the leader has not changed check the Leader Append-Only * guarantee. */ if (!updateLeaderAndCheckElectionSafety(f)) { checkLeaderAppendOnly(f); } /* If we have a leader, update leader-related state . */ if (f->leader_id != 0) { copyLeaderLog(f); updateCommitIndex(f); } if (f->hook != NULL) { f->hook(f, f->event); } return f->event; } struct raft_fixture_event *raft_fixture_step_n(struct raft_fixture *f, unsigned n) { unsigned i; assert(n > 0); for (i = 0; i < n - 1; i++) { raft_fixture_step(f); } return raft_fixture_step(f); } bool raft_fixture_step_until(struct raft_fixture *f, bool (*stop)(struct raft_fixture *f, void *arg), void *arg, unsigned max_msecs) { raft_time start = f->time; while (!stop(f, arg) && (f->time - start) < max_msecs) { raft_fixture_step(f); } return f->time - start < max_msecs; } /* A step function which return always false, forcing raft_fixture_step_n to * advance time at each iteration. */ static bool spin(struct raft_fixture *f, void *arg) { (void)f; (void)arg; return false; } void raft_fixture_step_until_elapsed(struct raft_fixture *f, unsigned msecs) { raft_fixture_step_until(f, spin, NULL, msecs); } static bool hasLeader(struct raft_fixture *f, void *arg) { (void)arg; return f->leader_id != 0; } bool raft_fixture_step_until_has_leader(struct raft_fixture *f, unsigned max_msecs) { return raft_fixture_step_until(f, hasLeader, NULL, max_msecs); } static bool hasNoLeader(struct raft_fixture *f, void *arg) { (void)arg; return f->leader_id == 0; } bool raft_fixture_step_until_has_no_leader(struct raft_fixture *f, unsigned max_msecs) { return raft_fixture_step_until(f, hasNoLeader, NULL, max_msecs); } /* Enable/disable dropping outgoing messages of a certain type from all servers * except one. */ static void dropAllExcept(struct raft_fixture *f, int type, bool flag, unsigned i) { unsigned j; for (j = 0; j < f->n; j++) { struct raft_fixture_server *s = f->servers[j]; if (j == i) { continue; } ioDrop(s->io.impl, type, flag); } } /* Set the randomized election timeout of the given server to the minimum value * compatible with its current state and timers. */ static void minimizeRandomizedElectionTimeout(struct raft_fixture *f, unsigned i) { struct raft *raft = &f->servers[i]->raft; raft_time now = raft->io->time(raft->io); unsigned timeout = raft->election_timeout; assert(raft->state == RAFT_FOLLOWER); /* If the minimum election timeout value would make the timer expire in * the past, cap it. */ if (now - raft->election_timer_start > timeout) { timeout = (unsigned)(now - raft->election_timer_start); } raft->follower_state.randomized_election_timeout = timeout; } /* Set the randomized election timeout to the maximum value on all servers * except the given one. */ static void maximizeAllRandomizedElectionTimeoutsExcept(struct raft_fixture *f, unsigned i) { unsigned j; for (j = 0; j < f->n; j++) { struct raft *raft = &f->servers[j]->raft; unsigned timeout = raft->election_timeout * 2; if (j == i) { continue; } assert(raft->state == RAFT_FOLLOWER); raft->follower_state.randomized_election_timeout = timeout; } } void raft_fixture_hook(struct raft_fixture *f, raft_fixture_event_cb hook) { f->hook = hook; } void raft_fixture_start_elect(struct raft_fixture *f, unsigned i) { struct raft *raft = raft_fixture_get(f, i); unsigned j; /* Make sure there's currently no leader. */ assert(f->leader_id == 0); /* Make sure that the given server is voting. */ assert(configurationGet(&raft->configuration, raft->id)->role == RAFT_VOTER); /* Make sure all servers are currently followers. */ for (j = 0; j < f->n; j++) { assert(raft_state(&f->servers[j]->raft) == RAFT_FOLLOWER); } /* Pretend that the last randomized election timeout was set at the * maximum value on all server expect the one to be elected, which is * instead set to the minimum possible value compatible with its current * state. */ minimizeRandomizedElectionTimeout(f, i); maximizeAllRandomizedElectionTimeoutsExcept(f, i); } void raft_fixture_elect(struct raft_fixture *f, unsigned i) { struct raft *raft = raft_fixture_get(f, i); raft_fixture_start_elect(f, i); raft_fixture_step_until_has_leader(f, ELECTION_TIMEOUT * 20); assert(f->leader_id == raft->id); } void raft_fixture_depose(struct raft_fixture *f) { unsigned leader_i; /* Make sure there's a leader. */ assert(f->leader_id != 0); leader_i = (unsigned)f->leader_id - 1; assert(raft_state(&f->servers[leader_i]->raft) == RAFT_LEADER); /* Set a very large election timeout on all followers, to prevent them * from starting an election. */ maximizeAllRandomizedElectionTimeoutsExcept(f, leader_i); /* Prevent all servers from sending append entries results, so the * leader will eventually step down. */ dropAllExcept(f, RAFT_IO_APPEND_ENTRIES_RESULT, true, leader_i); raft_fixture_step_until_has_no_leader(f, ELECTION_TIMEOUT * 3); assert(f->leader_id == 0); dropAllExcept(f, RAFT_IO_APPEND_ENTRIES_RESULT, false, leader_i); } struct step_apply { unsigned i; raft_index index; }; static bool hasAppliedIndex(struct raft_fixture *f, void *arg) { struct step_apply *apply = (struct step_apply *)arg; struct raft *raft; unsigned n = 0; unsigned i; if (apply->i < f->n) { raft = raft_fixture_get(f, apply->i); return raft_last_applied(raft) >= apply->index; } for (i = 0; i < f->n; i++) { raft = raft_fixture_get(f, i); if (raft_last_applied(raft) >= apply->index) { n++; } } return n == f->n; } bool raft_fixture_step_until_applied(struct raft_fixture *f, unsigned i, raft_index index, unsigned max_msecs) { struct step_apply apply = {i, index}; return raft_fixture_step_until(f, hasAppliedIndex, &apply, max_msecs); } struct step_state { unsigned i; int state; }; static bool hasState(struct raft_fixture *f, void *arg) { struct step_state *target = (struct step_state *)arg; struct raft *raft; raft = raft_fixture_get(f, target->i); return raft_state(raft) == target->state; } bool raft_fixture_step_until_state_is(struct raft_fixture *f, unsigned i, int state, unsigned max_msecs) { struct step_state target = {i, state}; return raft_fixture_step_until(f, hasState, &target, max_msecs); } struct step_term { unsigned i; raft_term term; }; static bool hasTerm(struct raft_fixture *f, void *arg) { struct step_term *target = (struct step_term *)arg; struct raft *raft; raft = raft_fixture_get(f, target->i); return raft->current_term == target->term; } bool raft_fixture_step_until_term_is(struct raft_fixture *f, unsigned i, raft_term term, unsigned max_msecs) { struct step_term target = {i, term}; return raft_fixture_step_until(f, hasTerm, &target, max_msecs); } struct step_vote { unsigned i; unsigned j; }; static bool hasVotedFor(struct raft_fixture *f, void *arg) { struct step_vote *target = (struct step_vote *)arg; struct raft *raft; raft = raft_fixture_get(f, target->i); return raft->voted_for == target->j + 1; } bool raft_fixture_step_until_voted_for(struct raft_fixture *f, unsigned i, unsigned j, unsigned max_msecs) { struct step_vote target = {i, j}; return raft_fixture_step_until(f, hasVotedFor, &target, max_msecs); } struct step_deliver { unsigned i; unsigned j; }; static bool hasDelivered(struct raft_fixture *f, void *arg) { struct step_deliver *target = (struct step_deliver *)arg; struct raft *raft; struct io *io; struct raft_message *message; queue *head; raft = raft_fixture_get(f, target->i); io = raft->io->impl; QUEUE_FOREACH(head, &io->requests) { struct ioRequest *r; r = QUEUE_DATA(head, struct ioRequest, queue); message = NULL; switch (r->type) { case SEND: message = &((struct send *)r)->message; break; case TRANSMIT: message = &((struct transmit *)r)->message; break; } if (message != NULL && message->server_id == target->j + 1) { return false; } } return true; } bool raft_fixture_step_until_delivered(struct raft_fixture *f, unsigned i, unsigned j, unsigned max_msecs) { struct step_deliver target = {i, j}; return raft_fixture_step_until(f, hasDelivered, &target, max_msecs); } void raft_fixture_disconnect(struct raft_fixture *f, unsigned i, unsigned j) { struct raft_io *io1 = &f->servers[i]->io; struct raft_io *io2 = &f->servers[j]->io; ioDisconnect(io1, io2); } void raft_fixture_reconnect(struct raft_fixture *f, unsigned i, unsigned j) { struct raft_io *io1 = &f->servers[i]->io; struct raft_io *io2 = &f->servers[j]->io; ioReconnect(io1, io2); } void raft_fixture_saturate(struct raft_fixture *f, unsigned i, unsigned j) { struct raft_io *io1 = &f->servers[i]->io; struct raft_io *io2 = &f->servers[j]->io; ioSaturate(io1, io2); } static void disconnectFromAll(struct raft_fixture *f, unsigned i) { unsigned j; for (j = 0; j < f->n; j++) { if (j == i) { continue; } raft_fixture_saturate(f, i, j); raft_fixture_saturate(f, j, i); } } static void reconnectToAll(struct raft_fixture *f, unsigned i) { unsigned j; for (j = 0; j < f->n; j++) { if (j == i) { continue; } /* Don't reconnect to disconnected peers */ if (!f->servers[j]->alive) { continue; } raft_fixture_desaturate(f, i, j); raft_fixture_desaturate(f, j, i); } } bool raft_fixture_saturated(struct raft_fixture *f, unsigned i, unsigned j) { struct raft_io *io1 = &f->servers[i]->io; struct raft_io *io2 = &f->servers[j]->io; return ioSaturated(io1, io2); } void raft_fixture_desaturate(struct raft_fixture *f, unsigned i, unsigned j) { struct raft_io *io1 = &f->servers[i]->io; struct raft_io *io2 = &f->servers[j]->io; ioDesaturate(io1, io2); } void raft_fixture_kill(struct raft_fixture *f, unsigned i) { disconnectFromAll(f, i); f->servers[i]->alive = false; } void raft_fixture_revive(struct raft_fixture *f, unsigned i) { reconnectToAll(f, i); f->servers[i]->alive = true; } int raft_fixture_grow(struct raft_fixture *f, struct raft_fsm *fsm) { unsigned i; unsigned j; int rc; i = f->n; f->n++; rc = serverInit(f, i, fsm); if (rc != 0) { return rc; } serverConnectToAll(f, i); for (j = 0; j < f->n; j++) { struct raft_io *io1 = &f->servers[i]->io; struct raft_io *io2 = &f->servers[j]->io; ioConnect(io2, io1); } return 0; } void raft_fixture_set_randomized_election_timeout(struct raft_fixture *f, unsigned i, unsigned msecs) { struct io *io = f->servers[i]->io.impl; io->randomized_election_timeout = msecs; } void raft_fixture_set_network_latency(struct raft_fixture *f, unsigned i, unsigned msecs) { struct io *io = f->servers[i]->io.impl; io->network_latency = msecs; } void raft_fixture_set_disk_latency(struct raft_fixture *f, unsigned i, unsigned msecs) { struct io *io = f->servers[i]->io.impl; io->disk_latency = msecs; } void raft_fixture_set_send_latency(struct raft_fixture *f, unsigned i, unsigned j, unsigned msecs) { struct io *io = f->servers[i]->io.impl; struct peer *peer = ioGetPeer(io, f->servers[j]->id); peer->send_latency = msecs; } void raft_fixture_set_term(struct raft_fixture *f, unsigned i, raft_term term) { struct io *io = f->servers[i]->io.impl; io->term = term; } void raft_fixture_set_snapshot(struct raft_fixture *f, unsigned i, struct raft_snapshot *snapshot) { struct io *io = f->servers[i]->io.impl; io->snapshot = snapshot; } void raft_fixture_add_entry(struct raft_fixture *f, unsigned i, struct raft_entry *entry) { struct io *io = f->servers[i]->io.impl; struct raft_entry *entries; entries = raft_realloc(io->entries, (io->n + 1) * sizeof *entries); assert(entries != NULL); entries[io->n] = *entry; io->entries = entries; io->n++; } void raft_fixture_append_fault(struct raft_fixture *f, unsigned i, int delay) { struct io *io = f->servers[i]->io.impl; io->append_fault_countdown = delay; } void raft_fixture_vote_fault(struct raft_fixture *f, unsigned i, int delay) { struct io *io = f->servers[i]->io.impl; io->vote_fault_countdown = delay; } void raft_fixture_term_fault(struct raft_fixture *f, unsigned i, int delay) { struct io *io = f->servers[i]->io.impl; io->term_fault_countdown = delay; } void raft_fixture_send_fault(struct raft_fixture *f, unsigned i, int delay) { struct io *io = f->servers[i]->io.impl; io->send_fault_countdown = delay; } unsigned raft_fixture_n_send(struct raft_fixture *f, unsigned i, int type) { struct io *io = f->servers[i]->io.impl; return io->n_send[type]; } unsigned raft_fixture_n_recv(struct raft_fixture *f, unsigned i, int type) { struct io *io = f->servers[i]->io.impl; return io->n_recv[type]; } void raft_fixture_make_unavailable(struct raft_fixture *f, unsigned i) { struct raft *r = &f->servers[i]->raft; convertToUnavailable(r); } dqlite-1.16.7/src/raft/flags.c000066400000000000000000000004311465252713400160620ustar00rootroot00000000000000#include "flags.h" inline raft_flags flagsSet(raft_flags in, raft_flags flags) { return in | flags; } inline raft_flags flagsClear(raft_flags in, raft_flags flags) { return in & (~flags); } inline bool flagsIsSet(raft_flags in, raft_flags flag) { return (bool)(in & flag); } dqlite-1.16.7/src/raft/flags.h000066400000000000000000000011551465252713400160730ustar00rootroot00000000000000#ifndef FLAGS_H_ #define FLAGS_H_ #include "../raft.h" #define RAFT_DEFAULT_FEATURE_FLAGS (0) /* Adds the flags @flags to @in and returns the new flags. Multiple flags should * be combined using the `|` operator. */ raft_flags flagsSet(raft_flags in, raft_flags flags); /* Clears the flags @flags from @in and returns the new flags. Multiple flags * should be combined using the `|` operator. */ raft_flags flagsClear(raft_flags in, raft_flags flags); /* Returns `true` if the single flag @flag is set in @in, otherwise returns * `false`. */ bool flagsIsSet(raft_flags in, raft_flags flag); #endif /* FLAGS_H */ dqlite-1.16.7/src/raft/heap.c000066400000000000000000000043211465252713400157050ustar00rootroot00000000000000#include "heap.h" #include #include "../raft.h" static void *defaultMalloc(void *data, size_t size) { (void)data; return malloc(size); } static void defaultFree(void *data, void *ptr) { (void)data; free(ptr); } static void *defaultCalloc(void *data, size_t nmemb, size_t size) { (void)data; return calloc(nmemb, size); } static void *defaultRealloc(void *data, void *ptr, size_t size) { (void)data; return realloc(ptr, size); } static void *defaultAlignedAlloc(void *data, size_t alignment, size_t size) { (void)data; return aligned_alloc(alignment, size); } static void defaultAlignedFree(void *data, size_t alignment, void *ptr) { (void)alignment; defaultFree(data, ptr); } static struct raft_heap defaultHeap = { NULL, /* data */ defaultMalloc, /* malloc */ defaultFree, /* free */ defaultCalloc, /* calloc */ defaultRealloc, /* realloc */ defaultAlignedAlloc, /* aligned_alloc */ defaultAlignedFree /* aligned_free */ }; static struct raft_heap *currentHeap = &defaultHeap; void *RaftHeapMalloc(size_t size) { return currentHeap->malloc(currentHeap->data, size); } void RaftHeapFree(void *ptr) { if (ptr == NULL) { return; } currentHeap->free(currentHeap->data, ptr); } void *RaftHeapCalloc(size_t nmemb, size_t size) { return currentHeap->calloc(currentHeap->data, nmemb, size); } void *RaftHeapRealloc(void *ptr, size_t size) { return currentHeap->realloc(currentHeap->data, ptr, size); } void *raft_malloc(size_t size) { return RaftHeapMalloc(size); } void raft_free(void *ptr) { RaftHeapFree(ptr); } void *raft_calloc(size_t nmemb, size_t size) { return RaftHeapCalloc(nmemb, size); } void *raft_realloc(void *ptr, size_t size) { return RaftHeapRealloc(ptr, size); } void *raft_aligned_alloc(size_t alignment, size_t size) { return currentHeap->aligned_alloc(currentHeap->data, alignment, size); } void raft_aligned_free(size_t alignment, void *ptr) { currentHeap->aligned_free(currentHeap->data, alignment, ptr); } void raft_heap_set(struct raft_heap *heap) { currentHeap = heap; } void raft_heap_set_default(void) { currentHeap = &defaultHeap; } const struct raft_heap *raft_heap_get(void) { return currentHeap; } dqlite-1.16.7/src/raft/heap.h000066400000000000000000000004131465252713400157100ustar00rootroot00000000000000/* Internal heap APIs. */ #ifndef HEAP_H_ #define HEAP_H_ #include void *RaftHeapMalloc(size_t size); void *RaftHeapCalloc(size_t nmemb, size_t size); void *RaftHeapRealloc(void *ptr, size_t size); void RaftHeapFree(void *ptr); #endif /* HEAP_H_ */ dqlite-1.16.7/src/raft/lifecycle.c000066400000000000000000000013701465252713400167300ustar00rootroot00000000000000#include "lifecycle.h" #include "../tracing.h" #include "../lib/queue.h" #include #include #include static bool reqIdIsSet(const struct request *req) { return req->req_id[15] == (uint8_t)-1; } static uint64_t extractReqId(const struct request *req) { uint64_t id; memcpy(&id, &req->req_id, sizeof(id)); return id; } void lifecycleRequestStart(struct raft *r, struct request *req) { if (reqIdIsSet(req)) { tracef("request start id:%" PRIu64, extractReqId(req)); } queue_insert_tail(&r->leader_state.requests, &req->queue); } void lifecycleRequestEnd(struct raft *r, struct request *req) { (void)r; if (reqIdIsSet(req)) { tracef("request end id:%" PRIu64, extractReqId(req)); } queue_remove(&req->queue); } dqlite-1.16.7/src/raft/lifecycle.h000066400000000000000000000003361465252713400167360ustar00rootroot00000000000000#ifndef LIFECYCLE_H_ #define LIFECYCLE_H_ #include "../raft.h" #include "request.h" void lifecycleRequestStart(struct raft *r, struct request *req); void lifecycleRequestEnd(struct raft *r, struct request *req); #endif dqlite-1.16.7/src/raft/log.c000066400000000000000000000555341465252713400155650ustar00rootroot00000000000000#include "log.h" #include #include "../raft.h" #include "assert.h" #include "configuration.h" /* Calculate the reference count hash table key for the given log entry index in * an hash table of the given size. * * The hash is simply the log entry index minus one modulo the size. This * minimizes conflicts in the most frequent case, where a new log entry is * simply appended to the log and can use the hash table bucket next to the * bucket for the entry with the previous index (possibly resizing the table if * its cap is reached). */ static size_t refsKey(const raft_index index, const size_t size) { assert(index > 0); assert(size > 0); return (size_t)((index - 1) % size); } /* Try to insert a new reference count item for the given log entry index into * the given reference count hash table. * * A collision happens when the bucket associated with the hash key of the given * log entry index is already used to refcount log entries with a different * index. In that case the collision output parameter will be set to true and no * new reference count item is inserted into the hash table. * * If two log entries have the same index but different terms, the associated * bucket will be grown accordingly. */ static int refsTryInsert(struct raft_entry_ref *table, const size_t size, const raft_term term, const raft_index index, const unsigned short count, struct raft_buffer buf, void *batch, bool *collision) { struct raft_entry_ref *bucket; /* Bucket associated with this index. */ struct raft_entry_ref *next_slot; /* For traversing the bucket slots. */ struct raft_entry_ref *last_slot; /* To track the last traversed slot. */ struct raft_entry_ref *slot; /* Actual slot to use for this entry. */ size_t key; assert(table != NULL); assert(size > 0); assert(term > 0); assert(index > 0); assert(count > 0); assert(collision != NULL); /* Calculate the hash table key for the given index. */ key = refsKey(index, size); bucket = &table[key]; /* If a bucket is empty, then there's no collision and we can fill its * first slot. */ if (bucket->count == 0) { assert(bucket->next == NULL); slot = bucket; goto fill; } /* If the bucket is already used to refcount entries with a different * index, then we have a collision and we must abort here. */ if (bucket->index != index) { *collision = true; return 0; } /* If we get here it means that the bucket is in use to refcount one or * more entries with the same index as the given one, but different * terms. * * We must append a newly allocated slot to refcount the entry with this * term. * * So first let's find the last slot in the bucket. */ for (next_slot = bucket; next_slot != NULL; next_slot = next_slot->next) { /* All entries in a bucket must have the same index. */ assert(next_slot->index == index); /* It should never happen that two entries with the same index * and term get appended. So no existing slot in this bucket * must track an entry with the same term as the given one. */ assert(next_slot->term != term); last_slot = next_slot; } /* The last slot must have no next slot. */ assert(last_slot->next == NULL); slot = raft_malloc(sizeof *slot); if (slot == NULL) { return RAFT_NOMEM; } last_slot->next = slot; fill: slot->term = term; slot->index = index; slot->count = count; slot->buf = buf; slot->batch = batch; slot->next = NULL; *collision = false; return 0; } /* Move the slots of the given bucket into the given reference count hash * table. The key of the bucket to use in the given table will be re-calculated * according to the given size. */ static int refsMove(struct raft_entry_ref *bucket, struct raft_entry_ref *table, const size_t size) { struct raft_entry_ref *slot; struct raft_entry_ref *next_slot; assert(bucket != NULL); assert(table != NULL); assert(size > 0); /* Only non-empty buckets should be moved. */ assert(bucket->count > 0); /* For each slot in the bucket, insert the relevant entry in the given * table, then free it. */ next_slot = bucket; while (next_slot != NULL) { bool collision; int rv; slot = next_slot; /* Insert the reference count for this entry into the new table. */ rv = refsTryInsert(table, size, slot->term, slot->index, slot->count, slot->buf, slot->batch, &collision); next_slot = slot->next; /* Unless this is the very first slot in the bucket, we need to * free the slot. */ if (slot != bucket) { raft_free(slot); } if (rv != 0) { return rv; } /* The given hash table is assumed to be large enough to hold * all ref counts without any conflict. */ assert(!collision); }; return 0; } /* Grow the size of the reference count hash table. */ static int refsGrow(struct raft_log *l) { struct raft_entry_ref *table; /* New hash table. */ size_t size; /* Size of the new hash table. */ size_t i; assert(l != NULL); assert(l->refs_size > 0); size = l->refs_size * 2; /* Double the table size */ table = raft_calloc(size, sizeof *table); if (table == NULL) { return RAFT_NOMEM; } /* Populate the new hash table, inserting all entries existing in the * current hash table. Each bucket will have a different key in the new * hash table, since the size has changed. */ for (i = 0; i < l->refs_size; i++) { struct raft_entry_ref *bucket = &l->refs[i]; if (bucket->count > 0) { int rv = refsMove(bucket, table, size); if (rv != 0) { return rv; } } else { /* If the count is zero, we expect that the bucket is * unused. */ assert(bucket->next == NULL); } } raft_free(l->refs); l->refs = table; l->refs_size = size; return 0; } /* Initialize the reference count of the entry with the given index, setting it * to 1. */ static int refsInit(struct raft_log *l, const raft_term term, const raft_index index, struct raft_buffer buf, void *batch) { int i; assert(l != NULL); assert(term > 0); assert(index > 0); /* Initialize the hash map with a reasonable size */ if (l->refs == NULL) { l->refs_size = LOG__REFS_INITIAL_SIZE; l->refs = raft_calloc(l->refs_size, sizeof *l->refs); if (l->refs == NULL) { return RAFT_NOMEM; } } /* Check if the bucket associated with the given index is available * (i.e. there are no collisions), or grow the table and re-key it * otherwise. * * We limit the number of times we try to grow the table to 10, to avoid * eating up too much memory. In practice, there should never be a case * where this is not enough. */ for (i = 0; i < 10; i++) { bool collision; int rc; rc = refsTryInsert(l->refs, l->refs_size, term, index, 1, buf, batch, &collision); if (rc != 0) { return RAFT_NOMEM; } if (!collision) { return 0; } rc = refsGrow(l); if (rc != 0) { return rc; } }; return RAFT_NOMEM; } /* Increment the refcount of the entry with the given term and index. */ static void refsIncr(struct raft_log *l, const raft_term term, const raft_index index) { size_t key; /* Hash table key for the given index. */ struct raft_entry_ref *slot; /* Slot for the given term/index */ assert(l != NULL); assert(term > 0); assert(index > 0); key = refsKey(index, l->refs_size); /* Lookup the slot associated with the given term/index, which must have * been previously inserted. */ slot = &l->refs[key]; while (1) { assert(slot != NULL); assert(slot->index == index); if (slot->term == term) { break; } slot = slot->next; } assert(slot != NULL); slot->count++; } /* Decrement the refcount of the entry with the given index. Return a boolean * indicating whether the entry has now zero references. */ static bool refsDecr(struct raft_log *l, const raft_term term, const raft_index index) { size_t key; /* Hash table key for the given index. */ struct raft_entry_ref *slot; /* Slot for the given term/index */ struct raft_entry_ref *prev_slot; /* Slot preceeding the one to decrement */ assert(l != NULL); assert(term > 0); assert(index > 0); key = refsKey(index, l->refs_size); prev_slot = NULL; /* Lookup the slot associated with the given term/index, keeping track * of its previous slot in the bucket list. */ slot = &l->refs[key]; while (1) { assert(slot != NULL); assert(slot->index == index); if (slot->term == term) { break; } prev_slot = slot; slot = slot->next; } slot->count--; if (slot->count > 0) { /* The entry is still referenced. */ return false; } /* If the refcount has dropped to zero, delete the slot. */ if (prev_slot != NULL) { /* This isn't the very first slot, simply unlink it from the * slot list. */ prev_slot->next = slot->next; raft_free(slot); } else if (slot->next != NULL) { /* This is the very first slot, and slot list is not empty. Copy * the second slot into the first one, then delete it. */ struct raft_entry_ref *second_slot = slot->next; *slot = *second_slot; raft_free(second_slot); } return true; } struct raft_log *logInit(void) { struct raft_log *log; log = raft_malloc(sizeof(*log)); if (log == NULL) { return NULL; } log->entries = NULL; log->size = 0; log->front = log->back = 0; log->offset = 0; log->refs = NULL; log->refs_size = 0; log->snapshot.last_index = 0; log->snapshot.last_term = 0; return log; } /* Return the index of the i'th entry in the log. */ static raft_index indexAt(struct raft_log *l, size_t i) { return l->offset + i + 1; } /* Return the circular buffer position of the i'th entry in the log. */ static size_t positionAt(struct raft_log *l, size_t i) { return (l->front + i) % l->size; } /* Return the i'th entry in the log. */ static struct raft_entry *entryAt(struct raft_log *l, size_t i) { return &l->entries[positionAt(l, i)]; } void logClose(struct raft_log *l) { void *batch = NULL; /* Last batch that has been freed */ assert(l != NULL); if (l->entries != NULL) { size_t i; size_t n = logNumEntries(l); for (i = 0; i < n; i++) { struct raft_entry *entry = entryAt(l, i); raft_index index = indexAt(l, i); size_t key = refsKey(index, l->refs_size); struct raft_entry_ref *slot = &l->refs[key]; /* We require that there are no outstanding references * to active entries. */ assert(slot->count == 1); /* TODO: we should support the case where the bucket has * more than one slot. */ assert(slot->next == NULL); /* Release the memory used by the entry data (either * directly or via a batch). */ if (entry->batch == NULL) { if (entry->buf.base != NULL) { raft_free(entry->buf.base); } } else { if (entry->batch != batch) { /* This batch was not released yet, so * let's do it now. */ batch = entry->batch; raft_free(entry->batch); } } } raft_free(l->entries); } if (l->refs != NULL) { raft_free(l->refs); } raft_free(l); } void logStart(struct raft_log *l, raft_index snapshot_index, raft_term snapshot_term, raft_index start_index) { assert(logNumEntries(l) == 0); assert(start_index > 0); assert(start_index <= snapshot_index + 1); assert(snapshot_index == 0 || snapshot_term != 0); l->snapshot.last_index = snapshot_index; l->snapshot.last_term = snapshot_term; l->offset = start_index - 1; } /* Ensure that the entries array has enough free slots for adding a new entry. */ static int ensureCapacity(struct raft_log *l) { struct raft_entry *entries; /* New entries array */ size_t n; /* Current number of entries */ size_t size; /* Size of the new array */ size_t i; n = logNumEntries(l); if (n + 1 < l->size) { return 0; } /* Make the new size twice the current size plus one (for the new * entry). Over-allocating now avoids smaller allocations later. */ size = (l->size + 1) * 2; entries = raft_calloc(size, sizeof *entries); if (entries == NULL) { return RAFT_NOMEM; } /* Copy all active old entries to the beginning of the newly allocated * array. */ for (i = 0; i < n; i++) { memcpy(&entries[i], entryAt(l, i), sizeof *entries); } /* Release the old entries array. */ if (l->entries != NULL) { raft_free(l->entries); } l->entries = entries; l->size = size; l->front = 0; l->back = n; return 0; } int logReinstate(struct raft_log *l, raft_term term, unsigned short type, bool *reinstated) { raft_index index; size_t key; struct raft_entry_ref *bucket; struct raft_entry_ref *slot; struct raft_entry *entry; int rv; *reinstated = false; if (l->refs_size == 0) { return 0; } index = logLastIndex(l) + 1; key = refsKey(index, l->refs_size); bucket = &l->refs[key]; if (bucket->count == 0 || bucket->index != index) { return 0; } for (slot = bucket; slot != NULL; slot = slot->next) { if (slot->term == term) { rv = ensureCapacity(l); if (rv != 0) { return rv; } slot->count++; l->back++; l->back %= l->size; entry = &l->entries[l->back]; entry->term = term; entry->type = type; entry->buf = slot->buf; entry->batch = slot->batch; *reinstated = true; break; } } return 0; } int logAppend(struct raft_log *l, const raft_term term, const unsigned short type, struct raft_buffer buf, struct raft_entry_local_data local_data, bool is_local, void *batch) { int rv; struct raft_entry *entry; raft_index index; assert(l != NULL); assert(term > 0); assert(type == RAFT_CHANGE || type == RAFT_BARRIER || type == RAFT_COMMAND); rv = ensureCapacity(l); if (rv != 0) { return rv; } index = logLastIndex(l) + 1; rv = refsInit(l, term, index, buf, batch); if (rv != 0) { return rv; } entry = &l->entries[l->back]; entry->term = term; entry->type = type; entry->buf = buf; entry->batch = batch; entry->local_data = local_data; entry->is_local = is_local; l->back += 1; l->back = l->back % l->size; return 0; } int logAppendCommands(struct raft_log *l, const raft_term term, const struct raft_buffer bufs[], const struct raft_entry_local_data local_data[], const unsigned n) { unsigned i; int rv; assert(l != NULL); assert(term > 0); assert(bufs != NULL); assert(n > 0); for (i = 0; i < n; i++) { struct raft_entry_local_data loc = (local_data != NULL) ? local_data[i] : (struct raft_entry_local_data){}; rv = logAppend(l, term, RAFT_COMMAND, bufs[i], loc, true, NULL); if (rv != 0) { return rv; } } return 0; } int logAppendConfiguration(struct raft_log *l, const raft_term term, const struct raft_configuration *configuration) { struct raft_buffer buf; int rv; assert(l != NULL); assert(term > 0); assert(configuration != NULL); /* Encode the configuration into a buffer. */ rv = configurationEncode(configuration, &buf); if (rv != 0) { goto err; } /* Append the new entry to the log. */ rv = logAppend(l, term, RAFT_CHANGE, buf, (struct raft_entry_local_data){}, true, NULL); if (rv != 0) { goto err_after_encode; } return 0; err_after_encode: raft_free(buf.base); err: assert(rv != 0); return rv; } size_t logNumEntries(struct raft_log *l) { assert(l != NULL); /* The circular buffer is not wrapped. */ if (l->front <= l->back) { return l->back - l->front; } /* The circular buffer is wrapped. */ return l->size - l->front + l->back; } raft_index logLastIndex(struct raft_log *l) { /* If there are no entries in the log, but there is a snapshot available * check that it's last index is consistent with the offset. */ if (logNumEntries(l) == 0 && l->snapshot.last_index != 0) { assert(l->offset <= l->snapshot.last_index); } return l->offset + logNumEntries(l); } /* Return the position of the entry with the given index in the entries array. * * If no entry with the given index is in the log return the size of the entries * array. */ static size_t locateEntry(struct raft_log *l, const raft_index index) { size_t n = logNumEntries(l); if (n == 0 || index < indexAt(l, 0) || index > indexAt(l, n - 1)) { return l->size; } /* Get the circular buffer position of the desired entry. Log indexes * start at 1, so we subtract one to get array indexes. We also need to * subtract any index offset this log might start at. */ return positionAt(l, (size_t)((index - 1) - l->offset)); } raft_term logTermOf(struct raft_log *l, const raft_index index) { size_t i; assert(index > 0); assert(l->offset <= l->snapshot.last_index); if ((index < l->offset + 1 && index != l->snapshot.last_index) || index > logLastIndex(l)) { return 0; } if (index == l->snapshot.last_index) { assert(l->snapshot.last_term != 0); /* Coherence check that if we still have the entry at * last_index, its term matches the one in the snapshot. */ i = locateEntry(l, index); if (i != l->size) { assert(l->entries[i].term == l->snapshot.last_term); } return l->snapshot.last_term; } i = locateEntry(l, index); assert(i < l->size); return l->entries[i].term; } raft_index logSnapshotIndex(struct raft_log *l) { return l->snapshot.last_index; } raft_term logLastTerm(struct raft_log *l) { raft_index last_index; last_index = logLastIndex(l); return last_index > 0 ? logTermOf(l, last_index) : 0; } const struct raft_entry *logGet(struct raft_log *l, const raft_index index) { size_t i; assert(l != NULL); /* Get the array index of the desired entry. */ i = locateEntry(l, index); if (i == l->size) { return NULL; } assert(i < l->size); return &l->entries[i]; } int logAcquire(struct raft_log *l, const raft_index index, struct raft_entry *entries[], unsigned *n) { size_t i; size_t j; assert(l != NULL); assert(index > 0); assert(entries != NULL); assert(n != NULL); /* Get the array index of the first entry to acquire. */ i = locateEntry(l, index); if (i == l->size) { *n = 0; *entries = NULL; return 0; } if (i < l->back) { /* The last entry does not wrap with respect to i, so the number * of entries is simply the length of the range [i...l->back). */ *n = (unsigned)(l->back - i); } else { /* The last entry wraps with respect to i, so the number of * entries is the sum of the lengths of the ranges [i...l->size) * and [0...l->back), which is l->size - i + l->back.*/ *n = (unsigned)(l->size - i + l->back); } assert(*n > 0); *entries = raft_calloc(*n, sizeof **entries); if (*entries == NULL) { return RAFT_NOMEM; } for (j = 0; j < *n; j++) { size_t k = (i + j) % l->size; struct raft_entry *entry = &(*entries)[j]; *entry = l->entries[k]; refsIncr(l, entry->term, index + j); } return 0; } /* Return true if the given batch is referenced by any entry currently in the * log. */ static bool isBatchReferenced(struct raft_log *l, const void *batch) { size_t i; /* Iterate through all live entries to see if there's one * belonging to the same batch. This is slightly inefficient but * this code path should be taken very rarely in practice. */ for (i = 0; i < logNumEntries(l); i++) { struct raft_entry *entry = entryAt(l, i); if (entry->batch == batch) { return true; } } return false; } void logRelease(struct raft_log *l, const raft_index index, struct raft_entry entries[], const unsigned n) { size_t i; void *batch = NULL; /* Last batch whose memory was freed */ assert(l != NULL); assert((entries == NULL && n == 0) || (entries != NULL && n > 0)); for (i = 0; i < n; i++) { struct raft_entry *entry = &entries[i]; bool unref; unref = refsDecr(l, entry->term, index + i); /* If there are no outstanding references to this entry, free * its payload if it's not part of a batch, or check if we can * free the batch itself. */ if (unref) { if (entries[i].batch == NULL) { if (entry->buf.base != NULL) { raft_free(entries[i].buf.base); } } else { if (entry->batch != batch) { if (!isBatchReferenced(l, entry->batch)) { batch = entry->batch; raft_free(batch); } } } } } if (entries != NULL) { raft_free(entries); } } /* Clear the log if it became empty. */ static void clearIfEmpty(struct raft_log *l) { if (logNumEntries(l) > 0) { return; } raft_free(l->entries); l->entries = NULL; l->size = 0; l->front = 0; l->back = 0; } /* Destroy an entry, possibly releasing the memory of its buffer. */ static void destroyEntry(struct raft_log *l, struct raft_entry *entry) { if (entry->batch == NULL) { if (entry->buf.base != NULL) { raft_free(entry->buf.base); } } else { if (!isBatchReferenced(l, entry->batch)) { raft_free(entry->batch); } } } /* Core logic of @logTruncate and @logDiscard, removing all log entries from * @index onward. If @destroy is true, also destroy the removed entries. */ static void removeSuffix(struct raft_log *l, const raft_index index, bool destroy) { size_t i; size_t n; raft_index start = index; assert(l != NULL); assert(index > l->offset); assert(index <= logLastIndex(l)); /* Number of entries to delete */ n = (size_t)(logLastIndex(l) - start) + 1; for (i = 0; i < n; i++) { struct raft_entry *entry; bool unref; if (l->back == 0) { l->back = l->size - 1; } else { l->back--; } entry = &l->entries[l->back]; unref = refsDecr(l, entry->term, start + n - i - 1); if (unref && destroy) { destroyEntry(l, entry); } } clearIfEmpty(l); } void logTruncate(struct raft_log *l, const raft_index index) { if (logNumEntries(l) == 0) { return; } removeSuffix(l, index, true); } void logDiscard(struct raft_log *l, const raft_index index) { removeSuffix(l, index, false); } /* Delete all entries up to the given index (included). */ static void removePrefix(struct raft_log *l, const raft_index index) { size_t i; size_t n; assert(l != NULL); assert(index > 0); assert(index <= logLastIndex(l)); /* Number of entries to delete */ n = (size_t)(index - indexAt(l, 0)) + 1; for (i = 0; i < n; i++) { struct raft_entry *entry; bool unref; entry = &l->entries[l->front]; if (l->front == l->size - 1) { l->front = 0; } else { l->front++; } l->offset++; unref = refsDecr(l, entry->term, l->offset); if (unref) { destroyEntry(l, entry); } } clearIfEmpty(l); } void logSnapshot(struct raft_log *l, raft_index last_index, unsigned trailing) { raft_term last_term = logTermOf(l, last_index); /* We must have an entry at this index */ assert(last_term != 0); l->snapshot.last_index = last_index; l->snapshot.last_term = last_term; /* If we have not at least n entries preceeding the given last index, * then there's nothing to remove and we're done. */ if (last_index <= trailing || locateEntry(l, last_index - trailing) == l->size) { return; } removePrefix(l, last_index - trailing); } void logRestore(struct raft_log *l, raft_index last_index, raft_term last_term) { size_t n = logNumEntries(l); assert(last_index > 0); assert(last_term > 0); if (n > 0) { logTruncate(l, logLastIndex(l) - n + 1); } l->snapshot.last_index = last_index; l->snapshot.last_term = last_term; l->offset = last_index; } dqlite-1.16.7/src/raft/log.h000066400000000000000000000160651465252713400155660ustar00rootroot00000000000000/* In-memory cache of the persistent raft log stored on disk. */ #ifndef RAFT_LOG_H_ #define RAFT_LOG_H_ #include "../raft.h" /* Initial size of the entry reference count hash table. */ #define LOG__REFS_INITIAL_SIZE 256 /** * Counter for outstanding references to a log entry. * * When an entry is first appended to the log, its refcount is set to one (the * log itself is the only one referencing the entry). Whenever an entry is * included in an I/O request (to write it to disk or to send it to other * servers) its refcount is increased by one. Whenever an entry gets deleted * from the log its refcount is decreased by one. Likewise, whenever an I/O * request is completed the refcount of the relevant entries is decreased by * one. When the refcount drops to zero the memory that its @buf attribute * points to gets released, or, if the @batch attribute is non-NULL, a check is * made to see if all other entries of the same batch also have a zero refcount, * and the memory that @batch points to gets released if that's the case. */ struct raft_entry_ref { raft_term term; /* Term of the entry being ref-counted. */ raft_index index; /* Index of the entry being ref-counted. */ unsigned short count; /* Number of references. */ /* The next two fields are copied from the corresponding fields of the * raft_entry pointed to by this reference. We store them here as well, * so that logReinstate can retrieve them when it finds a raft_entry_ref * with the same index and term as it was passed, and create a full * raft_entry using them. */ struct raft_buffer buf; void *batch; struct raft_entry_ref *next; /* Next item in the bucket (for collisions). */ }; /** * In-memory cache of the persistent raft log stored on disk. * * The raft log cache is implemented as a circular buffer of log entries, which * makes some frequent operations very efficient (e.g. deleting the first N * entries when snapshotting). */ struct raft_log { struct raft_entry *entries; /* Circular buffer of log entries. */ size_t size; /* Number of available slots in the buffer. */ size_t front, back; /* Indexes of used slots [front, back). */ raft_index offset; /* Index of first entry is offset+1. */ struct raft_entry_ref *refs; /* Log entries reference counts hash table. */ size_t refs_size; /* Size of the reference counts hash table. */ struct /* Information about last snapshot, or zero. */ { raft_index last_index; /* Snapshot replaces all entries up to here. */ raft_term last_term; /* Term of last index. */ } snapshot; }; /* Initialize an empty in-memory log of raft entries. */ struct raft_log *logInit(void); /* Release all memory used by the given log object. */ void logClose(struct raft_log *l); /* Called at startup when populating the log with entries loaded from disk. It * sets the starting state of the log. The start index must be lower or equal * than snapshot_index + 1. */ void logStart(struct raft_log *l, raft_index snapshot_index, raft_term snapshot_term, raft_index start_index); /* Get the number of entries the log currently contains. */ size_t logNumEntries(struct raft_log *l); /* Get the index of the last entry in the log. Return #0 if the log is empty. */ raft_index logLastIndex(struct raft_log *l); /* Get the term of the last entry in the log. Return #0 if the log is empty. */ raft_term logLastTerm(struct raft_log *l); /* Get the term of the entry with the given index. Return #0 if @index is * * greater than the last index of the log, or if it's lower than oldest index we * know the term of (either because it's outstanding or because it's the last * entry in the most recent snapshot). */ raft_term logTermOf(struct raft_log *l, raft_index index); /* Get the index of the last entry in the most recent snapshot. Return #0 if * there are no snapshots. */ raft_index logSnapshotIndex(struct raft_log *l); /* Get the entry with the given index. The returned pointer remains valid only * as long as no API that might delete the entry with the given index is * invoked. Return #NULL if there is no such entry. */ const struct raft_entry *logGet(struct raft_log *l, const raft_index index); /* Check whether the hash map is already tracking an entry with the given * @term and @index (that is not part of the "logical" log). If so, increment * the refcount of that entry and set @reinstated to true; otherwise, set * @reinstated to false. */ int logReinstate(struct raft_log *l, raft_term term, unsigned short type, bool *reinstated); /* Append a new entry to the log. */ int logAppend(struct raft_log *l, raft_term term, unsigned short type, struct raft_buffer buf, struct raft_entry_local_data local_data, bool is_local, void *batch); /* Convenience to append a series of #RAFT_COMMAND entries. */ int logAppendCommands(struct raft_log *l, const raft_term term, const struct raft_buffer bufs[], const struct raft_entry_local_data local_data[], const unsigned n); /* Convenience to encode and append a single #RAFT_CHANGE entry. */ int logAppendConfiguration(struct raft_log *l, const raft_term term, const struct raft_configuration *configuration); /* Acquire an array of entries from the given index onwards. The payload * memory referenced by the @buf attribute of the returned entries is guaranteed * to be valid until logRelease() is called. */ int logAcquire(struct raft_log *l, raft_index index, struct raft_entry *entries[], unsigned *n); /* Release a previously acquired array of entries. */ void logRelease(struct raft_log *l, raft_index index, struct raft_entry entries[], unsigned n); /* Delete all entries from the given index (included) onwards. If the log is * empty this is a no-op. If @index is lower than or equal to the index of the * first entry in the log, then the log will become empty. */ void logTruncate(struct raft_log *l, const raft_index index); /* Discard all entries from the given index (included) onwards. This is exactly * the same as truncate, but the memory of the entries does not gets * released. This is called as part of error handling, when reverting the effect * of previous logAppend calls. */ void logDiscard(struct raft_log *l, const raft_index index); /* To be called when taking a new snapshot. The log must contain an entry at * last_index, which is the index of the last entry included in the * snapshot. The function will update the last snapshot information and delete * all entries up to last_index - trailing (included). If the log contains no * entry at last_index - trailing, then no entry will be deleted. */ void logSnapshot(struct raft_log *l, raft_index last_index, unsigned trailing); /* To be called when installing a snapshot. * * The log can be in any state. All outstanding entries will be discarded, the * last index and last term of the most recent snapshot will be set to the given * values, and the offset adjusted accordingly. */ void logRestore(struct raft_log *l, raft_index last_index, raft_term last_term); #endif /* RAFT_LOG_H_ */ dqlite-1.16.7/src/raft/membership.c000066400000000000000000000166441465252713400171360ustar00rootroot00000000000000#include "membership.h" #include "../raft.h" #include "../tracing.h" #include "assert.h" #include "configuration.h" #include "err.h" #include "heap.h" #include "log.h" #include "progress.h" int membershipCanChangeConfiguration(struct raft *r) { int rv; if (r->state != RAFT_LEADER || r->transfer != NULL) { tracef("NOT LEADER"); rv = RAFT_NOTLEADER; goto err; } if (r->configuration_uncommitted_index != 0) { tracef("r->configuration_uncommitted_index %llu", r->configuration_uncommitted_index); rv = RAFT_CANTCHANGE; goto err; } if (r->leader_state.promotee_id != 0) { tracef("r->leader_state.promotee_id %llu", r->leader_state.promotee_id); rv = RAFT_CANTCHANGE; goto err; } /* In order to become leader at all we are supposed to have committed at * least the initial configuration at index 1. */ assert(r->configuration_committed_index > 0); /* The index of the last committed configuration can't be greater than * the last log index. */ assert(logLastIndex(r->log) >= r->configuration_committed_index); /* No catch-up round should be in progress. */ assert(r->leader_state.round_number == 0); assert(r->leader_state.round_index == 0); assert(r->leader_state.round_start == 0); return 0; err: assert(rv != 0); ErrMsgFromCode(r->errmsg, rv); return rv; } int membershipFetchLastCommittedConfiguration(struct raft *r, struct raft_configuration *conf) { const struct raft_entry *entry; int rv; /* Try to get the entry at r->configuration_committed_index from the * log. If the entry is not present in the log anymore because the log * was truncated after a snapshot, we can just use * configuration_last_snapshot, which we cached when we took or restored * the snapshot and is guaranteed to match the content that the entry at * r->configuration_committed_index had. */ entry = logGet(r->log, r->configuration_committed_index); if (entry != NULL) { rv = configurationDecode(&entry->buf, conf); } else { assert(r->configuration_last_snapshot.n > 0); rv = configurationCopy(&r->configuration_last_snapshot, conf); } if (rv != 0) { return rv; } return 0; } bool membershipUpdateCatchUpRound(struct raft *r) { unsigned server_index; raft_index match_index; raft_index last_index; raft_time now = r->io->time(r->io); raft_time round_duration; bool is_up_to_date; bool is_fast_enough; assert(r->state == RAFT_LEADER); assert(r->leader_state.promotee_id != 0); server_index = configurationIndexOf(&r->configuration, r->leader_state.promotee_id); assert(server_index < r->configuration.n); match_index = progressMatchIndex(r, server_index); /* If the server did not reach the target index for this round, it did * not catch up. */ if (match_index < r->leader_state.round_index) { tracef( "member (index: %u) not yet caught up match_index:%llu " "round_index:%llu", server_index, match_index, r->leader_state.round_index); return false; } last_index = logLastIndex(r->log); round_duration = now - r->leader_state.round_start; is_up_to_date = match_index == last_index; is_fast_enough = round_duration < r->election_timeout; tracef("member is_up_to_date:%d is_fast_enough:%d", is_up_to_date, is_fast_enough); /* If the server's log is fully up-to-date or the round that just * terminated was fast enough, then the server as caught up. */ if (is_up_to_date || is_fast_enough) { r->leader_state.round_number = 0; r->leader_state.round_index = 0; r->leader_state.round_start = 0; return true; } /* If we get here it means that this catch-up round is complete, but * there are more entries to replicate, or it was not fast enough. Let's * start a new round. */ r->leader_state.round_number++; r->leader_state.round_index = last_index; r->leader_state.round_start = now; return false; } int membershipUncommittedChange(struct raft *r, const raft_index index, const struct raft_entry *entry) { struct raft_configuration configuration; int rv; char msg[128]; assert(r != NULL); assert(r->state == RAFT_FOLLOWER); assert(entry != NULL); assert(entry->type == RAFT_CHANGE); rv = configurationDecode(&entry->buf, &configuration); if (rv != 0) { tracef("failed to decode configuration at index:%llu", index); goto err; } /* ignore errors */ snprintf(msg, sizeof(msg), "uncommitted config change at index:%llu", index); configurationTrace(r, &configuration, msg); raft_configuration_close(&r->configuration); r->configuration = configuration; r->configuration_uncommitted_index = index; return 0; err: assert(rv != 0); return rv; } int membershipRollback(struct raft *r) { int rv; assert(r != NULL); assert(r->state == RAFT_FOLLOWER); assert(r->configuration_uncommitted_index > 0); tracef("roll back membership"); /* Fetch the last committed configuration entry. */ assert(r->configuration_committed_index != 0); /* Replace the current configuration with the last committed one. */ configurationClose(&r->configuration); rv = membershipFetchLastCommittedConfiguration(r, &r->configuration); if (rv != 0) { return rv; } configurationTrace(r, &r->configuration, "roll back config"); r->configuration_uncommitted_index = 0; return 0; } void membershipLeadershipTransferInit(struct raft *r, struct raft_transfer *req, raft_id id, raft_transfer_cb cb) { req->cb = cb; req->id = id; req->start = r->io->time(r->io); req->send.data = NULL; r->transfer = req; } static void membershipLeadershipSendCb(struct raft_io_send *send, int status) { (void)status; RaftHeapFree(send); } int membershipLeadershipTransferStart(struct raft *r) { const struct raft_server *server; struct raft_message message; struct raft_io_send *send; int rv; assert(r->transfer->send.data == NULL); server = configurationGet(&r->configuration, r->transfer->id); assert(server != NULL); if (server == NULL) { tracef("transferee server not found in configuration"); return -1; } /* Don't use the raft_io_send object embedded in struct raft_transfer, * since the two objects must have different lifetimes. For example * raft_io_send might live longer than raft_transfer, see #396. * * Ideally we should remove the embedded struct raft_io_send send field * from struct raft_transfer, and replace it with a raft_io_send *send * pointer, that we set to the raft_io_send object allocated in this * function. This would break ABI compatibility though. */ send = RaftHeapMalloc(sizeof *send); if (send == NULL) { return RAFT_NOMEM; } message.type = RAFT_IO_TIMEOUT_NOW; message.server_id = server->id; message.server_address = server->address; message.timeout_now.term = r->current_term; message.timeout_now.last_log_index = logLastIndex(r->log); message.timeout_now.last_log_term = logLastTerm(r->log); /* Set the data attribute of the raft_io_send object embedded in * raft_transfer. This is needed because we historically used it as a * flag to indicate that a transfer request was sent. See the * replicationUpdate function. */ r->transfer->send.data = r; send->data = r; rv = r->io->send(r->io, send, &message, membershipLeadershipSendCb); if (rv != 0) { RaftHeapFree(send); ErrMsgTransferf(r->io->errmsg, r->errmsg, "send timeout now to %llu", server->id); return rv; } return 0; } void membershipLeadershipTransferClose(struct raft *r) { struct raft_transfer *req = r->transfer; raft_transfer_cb cb = req->cb; r->transfer = NULL; if (cb != NULL) { cb(req); } } dqlite-1.16.7/src/raft/membership.h000066400000000000000000000043501465252713400171320ustar00rootroot00000000000000/* Membership-related APIs. */ #ifndef MEMBERSHIP_H_ #define MEMBERSHIP_H_ #include "../raft.h" /* Helper returning an error if the configuration can't be changed, either * because this node is not the leader or because a configuration change is * already in progress. */ int membershipCanChangeConfiguration(struct raft *r); /* Populate the given configuration object with the most recent committed * configuration, the one contained in the entry at * r->configuration_committed_index. */ int membershipFetchLastCommittedConfiguration(struct raft *r, struct raft_configuration *conf); /* Update the information about the progress that the non-voting server * currently being promoted is making in catching with logs. * * Return false if the server being promoted did not yet catch-up with logs, and * true if it did. * * This function must be called only by leaders after a @raft_assign request * has been submitted. */ bool membershipUpdateCatchUpRound(struct raft *r); /* Update the local configuration replacing it with the content of the given * RAFT_CHANGE entry, which has just been received in as part of an * AppendEntries RPC request. The uncommitted configuration index will be * updated accordingly. * * It must be called only by followers. */ int membershipUncommittedChange(struct raft *r, const raft_index index, const struct raft_entry *entry); /* Rollback any promotion configuration change that was applied locally, but * failed to be committed. It must be called by followers after they receive an * AppendEntries RPC request that instructs them to evict the uncommitted entry * from their log. */ int membershipRollback(struct raft *r); /* Initialize the state of a leadership transfer request. */ void membershipLeadershipTransferInit(struct raft *r, struct raft_transfer *req, raft_id id, raft_transfer_cb cb); /* Start the leadership transfer by sending a TimeoutNow message to the target * server. */ int membershipLeadershipTransferStart(struct raft *r); /* Finish a leadership transfer (whether successful or not), resetting the * leadership transfer state and firing the user callback. */ void membershipLeadershipTransferClose(struct raft *r); #endif /* MEMBERSHIP_H_ */ dqlite-1.16.7/src/raft/progress.c000066400000000000000000000207271465252713400166440ustar00rootroot00000000000000#include "progress.h" #include "../tracing.h" #include "assert.h" #include "configuration.h" #include "log.h" #ifndef max #define max(a, b) ((a) < (b) ? (b) : (a)) #endif #ifndef min #define min(a, b) ((a) < (b) ? (a) : (b)) #endif /* Initialize a single progress object. */ static void initProgress(struct raft_progress *p, raft_index last_index) { p->next_index = last_index + 1; p->match_index = 0; p->snapshot_index = 0; p->last_send = 0; p->snapshot_last_send = 0; p->recent_recv = false; p->state = PROGRESS__PROBE; p->features = 0; } int progressBuildArray(struct raft *r) { struct raft_progress *progress; unsigned i; raft_index last_index = logLastIndex(r->log); progress = raft_malloc(r->configuration.n * sizeof *progress); if (progress == NULL) { return RAFT_NOMEM; } for (i = 0; i < r->configuration.n; i++) { initProgress(&progress[i], last_index); if (r->configuration.servers[i].id == r->id) { progress[i].match_index = r->last_stored; } } r->leader_state.progress = progress; return 0; } int progressRebuildArray(struct raft *r, const struct raft_configuration *configuration) { raft_index last_index = logLastIndex(r->log); struct raft_progress *progress; unsigned i; unsigned j; raft_id id; progress = raft_malloc(configuration->n * sizeof *progress); if (progress == NULL) { return RAFT_NOMEM; } /* First copy the progress information for the servers that exists both * in the current and in the new configuration. */ for (i = 0; i < r->configuration.n; i++) { id = r->configuration.servers[i].id; j = configurationIndexOf(configuration, id); if (j == configuration->n) { /* This server is not present in the new configuration, * so we just skip it. */ continue; } progress[j] = r->leader_state.progress[i]; } /* Then reset the replication state for servers that are present in the * new configuration, but not in the current one. */ for (i = 0; i < configuration->n; i++) { id = configuration->servers[i].id; j = configurationIndexOf(&r->configuration, id); if (j < r->configuration.n) { /* This server is present both in the new and in the * current configuration, so we have already copied its * next/match index value in the loop above. */ continue; } assert(j == r->configuration.n); initProgress(&progress[i], last_index); } raft_free(r->leader_state.progress); r->leader_state.progress = progress; return 0; } bool progressIsUpToDate(struct raft *r, unsigned i) { struct raft_progress *p = &r->leader_state.progress[i]; raft_index last_index = logLastIndex(r->log); return p->next_index == last_index + 1; } bool progressPersistedIsUpToDate(struct raft *r, unsigned i) { struct raft_progress *p = &r->leader_state.progress[i]; raft_index last_index = logLastIndex(r->log); return p->match_index == last_index; } bool progressShouldReplicate(struct raft *r, unsigned i) { struct raft_progress *p = &r->leader_state.progress[i]; raft_time now = r->io->time(r->io); bool needs_heartbeat = now - p->last_send >= r->heartbeat_timeout; raft_index last_index = logLastIndex(r->log); bool result = false; /* We must be in a valid state. */ assert(p->state == PROGRESS__PROBE || p->state == PROGRESS__PIPELINE || p->state == PROGRESS__SNAPSHOT); /* The next index to send must be lower than the highest index in our * log. */ assert(p->next_index <= last_index + 1); switch (p->state) { case PROGRESS__SNAPSHOT: /* Snapshot timed out, move to PROBE */ if (now - p->snapshot_last_send >= r->install_snapshot_timeout) { tracef("snapshot timed out for index:%u", i); result = true; progressAbortSnapshot(r, i); } else { /* Enforce Leadership during follower Snapshot * installation */ result = needs_heartbeat; } break; case PROGRESS__PROBE: /* We send at most one message per heartbeat interval. */ result = needs_heartbeat; break; case PROGRESS__PIPELINE: /* In replication mode we send empty append entries * messages only if haven't sent anything in the last * heartbeat interval. */ result = !progressIsUpToDate(r, i) || needs_heartbeat; break; } return result; } raft_index progressNextIndex(struct raft *r, unsigned i) { return r->leader_state.progress[i].next_index; } raft_index progressMatchIndex(struct raft *r, unsigned i) { return r->leader_state.progress[i].match_index; } void progressUpdateLastSend(struct raft *r, unsigned i) { r->leader_state.progress[i].last_send = r->io->time(r->io); } void progressUpdateSnapshotLastSend(struct raft *r, unsigned i) { r->leader_state.progress[i].snapshot_last_send = r->io->time(r->io); } bool progressResetRecentRecv(struct raft *r, const unsigned i) { bool prev = r->leader_state.progress[i].recent_recv; r->leader_state.progress[i].recent_recv = false; return prev; } void progressMarkRecentRecv(struct raft *r, const unsigned i) { r->leader_state.progress[i].recent_recv = true; } inline void progressSetFeatures(struct raft *r, const unsigned i, raft_flags features) { r->leader_state.progress[i].features = features; } inline raft_flags progressGetFeatures(struct raft *r, const unsigned i) { return r->leader_state.progress[i].features; } bool progressGetRecentRecv(const struct raft *r, const unsigned i) { return r->leader_state.progress[i].recent_recv; } void progressToSnapshot(struct raft *r, unsigned i) { struct raft_progress *p = &r->leader_state.progress[i]; p->state = PROGRESS__SNAPSHOT; p->snapshot_index = logSnapshotIndex(r->log); } void progressAbortSnapshot(struct raft *r, const unsigned i) { struct raft_progress *p = &r->leader_state.progress[i]; p->snapshot_index = 0; p->state = PROGRESS__PROBE; } int progressState(struct raft *r, const unsigned i) { struct raft_progress *p = &r->leader_state.progress[i]; return p->state; } bool progressMaybeDecrement(struct raft *r, const unsigned i, raft_index rejected, raft_index last_index) { struct raft_progress *p = &r->leader_state.progress[i]; assert(p->state == PROGRESS__PROBE || p->state == PROGRESS__PIPELINE || p->state == PROGRESS__SNAPSHOT); if (p->state == PROGRESS__SNAPSHOT) { /* The rejection must be stale or spurious if the rejected index * does not match the last snapshot index. */ if (rejected != p->snapshot_index) { return false; } progressAbortSnapshot(r, i); return true; } if (p->state == PROGRESS__PIPELINE) { /* The rejection must be stale if the rejected index is smaller * than the matched one. */ if (rejected <= p->match_index) { tracef("match index is up to date -> ignore "); return false; } /* Directly decrease next to match + 1 */ p->next_index = min(rejected, p->match_index + 1); progressToProbe(r, i); return true; } /* The rejection must be stale or spurious if the rejected index does * not match the next index minus one. */ if (rejected != p->next_index - 1) { tracef( "rejected index %llu different from next index %lld -> " "ignore ", rejected, p->next_index); return false; } p->next_index = min(rejected, last_index + 1); p->next_index = max(p->next_index, 1); return true; } void progressOptimisticNextIndex(struct raft *r, unsigned i, raft_index next_index) { struct raft_progress *p = &r->leader_state.progress[i]; p->next_index = next_index; } bool progressMaybeUpdate(struct raft *r, unsigned i, raft_index last_index) { struct raft_progress *p = &r->leader_state.progress[i]; bool updated = false; if (p->match_index < last_index) { p->match_index = last_index; updated = true; } if (p->next_index < last_index + 1) { p->next_index = last_index + 1; } return updated; } void progressToProbe(struct raft *r, const unsigned i) { struct raft_progress *p = &r->leader_state.progress[i]; /* If the current state is snapshot, we know that the pending snapshot * has been sent to this peer successfully, so we probe from * snapshot_index + 1.*/ if (p->state == PROGRESS__SNAPSHOT) { assert(p->snapshot_index > 0); p->next_index = max(p->match_index + 1, p->snapshot_index); p->snapshot_index = 0; } else { p->next_index = p->match_index + 1; } p->state = PROGRESS__PROBE; } void progressToPipeline(struct raft *r, const unsigned i) { struct raft_progress *p = &r->leader_state.progress[i]; p->state = PROGRESS__PIPELINE; } bool progressSnapshotDone(struct raft *r, const unsigned i) { struct raft_progress *p = &r->leader_state.progress[i]; assert(p->state == PROGRESS__SNAPSHOT); return p->match_index >= p->snapshot_index; } #undef tracef dqlite-1.16.7/src/raft/progress.h000066400000000000000000000121101465252713400166340ustar00rootroot00000000000000/* Track replication progress on followers. */ #ifndef PROGRESS_H_ #define PROGRESS_H_ #include "../raft.h" /* Possible values for the state field of struct raft_progress. */ enum { PROGRESS__PROBE = 0, /* At most one AppendEntries per heartbeat interval */ PROGRESS__PIPELINE, /* Optimistically stream AppendEntries */ PROGRESS__SNAPSHOT /* Sending a snapshot */ }; /** * Used by leaders to keep track of replication progress for each server. */ struct raft_progress { unsigned short state; /* Probe, pipeline or snapshot. */ raft_index next_index; /* Next entry to send. */ raft_index match_index; /* Highest index reported as replicated. */ raft_index snapshot_index; /* Last index of most recent snapshot sent. */ raft_time last_send; /* Timestamp of last AppendEntries RPC. */ raft_time snapshot_last_send; /* Timestamp of last InstallSnaphot RPC. */ bool recent_recv; /* A msg was received within election timeout. */ raft_flags features; /* What the server is capable of. */ }; /* Create and initialize the array of progress objects used by the leader to * * track followers. The match index will be set to zero, and the next index to * the current last index plus 1. */ int progressBuildArray(struct raft *r); /* Re-build the progress array against a new configuration. * * Progress information for servers existing both in the new and in the current * configuration will remain unchanged. * * Progress information for servers existing only in the new configuration will * be initialized as in progressBuildArray().*/ int progressRebuildArray(struct raft *r, const struct raft_configuration *configuration); /* Whether the i'th server in the configuration has been sent all the log * entries. */ bool progressIsUpToDate(struct raft *r, unsigned i); /* Whether the persisted log of the i'th server in the configuration up-to-date * with ours. */ bool progressPersistedIsUpToDate(struct raft *r, unsigned i); /* Whether a new AppendEntries or InstallSnapshot message should be sent to the * i'th server at this time. * * See the docstring of replicationProgress() for details about how the decision * is taken. */ bool progressShouldReplicate(struct raft *r, unsigned i); /* Return the index of the next entry that should be sent to the i'th server. */ raft_index progressNextIndex(struct raft *r, unsigned i); /* Return the index of the most recent entry that the i'th server has reported * as replicated. */ raft_index progressMatchIndex(struct raft *r, unsigned i); /* Update the last_send timestamp after an AppendEntries request has been * sent. */ void progressUpdateLastSend(struct raft *r, unsigned i); /* Update the snapshot_last_send timestamp after an InstallSnaphot request has * been sent. */ void progressUpdateSnapshotLastSend(struct raft *r, unsigned i); /* Reset to false the recent_recv flag of the server at the given index, * returning the previous value. * * To be called once every election_timeout milliseconds. */ bool progressResetRecentRecv(struct raft *r, unsigned i); /* Set to true the recent_recv flag of the server at the given index. * * To be called whenever we receive an AppendEntries RPC result */ void progressMarkRecentRecv(struct raft *r, unsigned i); /* Return the value of the recent_recv flag. */ bool progressGetRecentRecv(const struct raft *r, unsigned i); /* Convert to the i'th server to snapshot mode. */ void progressToSnapshot(struct raft *r, unsigned i); /* Convert to probe mode. */ void progressToProbe(struct raft *r, unsigned i); /* Convert to pipeline mode. */ void progressToPipeline(struct raft *r, unsigned i); /* Abort snapshot mode and switch to back to probe. * * Called after sending the snapshot has failed or timed out. */ void progressAbortSnapshot(struct raft *r, unsigned i); /* Return the progress mode code for the i'th server. */ int progressState(struct raft *r, unsigned i); /* Optimistically update the next index of the given server. * * Called in pipeline mode after sending new entries. */ void progressOptimisticNextIndex(struct raft *r, unsigned i, raft_index next_index); /* Return false if the given @index comes from an outdated message. Otherwise * update the progress and returns true. To be called when receiving a * successful AppendEntries RPC response. */ bool progressMaybeUpdate(struct raft *r, unsigned i, raft_index last_index); /* Return false if the given rejected index comes from an out of order * message. Otherwise decrease the progress next index to min(rejected, * last_index) and returns true. To be called when receiving an unsuccessful * AppendEntries RPC response. */ bool progressMaybeDecrement(struct raft *r, unsigned i, raft_index rejected, raft_index last_index); /* Return true if match_index is equal or higher than the snapshot_index. */ bool progressSnapshotDone(struct raft *r, unsigned i); /* Sets the feature flags of a node. */ void progressSetFeatures(struct raft *r, const unsigned i, raft_flags features); /* Gets the feature flags of a node. */ raft_flags progressGetFeatures(struct raft *r, const unsigned i); #endif /* PROGRESS_H_ */ dqlite-1.16.7/src/raft/raft.c000066400000000000000000000150521465252713400157270ustar00rootroot00000000000000#include "../raft.h" #include #include #include "../tracing.h" #include "assert.h" #include "byte.h" #include "callbacks.h" #include "configuration.h" #include "convert.h" #include "election.h" #include "err.h" #include "flags.h" #include "heap.h" #include "log.h" #include "membership.h" #define DEFAULT_ELECTION_TIMEOUT 1000 /* One second */ #define DEFAULT_HEARTBEAT_TIMEOUT 100 /* One tenth of a second */ #define DEFAULT_INSTALL_SNAPSHOT_TIMEOUT 30000 /* 30 seconds */ #define DEFAULT_SNAPSHOT_THRESHOLD 1024 #define DEFAULT_SNAPSHOT_TRAILING 2048 /* Number of milliseconds after which a server promotion will be aborted if the * server hasn't caught up with the logs yet. */ #define DEFAULT_MAX_CATCH_UP_ROUNDS 10 #define DEFAULT_MAX_CATCH_UP_ROUND_DURATION (5 * 1000) int raft_version_number(void) { return RAFT_VERSION_NUMBER; } static int ioFsmVersionCheck(struct raft *r, struct raft_io *io, struct raft_fsm *fsm); int raft_init(struct raft *r, struct raft_io *io, struct raft_fsm *fsm, const raft_id id, const char *address) { int rv; assert(r != NULL); rv = ioFsmVersionCheck(r, io, fsm); if (rv != 0) { goto err; } r->io = io; r->io->data = r; r->fsm = fsm; r->tracer = NULL; r->id = id; /* Make a copy of the address */ r->address = RaftHeapMalloc(strlen(address) + 1); if (r->address == NULL) { rv = RAFT_NOMEM; goto err; } strcpy(r->address, address); r->current_term = 0; r->voted_for = 0; r->log = logInit(); if (r->log == NULL) { rv = RAFT_NOMEM; goto err_after_address_alloc; } raft_configuration_init(&r->configuration); raft_configuration_init(&r->configuration_last_snapshot); r->configuration_committed_index = 0; r->configuration_uncommitted_index = 0; r->election_timeout = DEFAULT_ELECTION_TIMEOUT; r->heartbeat_timeout = DEFAULT_HEARTBEAT_TIMEOUT; r->install_snapshot_timeout = DEFAULT_INSTALL_SNAPSHOT_TIMEOUT; r->commit_index = 0; r->last_applied = 0; r->last_stored = 0; r->state = RAFT_UNAVAILABLE; r->leader_state.voter_contacts = 0; rv = raftInitCallbacks(r); if (rv != 0) { goto err_after_address_alloc; } r->transfer = NULL; r->snapshot.pending.term = 0; r->snapshot.threshold = DEFAULT_SNAPSHOT_THRESHOLD; r->snapshot.trailing = DEFAULT_SNAPSHOT_TRAILING; r->snapshot.put.data = NULL; r->close_cb = NULL; memset(r->errmsg, 0, sizeof r->errmsg); r->pre_vote = false; r->max_catch_up_rounds = DEFAULT_MAX_CATCH_UP_ROUNDS; r->max_catch_up_round_duration = DEFAULT_MAX_CATCH_UP_ROUND_DURATION; rv = r->io->init(r->io, r->id, r->address); if (rv != 0) { ErrMsgTransfer(r->io->errmsg, r->errmsg, "io"); goto err_after_callbacks_alloc; } return 0; err_after_callbacks_alloc: raftDestroyCallbacks(r); err_after_address_alloc: RaftHeapFree(r->address); err: assert(rv != 0); return rv; } static void ioCloseCb(struct raft_io *io) { struct raft *r = io->data; tracef("io close cb"); raftDestroyCallbacks(r); raft_free(r->address); logClose(r->log); raft_configuration_close(&r->configuration); raft_configuration_close(&r->configuration_last_snapshot); if (r->close_cb != NULL) { r->close_cb(r); } } void raft_close(struct raft *r, void (*cb)(struct raft *r)) { assert(r->close_cb == NULL); if (r->state != RAFT_UNAVAILABLE) { convertToUnavailable(r); } r->close_cb = cb; r->io->close(r->io, ioCloseCb); } void raft_register_state_cb(struct raft *r, raft_state_cb cb) { struct raft_callbacks *cbs = raftGetCallbacks(r); assert(cbs != NULL); cbs->state_cb = cb; } void raft_set_election_timeout(struct raft *r, const unsigned msecs) { r->election_timeout = msecs; } void raft_set_heartbeat_timeout(struct raft *r, const unsigned msecs) { r->heartbeat_timeout = msecs; } void raft_set_install_snapshot_timeout(struct raft *r, const unsigned msecs) { r->install_snapshot_timeout = msecs; } void raft_set_snapshot_threshold(struct raft *r, unsigned n) { r->snapshot.threshold = n; } void raft_set_snapshot_trailing(struct raft *r, unsigned n) { r->snapshot.trailing = n; } void raft_set_max_catch_up_rounds(struct raft *r, unsigned n) { r->max_catch_up_rounds = n; } void raft_set_max_catch_up_round_duration(struct raft *r, unsigned msecs) { r->max_catch_up_round_duration = msecs; } void raft_set_pre_vote(struct raft *r, bool enabled) { r->pre_vote = enabled; } const char *raft_errmsg(struct raft *r) { return r->errmsg; } int raft_voter_contacts(struct raft *r) { int ret; if (r->state == RAFT_LEADER) { ret = (int)r->leader_state.voter_contacts; } else { ret = -1; } return ret; } int raft_bootstrap(struct raft *r, const struct raft_configuration *conf) { int rv; if (r->state != RAFT_UNAVAILABLE) { return RAFT_BUSY; } rv = r->io->bootstrap(r->io, conf); if (rv != 0) { return rv; } return 0; } int raft_recover(struct raft *r, const struct raft_configuration *conf) { int rv; if (r->state != RAFT_UNAVAILABLE) { return RAFT_BUSY; } rv = r->io->recover(r->io, conf); if (rv != 0) { return rv; } return 0; } const char *raft_strerror(int errnum) { return errCodeToString(errnum); } void raft_configuration_init(struct raft_configuration *c) { configurationInit(c); } void raft_configuration_close(struct raft_configuration *c) { configurationClose(c); } int raft_configuration_add(struct raft_configuration *c, const raft_id id, const char *address, const int role) { return configurationAdd(c, id, address, role); } int raft_configuration_encode(const struct raft_configuration *c, struct raft_buffer *buf) { return configurationEncode(c, buf); } unsigned long long raft_digest(const char *text, unsigned long long n) { struct byteSha1 sha1; uint8_t value[20]; uint64_t n64 = byteFlip64((uint64_t)n); uint64_t digest; byteSha1Init(&sha1); byteSha1Update(&sha1, (const uint8_t *)text, (uint32_t)strlen(text)); byteSha1Update(&sha1, (const uint8_t *)&n64, (uint32_t)(sizeof n64)); byteSha1Digest(&sha1, value); memcpy(&digest, value + (sizeof value - sizeof digest), sizeof digest); return byteFlip64(digest); } static int ioFsmVersionCheck(struct raft *r, struct raft_io *io, struct raft_fsm *fsm) { if (io->version == 0) { ErrMsgPrintf(r->errmsg, "io->version must be set"); return -1; } if (fsm->version == 0) { ErrMsgPrintf(r->errmsg, "fsm->version must be set"); return -1; } if ((fsm->version > 2 && fsm->snapshot_async != NULL) && ((io->version < 2) || (io->async_work == NULL))) { ErrMsgPrintf(r->errmsg, "async snapshot requires io->version > 1 and " "async_work method."); return -1; } return 0; } dqlite-1.16.7/src/raft/recv.c000066400000000000000000000125731465252713400157370ustar00rootroot00000000000000#include "recv.h" #include "../tracing.h" #include "assert.h" #include "convert.h" #include "entry.h" #include "heap.h" #include "log.h" #include "membership.h" #include "recv_append_entries.h" #include "recv_append_entries_result.h" #include "recv_install_snapshot.h" #include "recv_request_vote.h" #include "recv_request_vote_result.h" #include "recv_timeout_now.h" #include "string.h" /* Dispatch a single RPC message to the appropriate handler. */ static int recvMessage(struct raft *r, struct raft_message *message) { int rv = 0; switch (message->type) { case RAFT_IO_APPEND_ENTRIES: rv = recvAppendEntries(r, message->server_id, message->server_address, &message->append_entries); if (rv != 0) { entryBatchesDestroy( message->append_entries.entries, message->append_entries.n_entries); } break; case RAFT_IO_APPEND_ENTRIES_RESULT: rv = recvAppendEntriesResult( r, message->server_id, message->server_address, &message->append_entries_result); break; case RAFT_IO_REQUEST_VOTE: rv = recvRequestVote(r, message->server_id, message->server_address, &message->request_vote); break; case RAFT_IO_REQUEST_VOTE_RESULT: rv = recvRequestVoteResult( r, message->server_id, message->server_address, &message->request_vote_result); break; case RAFT_IO_INSTALL_SNAPSHOT: rv = recvInstallSnapshot(r, message->server_id, message->server_address, &message->install_snapshot); /* Already installing a snapshot, wait for it and ignore * this one */ if (rv == RAFT_BUSY) { raft_free(message->install_snapshot.data.base); raft_configuration_close( &message->install_snapshot.conf); rv = 0; } break; case RAFT_IO_TIMEOUT_NOW: rv = recvTimeoutNow(r, message->server_id, message->server_address, &message->timeout_now); break; default: tracef("received unknown message type (%d)", message->type); /* Drop message */ return 0; }; if (rv != 0 && rv != RAFT_NOCONNECTION) { tracef("recv: %d: %s", message->type, raft_strerror(rv)); return rv; } /* If there's a leadership transfer in progress, check if it has * completed. */ if (r->transfer != NULL) { if (r->follower_state.current_leader.id == r->transfer->id) { membershipLeadershipTransferClose(r); } } return 0; } void recvCb(struct raft_io *io, struct raft_message *message) { struct raft *r = io->data; int rv; if (r->state == RAFT_UNAVAILABLE) { switch (message->type) { case RAFT_IO_APPEND_ENTRIES: entryBatchesDestroy( message->append_entries.entries, message->append_entries.n_entries); break; case RAFT_IO_INSTALL_SNAPSHOT: raft_configuration_close( &message->install_snapshot.conf); raft_free(message->install_snapshot.data.base); break; } return; } rv = recvMessage(r, message); if (rv != 0) { convertToUnavailable(r); } } int recvBumpCurrentTerm(struct raft *r, raft_term term) { int rv; char msg[128]; assert(r != NULL); assert(term > r->current_term); sprintf(msg, "remote term %lld is higher than %lld -> bump local term", term, r->current_term); if (r->state != RAFT_FOLLOWER) { strcat(msg, " and step down"); } tracef("%s", msg); /* Save the new term to persistent store, resetting the vote. */ rv = r->io->set_term(r->io, term); if (rv != 0) { return rv; } /* Update our cache too. */ r->current_term = term; r->voted_for = 0; if (r->state != RAFT_FOLLOWER) { /* Also convert to follower. */ convertToFollower(r); } return 0; } void recvCheckMatchingTerms(struct raft *r, raft_term term, int *match) { if (term < r->current_term) { *match = -1; } else if (term > r->current_term) { *match = 1; } else { *match = 0; } } int recvEnsureMatchingTerms(struct raft *r, raft_term term, int *match) { int rv; assert(r != NULL); assert(match != NULL); recvCheckMatchingTerms(r, term, match); if (*match == -1) { tracef("old term - current_term:%llu other_term:%llu", r->current_term, term); return 0; } /* From Figure 3.1: * * Rules for Servers: All Servers: If RPC request or response contains * term T > currentTerm: set currentTerm = T, convert to follower. * * From state diagram in Figure 3.3: * * [leader]: discovers server with higher term -> [follower] * * From Section 3.3: * * If a candidate or leader discovers that its term is out of date, it * immediately reverts to follower state. */ if (*match == 1) { rv = recvBumpCurrentTerm(r, term); if (rv != 0) { tracef("recvBumpCurrentTerm failed %d", rv); return rv; } } return 0; } int recvUpdateLeader(struct raft *r, const raft_id id, const char *address) { assert(r->state == RAFT_FOLLOWER); r->follower_state.current_leader.id = id; /* If the address of the current leader is the same as the given one, * we're done. */ if (r->follower_state.current_leader.address != NULL && strcmp(address, r->follower_state.current_leader.address) == 0) { return 0; } if (r->follower_state.current_leader.address != NULL) { RaftHeapFree(r->follower_state.current_leader.address); } r->follower_state.current_leader.address = RaftHeapMalloc(strlen(address) + 1); if (r->follower_state.current_leader.address == NULL) { return RAFT_NOMEM; } strcpy(r->follower_state.current_leader.address, address); return 0; } #undef tracef dqlite-1.16.7/src/raft/recv.h000066400000000000000000000033331465252713400157360ustar00rootroot00000000000000/* Receive an RPC message. */ #ifndef RECV_H_ #define RECV_H_ #include "../raft.h" /* Callback to be passed to the raft_io implementation. It will be invoked upon * receiving an RPC message. */ void recvCb(struct raft_io *io, struct raft_message *message); /* Compare a request's term with the server's current term. * * The match output parameter will be set to 0 if the local term matches the * request's term, to -1 if the request's term is lower, and to 1 if the * request's term is higher. */ void recvCheckMatchingTerms(struct raft *r, raft_term term, int *match); /* Bump the current term and possibly step down from candidate or leader * state. */ int recvBumpCurrentTerm(struct raft *r, raft_term term); /* Common logic for RPC handlers, comparing the request's term with the server's * current term and possibly deciding to reject the request or step down from * candidate or leader. * * From Section 3.3: * * If a candidate or leader discovers that its term is out of date, it * immediately reverts to follower state. If a server receives a request with * a stale term number, it rejects the request. * * The match output parameter will be set to 0 if the local term matches the * request's term, to -1 if the request's term is lower, and to 1 if the * request's term was higher but we have successfully bumped the local one to * match it (and stepped down to follower in that case, if we were not * follower already). */ int recvEnsureMatchingTerms(struct raft *r, raft_term term, int *match); /* If different from the current one, update information about the current * leader. Must be called only by followers. */ int recvUpdateLeader(struct raft *r, raft_id id, const char *address); #endif /* RECV_H_ */ dqlite-1.16.7/src/raft/recv_append_entries.c000066400000000000000000000110651465252713400210120ustar00rootroot00000000000000#include "recv_append_entries.h" #include "../tracing.h" #include "assert.h" #include "convert.h" #include "entry.h" #include "flags.h" #include "heap.h" #include "log.h" #include "recv.h" #include "replication.h" static void recvSendAppendEntriesResultCb(struct raft_io_send *req, int status) { (void)status; RaftHeapFree(req); } int recvAppendEntries(struct raft *r, raft_id id, const char *address, const struct raft_append_entries *args) { struct raft_io_send *req; struct raft_message message; struct raft_append_entries_result *result = &message.append_entries_result; int match; bool async; int rv; assert(r != NULL); assert(id > 0); assert(args != NULL); assert(address != NULL); tracef( "self:%llu from:%llu@%s leader_commit:%llu n_entries:%d " "prev_log_index:%llu prev_log_term:%llu, term:%llu", r->id, id, address, args->leader_commit, args->n_entries, args->prev_log_index, args->prev_log_term, args->term); result->rejected = args->prev_log_index; result->last_log_index = logLastIndex(r->log); result->version = RAFT_APPEND_ENTRIES_RESULT_VERSION; result->features = RAFT_DEFAULT_FEATURE_FLAGS; rv = recvEnsureMatchingTerms(r, args->term, &match); if (rv != 0) { return rv; } /* From Figure 3.1: * * AppendEntries RPC: Receiver implementation: Reply false if term < * currentTerm. */ if (match < 0) { tracef("local term is higher -> reject "); goto reply; } /* If we get here it means that the term in the request matches our * current term or it was higher and we have possibly stepped down, * because we discovered the current leader: * * From Figure 3.1: * * Rules for Servers: Candidates: if AppendEntries RPC is received * from new leader: convert to follower. * * From Section 3.4: * * While waiting for votes, a candidate may receive an AppendEntries * RPC from another server claiming to be leader. If the leader's term * (included in its RPC) is at least as large as the candidate's * current term, then the candidate recognizes the leader as legitimate * and returns to follower state. If the term in the RPC is smaller than * the candidate's current term, then the candidate rejects the RPC and * continues in candidate state. * * From state diagram in Figure 3.3: * * [candidate]: discovers current leader -> [follower] * * Note that it should not be possible for us to be in leader state, * because the leader that is sending us the request should have either * a lower term (and in that case we reject the request above), or a * higher term (and in that case we step down). It can't have the same * term because at most one leader can be elected at any given term. */ assert(r->state == RAFT_FOLLOWER || r->state == RAFT_CANDIDATE); assert(r->current_term == args->term); if (r->state == RAFT_CANDIDATE) { /* The current term and the peer one must match, otherwise we * would have either rejected the request or stepped down to * followers. */ assert(match == 0); tracef("discovered leader -> step down "); convertToFollower(r); } assert(r->state == RAFT_FOLLOWER); /* Update current leader because the term in this AppendEntries RPC is * up to date. */ rv = recvUpdateLeader(r, id, address); if (rv != 0) { return rv; } /* Reset the election timer. */ r->election_timer_start = r->io->time(r->io); /* If we are installing a snapshot, ignore these entries. TODO: we * should do something smarter, e.g. buffering the entries in the I/O * backend, which should be in charge of serializing everything. */ if (replicationInstallSnapshotBusy(r) && args->n_entries > 0) { tracef("ignoring AppendEntries RPC during snapshot install"); entryBatchesDestroy(args->entries, args->n_entries); return 0; } rv = replicationAppend(r, args, &result->rejected, &async); if (rv != 0) { return rv; } if (async) { return 0; } /* Echo back to the leader the point that we reached. */ result->last_log_index = r->last_stored; reply: result->term = r->current_term; /* Free the entries batch, if any. */ if (args->n_entries > 0 && args->entries[0].batch != NULL) { raft_free(args->entries[0].batch); } if (args->entries != NULL) { raft_free(args->entries); } message.type = RAFT_IO_APPEND_ENTRIES_RESULT; message.server_id = id; message.server_address = address; req = RaftHeapMalloc(sizeof *req); if (req == NULL) { return RAFT_NOMEM; } req->data = r; rv = r->io->send(r->io, req, &message, recvSendAppendEntriesResultCb); if (rv != 0) { raft_free(req); return rv; } return 0; } #undef tracef dqlite-1.16.7/src/raft/recv_append_entries.h000066400000000000000000000005451465252713400210200ustar00rootroot00000000000000/* Receive an AppendEntries message. */ #ifndef RECV_APPEND_ENTRIES_H_ #define RECV_APPEND_ENTRIES_H_ #include "../raft.h" /* Process an AppendEntries RPC from the given server. */ int recvAppendEntries(struct raft *r, raft_id id, const char *address, const struct raft_append_entries *args); #endif /* RECV_APPEND_ENTRIES_H_ */ dqlite-1.16.7/src/raft/recv_append_entries_result.c000066400000000000000000000031121465252713400224020ustar00rootroot00000000000000#include "recv_append_entries_result.h" #include "../tracing.h" #include "assert.h" #include "configuration.h" #include "recv.h" #include "replication.h" int recvAppendEntriesResult(struct raft *r, const raft_id id, const char *address, const struct raft_append_entries_result *result) { int match; const struct raft_server *server; int rv; assert(r != NULL); assert(id > 0); assert(address != NULL); assert(result != NULL); tracef( "self:%llu from:%llu@%s last_log_index:%llu rejected:%llu " "term:%llu", r->id, id, address, result->last_log_index, result->rejected, result->term); if (r->state != RAFT_LEADER) { tracef("local server is not leader -> ignore"); return 0; } rv = recvEnsureMatchingTerms(r, result->term, &match); if (rv != 0) { return rv; } if (match < 0) { tracef("local term is higher -> ignore "); return 0; } /* If we have stepped down, abort here. * * From Figure 3.1: * * [Rules for Servers] All Servers: If RPC request or response * contains term T > currentTerm: set currentTerm = T, convert to * follower. */ if (match > 0) { assert(r->state == RAFT_FOLLOWER); return 0; } assert(result->term == r->current_term); /* Ignore responses from servers that have been removed */ server = configurationGet(&r->configuration, id); if (server == NULL) { tracef("unknown server -> ignore"); return 0; } /* Update the progress of this server, possibly sending further entries. */ rv = replicationUpdate(r, server, result); if (rv != 0) { return rv; } return 0; } #undef tracef dqlite-1.16.7/src/raft/recv_append_entries_result.h000066400000000000000000000006241465252713400224140ustar00rootroot00000000000000/* Receive an AppendEntries result message. */ #ifndef RECV_APPEND_ENTRIES_RESULT_H_ #define RECV_APPEND_ENTRIES_RESULT_H_ #include "../raft.h" /* Process an AppendEntries RPC result from the given server. */ int recvAppendEntriesResult(struct raft *r, raft_id id, const char *address, const struct raft_append_entries_result *result); #endif /* RECV_APPEND_ENTRIES_RESULT_H_ */ dqlite-1.16.7/src/raft/recv_install_snapshot.c000066400000000000000000000637101465252713400214030ustar00rootroot00000000000000#include #include #include #include "../tracing.h" #include "assert.h" #include "convert.h" #include "flags.h" #include "log.h" #include "recv.h" #include "replication.h" #include "../lib/sm.h" #include "../raft.h" #include "../raft/recv_install_snapshot.h" #include "../utils.h" /** * =Overview * * This detailed level design is based on PL018 and describes * significant implementation details of data structures, RPCs * introduced in it; provides model of operation and failure handling * based on Leader's and Follower's states. * * =Data structures * * Among other structures it's needed to introduce a (persistent) container * `HT` to efficiently store and map checksums to their page numbers on both * the leader's and follower's side. HT is implemented on top of sqlite3 * database with unix VFS. Every database corresponds to a raft-related * database and maintains the following schema: * * CREATE TABLE "map" ("checksum" INTEGER NOT NULL, "pageno" INTEGER NOT NULL UNIQUE) * CREATE INDEX map_idx on map(checksum); * * Each database stores a mapping from checksum to page number. This * provides an efficient way to insert and lookup records * corresponding to the checksums and page numbers. */ /** * =Operation * * 0. Leader creates one state machine per Follower to track of their states * and moves it to F_ONLINE state. Follower creates a state machine to keep * track of its states and moves it to NORMAL state. * * 1. The Leader learns the Follower’s follower.lastLogIndex during receiving * replies on AppendEntries() RPC, fails to find follower.lastLogIndex in its * RAFT log or tries and fails to construct an AppendEntries() message because * of the WAL that contained some necessary frames has been rotated out, and * understands that the snapshot installation procedure is required. * * Leader calls leader_tick() putting struct raft_message as a parameter which * logic moves it from F_ONLINE to F_NEEDS_SNAP state. * * 2. The Leader triggers the creation of its HT and initiates the snapshot * installation by sending InstallSnapshot() message as soon as the HT is * created. * * 3. Upon receiving this message on the Follower's side, Follower calls * follower_tick() putting struct raft_message as a parameter which triggers the * creation of the HT on the follower side. Once HT is created follower moves * to SIGS_CALC_STARTED and triggers a background job to calculate the checksum * of its pages and inserting them in the HT. * * 4. The Leader probes the follower sending Signature(calculated?) messages * and the Follower replies with either SignatureResult(calculated=false) if it * is still calculating the chechsums or SignatureResult(calculated=true) if it * has finished. If the process finishes, Follower moves into SIG_RECEIVING and * Leader moves into REQ_SIG_LOOP. * * 5. The Leader sends Signature() messages to the Follower containing the page * range for which we want to get the checksums. * * The Follower sends the requested checksums in a SignatureResult() message * back to the Leader and the leader puts incomming payloads of Signature() * message into the HT. * * 6. When the follower sends the checksum of its highest numbered page to the * Leader, it sends the SignatureResult() message using the done=true flag, * upon receiving it the Leader moves into READ_PAGES_LOOP state and the * Follower moves into CHUNK_RECEIVING. * * 7. In READ_PAGES_LOOP state, the Leader starts iterating over * the local persistent state, and calculates the checksum for each page the * state has. Then, it tries to find the checksum it calculated in HT. Based on * the result of this calculation, the Leader sends CP() or MV() to the * Follower. * * The Follower receives the message and persists the page using a background * job. Once the background job is finished, the Follower replies with * CPResult() or MVResult(). * * 8. When the iteration has finished the Leader sends * InstallShapshot(..., done=true) message to the Follower. It moves the * Follower back to NORMAL state and the state machine corresponding to the * Follower on the Leader is moved to SNAPSHOT_DONE state. * * 9. The Leader sends AppendEntries() RPC to the Follower and restarts the * algorithm from (1). The Leader's state machine is being moved to * FOLLOWER_ONLINE state. * * =Failure model * * ==Unavailability of the Leader and Follower. * * To handle use-cases when any party of the communication becomes * unavailable for a while without crash the following assumtions are * made: * * - Signature() or InstallSnapshot(MV/CP) messages are idempotent and * can be applied to the persistent state many times resulting the * same transition. * * - Each message with data chuncks has an information about the * "chunk index". Chunk indexes come in monotonically increasing * order. * * - Each reply message acknowledges that the data received (or * ignored) by sending `result` field back to the counter part along * with last known chunk index as a confirmation that the receiver * "knows everything up to the given chunck index". * * - If a party notices that last known chunk index sent back to it * doesn't match it's own, the communication get's restarted from * the lowest known index. * * If a reply is not received the Leader will eventually timeout and retry * sending the same message. * * ==Crashes of the Leader and Follower. * * Crashes of the Leader are handled by Raft when a new leader is elected * and the snapshot process is restarted. * * If the Follower receives an message which is not expected in the Follower's * current state, the Follower will reply using the message's result RPC * setting the unexpected=true flag. This response suggests the Leader to * restart the snapshot installation procedure. * * In particular, if the follower crashes it will restart its state machine to * the NORMAL state and reply using the unexpected=true flag for any messages * not expected in the NORMAL state, suggesting the Leader to restart the * procedure. * * =State model * * Definitions: * * Rf -- raft index sent in AppendEntriesResult() from Follower to Leader * Tf -- Follower's term sent in AppendEntriesResult() from Follower to Leader * * Tl -- Leader's term * Rl -- raft index of the Leader * * Leader's state machine: * * +-----------------------------+ * | | AppendEntriesResult() received * | *Result(unexpected=true) | raft_log.find(Rf) == "FOUND" * | received V +------------+ * | +-------------> F_ONLINE <------------+ * | | | * | | | AppendEntriesResult() received * | | | Rf << Rl && raft_log.find(Rf) == "ENOENTRY" * | | V Trigger background job. * | +--------------- HT_WAIT * | | V HT creation finished, * | +------------- F_NEEDS_SNAP* * | | | InstallSnapshot() sent, * | | V InstallSnapshotResult() received. * | +----------- CHECK_F_HAS_SIGS* <-----------------------+ SignatureResult() had * | | | Signature(calculated?) sent, | calculated=false and * | | V SignatureResult() received. | timeout reached. * | +------------- WAIT_SIGS -----------------------------+ * | | V SignatureResult() had calculated=true. * | +------------- REQ_SIG_LOOP* <-------------------------+ * | | | Signature() sent, | Signature persisted in HT, * | | V SignatureResult() received. | there are some pending * | +------------- RECV_SIG_PART | signatures. * | | V Background job triggered. | * | +---------- PRESISTED_SIG_PART ------------------------+ * | | | Signature persisted in HT, * | | V all signatures have been persisted. * | +----------- READ_PAGES_LOOP <-------------------------+ * | | V Background job triggered. | There are pending pages to * | +-------------- PAGE_READ* | be sent. * | | | Page read from disk, | * | | V CP()/MV() sent. | * | +-------------- PAGE_SENT -----------------------------+ * | | V All pages sent and acked. * | +-------------- SNAP_DONE * | | | InstallSnapshot(done=true) sent, * | | V and reply received. * | +---------------- FINAL * | | * +-----------------------------+ * * Note all states marked with (*) have an extra transition not represented in * the diagram above. When the leader sends a message there is always a timeout * sheduled. If the reply is not received and the timeout expires, we will stay * in the same state and re-send the message. * * Follower's state machine: * * +------+ (%) * +-------------------> NORMAL <----+ * | +-----------> | * | | | InstallSnapshot() received. * | | V * | +--------- HT_CREATE * | | V Trigger background job. * | +---------- HT_WAIT * | | | Background job finishes, * | | | InstallSnapshotResult() sent. * | | V * | +------ SIGS_CALC_STARTED * | | V Trigger background job. * | +------ SIGS_CALC_LOOP <--------------------------+ * | | V Signature(calculated?) received. | SignatureResult(calculated=false) sent. * | +--- SIGS_CALC_MSG_RECEIVED ----------------------+ * | | | Signatures for all db pages have been calculated. * | | V SignatureResult(calculated=true) sent. * | | SIGS_CALC_DONE * | | V * | +------- SIG_RECEIVING <--------------------------+ * | | V Signature() received. | * | +------- SIG_PROCESSED | * | | V Background job triggered. | Signature() had done=false, * | +--------- SIG_READ | SignatureResult() sent. * | | V Checksum is read from HT. | * | +--------- SIG_REPLIED ---------------------------+ * | | | Signature() had done=true, * | | V SignatureResult() sent. * | +------- CHUNK_RECEIVING <------------------------+ * | | V CP()/MV() received. | * | +------- CHUNK_PROCESSED | * | | V Background job triggered. | * | +------- CHUNK_APPLIED | * | | V Chunk has been written to disk. | * | +------- CHUNK_REPLIED ---------------------------+ * | (@ || %) | CP()/MV() had done=true. * | V CPResult()/MVResult() sent. * | FINAL * | | * +-----------------------+ * * (@) -- AppendEntries() received && Tf < Tl * (%) -- Signature()/CP()/MV() received and in the current state receving a * message of such type is unexpected. *Result(unexpected=true) sent. */ /* TODO this uses several GNU extensions, do we use it? #define RC(rc) ({ \ typeof(rc) __rc = (rc); \ printf("< rc=%d\n", __rc); \ __rc; \ }) */ enum rpc_state { RPC_INIT, RPC_FILLED, RPC_SENT, RPC_TIMEDOUT, RPC_REPLIED, RPC_ERROR, RPC_END, RPC_NR, }; /* clang-format off */ static const struct sm_conf rpc_sm_conf[RPC_NR] = { [RPC_INIT] = { .flags = SM_INITIAL | SM_FINAL, .name = "init", .allowed = BITS(RPC_FILLED) | BITS(RPC_ERROR), }, [RPC_FILLED] = { .name = "filled", .allowed = BITS(RPC_SENT) | BITS(RPC_ERROR), }, [RPC_SENT] = { .name = "sent", .allowed = BITS(RPC_TIMEDOUT) | BITS(RPC_REPLIED) | BITS(RPC_ERROR) | BITS(RPC_END), }, [RPC_TIMEDOUT] = { .name = "timedout", .allowed = BITS(RPC_INIT), }, [RPC_REPLIED] = { .name = "replied", .allowed = BITS(RPC_INIT) | BITS(RPC_END), }, [RPC_ERROR] = { .name = "error", .allowed = BITS(RPC_INIT), .flags = SM_FINAL, }, [RPC_END] = { .name = "end", .flags = SM_FINAL, }, }; /* clang-format on */ enum work_state { WORK_INIT, WORK_DONE, WORK_ERROR, WORK_NR, }; static const struct sm_conf work_sm_conf[WORK_NR] = { [WORK_INIT] = { .flags = SM_INITIAL | SM_FINAL, .name = "w_init", .allowed = BITS(WORK_DONE) | BITS(WORK_ERROR), }, [WORK_DONE] = { .flags = SM_FINAL, .name = "w_done", }, [WORK_ERROR] = { .flags = SM_FINAL, .name = "w_error", }, }; enum to_state { TO_INIT, TO_STARTED, TO_EXPIRED, TO_CANCELED, TO_NR, }; /* clang-format off */ static const struct sm_conf to_sm_conf[TO_NR] = { [TO_INIT] = { .flags = SM_INITIAL | SM_FINAL, .name = "init", .allowed = BITS(TO_STARTED), }, [TO_STARTED] = { .flags = SM_FINAL, .name = "started", .allowed = BITS(TO_EXPIRED) | BITS(TO_CANCELED), }, [TO_EXPIRED] = { .flags = SM_FINAL, .name = "expired", }, [TO_CANCELED] = { .flags = SM_FINAL, .name = "canceled", }, }; /* clang-format on */ #define M_MSG_SENT ((const struct raft_message *) 3) #define M_TIMEOUT ((const struct raft_message *) 2) #define M_WORK_DONE ((const struct raft_message *) 1) static bool is_main_thread(void) { // TODO: thread local storage. return true; } static bool work_sm_invariant(const struct sm *sm, int prev_state) { (void)sm; (void)prev_state; return true; } bool leader_sm_invariant(const struct sm *sm, int prev_state) { (void)sm; (void)prev_state; return true; } bool follower_sm_invariant(const struct sm *sm, int prev_state) { (void)sm; (void)prev_state; return true; } static bool rpc_sm_invariant(const struct sm *sm, int prev_state) { (void)sm; (void)prev_state; return true; } static bool to_sm_invariant(const struct sm *sm, int prev_state) { (void)sm; (void)prev_state; return true; } static void leader_work_done(struct work *w) { struct leader *leader = CONTAINER_OF(w, struct leader, work); sm_move(&w->sm, WORK_DONE); leader_tick(leader, M_WORK_DONE); } static void follower_work_done(struct work *w) { struct follower *follower = CONTAINER_OF(w, struct follower, work); sm_move(&w->sm, WORK_DONE); follower_tick(follower, M_WORK_DONE); } static void rpc_to_cb(uv_timer_t *handle) { struct timeout *to = CONTAINER_OF(handle, struct timeout, handle); struct rpc *rpc = CONTAINER_OF(to, struct rpc, timeout); struct leader *leader = CONTAINER_OF(rpc, struct leader, rpc); sm_move(&to->sm, TO_EXPIRED); sm_move(&rpc->sm, RPC_TIMEDOUT); leader_tick(leader, M_TIMEOUT); } static void leader_to_cb(uv_timer_t *handle) { struct timeout *to = CONTAINER_OF(handle, struct timeout, handle); struct leader *leader = CONTAINER_OF(to, struct leader, timeout); sm_move(&to->sm, TO_EXPIRED); leader_tick(leader, M_TIMEOUT); } static void leader_to_start(struct leader *leader, struct timeout *to, unsigned delay, to_cb_op to_cb) { leader->ops->to_init(to); sm_init(&to->sm, to_sm_invariant, NULL, to_sm_conf, "to", TO_INIT); leader->ops->to_start(to, delay, to_cb); sm_relate(&leader->sm, &to->sm); sm_move(&to->sm, TO_STARTED); } static void leader_to_cancel(struct leader *leader, struct timeout *to) { leader->ops->to_stop(to); sm_move(&to->sm, TO_CANCELED); } static void leader_sent_cb(struct sender *s, int rc) { struct rpc *rpc = CONTAINER_OF(s, struct rpc, sender); struct leader *leader = CONTAINER_OF(rpc, struct leader, rpc); if (UNLIKELY(rc != 0)) { sm_move(&rpc->sm, RPC_ERROR); return; } leader_tick(leader, M_MSG_SENT); } static void follower_sent_cb(struct sender *s, int rc) { struct rpc *rpc = CONTAINER_OF(s, struct rpc, sender); struct follower *follower = CONTAINER_OF(rpc, struct follower, rpc); if (UNLIKELY(rc != 0)) { sm_move(&rpc->sm, RPC_ERROR); return; } follower_tick(follower, M_MSG_SENT); } static bool is_a_trigger_leader(const struct leader *leader, const struct raft_message *incoming) { (void)leader; (void)incoming; return true; } static bool is_a_trigger_follower(const struct follower *follower, const struct raft_message *incoming) { switch (sm_state(&follower->sm)) { case FS_SIGS_CALC_LOOP: return incoming != M_WORK_DONE; case FS_SIG_PROCESSED: case FS_CHUNCK_PROCESSED: return incoming == M_WORK_DONE; } return true; } static bool is_a_duplicate(const void *state, const struct raft_message *incoming) { (void)state; (void)incoming; return false; } static void work_init(struct work *w) { sm_init(&w->sm, work_sm_invariant, NULL, work_sm_conf, "work", WORK_INIT); } static void rpc_init(struct rpc *rpc) { sm_init(&rpc->sm, rpc_sm_invariant, NULL, rpc_sm_conf, "rpc", RPC_INIT); } static void rpc_fini(struct rpc *rpc) { sm_move(&rpc->sm, RPC_END); } static void work_fill_leader(struct leader *leader) { leader->work_cb = leader->ops->ht_create; work_init(&leader->work); sm_relate(&leader->sm, &leader->work.sm); } static void work_fill_follower(struct follower *follower) { switch (sm_state(&follower->sm)) { case FS_HT_CREATE: follower->work_cb = follower->ops->ht_create; break; case FS_SIGS_CALC_STARTED: follower->work_cb = follower->ops->fill_ht; break; case FS_SIG_RECEIVING: follower->work_cb = follower->ops->read_sig; break; case FS_CHUNCK_RECEIVING: follower->work_cb = follower->ops->write_chunk; break; } work_init(&follower->work); sm_relate(&follower->sm, &follower->work.sm); } static void rpc_fill_leader(struct leader *leader) { rpc_init(&leader->rpc); sm_relate(&leader->sm, &leader->rpc.sm); sm_move(&leader->rpc.sm, RPC_FILLED); } static void rpc_fill_follower(struct follower *follower) { rpc_init(&follower->rpc); sm_relate(&follower->sm, &follower->rpc.sm); sm_move(&follower->rpc.sm, RPC_FILLED); } static int rpc_send(struct rpc *rpc, sender_send_op op, sender_cb_op sent_cb) { int rc = op(&rpc->sender, &rpc->message, sent_cb); return rc; } static void follower_rpc_tick(struct rpc *rpc) { switch(sm_state(&rpc->sm)) { case RPC_INIT: break; case RPC_FILLED: sm_move(&rpc->sm, RPC_SENT); break; case RPC_SENT: case RPC_TIMEDOUT: case RPC_REPLIED: case RPC_ERROR: case RPC_END: default: break; } } static void leader_rpc_tick(struct rpc *rpc) { switch(sm_state(&rpc->sm)) { case RPC_INIT: break; case RPC_FILLED: sm_move(&rpc->sm, RPC_SENT); break; case RPC_SENT: sm_move(&rpc->sm, RPC_REPLIED); break; case RPC_TIMEDOUT: case RPC_REPLIED: case RPC_ERROR: case RPC_END: default: break; } } static void leader_reset(struct leader *leader) { (void)leader; } static bool is_an_unexpected_trigger(const struct leader *leader, const struct raft_message *msg) { (void)leader; if (msg == M_MSG_SENT || msg == M_TIMEOUT || msg == M_WORK_DONE) { return false; } enum raft_result res = RAFT_RESULT_UNEXPECTED; switch (msg->type) { case RAFT_IO_APPEND_ENTRIES: res = RAFT_RESULT_OK; break; case RAFT_IO_INSTALL_SNAPSHOT: res = msg->install_snapshot.result; break; case RAFT_IO_INSTALL_SNAPSHOT_RESULT: res = msg->install_snapshot_result.result; break; case RAFT_IO_INSTALL_SNAPSHOT_CP: res = msg->install_snapshot_cp.result; break; case RAFT_IO_INSTALL_SNAPSHOT_CP_RESULT: res = msg->install_snapshot_cp_result.result; break; case RAFT_IO_INSTALL_SNAPSHOT_MV: res = msg->install_snapshot_mv.result; break; case RAFT_IO_INSTALL_SNAPSHOT_MV_RESULT: res = msg->install_snapshot_mv_result.result; break; case RAFT_IO_SIGNATURE: res = msg->signature.result; break; case RAFT_IO_SIGNATURE_RESULT: res = msg->signature_result.result; break; } return res == RAFT_RESULT_UNEXPECTED; } static int follower_next_state(struct sm *sm) { struct follower *follower = CONTAINER_OF(sm, struct follower, sm); switch (sm_state(sm)) { case FS_SIGS_CALC_LOOP: return follower->sigs_calculated ? FS_SIGS_CALC_DONE : FS_SIGS_CALC_MSG_RECEIVED; case FS_SIGS_CALC_MSG_RECEIVED: return FS_SIGS_CALC_LOOP; case FS_SIG_REPLIED: return FS_CHUNCK_RECEIVING; case FS_FINAL: return FS_NORMAL; } return sm_state(sm) + 1; } static int leader_next_state(struct sm *sm) { struct leader *leader = CONTAINER_OF(sm, struct leader, sm); switch (sm_state(sm)) { case LS_WAIT_SIGS: return sm_state(sm) + (leader->sigs_calculated ? +1 : -1); case LS_FINAL: return LS_F_ONLINE; } return sm_state(sm) + 1; } __attribute__((unused)) void leader_tick(struct leader *leader, const struct raft_message *incoming) { (void)leader_sm_conf; (void)leader_sm_invariant; int rc; struct sm *sm = &leader->sm; const struct leader_ops *ops = leader->ops; PRE(is_main_thread()); if (!is_a_trigger_leader(leader, incoming) || is_a_duplicate(leader, incoming)) return; if (is_an_unexpected_trigger(leader, incoming)) { leader_reset(leader); return; } again: switch(sm_state(sm)) { case LS_F_ONLINE: case LS_RECV_SIG_PART: case LS_READ_PAGES_LOOP: work_fill_leader(leader); ops->work_queue(&leader->work, leader->work_cb, leader_work_done); sm_move(sm, leader_next_state(sm)); break; case LS_HT_WAIT: case LS_PAGE_SENT: case LS_PERSISTED_SIG_PART: sm_move(sm, leader_next_state(sm)); goto again; case LS_FINAL: sm_move(sm, leader_next_state(sm)); break; case LS_PAGE_READ: case LS_SNAP_DONE: case LS_F_NEEDS_SNAP: case LS_REQ_SIG_LOOP: case LS_CHECK_F_HAS_SIGS: leader_rpc_tick(&leader->rpc); switch (sm_state(&leader->rpc.sm)) { case RPC_SENT: leader_to_start(leader, &leader->rpc.timeout, 10000, rpc_to_cb); return; case RPC_REPLIED: leader_to_cancel(leader, &leader->rpc.timeout); rpc_fini(&leader->rpc); sm_move(sm, leader_next_state(sm)); goto again; } rpc_fill_leader(leader); rc = rpc_send(&leader->rpc, ops->sender_send, leader_sent_cb); if (rc != 0) { goto again; } break; case LS_WAIT_SIGS: if (leader_next_state(sm) > sm_state(sm)) { sm_move(sm, leader_next_state(sm)); goto again; } leader_to_start(leader, &leader->timeout, 10000, leader_to_cb); sm_move(sm, leader_next_state(sm)); break; default: IMPOSSIBLE(""); } } __attribute__((unused)) void follower_tick(struct follower *follower, const struct raft_message *incoming) { (void)follower_sm_conf; (void)follower_sm_invariant; int rc; struct sm *sm = &follower->sm; const struct follower_ops *ops = follower->ops; if (!is_a_trigger_follower(follower, incoming) || is_a_duplicate(follower, incoming)) return; PRE(is_main_thread()); again: switch (sm_state(&follower->sm)) { case FS_NORMAL: case FS_SIGS_CALC_LOOP: case FS_SIG_READ: case FS_CHUNCK_APPLIED: follower_rpc_tick(&follower->rpc); if (sm_state(&follower->rpc.sm) == RPC_SENT) { rpc_fini(&follower->rpc); sm_move(sm, follower_next_state(sm)); goto again; } rpc_fill_follower(follower); rc = rpc_send(&follower->rpc, ops->sender_send, follower_sent_cb); if (rc != 0) { goto again; } break; case FS_SIG_PROCESSED: case FS_CHUNCK_PROCESSED: case FS_CHUNCK_REPLIED: case FS_HT_WAIT: sm_move(sm, follower_next_state(sm)); goto again; case FS_HT_CREATE: case FS_SIGS_CALC_STARTED: case FS_SIG_RECEIVING: case FS_CHUNCK_RECEIVING: work_fill_follower(follower); ops->work_queue(&follower->work, follower->work_cb, follower_work_done); sm_move(sm, follower_next_state(sm)); break; case FS_SIG_REPLIED: case FS_SIGS_CALC_DONE: case FS_SIGS_CALC_MSG_RECEIVED: case FS_FINAL: sm_move(sm, follower_next_state(sm)); break; default: IMPOSSIBLE(""); } } static void installSnapshotSendCb(struct raft_io_send *req, int status) { (void)status; raft_free(req); } int recvInstallSnapshot(struct raft *r, const raft_id id, const char *address, struct raft_install_snapshot *args) { struct raft_io_send *req; struct raft_message message; struct raft_append_entries_result *result = &message.append_entries_result; int rv; int match; bool async; assert(address != NULL); tracef( "self:%llu from:%llu@%s conf_index:%llu last_index:%llu " "last_term:%llu " "term:%llu", r->id, id, address, args->conf_index, args->last_index, args->last_term, args->term); result->rejected = args->last_index; result->last_log_index = logLastIndex(r->log); result->version = RAFT_APPEND_ENTRIES_RESULT_VERSION; result->features = RAFT_DEFAULT_FEATURE_FLAGS; rv = recvEnsureMatchingTerms(r, args->term, &match); if (rv != 0) { return rv; } if (match < 0) { tracef("local term is higher -> reject "); goto reply; } /* TODO: this logic duplicates the one in the AppendEntries handler */ assert(r->state == RAFT_FOLLOWER || r->state == RAFT_CANDIDATE); assert(r->current_term == args->term); if (r->state == RAFT_CANDIDATE) { assert(match == 0); tracef("discovered leader -> step down "); convertToFollower(r); } rv = recvUpdateLeader(r, id, address); if (rv != 0) { return rv; } r->election_timer_start = r->io->time(r->io); rv = replicationInstallSnapshot(r, args, &result->rejected, &async); if (rv != 0) { tracef("replicationInstallSnapshot failed %d", rv); return rv; } if (async) { return 0; } if (result->rejected == 0) { /* Echo back to the leader the point that we reached. */ result->last_log_index = args->last_index; } reply: result->term = r->current_term; /* Free the snapshot data. */ raft_configuration_close(&args->conf); raft_free(args->data.base); message.type = RAFT_IO_APPEND_ENTRIES_RESULT; message.server_id = id; message.server_address = address; req = raft_malloc(sizeof *req); if (req == NULL) { return RAFT_NOMEM; } req->data = r; rv = r->io->send(r->io, req, &message, installSnapshotSendCb); if (rv != 0) { raft_free(req); return rv; } return 0; } #undef tracef dqlite-1.16.7/src/raft/recv_install_snapshot.h000066400000000000000000000153671465252713400214150ustar00rootroot00000000000000/* InstallSnapshot RPC handlers. */ #ifndef RECV_INSTALL_SNAPSHOT_H_ #define RECV_INSTALL_SNAPSHOT_H_ #include #include #include "../raft.h" struct work; struct sender; struct timeout; typedef void (*to_cb_op)(uv_timer_t *handle); typedef void (*work_op)(struct work *w); typedef void (*sender_cb_op)(struct sender *s, int rc); struct work { work_op work_cb; work_op after_cb; struct sm sm; }; struct sender { sender_cb_op cb; }; struct timeout { to_cb_op cb; struct sm sm; uv_timer_t handle; }; struct rpc { struct sm sm; struct sender sender; struct raft_message message; struct timeout timeout; }; typedef int (*sender_send_op)(struct sender *s, struct raft_message *payload, sender_cb_op cb); struct leader_ops { work_op ht_create; void (*to_init)(struct timeout *to); void (*to_stop)(struct timeout *to); void (*to_start)(struct timeout *to, unsigned delay, to_cb_op cb); sender_send_op sender_send; void (*work_queue)(struct work *w, work_op work, work_op after_cb); }; struct follower_ops { work_op ht_create; work_op fill_ht; work_op read_sig; work_op write_chunk; sender_send_op sender_send; void (*work_queue)(struct work *w, work_op work, work_op after_cb); }; struct leader { struct sm sm; struct rpc rpc; struct work work; work_op work_cb; struct timeout timeout; const struct leader_ops *ops; /* TODO dummy flags */ bool sigs_calculated; bool sigs_more; bool pages_more; }; struct follower { struct sm sm; struct rpc rpc; struct work work; work_op work_cb; const struct follower_ops *ops; /* TODO dummy flags */ bool sigs_calculated; }; void leader_tick(struct leader *leader, const struct raft_message *incoming); void follower_tick(struct follower *follower, const struct raft_message *incoming); /* TODO make all of these private and static once we can write tests without * depending on the states. */ bool leader_sm_invariant(const struct sm *sm, int prev_state); bool follower_sm_invariant(const struct sm *sm, int prev_state); enum leader_states { LS_F_ONLINE, LS_HT_WAIT, LS_F_NEEDS_SNAP, LS_CHECK_F_HAS_SIGS, LS_WAIT_SIGS, LS_REQ_SIG_LOOP, LS_RECV_SIG_PART, LS_PERSISTED_SIG_PART, LS_READ_PAGES_LOOP, LS_PAGE_READ, LS_PAGE_SENT, LS_SNAP_DONE, LS_FINAL, LS_NR, }; /* clang-format off */ static const struct sm_conf leader_sm_conf[LS_NR] = { [LS_F_ONLINE] = { .flags = SM_INITIAL | SM_FINAL, .name = "online", .allowed = BITS(LS_HT_WAIT) | BITS(LS_F_ONLINE), }, [LS_HT_WAIT] = { .name = "ht-wait", .allowed = BITS(LS_F_NEEDS_SNAP), }, [LS_F_NEEDS_SNAP] = { .name = "needs-snapshot", .allowed = BITS(LS_CHECK_F_HAS_SIGS) | BITS(LS_F_NEEDS_SNAP) | BITS(LS_F_ONLINE), }, [LS_CHECK_F_HAS_SIGS] = { .name = "check-f-has-sigs", .allowed = BITS(LS_CHECK_F_HAS_SIGS) | BITS(LS_WAIT_SIGS) | BITS(LS_F_ONLINE), }, [LS_WAIT_SIGS] = { .name = "wait-sigs", .allowed = BITS(LS_CHECK_F_HAS_SIGS) | BITS(LS_REQ_SIG_LOOP) | BITS(LS_F_ONLINE), }, [LS_REQ_SIG_LOOP] = { .name = "req-sig-loop", .allowed = BITS(LS_RECV_SIG_PART) | BITS(LS_F_ONLINE), }, [LS_RECV_SIG_PART] = { .name = "recv-sig", .allowed = BITS(LS_PERSISTED_SIG_PART) | BITS(LS_REQ_SIG_LOOP) | BITS(LS_F_ONLINE), }, [LS_PERSISTED_SIG_PART] = { .name = "pers-sig", .allowed = BITS(LS_READ_PAGES_LOOP) | BITS(LS_REQ_SIG_LOOP) | BITS(LS_F_ONLINE), }, [LS_READ_PAGES_LOOP] = { .name = "read-pages-loop", .allowed = BITS(LS_PAGE_READ) | BITS(LS_F_ONLINE), }, [LS_PAGE_READ] = { .name = "page-read", .allowed = BITS(LS_PAGE_SENT) | BITS(LS_F_ONLINE), }, [LS_PAGE_SENT] = { .name = "page-sent", .allowed = BITS(LS_READ_PAGES_LOOP) | BITS(LS_SNAP_DONE) | BITS(LS_F_ONLINE), }, [LS_SNAP_DONE] = { .name = "snap-done", .allowed = BITS(LS_SNAP_DONE) | BITS(LS_FINAL), }, [LS_FINAL] = { .name = "final", .allowed = BITS(LS_F_ONLINE), }, }; /* clang-format on */ enum follower_states { FS_NORMAL, FS_HT_CREATE, FS_HT_WAIT, FS_SIGS_CALC_STARTED, FS_SIGS_CALC_LOOP, FS_SIGS_CALC_MSG_RECEIVED, FS_SIGS_CALC_DONE, FS_SIG_RECEIVING, FS_SIG_PROCESSED, FS_SIG_READ, FS_SIG_REPLIED, FS_CHUNCK_RECEIVING, FS_CHUNCK_PROCESSED, FS_CHUNCK_APPLIED, FS_CHUNCK_REPLIED, FS_FINAL, FS_NR, }; /* clang-format off */ static const struct sm_conf follower_sm_conf[FS_NR] = { [FS_NORMAL] = { .flags = SM_INITIAL | SM_FINAL, .name = "normal", .allowed = BITS(FS_HT_CREATE) | BITS(FS_NORMAL), }, [FS_HT_CREATE] = { .name = "ht_create", .allowed = BITS(FS_HT_WAIT) | BITS(FS_NORMAL), }, [FS_HT_WAIT] = { .name = "ht_waiting", .allowed = BITS(FS_SIGS_CALC_STARTED) | BITS(FS_NORMAL), }, [FS_SIGS_CALC_STARTED] = { .name = "signatures_calc_started", .allowed = BITS(FS_SIGS_CALC_LOOP) | BITS(FS_NORMAL), }, [FS_SIGS_CALC_LOOP] = { .name = "signatures_calc_loop", .allowed = BITS(FS_SIGS_CALC_MSG_RECEIVED) | BITS(FS_SIGS_CALC_DONE) | BITS(FS_NORMAL), }, [FS_SIGS_CALC_MSG_RECEIVED] = { .name = "signatures_msg_received", .allowed = BITS(FS_SIGS_CALC_LOOP) | BITS(FS_NORMAL), }, [FS_SIGS_CALC_DONE] = { .name = "signatures_calc_done", .allowed = BITS(FS_SIG_RECEIVING) | BITS(FS_NORMAL), }, [FS_SIG_RECEIVING] = { .name = "signature_received", .allowed = BITS(FS_SIG_PROCESSED) | BITS(FS_NORMAL), }, [FS_SIG_PROCESSED] = { .name = "signature_processed", .allowed = BITS(FS_SIG_READ) | BITS(FS_NORMAL), }, [FS_SIG_READ] = { .name = "signature_read", .allowed = BITS(FS_SIG_REPLIED) | BITS(FS_NORMAL), }, [FS_SIG_REPLIED] = { .name = "signature_sent", .allowed = BITS(FS_CHUNCK_RECEIVING) | BITS(FS_SIG_RECEIVING) | BITS(FS_NORMAL), }, [FS_CHUNCK_RECEIVING] = { .name = "chunk_received", .allowed = BITS(FS_CHUNCK_PROCESSED) | BITS(FS_NORMAL), }, [FS_CHUNCK_PROCESSED] = { .name = "chunk_processed", .allowed = BITS(FS_CHUNCK_APPLIED) | BITS(FS_NORMAL), }, [FS_CHUNCK_APPLIED] = { .name = "chunk_applied", .allowed = BITS(FS_CHUNCK_REPLIED) | BITS(FS_NORMAL), }, [FS_CHUNCK_REPLIED] = { .name = "chunk_replied", .allowed = BITS(FS_CHUNCK_PROCESSED) | BITS(FS_FINAL) | BITS(FS_NORMAL), }, [FS_FINAL] = { .name = "final", .allowed = BITS(FS_NORMAL), }, }; /* clang-format on */ /* end of TODO make this private */ /* Process an InstallSnapshot RPC from the given server. */ int recvInstallSnapshot(struct raft *r, raft_id id, const char *address, struct raft_install_snapshot *args); #endif /* RECV_INSTALL_SNAPSHOT_H_ */ dqlite-1.16.7/src/raft/recv_request_vote.c000066400000000000000000000106071465252713400205400ustar00rootroot00000000000000#include "recv_request_vote.h" #include "../tracing.h" #include "assert.h" #include "election.h" #include "recv.h" #include "replication.h" static void requestVoteSendCb(struct raft_io_send *req, int status) { (void)status; raft_free(req); } int recvRequestVote(struct raft *r, const raft_id id, const char *address, const struct raft_request_vote *args) { struct raft_io_send *req; struct raft_message message; struct raft_request_vote_result *result = &message.request_vote_result; bool has_leader; int match; int rv; assert(r != NULL); assert(id > 0); assert(args != NULL); tracef( "self:%llu from:%llu@%s candidate_id:%llu disrupt_leader:%d " "last_log_index:%llu " "last_log_term:%llu pre_vote:%d term:%llu", r->id, id, address, args->candidate_id, args->disrupt_leader, args->last_log_index, args->last_log_term, args->pre_vote, args->term); result->vote_granted = false; result->pre_vote = args->pre_vote; result->version = RAFT_REQUEST_VOTE_RESULT_VERSION; /* Reject the request if we have a leader. * * From Section 4.2.3: * * [Removed] servers should not be able to disrupt a leader whose * cluster is receiving heartbeats. [...] If a server receives a * RequestVote request within the minimum election timeout of hearing * from a current leader, it does not update its term or grant its vote * * From Section 4.2.3: * * This change conflicts with the leadership transfer mechanism as * described in Chapter 3, in which a server legitimately starts an * election without waiting an election timeout. In that case, * RequestVote messages should be processed by other servers even when * they believe a current cluster leader exists. Those RequestVote * requests can include a special flag to indicate this behavior ("I * have permission to disrupt the leader - it told me to!"). */ has_leader = r->state == RAFT_LEADER || (r->state == RAFT_FOLLOWER && r->follower_state.current_leader.id != 0); if (has_leader && !args->disrupt_leader) { tracef("local server has a leader -> reject "); goto reply; } /* If this is a pre-vote request, don't actually increment our term or * persist the vote. */ if (args->pre_vote) { recvCheckMatchingTerms(r, args->term, &match); } else { rv = recvEnsureMatchingTerms(r, args->term, &match); if (rv != 0) { return rv; } } /* Reject the request if we are installing a snapshot. * * This condition should only be reachable if the disrupt_leader flag is * set, since otherwise we wouldn't have passed the have_leader check * above (follower state is not cleared while a snapshot is being * installed). */ if (replicationInstallSnapshotBusy(r)) { tracef("installing snapshot -> reject (disrupt_leader:%d)", (int)args->disrupt_leader); goto reply; } /* From Figure 3.1: * * RequestVote RPC: Receiver implementation: Reply false if * term < currentTerm. * */ if (match < 0) { tracef("local term is higher -> reject "); goto reply; } /* Unless this is a pre-vote request, at this point our term must be the * same as the request term (otherwise we would have rejected the * request or bumped our term). */ if (!args->pre_vote) { tracef("no pre_vote: current_term:%llu term:%llu", r->current_term, args->term); assert(r->current_term == args->term); } rv = electionVote(r, args, &result->vote_granted); if (rv != 0) { return rv; } reply: result->term = r->current_term; /* Nodes don't update their term when seeing a Pre-Vote RequestVote RPC. * To prevent the candidate from ignoring the response of this node if * it has a smaller term than the candidate, we include the term of the * request. The smaller term can occur if this node was partitioned from * the cluster and has reestablished connectivity. This prevents a * cluster deadlock when a majority of the nodes is online, but they * fail to establish quorum because the vote of a former partitioned * node with a smaller term is needed for majority.*/ if (args->pre_vote) { result->term = args->term; } message.type = RAFT_IO_REQUEST_VOTE_RESULT; message.server_id = id; message.server_address = address; req = raft_malloc(sizeof *req); if (req == NULL) { return RAFT_NOMEM; } req->data = r; rv = r->io->send(r->io, req, &message, requestVoteSendCb); if (rv != 0) { raft_free(req); return rv; } return 0; } #undef tracef dqlite-1.16.7/src/raft/recv_request_vote.h000066400000000000000000000005111465252713400205360ustar00rootroot00000000000000/* RequestVote RPC handler. */ #ifndef RECV_REQUEST_VOTE_H_ #define RECV_REQUEST_VOTE_H_ #include "../raft.h" /* Process a RequestVote RPC from the given server. */ int recvRequestVote(struct raft *r, raft_id id, const char *address, const struct raft_request_vote *args); #endif /* RECV_REQUEST_VOTE_H_ */ dqlite-1.16.7/src/raft/recv_request_vote_result.c000066400000000000000000000103721465252713400221350ustar00rootroot00000000000000#include "recv_request_vote_result.h" #include "../tracing.h" #include "assert.h" #include "configuration.h" #include "convert.h" #include "election.h" #include "recv.h" #include "replication.h" int recvRequestVoteResult(struct raft *r, raft_id id, const char *address, const struct raft_request_vote_result *result) { size_t votes_index; int match; int rv; (void)address; assert(r != NULL); assert(id > 0); tracef( "self:%llu from:%llu@%s term:%llu vote_granted:%d pre_vote:%d " "version:%d", r->id, id, address, result->term, result->vote_granted, result->pre_vote, result->version); votes_index = configurationIndexOfVoter(&r->configuration, id); if (votes_index == r->configuration.n) { tracef("non-voting or unknown server -> reject"); return 0; } /* Ignore responses if we are not candidate anymore */ if (r->state != RAFT_CANDIDATE) { tracef("local server is not candidate -> ignore"); return 0; } /* If we're in the pre-vote phase, don't actually increment our term * right now (we'll do it later, if we start the second phase), and also * don't step down if the peer is just one term ahead (this is okay as * in the request we sent our current term plus one). */ if (r->candidate_state.in_pre_vote) { recvCheckMatchingTerms(r, result->term, &match); } else { rv = recvEnsureMatchingTerms(r, result->term, &match); if (rv != 0) { return rv; } } /* Converted to follower as a result of seeing a higher term. */ if (r->state != RAFT_CANDIDATE) { tracef("no longer candidate -> ignore"); return 0; } if (match < 0) { /* If the term in the result is older than ours, this is an old * message we should ignore, because the node who voted for us * would have obtained our term. This happens if the network is * pretty choppy. */ tracef("local term is higher -> ignore"); return 0; } /* Avoid counting pre-vote votes as regular votes. */ if (result->version > 1 && result->pre_vote && !r->candidate_state.in_pre_vote) { tracef("receive stale pre-vote response -> ignore"); return 0; } /* This can happen when a candidate wins a pre-vote, bumps its term, * sends real RequestVote RPCs, crashes, comes online, starts a pre-vote * and then receives the response to the RequestVote RPC it sent * out before crashing. */ if (result->version > 1 && !result->pre_vote && r->candidate_state.in_pre_vote) { tracef("receive vote response during pre-vote -> ignore"); return 0; } /* If we're in the pre-vote phase, check that the peer's is at most one * term ahead (possibly stepping down). If we're the actual voting * phase, we expect our term must to be the same as the response term * (otherwise we would have either ignored the result bumped our term). */ if (r->candidate_state.in_pre_vote) { if (match > 0) { if (result->term > r->current_term + 1) { assert(!result->vote_granted); rv = recvBumpCurrentTerm(r, result->term); return rv; } } } else { assert(result->term == r->current_term); } /* If the vote was granted and we reached quorum, convert to leader. * * From Figure 3.1: * * If votes received from majority of severs: become leader. * * From state diagram in Figure 3.3: * * [candidate]: receives votes from majority of servers -> [leader] * * From Section 3.4: * * A candidate wins an election if it receives votes from a majority * of the servers in the full cluster for the same term. Each server * will vote for at most one candidate in a given term, on a * firstcome-first-served basis [...]. Once a candidate wins an * election, it becomes leader. */ if (result->vote_granted) { if (electionTally(r, votes_index)) { if (r->candidate_state.in_pre_vote) { tracef( "votes quorum reached -> pre-vote " "successful"); r->candidate_state.in_pre_vote = false; rv = electionStart(r); if (rv != 0) { return rv; } } else { tracef( "votes quorum reached -> convert to " "leader"); rv = convertToLeader(r); if (rv != 0) { return rv; } /* Send initial heartbeat. */ replicationHeartbeat(r); } } else { tracef("votes quorum not reached"); } } else { tracef("vote was not granted"); } return 0; } #undef tracef dqlite-1.16.7/src/raft/recv_request_vote_result.h000066400000000000000000000005731465252713400221440ustar00rootroot00000000000000/* Receive a RequestVote result. */ #ifndef RECV_REQUEST_VOTE_RESULT_H_ #define RECV_REQUEST_VOTE_RESULT_H_ #include "../raft.h" /* Process a RequestVote RPC result from the given server. */ int recvRequestVoteResult(struct raft *r, raft_id id, const char *address, const struct raft_request_vote_result *result); #endif /* RAFT_RECV_REQUEST_VOTE_RESULT_H_ */ dqlite-1.16.7/src/raft/recv_timeout_now.c000066400000000000000000000036611465252713400203660ustar00rootroot00000000000000#include "recv_timeout_now.h" #include "../tracing.h" #include "assert.h" #include "configuration.h" #include "convert.h" #include "log.h" #include "recv.h" int recvTimeoutNow(struct raft *r, const raft_id id, const char *address, const struct raft_timeout_now *args) { const struct raft_server *local_server; raft_index local_last_index; raft_term local_last_term; int match; int rv; assert(r != NULL); assert(id > 0); assert(args != NULL); (void)address; tracef( "self:%llu from:%llu@%s last_log_index:%llu last_log_term:%llu " "term:%llu", r->id, id, address, args->last_log_index, args->last_log_term, args->term); /* Ignore the request if we are not voters. */ local_server = configurationGet(&r->configuration, r->id); if (local_server == NULL || local_server->role != RAFT_VOTER) { tracef("non-voter"); return 0; } /* Ignore the request if we are not follower, or we have different * leader. */ if (r->state != RAFT_FOLLOWER || r->follower_state.current_leader.id != id) { tracef("Ignore - r->state:%d current_leader.id:%llu", r->state, r->follower_state.current_leader.id); return 0; } /* Possibly update our term. Ignore the request if it turns out we have * a higher term. */ rv = recvEnsureMatchingTerms(r, args->term, &match); if (rv != 0) { return rv; } if (match < 0) { return 0; } /* Ignore the request if we our log is not up-to-date. */ local_last_index = logLastIndex(r->log); local_last_term = logLastTerm(r->log); if (local_last_index != args->last_log_index || local_last_term != args->last_log_term) { return 0; } /* Finally, ignore the request if we're working on persisting some * entries. */ if (r->follower_state.append_in_flight_count > 0) { return 0; } /* Convert to candidate and start a new election. */ rv = convertToCandidate(r, true /* disrupt leader */); if (rv != 0) { return rv; } return 0; } #undef tracef dqlite-1.16.7/src/raft/recv_timeout_now.h000066400000000000000000000005051465252713400203650ustar00rootroot00000000000000/* Receive a TimeoutNow message. */ #ifndef RECV_TIMEOUT_NOW_H_ #define RECV_TIMEOUT_NOW_H_ #include "../raft.h" /* Process a TimeoutNow RPC from the given server. */ int recvTimeoutNow(struct raft *r, raft_id id, const char *address, const struct raft_timeout_now *args); #endif /* RECV_TIMEOUT_NOW_H_ */ dqlite-1.16.7/src/raft/replication.c000066400000000000000000001402131465252713400173020ustar00rootroot00000000000000#include #include "assert.h" #include "configuration.h" #include "convert.h" #include "entry.h" #ifdef __GLIBC__ #include "error.h" #endif #include "../tracing.h" #include "err.h" #include "flags.h" #include "heap.h" #include "lifecycle.h" #include "log.h" #include "membership.h" #include "progress.h" #include "../lib/queue.h" #include "replication.h" #include "request.h" #include "snapshot.h" #ifndef max #define max(a, b) ((a) < (b) ? (b) : (a)) #endif #ifndef min #define min(a, b) ((a) < (b) ? (a) : (b)) #endif /* Context of a RAFT_IO_APPEND_ENTRIES request that was submitted with * raft_io_>send(). */ struct sendAppendEntries { struct raft *raft; /* Instance sending the entries. */ struct raft_io_send send; /* Underlying I/O send request. */ raft_index index; /* Index of the first entry in the request. */ struct raft_entry *entries; /* Entries referenced in the request. */ unsigned n; /* Length of the entries array. */ raft_id server_id; /* Destination server. */ }; /* Callback invoked after request to send an AppendEntries RPC has completed. */ static void sendAppendEntriesCb(struct raft_io_send *send, const int status) { struct sendAppendEntries *req = send->data; struct raft *r = req->raft; unsigned i = configurationIndexOf(&r->configuration, req->server_id); if (r->state == RAFT_LEADER && i < r->configuration.n) { if (status != 0) { tracef( "failed to send append entries to server %llu: %s", req->server_id, raft_strerror(status)); /* Go back to probe mode. */ progressToProbe(r, i); } } /* Tell the log that we're done referencing these entries. */ logRelease(r->log, req->index, req->entries, req->n); raft_free(req); } /* Send an AppendEntries message to the i'th server, including all log entries * from the given point onwards. */ static int sendAppendEntries(struct raft *r, const unsigned i, const raft_index prev_index, const raft_term prev_term) { struct raft_server *server = &r->configuration.servers[i]; struct raft_message message; struct raft_append_entries *args = &message.append_entries; struct sendAppendEntries *req; raft_index next_index = prev_index + 1; int rv; args->term = r->current_term; args->prev_log_index = prev_index; args->prev_log_term = prev_term; /* TODO: implement a limit to the total size of the entries being sent */ rv = logAcquire(r->log, next_index, &args->entries, &args->n_entries); if (rv != 0) { goto err; } /* From Section 3.5: * * The leader keeps track of the highest index it knows to be * committed, and it includes that index in future AppendEntries RPCs * (including heartbeats) so that the other servers eventually find out. * Once a follower learns that a log entry is committed, it applies the * entry to its local state machine (in log order) */ args->leader_commit = r->commit_index; tracef( "send %u entries starting at %llu to server %llu (last index %llu)", args->n_entries, args->prev_log_index, server->id, logLastIndex(r->log)); message.type = RAFT_IO_APPEND_ENTRIES; message.server_id = server->id; message.server_address = server->address; req = raft_malloc(sizeof *req); if (req == NULL) { rv = RAFT_NOMEM; goto err_after_entries_acquired; } req->raft = r; req->index = args->prev_log_index + 1; req->entries = args->entries; req->n = args->n_entries; req->server_id = server->id; req->send.data = req; rv = r->io->send(r->io, &req->send, &message, sendAppendEntriesCb); if (rv != 0) { goto err_after_req_alloc; } if (progressState(r, i) == PROGRESS__PIPELINE) { /* Optimistically update progress. */ progressOptimisticNextIndex(r, i, req->index + req->n); } progressUpdateLastSend(r, i); return 0; err_after_req_alloc: raft_free(req); err_after_entries_acquired: logRelease(r->log, next_index, args->entries, args->n_entries); err: assert(rv != 0); return rv; } /* Context of a RAFT_IO_INSTALL_SNAPSHOT request that was submitted with * raft_io_>send(). */ struct sendInstallSnapshot { struct raft *raft; /* Instance sending the snapshot. */ struct raft_io_snapshot_get get; /* Snapshot get request. */ struct raft_io_send send; /* Underlying I/O send request. */ struct raft_snapshot *snapshot; /* Snapshot to send. */ raft_id server_id; /* Destination server. */ }; static void sendInstallSnapshotCb(struct raft_io_send *send, int status) { struct sendInstallSnapshot *req = send->data; struct raft *r = req->raft; const struct raft_server *server; server = configurationGet(&r->configuration, req->server_id); if (status != 0) { tracef("send install snapshot: %s", raft_strerror(status)); if (r->state == RAFT_LEADER && server != NULL) { unsigned i; i = configurationIndexOf(&r->configuration, req->server_id); progressAbortSnapshot(r, i); } } snapshotClose(req->snapshot); raft_free(req->snapshot); raft_free(req); } static void sendSnapshotGetCb(struct raft_io_snapshot_get *get, struct raft_snapshot *snapshot, int status) { struct sendInstallSnapshot *req = get->data; struct raft *r = req->raft; struct raft_message message; struct raft_install_snapshot *args = &message.install_snapshot; const struct raft_server *server = NULL; bool progress_state_is_snapshot = false; unsigned i = 0; int rv; if (status != 0) { tracef("get snapshot %s", raft_strerror(status)); goto abort; } if (r->state != RAFT_LEADER) { goto abort_with_snapshot; } server = configurationGet(&r->configuration, req->server_id); if (server == NULL) { /* Probably the server was removed in the meantime. */ goto abort_with_snapshot; } i = configurationIndexOf(&r->configuration, req->server_id); progress_state_is_snapshot = progressState(r, i) == PROGRESS__SNAPSHOT; if (!progress_state_is_snapshot) { /* Something happened in the meantime. */ goto abort_with_snapshot; } assert(snapshot->n_bufs == 1); message.type = RAFT_IO_INSTALL_SNAPSHOT; message.server_id = server->id; message.server_address = server->address; args->term = r->current_term; args->last_index = snapshot->index; args->last_term = snapshot->term; args->conf_index = snapshot->configuration_index; args->conf = snapshot->configuration; args->data = snapshot->bufs[0]; req->snapshot = snapshot; req->send.data = req; tracef("sending snapshot with last index %llu to %llu", snapshot->index, server->id); rv = r->io->send(r->io, &req->send, &message, sendInstallSnapshotCb); if (rv != 0) { goto abort_with_snapshot; } goto out; abort_with_snapshot: snapshotClose(snapshot); raft_free(snapshot); abort: if (r->state == RAFT_LEADER && server != NULL && progress_state_is_snapshot) { progressAbortSnapshot(r, i); } raft_free(req); out: return; } /* Send the latest snapshot to the i'th server */ static int sendSnapshot(struct raft *r, const unsigned i) { struct raft_server *server = &r->configuration.servers[i]; struct sendInstallSnapshot *request; int rv; progressToSnapshot(r, i); request = raft_malloc(sizeof *request); if (request == NULL) { rv = RAFT_NOMEM; goto err; } request->raft = r; request->server_id = server->id; request->get.data = request; /* TODO: make sure that the I/O implementation really returns the latest * snapshot *at this time* and not any snapshot that might be stored at * a later point. Otherwise the progress snapshot_index would be wrong. */ rv = r->io->snapshot_get(r->io, &request->get, sendSnapshotGetCb); if (rv != 0) { goto err_after_req_alloc; } progressUpdateSnapshotLastSend(r, i); return 0; err_after_req_alloc: raft_free(request); err: progressAbortSnapshot(r, i); assert(rv != 0); return rv; } int replicationProgress(struct raft *r, unsigned i) { struct raft_server *server = &r->configuration.servers[i]; bool progress_state_is_snapshot = progressState(r, i) == PROGRESS__SNAPSHOT; raft_index snapshot_index = logSnapshotIndex(r->log); raft_index next_index = progressNextIndex(r, i); raft_index prev_index; raft_term prev_term; assert(r->state == RAFT_LEADER); assert(server->id != r->id); assert(next_index >= 1); if (!progressShouldReplicate(r, i)) { return 0; } /* From Section 3.5: * * When sending an AppendEntries RPC, the leader includes the index * and term of the entry in its log that immediately precedes the new * entries. If the follower does not find an entry in its log with the * same index and term, then it refuses the new entries. The * consistency check acts as an induction step: the initial empty state * of the logs satisfies the Log Matching Property, and the consistency * check preserves the Log Matching Property whenever logs are extended. * As a result, whenever AppendEntries returns successfully, the leader * knows that the follower's log is identical to its own log up through * the new entries (Log Matching Property in Figure 3.2). */ if (next_index == 1) { /* We're including the very first entry, so prevIndex and * prevTerm are null. If the first entry is not available * anymore, send the last snapshot if we're not already sending * one. */ if (snapshot_index > 0 && !progress_state_is_snapshot) { raft_index last_index = logLastIndex(r->log); assert(last_index > 0); /* The log can't be empty */ goto send_snapshot; } prev_index = 0; prev_term = 0; } else { /* Set prevIndex and prevTerm to the index and term of the entry * at next_index - 1. */ prev_index = next_index - 1; prev_term = logTermOf(r->log, prev_index); /* If the entry is not anymore in our log, send the last * snapshot if we're not doing so already. */ if (prev_term == 0 && !progress_state_is_snapshot) { assert(prev_index < snapshot_index); tracef("missing entry at index %lld -> send snapshot", prev_index); goto send_snapshot; } } /* Send empty AppendEntries RPC when installing a snaphot */ if (progress_state_is_snapshot) { prev_index = logLastIndex(r->log); prev_term = logLastTerm(r->log); } return sendAppendEntries(r, i, prev_index, prev_term); send_snapshot: if (progressGetRecentRecv(r, i)) { /* Only send a snapshot when we have heard from the server */ return sendSnapshot(r, i); } else { /* Send empty AppendEntries RPC when we haven't heard from the * server */ prev_index = logLastIndex(r->log); prev_term = logLastTerm(r->log); return sendAppendEntries(r, i, prev_index, prev_term); } } /* Possibly trigger I/O requests for newly appended log entries or heartbeats. * * This function loops through all followers and triggers replication on them. * * It must be called only by leaders. */ static int triggerAll(struct raft *r) { unsigned i; int rv; assert(r->state == RAFT_LEADER); /* Trigger replication for servers we didn't hear from recently. */ for (i = 0; i < r->configuration.n; i++) { struct raft_server *server = &r->configuration.servers[i]; if (server->id == r->id) { continue; } /* Skip spare servers, unless they're being promoted. */ if (server->role == RAFT_SPARE && server->id != r->leader_state.promotee_id) { continue; } rv = replicationProgress(r, i); if (rv != 0 && rv != RAFT_NOCONNECTION) { /* This is not a critical failure, let's just log it. */ tracef( "failed to send append entries to server %llu: %s " "(%d)", server->id, raft_strerror(rv), rv); } } return 0; } int replicationHeartbeat(struct raft *r) { return triggerAll(r); } /* Context for a write log entries request that was submitted by a leader. */ struct appendLeader { struct raft *raft; /* Instance that has submitted the request */ raft_index index; /* Index of the first entry in the request. */ struct raft_entry *entries; /* Entries referenced in the request. */ unsigned n; /* Length of the entries array. */ struct raft_io_append req; }; /* Called after a successful append entries I/O request to update the index of * the last entry stored on disk. Return how many new entries that are still * present in our in-memory log were stored. */ static size_t updateLastStored(struct raft *r, raft_index first_index, struct raft_entry *entries, size_t n_entries) { size_t i; /* Check which of these entries is still in our in-memory log */ for (i = 0; i < n_entries; i++) { struct raft_entry *entry = &entries[i]; raft_index index = first_index + i; raft_term local_term = logTermOf(r->log, index); /* If we have no entry at this index, or if the entry we have * now has a different term, it means that this entry got * truncated, so let's stop here. */ if (local_term == 0 || (local_term > 0 && local_term != entry->term)) { break; } /* If we do have an entry at this index, its term must match the * one of the entry we wrote on disk. */ assert(local_term != 0 && local_term == entry->term); } r->last_stored += i; return i; } /* Get the request matching the given @index and @type, if any. * The type check is skipped when @type == -1. */ static struct request *getRequest(struct raft *r, const raft_index index, int type) { queue *head; struct request *req; if (r->state != RAFT_LEADER) { return NULL; } QUEUE_FOREACH(head, &r->leader_state.requests) { req = QUEUE_DATA(head, struct request, queue); if (req->index == index) { if (type != -1) { assert(req->type == type); } lifecycleRequestEnd(r, req); return req; } } return NULL; } /* Invoked once a disk write request for new entries has been completed. */ static void appendLeaderCb(struct raft_io_append *append, int status) { struct appendLeader *request = append->data; struct raft *r = request->raft; size_t server_index; raft_index index; int rv; tracef("leader: written %u entries starting at %lld: status %d", request->n, request->index, status); /* In case of a failed disk write, if we were the leader creating these * entries in the first place, truncate our log too (since we have * appended these entries to it) and fire the request callbacks. * * Afterward, convert immediately to follower state, giving the cluster * a chance to elect another leader that doesn't have a full disk (or * whatever caused our write error). */ if (status != 0) { ErrMsgTransfer(r->io->errmsg, r->errmsg, "io"); for (unsigned i = 0; i < request->n; i++) { const struct request *req = getRequest(r, request->index + i, -1); if (!req) { tracef("no request found at index %llu", request->index + i); continue; } switch (req->type) { case RAFT_COMMAND: { struct raft_apply *apply = (struct raft_apply *)req; if (apply->cb) { apply->cb(apply, status, NULL); } break; } case RAFT_BARRIER: { struct raft_barrier *barrier = (struct raft_barrier *)req; if (barrier->cb) { barrier->cb(barrier, status); } break; } case RAFT_CHANGE: { struct raft_change *change = (struct raft_change *)req; if (change->cb) { change->cb(change, status); } break; } default: tracef( "unknown request type, shutdown."); assert(false); break; } } goto out; } updateLastStored(r, request->index, request->entries, request->n); /* If we are not leader anymore, just discard the result. */ if (r->state != RAFT_LEADER) { tracef("local server is not leader -> ignore write log result"); goto out; } /* Only update the next index if we are part of the current * configuration. The only case where this is not true is when we were * asked to remove ourselves from the cluster. * * From Section 4.2.2: * * there will be a period of time (while it is committing Cnew) when a * leader can manage a cluster that does not include itself; it * replicates log entries but does not count itself in majorities. */ server_index = configurationIndexOf(&r->configuration, r->id); if (server_index < r->configuration.n) { r->leader_state.progress[server_index].match_index = r->last_stored; } /* Check if we can commit some new entries. */ replicationQuorum(r, r->last_stored); rv = replicationApply(r); if (rv != 0) { /* TODO: just log the error? */ } out: /* Tell the log that we're done referencing these entries. */ logRelease(r->log, request->index, request->entries, request->n); index = request->index; raft_free(request); if (status != 0) { if (index <= logLastIndex(r->log)) { logTruncate(r->log, index); } if (r->state == RAFT_LEADER) { convertToFollower(r); } } } /* Submit a disk write for all entries from the given index onward. */ static int appendLeader(struct raft *r, raft_index index) { struct raft_entry *entries = NULL; unsigned n; struct appendLeader *request; int rv; assert(r->state == RAFT_LEADER); assert(index > 0); assert(index > r->last_stored); /* Acquire all the entries from the given index onwards. */ rv = logAcquire(r->log, index, &entries, &n); if (rv != 0) { goto err; } /* We expect this function to be called only when there are actually * some entries to write. */ if (n == 0) { assert(false); tracef("No log entries found at index %llu", index); ErrMsgPrintf(r->errmsg, "No log entries found at index %llu", index); rv = RAFT_SHUTDOWN; goto err_after_entries_acquired; } /* Allocate a new request. */ request = raft_malloc(sizeof *request); if (request == NULL) { rv = RAFT_NOMEM; goto err_after_entries_acquired; } request->raft = r; request->index = index; request->entries = entries; request->n = n; request->req.data = request; rv = r->io->append(r->io, &request->req, entries, n, appendLeaderCb); if (rv != 0) { ErrMsgTransfer(r->io->errmsg, r->errmsg, "io"); goto err_after_request_alloc; } return 0; err_after_request_alloc: raft_free(request); err_after_entries_acquired: logRelease(r->log, index, entries, n); err: assert(rv != 0); return rv; } int replicationTrigger(struct raft *r, raft_index index) { int rv; rv = appendLeader(r, index); if (rv != 0) { return rv; } return triggerAll(r); } /* Helper to be invoked after a promotion of a non-voting server has been * requested via @raft_assign and that server has caught up with logs. * * This function changes the local configuration marking the server being * promoted as actually voting, appends the a RAFT_CHANGE entry with the new * configuration to the local log and triggers its replication. */ static int triggerActualPromotion(struct raft *r) { raft_index index; raft_term term = r->current_term; size_t server_index; struct raft_server *server; int old_role; int rv; assert(r->state == RAFT_LEADER); assert(r->leader_state.promotee_id != 0); server_index = configurationIndexOf(&r->configuration, r->leader_state.promotee_id); assert(server_index < r->configuration.n); server = &r->configuration.servers[server_index]; assert(server->role != RAFT_VOTER); /* Update our current configuration. */ old_role = server->role; server->role = RAFT_VOTER; /* Index of the entry being appended. */ index = logLastIndex(r->log) + 1; /* Encode the new configuration and append it to the log. */ rv = logAppendConfiguration(r->log, term, &r->configuration); if (rv != 0) { goto err; } /* Start writing the new log entry to disk and send it to the followers. */ rv = replicationTrigger(r, index); if (rv != 0) { goto err_after_log_append; } r->leader_state.promotee_id = 0; r->configuration_uncommitted_index = logLastIndex(r->log); return 0; err_after_log_append: logTruncate(r->log, index); err: server->role = old_role; assert(rv != 0); return rv; } int replicationUpdate(struct raft *r, const struct raft_server *server, const struct raft_append_entries_result *result) { bool is_being_promoted; raft_index last_index; unsigned i; int rv; i = configurationIndexOf(&r->configuration, server->id); assert(r->state == RAFT_LEADER); assert(i < r->configuration.n); progressMarkRecentRecv(r, i); progressSetFeatures(r, i, result->features); /* If the RPC failed because of a log mismatch, retry. * * From Figure 3.1: * * [Rules for servers] Leaders: * * - If AppendEntries fails because of log inconsistency: * decrement nextIndex and retry. */ if (result->rejected > 0) { bool retry; retry = progressMaybeDecrement(r, i, result->rejected, result->last_log_index); if (retry) { /* Retry, ignoring errors. */ tracef("log mismatch -> send old entries to %llu", server->id); replicationProgress(r, i); } return 0; } /* In case of success the remote server is expected to send us back the * value of prevLogIndex + len(entriesToAppend). If it has a longer log, * it might be a leftover from previous terms. */ last_index = result->last_log_index; if (last_index > logLastIndex(r->log)) { last_index = logLastIndex(r->log); } /* If the RPC succeeded, update our counters for this server. * * From Figure 3.1: * * [Rules for servers] Leaders: * * If successful update nextIndex and matchIndex for follower. */ if (!progressMaybeUpdate(r, i, last_index)) { return 0; } switch (progressState(r, i)) { case PROGRESS__SNAPSHOT: /* If a snapshot has been installed, transition back to * probe */ if (progressSnapshotDone(r, i)) { progressToProbe(r, i); } break; case PROGRESS__PROBE: /* Transition to pipeline */ progressToPipeline(r, i); } /* If the server is currently being promoted and is catching with logs, * update the information about the current catch-up round, and possibly * proceed with the promotion. */ is_being_promoted = r->leader_state.promotee_id != 0 && r->leader_state.promotee_id == server->id; if (is_being_promoted) { bool is_up_to_date = membershipUpdateCatchUpRound(r); if (is_up_to_date) { rv = triggerActualPromotion(r); if (rv != 0) { return rv; } } } /* Check if we can commit some new entries. */ replicationQuorum(r, last_index); rv = replicationApply(r); if (rv != 0) { /* TODO: just log the error? */ } /* Abort here we have been removed and we are not leaders anymore. */ if (r->state != RAFT_LEADER) { goto out; } /* Get again the server index since it might have been removed from the * configuration. */ i = configurationIndexOf(&r->configuration, server->id); if (i < r->configuration.n) { /* If we are transferring leadership to this follower, check if * its log is now up-to-date and, if so, send it a TimeoutNow * RPC (unless we already did). */ if (r->transfer != NULL && r->transfer->id == server->id) { if (progressPersistedIsUpToDate(r, i) && r->transfer->send.data == NULL) { rv = membershipLeadershipTransferStart(r); if (rv != 0) { membershipLeadershipTransferClose(r); } } } /* If this follower is in pipeline mode, send it more entries. */ if (progressState(r, i) == PROGRESS__PIPELINE) { replicationProgress(r, i); } } out: return 0; } static void sendAppendEntriesResultCb(struct raft_io_send *req, int status) { (void)status; RaftHeapFree(req); } static void sendAppendEntriesResult( struct raft *r, const struct raft_append_entries_result *result) { struct raft_message message; struct raft_io_send *req; int rv; assert(r->state == RAFT_FOLLOWER); message.type = RAFT_IO_APPEND_ENTRIES_RESULT; message.server_id = r->follower_state.current_leader.id; message.server_address = r->follower_state.current_leader.address; message.append_entries_result = *result; req = raft_malloc(sizeof *req); if (req == NULL) { return; } req->data = r; rv = r->io->send(r->io, req, &message, sendAppendEntriesResultCb); if (rv != 0) { raft_free(req); } } /* Context for a write log entries request that was submitted by a follower. */ struct appendFollower { struct raft *raft; /* Instance that has submitted the request */ raft_index index; /* Index of the first entry in the request. */ struct raft_append_entries args; struct raft_io_append req; }; static void appendFollowerCb(struct raft_io_append *req, int status) { struct appendFollower *request = req->data; struct raft *r = request->raft; struct raft_append_entries *args = &request->args; struct raft_append_entries_result result; size_t i; size_t j; int rv; tracef("I/O completed on follower: status %d", status); assert(args->entries != NULL); assert(args->n_entries > 0); assert(r->state == RAFT_FOLLOWER || r->state == RAFT_UNAVAILABLE); if (r->state == RAFT_UNAVAILABLE) { goto out; } assert(r->follower_state.append_in_flight_count > 0); r->follower_state.append_in_flight_count -= 1; result.term = r->current_term; result.version = RAFT_APPEND_ENTRIES_RESULT_VERSION; result.features = RAFT_DEFAULT_FEATURE_FLAGS; if (status != 0) { ErrMsgTransfer(r->io->errmsg, r->errmsg, "io"); result.rejected = args->prev_log_index + 1; goto respond; } /* We received an InstallSnapshot RPC while these entries were being * persisted to disk */ if (replicationInstallSnapshotBusy(r)) { goto out; } i = updateLastStored(r, request->index, args->entries, args->n_entries); /* If none of the entries that we persisted is present anymore in our * in-memory log, there's nothing to report or to do. We just discard * them. */ if (i == 0) { goto out; } /* Possibly apply configuration changes as uncommitted. */ for (j = 0; j < i; j++) { struct raft_entry *entry = &args->entries[j]; raft_index index = request->index + j; raft_term local_term = logTermOf(r->log, index); assert(local_term != 0 && local_term == entry->term); if (entry->type == RAFT_CHANGE) { rv = membershipUncommittedChange(r, index, entry); if (rv != 0) { goto out; } } } /* From Figure 3.1: * * AppendEntries RPC: Receiver implementation: If leaderCommit > * commitIndex, set commitIndex = min(leaderCommit, index of last new * entry). */ if (args->leader_commit > r->commit_index && r->last_stored >= r->commit_index) { r->commit_index = min(args->leader_commit, r->last_stored); rv = replicationApply(r); if (rv != 0) { goto out; } } /* If our term number has changed since receiving these entries, * our current_leader may have changed as well, so don't send a response * to that server. */ if (r->current_term != args->term) { tracef( "new role or term since receiving entries -> don't " "respond"); goto out; } result.rejected = 0; respond: result.last_log_index = r->last_stored; sendAppendEntriesResult(r, &result); out: logRelease(r->log, request->index, request->args.entries, request->args.n_entries); /* If the write failed, we need to truncate the log. */ if (status != 0) { if (request->index <= logLastIndex(r->log)) { logTruncate(r->log, request->index); } } raft_free(request); } /* Check the log matching property against an incoming AppendEntries request. * * From Figure 3.1: * * [AppendEntries RPC] Receiver implementation: * * 2. Reply false if log doesn't contain an entry at prevLogIndex whose * term matches prevLogTerm. * * Return 0 if the check passed. * * Return 1 if the check did not pass and the request needs to be rejected. * * Return -1 if there's a conflict and we need to shutdown. */ static int checkLogMatchingProperty(struct raft *r, const struct raft_append_entries *args) { raft_term local_prev_term; /* If this is the very first entry, there's nothing to check. */ if (args->prev_log_index == 0) { return 0; } local_prev_term = logTermOf(r->log, args->prev_log_index); if (local_prev_term == 0) { tracef("no entry at index %llu -> reject", args->prev_log_index); return 1; } if (local_prev_term != args->prev_log_term) { if (args->prev_log_index <= r->commit_index) { /* Should never happen; something is seriously wrong! */ tracef( "conflicting terms %llu and %llu for entry %llu " "(commit " "index %llu) -> shutdown", local_prev_term, args->prev_log_term, args->prev_log_index, r->commit_index); return -1; } tracef("previous term mismatch -> reject"); return 1; } return 0; } /* Delete from our log all entries that conflict with the ones in the given * AppendEntries request. * * From Figure 3.1: * * [AppendEntries RPC] Receiver implementation: * * 3. If an existing entry conflicts with a new one (same index but * different terms), delete the existing entry and all that follow it. * * The i output parameter will be set to the array index of the first new log * entry that we don't have yet in our log, among the ones included in the given * AppendEntries request. */ static int deleteConflictingEntries(struct raft *r, const struct raft_append_entries *args, size_t *i) { size_t j; int rv; for (j = 0; j < args->n_entries; j++) { struct raft_entry *entry = &args->entries[j]; raft_index entry_index = args->prev_log_index + 1 + j; raft_term local_term = logTermOf(r->log, entry_index); if (local_term > 0 && local_term != entry->term) { if (entry_index <= r->commit_index) { /* Should never happen; something is seriously * wrong! */ tracef( "new index conflicts with committed entry " "-> shutdown"); return RAFT_SHUTDOWN; } tracef("log mismatch -> truncate (%llu)", entry_index); /* Possibly discard uncommitted configuration changes. */ if (r->configuration_uncommitted_index >= entry_index) { rv = membershipRollback(r); if (rv != 0) { return rv; } } /* Delete all entries from this index on because they * don't match. */ rv = r->io->truncate(r->io, entry_index); if (rv != 0) { return rv; } logTruncate(r->log, entry_index); /* Drop information about previously stored entries that * have just been discarded. */ if (r->last_stored >= entry_index) { r->last_stored = entry_index - 1; } /* We want to append all entries from here on, replacing * anything that we had before. */ break; } else if (local_term == 0) { /* We don't have an entry at this index, so we want to * append this new one and all the subsequent ones. */ break; } } *i = j; return 0; } int replicationAppend(struct raft *r, const struct raft_append_entries *args, raft_index *rejected, bool *async) { struct appendFollower *request; int match; size_t n; size_t i; size_t j; bool reinstated; int rv; assert(r != NULL); assert(args != NULL); assert(rejected != NULL); assert(async != NULL); assert(r->state == RAFT_FOLLOWER); *rejected = args->prev_log_index; *async = false; /* Check the log matching property. */ match = checkLogMatchingProperty(r, args); if (match != 0) { assert(match == 1 || match == -1); return match == 1 ? 0 : RAFT_SHUTDOWN; } /* Delete conflicting entries. */ rv = deleteConflictingEntries(r, args, &i); if (rv != 0) { return rv; } *rejected = 0; n = args->n_entries - i; /* Number of new entries */ /* If this is an empty AppendEntries, there's nothing to write. However * we still want to check if we can commit some entry. However, don't * commit anything while a snapshot install is busy, r->last_stored will * be 0 in that case. * * From Figure 3.1: * * AppendEntries RPC: Receiver implementation: If leaderCommit > * commitIndex, set commitIndex = min(leaderCommit, index of last new * entry). */ if (n == 0) { if ((args->leader_commit > r->commit_index) && r->last_stored >= r->commit_index && !replicationInstallSnapshotBusy(r)) { r->commit_index = min(args->leader_commit, r->last_stored); rv = replicationApply(r); if (rv != 0) { return rv; } } return 0; } *async = true; request = raft_malloc(sizeof *request); if (request == NULL) { rv = RAFT_NOMEM; goto err; } request->raft = r; request->args = *args; /* Index of first new entry */ request->index = args->prev_log_index + 1 + i; /* Update our in-memory log to reflect that we received these entries. * We'll notify the leader of a successful append once the write entries * request that we issue below actually completes. */ for (j = 0; j < n; j++) { struct raft_entry *entry = &args->entries[i + j]; /* We are trying to append an entry at index X with term T to * our in-memory log. If we've gotten this far, we know that the * log *logically* has no entry at this index. However, it's * possible that we're still hanging on to such an entry, * because we previously tried to append and replicate it, and * the associated disk write failed, but some send requests are * still pending that refer to it. Since the log is not capable * of tracking multiple independent entries that share an index * and term, we just piggyback on the already-stored entry in * this case. */ rv = logReinstate(r->log, entry->term, entry->type, &reinstated); if (rv != 0) { goto err_after_request_alloc; } else if (reinstated) { continue; } /* TODO This copy should not strictly be necessary, as the batch * logic will take care of freeing the batch buffer in which the * entries are received. However, this would lead to memory * spikes in certain edge cases. * https://github.com/canonical/dqlite/issues/276 */ struct raft_entry copy = {0}; rv = entryCopy(entry, ©); if (rv != 0) { goto err_after_request_alloc; } rv = logAppend(r->log, copy.term, copy.type, copy.buf, (struct raft_entry_local_data){}, false, NULL); if (rv != 0) { goto err_after_request_alloc; } } /* Acquire the relevant entries from the log. */ rv = logAcquire(r->log, request->index, &request->args.entries, &request->args.n_entries); if (rv != 0) { goto err_after_request_alloc; } assert(request->args.n_entries == n); if (request->args.n_entries == 0) { tracef("No log entries found at index %llu", request->index); ErrMsgPrintf(r->errmsg, "No log entries found at index %llu", request->index); rv = RAFT_SHUTDOWN; goto err_after_acquire_entries; } request->req.data = request; rv = r->io->append(r->io, &request->req, request->args.entries, request->args.n_entries, appendFollowerCb); if (rv != 0) { ErrMsgTransfer(r->io->errmsg, r->errmsg, "io"); goto err_after_acquire_entries; } r->follower_state.append_in_flight_count += 1; entryBatchesDestroy(args->entries, args->n_entries); return 0; err_after_acquire_entries: /* Release the entries related to the IO request */ logRelease(r->log, request->index, request->args.entries, request->args.n_entries); err_after_request_alloc: /* Release all entries added to the in-memory log, making * sure the in-memory log and disk don't diverge, leading * to future log entries not being persisted to disk. */ if (j != 0) { logTruncate(r->log, request->index); } raft_free(request); err: assert(rv != 0); return rv; } struct recvInstallSnapshot { struct raft *raft; struct raft_snapshot snapshot; raft_term term; /* Used to check for state transitions. */ }; static void installSnapshotCb(struct raft_io_snapshot_put *req, int status) { struct recvInstallSnapshot *request = req->data; struct raft *r = request->raft; struct raft_snapshot *snapshot = &request->snapshot; struct raft_append_entries_result result; bool should_respond = true; int rv; /* We avoid converting to candidate state while installing a snapshot. */ assert(r->state == RAFT_FOLLOWER || r->state == RAFT_UNAVAILABLE); r->snapshot.put.data = NULL; result.term = r->current_term; result.version = RAFT_APPEND_ENTRIES_RESULT_VERSION; result.features = RAFT_DEFAULT_FEATURE_FLAGS; result.rejected = 0; /* If we are shutting down, let's discard the result. */ if (r->state == RAFT_UNAVAILABLE) { tracef( "shutting down -> discard result of snapshot installation"); should_respond = false; goto discard; } /* If the request is from a previous term, it means that someone else * became a candidate while we were installing the snapshot. In that * case, we want to install the snapshot anyway, but our "current * leader" may no longer be the same as the server that sent the install * request, so we shouldn't send a response to that server. */ if (request->term != r->current_term) { tracef( "new term since receiving snapshot -> install but don't " "respond"); should_respond = false; } if (status != 0) { tracef("save snapshot %llu: %s", snapshot->index, raft_strerror(status)); goto discard; } /* From Figure 5.3: * * 7. Discard the entire log * 8. Reset state machine using snapshot contents (and load lastConfig * as cluster configuration). */ rv = snapshotRestore(r, snapshot); if (rv != 0) { tracef("restore snapshot %llu: %s", snapshot->index, raft_strerror(status)); goto discard; } tracef("restored snapshot with last index %llu", snapshot->index); goto respond; discard: /* In case of error we must also free the snapshot data buffer and free * the configuration. */ result.rejected = snapshot->index; raft_free(snapshot->bufs[0].base); raft_free(snapshot->bufs); raft_configuration_close(&snapshot->configuration); respond: if (should_respond) { result.last_log_index = r->last_stored; sendAppendEntriesResult(r, &result); } raft_free(request); } int replicationInstallSnapshot(struct raft *r, const struct raft_install_snapshot *args, raft_index *rejected, bool *async) { struct recvInstallSnapshot *request; struct raft_snapshot *snapshot; raft_term local_term; int rv; assert(r->state == RAFT_FOLLOWER); *rejected = args->last_index; *async = false; /* If we are taking a snapshot ourselves or installing a snapshot, * ignore the request, the leader will eventually retry. TODO: we should * do something smarter. */ if (r->snapshot.pending.term != 0 || r->snapshot.put.data != NULL) { *async = true; tracef("already taking or installing snapshot"); return RAFT_BUSY; } /* If our last snapshot is more up-to-date, this is a no-op */ if (r->log->snapshot.last_index >= args->last_index) { tracef("have more recent snapshot"); *rejected = 0; return 0; } /* If we already have all entries in the snapshot, this is a no-op */ local_term = logTermOf(r->log, args->last_index); if (local_term != 0 && local_term >= args->last_term) { tracef("have all entries"); *rejected = 0; return 0; } *async = true; /* Preemptively update our in-memory state. */ logRestore(r->log, args->last_index, args->last_term); r->last_stored = 0; request = raft_malloc(sizeof *request); if (request == NULL) { rv = RAFT_NOMEM; goto err; } request->raft = r; request->term = r->current_term; snapshot = &request->snapshot; snapshot->term = args->last_term; snapshot->index = args->last_index; snapshot->configuration_index = args->conf_index; snapshot->configuration = args->conf; snapshot->bufs = raft_malloc(sizeof *snapshot->bufs); if (snapshot->bufs == NULL) { rv = RAFT_NOMEM; goto err_after_request_alloc; } snapshot->bufs[0] = args->data; snapshot->n_bufs = 1; assert(r->snapshot.put.data == NULL); r->snapshot.put.data = request; rv = r->io->snapshot_put(r->io, 0 /* zero trailing means replace everything */, &r->snapshot.put, snapshot, installSnapshotCb); if (rv != 0) { tracef("snapshot_put failed %d", rv); goto err_after_bufs_alloc; } return 0; err_after_bufs_alloc: raft_free(snapshot->bufs); r->snapshot.put.data = NULL; err_after_request_alloc: raft_free(request); err: assert(rv != 0); return rv; } /* Apply a RAFT_COMMAND entry that has been committed. */ static int applyCommand(struct raft *r, const raft_index index, const struct raft_buffer *buf) { struct raft_apply *req; void *result; int rv; rv = r->fsm->apply(r->fsm, buf, &result); if (rv != 0) { return rv; } r->last_applied = index; req = (struct raft_apply *)getRequest(r, index, RAFT_COMMAND); if (req != NULL && req->cb != NULL) { req->cb(req, 0, result); } return 0; } /* Fire the callback of a barrier request whose entry has been committed. */ static void applyBarrier(struct raft *r, const raft_index index) { r->last_applied = index; struct raft_barrier *req; req = (struct raft_barrier *)getRequest(r, index, RAFT_BARRIER); if (req != NULL && req->cb != NULL) { req->cb(req, 0); } } /* Apply a RAFT_CHANGE entry that has been committed. */ static void applyChange(struct raft *r, const raft_index index) { struct raft_change *req; assert(index > 0); /* If this is an uncommitted configuration that we had already applied * when submitting the configuration change (for leaders) or upon * receiving it via an AppendEntries RPC (for followers), then reset the * uncommitted index, since that uncommitted configuration is now * committed. */ if (r->configuration_uncommitted_index == index) { tracef("configuration at index:%llu is committed.", index); r->configuration_uncommitted_index = 0; } r->configuration_committed_index = index; r->last_applied = index; if (r->state == RAFT_LEADER) { const struct raft_server *server; req = r->leader_state.change; r->leader_state.change = NULL; /* If we are leader but not part of this new configuration, step * down. * * From Section 4.2.2: * * In this approach, a leader that is removed from the * configuration steps down once the Cnew entry is committed. */ server = configurationGet(&r->configuration, r->id); if (server == NULL || server->role != RAFT_VOTER) { tracef( "leader removed from config or no longer voter " "server: %p", (void *)server); convertToFollower(r); } if (req != NULL && req->cb != NULL) { req->cb(req, 0); } } } static bool shouldTakeSnapshot(struct raft *r) { /* If we are shutting down, let's not do anything. */ if (r->state == RAFT_UNAVAILABLE) { return false; } /* If a snapshot is already in progress or we're installing a snapshot, * we don't want to start another one. */ if (r->snapshot.pending.term != 0 || r->snapshot.put.data != NULL) { return false; }; /* If we didn't reach the threshold yet, do nothing. */ if (r->last_applied - r->log->snapshot.last_index < r->snapshot.threshold) { return false; } return true; } /* * When taking a snapshot, ownership of the snapshot data is with raft if * `snapshot_finalize` is NULL. */ static void takeSnapshotClose(struct raft *r, struct raft_snapshot *s) { if (r->fsm->version == 1 || (r->fsm->version > 1 && r->fsm->snapshot_finalize == NULL)) { snapshotClose(s); return; } configurationClose(&s->configuration); r->fsm->snapshot_finalize(r->fsm, &s->bufs, &s->n_bufs); } static void takeSnapshotCb(struct raft_io_snapshot_put *req, int status) { struct raft *r = req->data; struct raft_snapshot *snapshot; int rv; r->snapshot.put.data = NULL; snapshot = &r->snapshot.pending; if (status != 0) { tracef("snapshot %lld at term %lld: %s", snapshot->index, snapshot->term, raft_strerror(status)); goto out; } /* Cache the configuration contained in the snapshot. While the snapshot * was written, new configuration changes could have been committed, * these changes will not be purged from the log by this snapshot. * However we still cache the configuration for consistency. */ configurationClose(&r->configuration_last_snapshot); rv = configurationCopy(&snapshot->configuration, &r->configuration_last_snapshot); if (rv != 0) { /* TODO: make this a hard fault, because if we have no backup * and the log was truncated it will be impossible to rollback * an aborted configuration change. */ tracef("failed to backup last committed configuration."); } logSnapshot(r->log, snapshot->index, r->snapshot.trailing); out: takeSnapshotClose(r, snapshot); r->snapshot.pending.term = 0; } static int putSnapshot(struct raft *r, struct raft_snapshot *snapshot, raft_io_snapshot_put_cb cb) { int rv; assert(r->snapshot.put.data == NULL); r->snapshot.put.data = r; rv = r->io->snapshot_put(r->io, r->snapshot.trailing, &r->snapshot.put, snapshot, cb); if (rv != 0) { takeSnapshotClose(r, snapshot); r->snapshot.pending.term = 0; r->snapshot.put.data = NULL; } return rv; } static void takeSnapshotDoneCb(struct raft_io_async_work *take, int status) { struct raft *r = take->data; struct raft_snapshot *snapshot = &r->snapshot.pending; int rv; raft_free(take); if (status != 0) { tracef("take snapshot failed %s", raft_strerror(status)); takeSnapshotClose(r, snapshot); r->snapshot.pending.term = 0; r->snapshot.put.data = NULL; return; } rv = putSnapshot(r, snapshot, takeSnapshotCb); if (rv != 0) { tracef("put snapshot failed %d", rv); } } static int takeSnapshotAsync(struct raft_io_async_work *take) { struct raft *r = take->data; tracef("take snapshot async at %lld", r->snapshot.pending.index); struct raft_snapshot *snapshot = &r->snapshot.pending; return r->fsm->snapshot_async(r->fsm, &snapshot->bufs, &snapshot->n_bufs); } static int takeSnapshot(struct raft *r) { struct raft_snapshot *snapshot; int rv; tracef("take snapshot at %lld", r->last_applied); snapshot = &r->snapshot.pending; snapshot->index = r->last_applied; snapshot->term = logTermOf(r->log, r->last_applied); snapshot->bufs = NULL; snapshot->n_bufs = 0; rv = membershipFetchLastCommittedConfiguration( r, &snapshot->configuration); if (rv != 0) { goto abort; } snapshot->configuration_index = r->configuration_committed_index; rv = r->fsm->snapshot(r->fsm, &snapshot->bufs, &snapshot->n_bufs); if (rv != 0) { /* Ignore transient errors. We'll retry next time. */ if (rv == RAFT_BUSY) { rv = 0; } raft_configuration_close(&snapshot->configuration); goto abort; } bool sync_snapshot = r->fsm->version < 3 || r->fsm->snapshot_async == NULL; if (sync_snapshot) { /* putSnapshot will clean up config and buffers in case of error */ return putSnapshot(r, snapshot, takeSnapshotCb); } else { struct raft_io_async_work *take = raft_malloc(sizeof(*take)); if (take == NULL) { rv = RAFT_NOMEM; goto abort_after_snapshot; } take->data = r; take->work = takeSnapshotAsync; rv = r->io->async_work(r->io, take, takeSnapshotDoneCb); if (rv != 0) { raft_free(take); goto abort_after_snapshot; } } return 0; abort_after_snapshot: /* Closes config and finalizes snapshot */ takeSnapshotClose(r, snapshot); abort: r->snapshot.pending.term = 0; return rv; } int replicationApply(struct raft *r) { raft_index index; int rv = 0; assert(r->state == RAFT_LEADER || r->state == RAFT_FOLLOWER); assert(r->last_applied <= r->commit_index); if (r->last_applied == r->commit_index) { /* Nothing to do. */ return 0; } for (index = r->last_applied + 1; index <= r->commit_index; index++) { const struct raft_entry *entry = logGet(r->log, index); if (entry == NULL) { /* This can happen while installing a snapshot */ tracef("replicationApply - ENTRY NULL"); return 0; } assert(entry->type == RAFT_COMMAND || entry->type == RAFT_BARRIER || entry->type == RAFT_CHANGE); switch (entry->type) { case RAFT_COMMAND: rv = applyCommand(r, index, &entry->buf); break; case RAFT_BARRIER: applyBarrier(r, index); rv = 0; break; case RAFT_CHANGE: applyChange(r, index); rv = 0; break; default: rv = 0; /* For coverity. This case can't be taken. */ break; } if (rv != 0) { break; } } if (shouldTakeSnapshot(r)) { rv = takeSnapshot(r); } return rv; } void replicationQuorum(struct raft *r, const raft_index index) { size_t votes = 0; size_t i; raft_term term; assert(r->state == RAFT_LEADER); if (index <= r->commit_index) { return; } term = logTermOf(r->log, index); /* TODO: fuzzy-test --seed 0x8db5fccc replication/entries/partitioned * fails the assertion below. */ if (term == 0) { return; } // assert(logTermOf(r->log, index) > 0); assert(!(term > r->current_term)); /* Don't commit entries from previous terms by counting replicas. */ if (term < r->current_term) { return; } for (i = 0; i < r->configuration.n; i++) { struct raft_server *server = &r->configuration.servers[i]; if (server->role != RAFT_VOTER) { continue; } if (r->leader_state.progress[i].match_index >= index) { votes++; } } if (votes > configurationVoterCount(&r->configuration) / 2) { r->commit_index = index; tracef("new commit index %llu", r->commit_index); } return; } inline bool replicationInstallSnapshotBusy(struct raft *r) { return r->last_stored == 0 && r->snapshot.put.data != NULL; } #undef tracef dqlite-1.16.7/src/raft/replication.h000066400000000000000000000073251465252713400173150ustar00rootroot00000000000000/* Log replication logic and helpers. */ #ifndef REPLICATION_H_ #define REPLICATION_H_ #include "../raft.h" /* Send AppendEntries RPC messages to all followers to which no AppendEntries * was sent in the last heartbeat interval. */ int replicationHeartbeat(struct raft *r); /* Start a local disk write for entries from the given index onwards, and * trigger replication against all followers, typically sending AppendEntries * RPC messages with outstanding log entries. */ int replicationTrigger(struct raft *r, raft_index index); /* Possibly send an AppendEntries or an InstallSnapshot RPC message to the * server with the given index. * * The rules to decide whether or not to send a message are: * * - If we have sent an InstallSnapshot RPC recently and we haven't yet received * a response, then don't send any new message. * * - If we are probing the follower (i.e. we haven't received a successful * response during the last heartbeat interval), then send a message only if * haven't sent any during the last heartbeat interval. * * - If we are pipelining entries to the follower, then send any new entries * haven't yet sent. * * If a message should be sent, the rules to decide what type of message to send * and what it should contain are: * * - If we don't have anymore the first entry that should be sent to the * follower, then send an InstallSnapshot RPC with the last snapshot. * * - If we still have the first entry to send, then send all entries from that index onward (possibly zero). * * This function must be called only by leaders. */ int replicationProgress(struct raft *r, unsigned i); /* Update the replication state (match and next indexes) for the given server * using the given AppendEntries RPC result. * * Possibly send to the server a new set of entries or a snapshot if the result * was unsuccessful because of missing entries or if new entries were added to * our log in the meantime. * * It must be called only by leaders. */ int replicationUpdate(struct raft *r, const struct raft_server *server, const struct raft_append_entries_result *result); /* Append the log entries in the given request if the Log Matching Property is * satisfied. * * The rejected output parameter will be set to 0 if the Log Matching Property * was satisfied, or to args->prev_log_index if not. * * The async output parameter will be set to true if some of the entries in the * request were not present in our log, and a disk write was started to persist * them to disk. The entries will still be appended immediately to our in-memory * copy of the log, but an AppendEntries result message will be sent only once * the disk write completes and the I/O callback is invoked. * * It must be called only by followers. */ int replicationAppend(struct raft *r, const struct raft_append_entries *args, raft_index *rejected, bool *async); int replicationInstallSnapshot(struct raft *r, const struct raft_install_snapshot *args, raft_index *rejected, bool *async); /* Returns `true` if the raft instance is currently installing a snapshot */ bool replicationInstallSnapshotBusy(struct raft *r); /* Apply any committed entry that was not applied yet. * * It must be called by leaders or followers. */ int replicationApply(struct raft *r); /* Check if a quorum has been reached for the given log index, and update the * commit index accordingly if so. * * From Figure 3.1: * * [Rules for servers] Leaders: * * If there exists an N such that N > commitIndex, a majority of * matchIndex[i] >= N, and log[N].term == currentTerm: set commitIndex = N */ void replicationQuorum(struct raft *r, const raft_index index); #endif /* REPLICATION_H_ */ dqlite-1.16.7/src/raft/request.h000066400000000000000000000005241465252713400164660ustar00rootroot00000000000000#ifndef REQUEST_H_ #define REQUEST_H_ #include "../raft.h" /* Abstract request type */ struct request { /* Must be kept in sync with RAFT__REQUEST in raft.h */ void *data; int type; raft_index index; queue queue; uint8_t req_id[16]; uint8_t client_id[16]; uint8_t unique_id[16]; uint64_t reserved[4]; }; #endif /* REQUEST_H_ */ dqlite-1.16.7/src/raft/snapshot.c000066400000000000000000000047621465252713400166400ustar00rootroot00000000000000#include "snapshot.h" #include #include #include "../tracing.h" #include "assert.h" #include "configuration.h" #include "err.h" #include "log.h" void snapshotClose(struct raft_snapshot *s) { unsigned i; configurationClose(&s->configuration); for (i = 0; i < s->n_bufs; i++) { raft_free(s->bufs[i].base); } raft_free(s->bufs); } void snapshotDestroy(struct raft_snapshot *s) { snapshotClose(s); raft_free(s); } int snapshotRestore(struct raft *r, struct raft_snapshot *snapshot) { int rv; assert(snapshot->n_bufs == 1); rv = r->fsm->restore(r->fsm, &snapshot->bufs[0]); if (rv != 0) { tracef("restore snapshot %llu: %s", snapshot->index, errCodeToString(rv)); return rv; } configurationClose(&r->configuration); r->configuration = snapshot->configuration; r->configuration_committed_index = snapshot->configuration_index; r->configuration_uncommitted_index = 0; /* Make a copy of the configuration contained in the snapshot, in case * r->configuration gets overriden with an uncommitted configuration and * we then need to rollback, but the log does not contain anymore the * entry at r->configuration_committed_index because it was truncated. */ configurationClose(&r->configuration_last_snapshot); rv = configurationCopy(&r->configuration, &r->configuration_last_snapshot); if (rv != 0) { return rv; } configurationTrace(r, &r->configuration, "configuration restore from snapshot"); r->commit_index = snapshot->index; r->last_applied = snapshot->index; r->last_stored = snapshot->index; /* Don't free the snapshot data buffer, as ownership has been * transferred to the fsm. */ raft_free(snapshot->bufs); return 0; } int snapshotCopy(const struct raft_snapshot *src, struct raft_snapshot *dst) { int rv; unsigned i; size_t size; uint8_t *cursor; dst->term = src->term; dst->index = src->index; dst->configuration_index = src->configuration_index; rv = configurationCopy(&src->configuration, &dst->configuration); if (rv != 0) { return rv; } size = 0; for (i = 0; i < src->n_bufs; i++) { size += src->bufs[i].len; } dst->bufs = raft_malloc(sizeof *dst->bufs); assert(dst->bufs != NULL); dst->bufs[0].base = raft_malloc(size); dst->bufs[0].len = size; if (dst->bufs[0].base == NULL) { return RAFT_NOMEM; } cursor = dst->bufs[0].base; for (i = 0; i < src->n_bufs; i++) { memcpy(cursor, src->bufs[i].base, src->bufs[i].len); cursor += src->bufs[i].len; } dst->n_bufs = 1; return 0; } #undef tracef dqlite-1.16.7/src/raft/snapshot.h000066400000000000000000000017031465252713400166350ustar00rootroot00000000000000#ifndef RAFT_SNAPSHOT_H_ #define RAFT_SNAPSHOT_H_ #include "../raft.h" /* Release all memory associated with the given snapshot. */ void snapshotClose(struct raft_snapshot *s); /* Like snapshotClose(), but also release the snapshot object itself. */ void snapshotDestroy(struct raft_snapshot *s); /* Restore a snapshot. * * This will reset the current state of the server as if the last entry * contained in the snapshot had just been persisted, committed and applied. * * The in-memory log must be empty when calling this function. * * If no error occurs, the memory of the snapshot object gets released. */ int snapshotRestore(struct raft *r, struct raft_snapshot *snapshot); /* Make a full deep copy of a snapshot object. * * All data buffers in the source snapshot will be compacted in a single buffer * in the destination snapshot. */ int snapshotCopy(const struct raft_snapshot *src, struct raft_snapshot *dst); #endif /* RAFT_SNAPSHOT_H */ dqlite-1.16.7/src/raft/start.c000066400000000000000000000156341465252713400161360ustar00rootroot00000000000000#include "../raft.h" #include "../tracing.h" #include "assert.h" #include "configuration.h" #include "convert.h" #include "entry.h" #include "err.h" #include "log.h" #include "recv.h" #include "snapshot.h" #include "tick.h" /* Restore the most recent configuration entry found in the log. */ static int restoreMostRecentConfigurationEntry(struct raft *r, struct raft_entry *entry, raft_index index) { struct raft_configuration configuration; int rv; rv = configurationDecode(&entry->buf, &configuration); if (rv != 0) { configurationClose(&configuration); return rv; } configurationClose(&r->configuration); r->configuration = configuration; /* If the configuration comes from entry at index 1 in the log, we know * it's the bootstrap configuration and it's committed by default. * Otherwise we we can't know if it's committed or not and treat it as * uncommitted. */ if (index == 1) { assert(r->configuration_uncommitted_index == 0); r->configuration_committed_index = 1; } else { assert(r->configuration_committed_index < index); r->configuration_uncommitted_index = index; } configurationTrace(r, &r->configuration, "restore most recent configuration"); return 0; } /* Restore the entries that were loaded from persistent storage. The most recent * configuration entry will be restored as well, if any. * * Note that if the last configuration entry in the log has index greater than * one we cannot know if it is committed or not. Therefore we also need to track * the second-to-last configuration entry. This second-to-last entry is * committed by default as raft doesn't allow multiple uncommitted configuration * entries. That entry is used in case of configuration rollback scenarios. If * we don't find the second-to-last configuration entry in the log, it means * that the log was truncated after a snapshot and second-to-last configuration * is available in r->configuration_last_snapshot, which we popolated earlier * when the snapshot was restored. */ static int restoreEntries(struct raft *r, raft_index snapshot_index, raft_term snapshot_term, raft_index start_index, struct raft_entry *entries, size_t n) { struct raft_entry *conf = NULL; raft_index conf_index = 0; size_t i; int rv; logStart(r->log, snapshot_index, snapshot_term, start_index); r->last_stored = start_index - 1; for (i = 0; i < n; i++) { struct raft_entry *entry = &entries[i]; rv = logAppend(r->log, entry->term, entry->type, entry->buf, entry->local_data, entry->is_local, entry->batch); if (rv != 0) { goto err; } r->last_stored++; /* Only take into account configurations that are newer than the * configuration restored from the snapshot. */ if (entry->type == RAFT_CHANGE && r->last_stored > r->configuration_committed_index) { /* If there is a previous configuration it must have * been committed as we don't allow multiple uncommitted * configurations. At the end of the loop * r->configuration_committed_index will point to the * second to last configuration entry, if any. */ if (conf_index != 0) { r->configuration_committed_index = conf_index; } conf = entry; conf_index = r->last_stored; } } if (conf != NULL) { rv = restoreMostRecentConfigurationEntry(r, conf, conf_index); if (rv != 0) { goto err; } } raft_free(entries); return 0; err: if (logNumEntries(r->log) > 0) { logDiscard(r->log, r->log->offset + 1); } return rv; } /* If we're the only voting server in the configuration, automatically * self-elect ourselves and convert to leader without waiting for the election * timeout. */ static int maybeSelfElect(struct raft *r) { const struct raft_server *server; int rv; server = configurationGet(&r->configuration, r->id); if (server == NULL || server->role != RAFT_VOTER || configurationVoterCount(&r->configuration) > 1) { return 0; } /* Converting to candidate will notice that we're the only voter and * automatically convert to leader. */ rv = convertToCandidate(r, false /* disrupt leader */); if (rv != 0) { return rv; } assert(r->state == RAFT_LEADER); return 0; } int raft_start(struct raft *r) { struct raft_snapshot *snapshot; raft_index snapshot_index = 0; raft_term snapshot_term = 0; raft_index start_index; struct raft_entry *entries; size_t n_entries; int rv; assert(r != NULL); assert(r->state == RAFT_UNAVAILABLE); assert(r->heartbeat_timeout != 0); assert(r->heartbeat_timeout < r->election_timeout); assert(r->install_snapshot_timeout != 0); assert(logNumEntries(r->log) == 0); assert(logSnapshotIndex(r->log) == 0); assert(r->last_stored == 0); #ifndef RAFT_REVISION #define RAFT_REVISION "unknown" #endif tracef("starting version:%d revision:%s", raft_version_number(), RAFT_REVISION); rv = r->io->load(r->io, &r->current_term, &r->voted_for, &snapshot, &start_index, &entries, &n_entries); if (rv != 0) { ErrMsgTransfer(r->io->errmsg, r->errmsg, "io"); return rv; } assert(start_index >= 1); tracef( "current_term:%llu voted_for:%llu start_index:%llu n_entries:%zu", r->current_term, r->voted_for, start_index, n_entries); /* If we have a snapshot, let's restore it. */ if (snapshot != NULL) { tracef( "restore snapshot with last index %llu and last term %llu", snapshot->index, snapshot->term); rv = snapshotRestore(r, snapshot); if (rv != 0) { snapshotDestroy(snapshot); entryBatchesDestroy(entries, n_entries); return rv; } snapshot_index = snapshot->index; snapshot_term = snapshot->term; raft_free(snapshot); } else if (n_entries > 0) { /* If we don't have a snapshot and the on-disk log is not empty, * then the first entry must be a configuration entry. */ assert(start_index == 1); assert(entries[0].type == RAFT_CHANGE); /* As a small optimization, bump the commit index to 1 since we * require the first entry to be the same on all servers. */ r->commit_index = 1; r->last_applied = 1; } /* Append the entries to the log, possibly restoring the last * configuration. */ tracef("restore %zu entries starting at %llu", n_entries, start_index); rv = restoreEntries(r, snapshot_index, snapshot_term, start_index, entries, n_entries); if (rv != 0) { entryBatchesDestroy(entries, n_entries); return rv; } /* Start the I/O backend. The tickCb function is expected to fire every * r->heartbeat_timeout milliseconds and recvCb whenever an RPC is * received. */ rv = r->io->start(r->io, r->heartbeat_timeout, tickCb, recvCb); if (rv != 0) { tracef("io start failed %d", rv); return rv; } /* By default we start as followers. */ convertToFollower(r); /* If there's only one voting server, and that is us, it's safe to * convert to leader right away. If that is not us, we're either joining * the cluster or we're simply configured as non-voter, and we'll stay * follower. */ rv = maybeSelfElect(r); if (rv != 0) { return rv; } return 0; } #undef tracef dqlite-1.16.7/src/raft/state.c000066400000000000000000000017161465252713400161150ustar00rootroot00000000000000#include "assert.h" #include "configuration.h" #include "election.h" #include "log.h" #include "../lib/queue.h" int raft_state(struct raft *r) { return r->state; } void raft_leader(struct raft *r, raft_id *id, const char **address) { switch (r->state) { case RAFT_UNAVAILABLE: case RAFT_CANDIDATE: *id = 0; *address = NULL; return; case RAFT_FOLLOWER: *id = r->follower_state.current_leader.id; *address = r->follower_state.current_leader.address; return; case RAFT_LEADER: if (r->transfer != NULL) { *id = 0; *address = NULL; return; } *id = r->id; *address = r->address; return; } } raft_index raft_last_index(struct raft *r) { return logLastIndex(r->log); } raft_index raft_last_applied(struct raft *r) { return r->last_applied; } int raft_role(struct raft *r) { const struct raft_server *local = configurationGet(&r->configuration, r->id); if (local == NULL) { return -1; } return local->role; } dqlite-1.16.7/src/raft/syscall.c000066400000000000000000000024241465252713400164440ustar00rootroot00000000000000#include "syscall.h" #if HAVE_LINUX_AIO_ABI_H || HAVE_LINUX_IO_URING_H #include #include #endif #if HAVE_LINUX_AIO_ABI_H int io_setup(unsigned nr_events, aio_context_t *ctx_idp) { return (int)syscall(__NR_io_setup, nr_events, ctx_idp); } int io_destroy(aio_context_t ctx_id) { return (int)syscall(__NR_io_destroy, ctx_id); } int io_submit(aio_context_t ctx_id, long nr, struct iocb **iocbpp) { return (int)syscall(__NR_io_submit, ctx_id, nr, iocbpp); } int io_getevents(aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout) { return (int)syscall(__NR_io_getevents, ctx_id, min_nr, nr, events, timeout); } #endif #if HAVE_LINUX_IO_URING_H int io_uring_register(int fd, unsigned int opcode, const void *arg, unsigned int nr_args) { return (int)syscall(__NR_io_uring_register, fd, opcode, arg, nr_args); } int io_uring_setup(unsigned int entries, struct io_uring_params *p) { return (int)syscall(__NR_io_uring_setup, entries, p); } int io_uring_enter(int fd, unsigned int to_submit, unsigned int min_complete, unsigned int flags, sigset_t *sig) { return (int)syscall(__NR_io_uring_enter, fd, to_submit, min_complete, flags, sig, _NSIG / 8); } #endif dqlite-1.16.7/src/raft/syscall.h000066400000000000000000000017241465252713400164530ustar00rootroot00000000000000/* Wrappers for system calls not yet defined in libc. */ #ifndef SYSCALL_H_ #define SYSCALL_H_ #if HAVE_LINUX_AIO_ABI_H #include #include #include #endif #if HAVE_LINUX_IO_URING_H #include #endif #if HAVE_LINUX_AIO_ABI_H /* AIO */ int io_setup(unsigned nr_events, aio_context_t *ctx_idp); int io_destroy(aio_context_t ctx_id); int io_submit(aio_context_t ctx_id, long nr, struct iocb **iocbpp); int io_getevents(aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout); #endif #if HAVE_LINUX_IO_URING_H /* uring */ int io_uring_register(int fd, unsigned int opcode, const void *arg, unsigned int nr_args); int io_uring_setup(unsigned int entries, struct io_uring_params *p); int io_uring_enter(int fd, unsigned int to_submit, unsigned int min_complete, unsigned int flags, sigset_t *sig); #endif #endif /* SYSCALL_ */ dqlite-1.16.7/src/raft/tick.c000066400000000000000000000157271465252713400157360ustar00rootroot00000000000000#include "../raft.h" #include "../tracing.h" #include "assert.h" #include "configuration.h" #include "convert.h" #include "election.h" #include "membership.h" #include "progress.h" #include "replication.h" /* Apply time-dependent rules for followers (Figure 3.1). */ static int tickFollower(struct raft *r) { const struct raft_server *server; int rv; assert(r != NULL); assert(r->state == RAFT_FOLLOWER); server = configurationGet(&r->configuration, r->id); /* If we have been removed from the configuration, or maybe we didn't * receive one yet, just stay follower. */ if (server == NULL) { return 0; } /* Check if we need to start an election. * * From Section 3.3: * * If a follower receives no communication over a period of time * called the election timeout, then it assumes there is no viable * leader and begins an election to choose a new leader. * * Figure 3.1: * * If election timeout elapses without receiving AppendEntries RPC * from current leader or granting vote to candidate, convert to * candidate. */ if (electionTimerExpired(r) && server->role == RAFT_VOTER) { if (replicationInstallSnapshotBusy(r)) { tracef( "installing snapshot -> don't convert to " "candidate"); electionResetTimer(r); return 0; } if (r->follower_state.append_in_flight_count > 0) { tracef( "append in progress -> don't convert to candidate"); electionResetTimer(r); return 0; } tracef("convert to candidate and start new election"); rv = convertToCandidate(r, false /* disrupt leader */); if (rv != 0) { tracef("convert to candidate: %s", raft_strerror(rv)); return rv; } } return 0; } /* Apply time-dependent rules for candidates (Figure 3.1). */ static int tickCandidate(struct raft *r) { assert(r != NULL); assert(r->state == RAFT_CANDIDATE); /* Check if we need to start an election. * * From Section 3.4: * * The third possible outcome is that a candidate neither wins nor * loses the election: if many followers become candidates at the same * time, votes could be split so that no candidate obtains a majority. * When this happens, each candidate will time out and start a new * election by incrementing its term and initiating another round of * RequestVote RPCs */ if (electionTimerExpired(r)) { tracef("start new election"); return electionStart(r); } return 0; } /* Return true if we received an AppendEntries RPC result from a majority of * voting servers since we became leaders or since the last time this function * was called. * * For each server the function checks the recent_recv flag of the associated * progress object, and resets the flag after the check. It returns true if a * majority of voting server had the flag set to true. */ static bool checkContactQuorum(struct raft *r) { unsigned i; unsigned contacts = 0; assert(r->state == RAFT_LEADER); for (i = 0; i < r->configuration.n; i++) { struct raft_server *server = &r->configuration.servers[i]; bool recent_recv = progressResetRecentRecv(r, i); if ((server->role == RAFT_VOTER && recent_recv) || server->id == r->id) { contacts++; } } r->leader_state.voter_contacts = contacts; return contacts > configurationVoterCount(&r->configuration) / 2; } /* Apply time-dependent rules for leaders (Figure 3.1). */ static int tickLeader(struct raft *r) { raft_time now = r->io->time(r->io); assert(r->state == RAFT_LEADER); /* Check if we still can reach a majority of servers. * * From Section 6.2: * * A leader in Raft steps down if an election timeout elapses without * a successful round of heartbeats to a majority of its cluster; this * allows clients to retry their requests with another server. */ if (now - r->election_timer_start >= r->election_timeout) { if (!checkContactQuorum(r)) { tracef( "unable to contact majority of cluster -> step " "down"); convertToFollower(r); return 0; } r->election_timer_start = r->io->time(r->io); } /* Possibly send heartbeats. * * From Figure 3.1: * * Send empty AppendEntries RPC during idle periods to prevent * election timeouts. */ replicationHeartbeat(r); /* If a server is being promoted, increment the timer of the current * round or abort the promotion. * * From Section 4.2.1: * * The algorithm waits a fixed number of rounds (such as 10). If the * last round lasts less than an election timeout, then the leader adds * the new server to the cluster, under the assumption that there are * not enough unreplicated entries to create a significant availability * gap. Otherwise, the leader aborts the configuration change with an * error. */ if (r->leader_state.promotee_id != 0) { raft_id id = r->leader_state.promotee_id; unsigned server_index; raft_time round_duration = now - r->leader_state.round_start; bool is_too_slow; bool is_unresponsive; /* If a promotion is in progress, we expect that our * configuration contains an entry for the server being * promoted, and that the server is not yet considered as * voting. */ server_index = configurationIndexOf(&r->configuration, id); assert(server_index < r->configuration.n); assert(r->configuration.servers[server_index].role != RAFT_VOTER); is_too_slow = (r->leader_state.round_number == r->max_catch_up_rounds && round_duration > r->election_timeout); is_unresponsive = round_duration > r->max_catch_up_round_duration; /* Abort the promotion if we are at the 10'th round and it's * still taking too long, or if the server is unresponsive. */ if (is_too_slow || is_unresponsive) { tracef( "server_index:%d is_too_slow:%d is_unresponsive:%d", server_index, is_too_slow, is_unresponsive); struct raft_change *change; r->leader_state.promotee_id = 0; r->leader_state.round_index = 0; r->leader_state.round_number = 0; r->leader_state.round_start = 0; change = r->leader_state.change; r->leader_state.change = NULL; if (change != NULL && change->cb != NULL) { change->cb(change, RAFT_NOCONNECTION); } } } return 0; } static int tick(struct raft *r) { int rv = -1; assert(r->state == RAFT_UNAVAILABLE || r->state == RAFT_FOLLOWER || r->state == RAFT_CANDIDATE || r->state == RAFT_LEADER); /* If we are not available, let's do nothing. */ if (r->state == RAFT_UNAVAILABLE) { return 0; } switch (r->state) { case RAFT_FOLLOWER: rv = tickFollower(r); break; case RAFT_CANDIDATE: rv = tickCandidate(r); break; case RAFT_LEADER: rv = tickLeader(r); break; } return rv; } void tickCb(struct raft_io *io) { struct raft *r; int rv; r = io->data; rv = tick(r); if (rv != 0) { convertToUnavailable(r); return; } /* For all states: if there is a leadership transfer request in * progress, check if it's expired. */ if (r->transfer != NULL) { raft_time now = r->io->time(r->io); if (now - r->transfer->start >= r->election_timeout) { membershipLeadershipTransferClose(r); } } } #undef tracef dqlite-1.16.7/src/raft/tick.h000066400000000000000000000004611465252713400157300ustar00rootroot00000000000000/* Logic to be invoked periodically. */ #ifndef TICK_H_ #define TICK_H_ #include "../raft.h" /* Callback to be passed to the @raft_io implementation. It notifies us that a * certain amount of time has elapsed and will be invoked periodically. */ void tickCb(struct raft_io *io); #endif /* TICK_H_ */ dqlite-1.16.7/src/raft/utils.h000066400000000000000000000005321465252713400161350ustar00rootroot00000000000000#ifndef RAFT_UTILS_H_ #define RAFT_UTILS_H_ #include /* Various utility functions and macros */ #define LIKELY(x) __builtin_expect(!!(x), 1) #define UNLIKELY(x) __builtin_expect(!!(x), 0) #define DBG() fprintf(stderr, "%s:%d\n", __func__, __LINE__) #define ARRAY_SIZE(a) ((sizeof(a)) / (sizeof(a)[0])) #endif /* RAFT_UTILS_H_ */ dqlite-1.16.7/src/raft/uv.c000066400000000000000000000452641465252713400154350ustar00rootroot00000000000000#include "../raft.h" #include #include #include #include #include #include #include #include #include "../raft.h" #include "../tracing.h" #include "assert.h" #include "byte.h" #include "configuration.h" #include "entry.h" #include "heap.h" #include "snapshot.h" #include "uv.h" #include "uv_encoding.h" #include "uv_os.h" /* Retry to connect to peer servers every second. * * TODO: implement an exponential backoff instead. */ #define CONNECT_RETRY_DELAY 1000 /* Cleans up files that are no longer used by the system */ static int uvMaintenance(const char *dir, char *errmsg) { struct uv_fs_s req; struct uv_dirent_s entry; int n; int i; int rv; int rv2; n = uv_fs_scandir(NULL, &req, dir, 0, NULL); if (n < 0) { ErrMsgPrintf(errmsg, "scan data directory: %s", uv_strerror(n)); return RAFT_IOERR; } rv = 0; for (i = 0; i < n; i++) { const char *filename; rv = uv_fs_scandir_next(&req, &entry); assert(rv == 0); /* Can't fail in libuv */ filename = entry.name; /* Remove leftover tmp-files */ if (strncmp(filename, TMP_FILE_PREFIX, strlen(TMP_FILE_PREFIX)) == 0) { UvFsRemoveFile(dir, filename, errmsg); /* Ignore errors */ continue; } /* Remove orphaned snapshot files */ bool orphan = false; if ((UvSnapshotIsOrphan(dir, filename, &orphan) == 0) && orphan) { UvFsRemoveFile(dir, filename, errmsg); /* Ignore errors */ continue; } /* Remove orphaned snapshot metadata files */ if ((UvSnapshotMetaIsOrphan(dir, filename, &orphan) == 0) && orphan) { UvFsRemoveFile(dir, filename, errmsg); /* Ignore errors */ } } rv2 = uv_fs_scandir_next(&req, &entry); assert(rv2 == UV_EOF); return rv; } /* Implementation of raft_io->config. */ static int uvInit(struct raft_io *io, raft_id id, const char *address) { struct uv *uv; size_t direct_io; struct uvMetadata metadata; int rv; uv = io->impl; uv->id = id; rv = UvFsCheckDir(uv->dir, io->errmsg); if (rv != 0) { return rv; } /* Probe file system capabilities */ rv = UvFsProbeCapabilities(uv->dir, &direct_io, &uv->async_io, &uv->fallocate, io->errmsg); if (rv != 0) { return rv; } uv->direct_io = direct_io != 0; uv->block_size = direct_io != 0 ? direct_io : 4096; rv = uvMaintenance(uv->dir, io->errmsg); if (rv != 0) { return rv; } rv = uvMetadataLoad(uv->dir, &metadata, io->errmsg); if (rv != 0) { return rv; } uv->metadata = metadata; rv = uv->transport->init(uv->transport, id, address); if (rv != 0) { ErrMsgTransfer(uv->transport->errmsg, io->errmsg, "transport"); return rv; } uv->transport->data = uv; rv = uv_timer_init(uv->loop, &uv->timer); assert(rv == 0); /* This should never fail */ uv->timer.data = uv; return 0; } /* Periodic timer callback */ static void uvTickTimerCb(uv_timer_t *timer) { struct uv *uv; uv = timer->data; if (uv->tick_cb != NULL) { uv->tick_cb(uv->io); } } /* Implementation of raft_io->start. */ static int uvStart(struct raft_io *io, unsigned msecs, raft_io_tick_cb tick_cb, raft_io_recv_cb recv_cb) { struct uv *uv; int rv; uv = io->impl; uv->state = UV__ACTIVE; uv->tick_cb = tick_cb; uv->recv_cb = recv_cb; rv = UvRecvStart(uv); if (rv != 0) { return rv; } rv = uv_timer_start(&uv->timer, uvTickTimerCb, msecs, msecs); assert(rv == 0); return 0; } void uvMaybeFireCloseCb(struct uv *uv) { tracef("uv maybe fire close cb"); if (!uv->closing) { return; } if (uv->transport->data != NULL) { return; } if (uv->timer.data != NULL) { return; } if (!queue_empty(&uv->append_segments)) { return; } if (!queue_empty(&uv->finalize_reqs)) { return; } if (uv->finalize_work.data != NULL) { return; } if (uv->prepare_inflight != NULL) { return; } if (uv->barrier != NULL) { return; } if (uv->snapshot_put_work.data != NULL) { return; } if (!queue_empty(&uv->snapshot_get_reqs)) { return; } if (!queue_empty(&uv->async_work_reqs)) { return; } if (!queue_empty(&uv->aborting)) { return; } assert(uv->truncate_work.data == NULL); if (uv->close_cb != NULL) { uv->close_cb(uv->io); } } static void uvTickTimerCloseCb(uv_handle_t *handle) { struct uv *uv = handle->data; assert(uv->closing); uv->timer.data = NULL; uvMaybeFireCloseCb(uv); } static void uvTransportCloseCb(struct raft_uv_transport *transport) { struct uv *uv = transport->data; assert(uv->closing); uv->transport->data = NULL; uvMaybeFireCloseCb(uv); } /* Implementation of raft_io->close. */ static void uvClose(struct raft_io *io, raft_io_close_cb cb) { struct uv *uv; uv = io->impl; assert(uv != NULL); assert(!uv->closing); uv->close_cb = cb; uv->closing = true; UvSendClose(uv); UvRecvClose(uv); uvAppendClose(uv); if (uv->transport->data != NULL) { uv->transport->close(uv->transport, uvTransportCloseCb); } if (uv->timer.data != NULL) { uv_close((uv_handle_t *)&uv->timer, uvTickTimerCloseCb); } uvMaybeFireCloseCb(uv); } /* Filter the given segment list to find the most recent contiguous chunk of * closed segments that overlaps with the given snapshot last index. */ static int uvFilterSegments(struct uv *uv, raft_index last_index, const char *snapshot_filename, struct uvSegmentInfo **segments, size_t *n) { struct uvSegmentInfo *segment; size_t i; /* First valid closed segment. */ size_t j; /* Last valid closed segment. */ /* If there are not segments at all, or only open segments, there's * nothing to do. */ if (*segments == NULL || (*segments)[0].is_open) { return 0; } /* Find the index of the most recent closed segment. */ for (j = 0; j < *n; j++) { segment = &(*segments)[j]; if (segment->is_open) { break; } } assert(j > 0); j--; segment = &(*segments)[j]; tracef("most recent closed segment is %s", segment->filename); /* If the end index of the last closed segment is lower than the last * snapshot index, there might be no entry that we can keep. We return * an empty segment list, unless there is at least one open segment, in * that case we keep everything hoping that they contain all the entries * since the last closed segment (TODO: we should encode the starting * entry in the open segment). */ if (segment->end_index < last_index) { if (!(*segments)[*n - 1].is_open) { tracef( "discarding all closed segments, since most recent " "is behind " "last snapshot"); raft_free(*segments); *segments = NULL; *n = 0; return 0; } tracef( "most recent closed segment %s is behind last snapshot, " "yet there are open segments", segment->filename); } /* Now scan the segments backwards, searching for the longest list of * contiguous closed segments. */ if (j >= 1) { for (i = j; i > 0; i--) { struct uvSegmentInfo *newer; struct uvSegmentInfo *older; newer = &(*segments)[i]; older = &(*segments)[i - 1]; if (older->end_index != newer->first_index - 1) { tracef("discarding non contiguous segment %s", older->filename); break; } } } else { i = j; } /* Make sure that the first index of the first valid closed segment is * not greater than the snapshot's last index plus one (so there are no * missing entries). */ segment = &(*segments)[i]; if (segment->first_index > last_index + 1) { ErrMsgPrintf(uv->io->errmsg, "closed segment %s is past last snapshot %s", segment->filename, snapshot_filename); return RAFT_CORRUPT; } if (i != 0) { size_t new_n = *n - i; struct uvSegmentInfo *new_segments; new_segments = raft_malloc(new_n * sizeof *new_segments); if (new_segments == NULL) { return RAFT_NOMEM; } memcpy(new_segments, &(*segments)[i], new_n * sizeof *new_segments); raft_free(*segments); *segments = new_segments; *n = new_n; } return 0; } /* Load the last snapshot (if any) and all entries contained in all segment * files of the data directory. This function can be called recursively, `depth` * is there to ensure we don't get stuck in a recursive loop. */ static int uvLoadSnapshotAndEntries(struct uv *uv, struct raft_snapshot **snapshot, raft_index *start_index, struct raft_entry *entries[], size_t *n, int depth) { struct uvSnapshotInfo *snapshots; struct uvSegmentInfo *segments; size_t n_snapshots; size_t n_segments; int rv; *snapshot = NULL; *start_index = 1; *entries = NULL; *n = 0; /* List available snapshots and segments. */ rv = UvList(uv, &snapshots, &n_snapshots, &segments, &n_segments, uv->io->errmsg); if (rv != 0) { goto err; } /* Load the most recent snapshot, if any. */ if (snapshots != NULL) { char snapshot_filename[UV__FILENAME_LEN]; *snapshot = RaftHeapMalloc(sizeof **snapshot); if (*snapshot == NULL) { rv = RAFT_NOMEM; goto err; } rv = UvSnapshotLoad(uv, &snapshots[n_snapshots - 1], *snapshot, uv->io->errmsg); if (rv != 0) { RaftHeapFree(*snapshot); *snapshot = NULL; goto err; } uvSnapshotFilenameOf(&snapshots[n_snapshots - 1], snapshot_filename); tracef("most recent snapshot at %lld", (*snapshot)->index); RaftHeapFree(snapshots); snapshots = NULL; /* Update the start index. If there are closed segments on disk * let's make sure that the first index of the first closed * segment is not greater than the snapshot's last index plus * one (so there are no missing entries), and update the start * index accordingly. */ rv = uvFilterSegments(uv, (*snapshot)->index, snapshot_filename, &segments, &n_segments); if (rv != 0) { goto err; } if (segments != NULL) { if (segments[0].is_open) { *start_index = (*snapshot)->index + 1; } else { *start_index = segments[0].first_index; } } else { *start_index = (*snapshot)->index + 1; } } /* Read data from segments, closing any open segments. */ if (segments != NULL) { raft_index last_index; rv = uvSegmentLoadAll(uv, *start_index, segments, n_segments, entries, n); if (rv != 0) { goto err; } /* Check if all entries that we loaded are actually behind the * last snapshot. This can happen if the last closed segment was * behind the last snapshot and there were open segments, but * the entries in the open segments turned out to be behind the * snapshot as well. */ last_index = *start_index + *n - 1; if (*snapshot != NULL && last_index < (*snapshot)->index) { ErrMsgPrintf(uv->io->errmsg, "last entry on disk has index %llu, which " "is behind " "last snapshot's index %llu", last_index, (*snapshot)->index); rv = RAFT_CORRUPT; goto err; } raft_free(segments); segments = NULL; } return 0; err: assert(rv != 0); if (*snapshot != NULL) { snapshotDestroy(*snapshot); *snapshot = NULL; } if (snapshots != NULL) { raft_free(snapshots); } if (segments != NULL) { raft_free(segments); } if (*entries != NULL) { entryBatchesDestroy(*entries, *n); *entries = NULL; *n = 0; } /* Try to recover exactly once when corruption is detected, the first * pass might have cleaned up corrupt data. Most of the arguments are * already reset after the `err` label, except for `start_index`. */ if (rv == RAFT_CORRUPT && uv->auto_recovery && depth == 0) { *start_index = 1; return uvLoadSnapshotAndEntries(uv, snapshot, start_index, entries, n, depth + 1); } return rv; } /* Implementation of raft_io->load. */ static int uvLoad(struct raft_io *io, raft_term *term, raft_id *voted_for, struct raft_snapshot **snapshot, raft_index *start_index, struct raft_entry **entries, size_t *n_entries) { struct uv *uv; int rv; uv = io->impl; *term = uv->metadata.term; *voted_for = uv->metadata.voted_for; *snapshot = NULL; rv = uvLoadSnapshotAndEntries(uv, snapshot, start_index, entries, n_entries, 0); if (rv != 0) { return rv; } tracef("start index %lld, %zu entries", *start_index, *n_entries); if (*snapshot == NULL) { tracef("no snapshot"); } /* Set the index of the next entry that will be appended. */ uv->append_next_index = *start_index + *n_entries; return 0; } /* Implementation of raft_io->set_term. */ static int uvSetTerm(struct raft_io *io, const raft_term term) { struct uv *uv; int rv; uv = io->impl; uv->metadata.version++; uv->metadata.term = term; uv->metadata.voted_for = 0; rv = uvMetadataStore(uv, &uv->metadata); if (rv != 0) { return rv; } return 0; } /* Implementation of raft_io->set_term. */ static int uvSetVote(struct raft_io *io, const raft_id server_id) { struct uv *uv; int rv; uv = io->impl; uv->metadata.version++; uv->metadata.voted_for = server_id; rv = uvMetadataStore(uv, &uv->metadata); if (rv != 0) { return rv; } return 0; } /* Implementation of raft_io->bootstrap. */ static int uvBootstrap(struct raft_io *io, const struct raft_configuration *configuration) { struct uv *uv; int rv; uv = io->impl; /* We shouldn't have written anything else yet. */ if (uv->metadata.term != 0) { ErrMsgPrintf(io->errmsg, "metadata contains term %lld", uv->metadata.term); return RAFT_CANTBOOTSTRAP; } /* Write the term */ rv = uvSetTerm(io, 1); if (rv != 0) { return rv; } /* Create the first closed segment file, containing just one entry. */ rv = uvSegmentCreateFirstClosed(uv, configuration); if (rv != 0) { return rv; } return 0; } /* Implementation of raft_io->recover. */ static int uvRecover(struct raft_io *io, const struct raft_configuration *conf) { struct uv *uv = io->impl; struct raft_snapshot *snapshot; raft_index start_index; raft_index next_index; struct raft_entry *entries; size_t n_entries; int rv; /* Load the current state. This also closes any leftover open segment. */ rv = uvLoadSnapshotAndEntries(uv, &snapshot, &start_index, &entries, &n_entries, 0); if (rv != 0) { return rv; } /* We don't care about the actual data, just index of the last entry. */ if (snapshot != NULL) { snapshotDestroy(snapshot); } if (entries != NULL) { entryBatchesDestroy(entries, n_entries); } assert(start_index > 0); next_index = start_index + n_entries; rv = uvSegmentCreateClosedWithConfiguration(uv, next_index, conf); if (rv != 0) { return rv; } return 0; } /* Implementation of raft_io->time. */ static raft_time uvTime(struct raft_io *io) { struct uv *uv; uv = io->impl; return uv_now(uv->loop); } /* Implementation of raft_io->random. */ static int uvRandom(struct raft_io *io, int min, int max) { (void)io; return min + (abs(rand()) % (max - min)); } static void uvSeedRand(struct uv *uv) { ssize_t sz = -1; unsigned seed = 0; /* fed to srand() */ sz = getrandom(&seed, sizeof seed, GRND_NONBLOCK); if (sz == -1 || sz < ((ssize_t)sizeof seed)) { /* Fall back to an inferior random seed when `getrandom` would * have blocked or when not enough randomness was returned. */ seed ^= (unsigned)uv->id; seed ^= (unsigned)uv_now(uv->loop); struct timeval time = {0}; /* Ignore errors. */ gettimeofday(&time, NULL); seed ^= (unsigned)((time.tv_sec * 1000) + (time.tv_usec / 1000)); } srand(seed); } int raft_uv_init(struct raft_io *io, struct uv_loop_s *loop, const char *dir, struct raft_uv_transport *transport) { struct uv *uv; void *data; int rv; assert(io != NULL); assert(loop != NULL); assert(dir != NULL); assert(transport != NULL); data = io->data; memset(io, 0, sizeof *io); io->data = data; if (transport->version == 0) { ErrMsgPrintf(io->errmsg, "transport->version must be set"); return RAFT_INVALID; } /* Ensure that the given path doesn't exceed our static buffer limit. */ if (!UV__DIR_HAS_VALID_LEN(dir)) { ErrMsgPrintf(io->errmsg, "directory path too long"); return RAFT_NAMETOOLONG; } /* Allocate the raft_io_uv object */ uv = raft_malloc(sizeof *uv); if (uv == NULL) { rv = RAFT_NOMEM; goto err; } memset(uv, 0, sizeof(struct uv)); uv->io = io; uv->loop = loop; strncpy(uv->dir, dir, sizeof(uv->dir) - 1); uv->dir[sizeof(uv->dir) - 1] = '\0'; uv->transport = transport; uv->transport->data = NULL; uv->tracer = NULL; uv->id = 0; /* Set by raft_io->config() */ uv->state = UV__PRISTINE; uv->errored = false; uv->direct_io = false; uv->async_io = false; uv->fallocate = false; #ifdef LZ4_ENABLED uv->snapshot_compression = true; #else uv->snapshot_compression = false; #endif uv->segment_size = UV__MAX_SEGMENT_SIZE; uv->block_size = 0; queue_init(&uv->clients); queue_init(&uv->servers); uv->connect_retry_delay = CONNECT_RETRY_DELAY; uv->prepare_inflight = NULL; queue_init(&uv->prepare_reqs); queue_init(&uv->prepare_pool); uv->prepare_next_counter = 1; uv->append_next_index = 1; queue_init(&uv->append_segments); queue_init(&uv->append_pending_reqs); queue_init(&uv->append_writing_reqs); uv->barrier = NULL; queue_init(&uv->finalize_reqs); uv->finalize_work.data = NULL; uv->truncate_work.data = NULL; queue_init(&uv->snapshot_get_reqs); queue_init(&uv->async_work_reqs); uv->snapshot_put_work.data = NULL; uv->timer.data = NULL; uv->tick_cb = NULL; /* Set by raft_io->start() */ uv->recv_cb = NULL; /* Set by raft_io->start() */ queue_init(&uv->aborting); uv->closing = false; uv->close_cb = NULL; uv->auto_recovery = true; uvSeedRand(uv); /* Set the raft_io implementation. */ io->version = 2; /* future-proof'ing */ io->impl = uv; io->init = uvInit; io->close = uvClose; io->start = uvStart; io->load = uvLoad; io->bootstrap = uvBootstrap; io->recover = uvRecover; io->set_term = uvSetTerm; io->set_vote = uvSetVote; io->append = UvAppend; io->truncate = UvTruncate; io->send = UvSend; io->snapshot_put = UvSnapshotPut; io->snapshot_get = UvSnapshotGet; io->async_work = UvAsyncWork; io->time = uvTime; io->random = uvRandom; return 0; err: assert(rv != 0); if (rv == RAFT_NOMEM) { ErrMsgOom(io->errmsg); } return rv; } void raft_uv_close(struct raft_io *io) { struct uv *uv; uv = io->impl; io->impl = NULL; raft_free(uv); } void raft_uv_set_segment_size(struct raft_io *io, size_t size) { struct uv *uv; uv = io->impl; uv->segment_size = size; } void raft_uv_set_block_size(struct raft_io *io, size_t size) { struct uv *uv; uv = io->impl; uv->block_size = size; } int raft_uv_set_snapshot_compression(struct raft_io *io, bool compressed) { struct uv *uv; uv = io->impl; #ifndef LZ4_AVAILABLE if (compressed) { return RAFT_INVALID; } #endif uv->snapshot_compression = compressed; return 0; } void raft_uv_set_connect_retry_delay(struct raft_io *io, unsigned msecs) { struct uv *uv; uv = io->impl; uv->connect_retry_delay = msecs; } void raft_uv_set_tracer(struct raft_io *io, struct raft_tracer *tracer) { struct uv *uv; uv = io->impl; uv->tracer = tracer; } void raft_uv_set_auto_recovery(struct raft_io *io, bool flag) { struct uv *uv; uv = io->impl; uv->auto_recovery = flag; } #undef tracef dqlite-1.16.7/src/raft/uv.h000066400000000000000000000375211465252713400154370ustar00rootroot00000000000000/* Implementation of the @raft_io interface based on libuv. */ #ifndef UV_H_ #define UV_H_ #include "../raft.h" #include "../tracing.h" #include "err.h" #include "../lib/queue.h" #include "uv_fs.h" #include "uv_os.h" /* 8 Megabytes */ #define UV__MAX_SEGMENT_SIZE (8 * 1024 * 1024) /* Template string for closed segment filenames: start index (inclusive), end * index (inclusive). */ #define UV__CLOSED_TEMPLATE "%016llu-%016llu" /* Template string for open segment filenames: incrementing counter. */ #define UV__OPEN_TEMPLATE "open-%llu" /* Enough to hold a segment filename (either open or closed) */ #define UV__SEGMENT_FILENAME_BUF_SIZE 34 /* Template string for snapshot filenames: snapshot term, snapshot index, * creation timestamp (milliseconds since epoch). */ #define UV__SNAPSHOT_TEMPLATE "snapshot-%llu-%llu-%llu" #define UV__SNAPSHOT_META_SUFFIX ".meta" /* Template string for snapshot metadata filenames: snapshot term, snapshot * index, creation timestamp (milliseconds since epoch). */ #define UV__SNAPSHOT_META_TEMPLATE \ UV__SNAPSHOT_TEMPLATE UV__SNAPSHOT_META_SUFFIX /* State codes. */ enum { UV__PRISTINE, /* Metadata cache populated and I/O capabilities probed */ UV__ACTIVE, UV__CLOSED }; /* Open segment counter type */ typedef unsigned long long uvCounter; /* Information persisted in a single metadata file. */ struct uvMetadata { unsigned long long version; /* Monotonically increasing version */ raft_term term; /* Current term */ raft_id voted_for; /* Server ID of last vote, or 0 */ }; /* Hold state of a libuv-based raft_io implementation. */ struct uv { struct raft_io *io; /* I/O object we're implementing */ struct uv_loop_s *loop; /* UV event loop */ char dir[UV__DIR_LEN]; /* Data directory */ struct raft_uv_transport *transport; /* Network transport */ struct raft_tracer *tracer; /* Debug tracing */ raft_id id; /* Server ID */ int state; /* Current state */ bool snapshot_compression; /* If compression is enabled */ bool errored; /* If a disk I/O error was hit */ bool direct_io; /* Whether direct I/O is supported */ bool async_io; /* Whether async I/O is supported */ bool fallocate; /* Whether fallocate is supported */ size_t segment_size; /* Initial size of open segments. */ size_t block_size; /* Block size of the data dir */ queue clients; /* Outbound connections */ queue servers; /* Inbound connections */ unsigned connect_retry_delay; /* Client connection retry delay */ void *prepare_inflight; /* Segment being prepared */ queue prepare_reqs; /* Pending prepare requests. */ queue prepare_pool; /* Prepared open segments */ uvCounter prepare_next_counter; /* Counter of next open segment */ raft_index append_next_index; /* Index of next entry to append */ queue append_segments; /* Open segments in use. */ queue append_pending_reqs; /* Pending append requests. */ queue append_writing_reqs; /* Append requests in flight */ struct UvBarrier *barrier; /* Inflight barrier request */ queue finalize_reqs; /* Segments waiting to be closed */ struct uv_work_s finalize_work; /* Resize and rename segments */ struct uv_work_s truncate_work; /* Execute truncate log requests */ queue snapshot_get_reqs; /* Inflight get snapshot requests */ queue async_work_reqs; /* Inflight async work requests */ struct uv_work_s snapshot_put_work; /* Execute snapshot put requests */ struct uvMetadata metadata; /* Cache of metadata on disk */ struct uv_timer_s timer; /* Timer for periodic ticks */ raft_io_tick_cb tick_cb; /* Invoked when the timer expires */ raft_io_recv_cb recv_cb; /* Invoked when upon RPC messages */ queue aborting; /* Cleanups upon errors or shutdown */ bool closing; /* True if we are closing */ raft_io_close_cb close_cb; /* Invoked when finishing closing */ bool auto_recovery; /* Try to recover from corrupt segments */ }; /* Implementation of raft_io->truncate. */ int UvTruncate(struct raft_io *io, raft_index index); /* Load Raft metadata from disk, choosing the most recent version (either the * metadata1 or metadata2 file). */ int uvMetadataLoad(const char *dir, struct uvMetadata *metadata, char *errmsg); /* Store the given metadata to disk, writing the appropriate metadata file * according to the metadata version (if the version is odd, write metadata1, * otherwise write metadata2). */ int uvMetadataStore(struct uv *uv, const struct uvMetadata *metadata); /* Metadata about a segment file. */ struct uvSegmentInfo { bool is_open; /* Whether the segment is open */ union { struct { raft_index first_index; /* First index in a closed segment */ raft_index end_index; /* Last index in a closed segment */ }; struct { unsigned long long counter; /* Open segment counter */ }; }; char filename[UV__SEGMENT_FILENAME_BUF_SIZE]; /* Segment filename */ }; /* Append a new item to the given segment info list if the given filename * matches either the one of a closed segment (xxx-yyy) or the one of an open * segment (open-xxx). */ int uvSegmentInfoAppendIfMatch(const char *filename, struct uvSegmentInfo *infos[], size_t *n_infos, bool *appended); /* Sort the given list of segments by comparing their filenames. Closed segments * come before open segments. */ void uvSegmentSort(struct uvSegmentInfo *infos, size_t n_infos); /* Keep only the closed segments whose entries are within the given trailing * amount past the given snapshot last index. If the given trailing amount is 0, * unconditionally delete all closed segments. */ int uvSegmentKeepTrailing(struct uv *uv, struct uvSegmentInfo *segments, size_t n, raft_index last_index, size_t trailing, char *errmsg); /* Load all entries contained in the given closed segment. */ int uvSegmentLoadClosed(struct uv *uv, struct uvSegmentInfo *segment, struct raft_entry *entries[], size_t *n); /* Load raft entries from the given segments. The @start_index is the expected * index of the first entry of the first segment. */ int uvSegmentLoadAll(struct uv *uv, const raft_index start_index, struct uvSegmentInfo *segments, size_t n_segments, struct raft_entry **entries, size_t *n_entries); /* Return the number of blocks in a segments. */ #define uvSegmentBlocks(UV) (UV->segment_size / UV->block_size) /* A dynamically allocated buffer holding data to be written into a segment * file. * * The memory is aligned at disk block boundary, to allow for direct I/O. */ struct uvSegmentBuffer { size_t block_size; /* Disk block size for direct I/O */ uv_buf_t arena; /* Previously allocated memory that can be re-used */ size_t n; /* Write offset */ }; /* Initialize an empty buffer. */ void uvSegmentBufferInit(struct uvSegmentBuffer *b, size_t block_size); /* Release all memory used by the buffer. */ void uvSegmentBufferClose(struct uvSegmentBuffer *b); /* Encode the format version at the very beginning of the buffer. This function * must be called when the buffer is empty. */ int uvSegmentBufferFormat(struct uvSegmentBuffer *b); /* Extend the segment's buffer by encoding the given entries. * * Previous data in the buffer will be retained, and data for these new entries * will be appended. */ int uvSegmentBufferAppend(struct uvSegmentBuffer *b, const struct raft_entry entries[], unsigned n_entries); /* After all entries to write have been encoded, finalize the buffer by zeroing * the unused memory of the last block. The out parameter will point to the * memory to write. */ void uvSegmentBufferFinalize(struct uvSegmentBuffer *b, uv_buf_t *out); /* Reset the buffer preparing it for the next segment write. * * If the retain parameter is greater than zero, then the data of the retain'th * block will be copied at the beginning of the buffer and the write offset will * be set accordingly. */ void uvSegmentBufferReset(struct uvSegmentBuffer *b, unsigned retain); /* Write a closed segment, containing just one entry at the given index * for the given configuration. */ int uvSegmentCreateClosedWithConfiguration( struct uv *uv, raft_index index, const struct raft_configuration *configuration); /* Write the first closed segment, containing just one entry for the given * configuration. */ int uvSegmentCreateFirstClosed(struct uv *uv, const struct raft_configuration *configuration); /* Truncate a segment that was already closed. */ int uvSegmentTruncate(struct uv *uv, struct uvSegmentInfo *segment, raft_index index); /* Info about a persisted snapshot stored in snapshot metadata file. */ struct uvSnapshotInfo { raft_term term; raft_index index; unsigned long long timestamp; char filename[UV__FILENAME_LEN]; }; /* Render the filename of the data file of a snapshot */ void uvSnapshotFilenameOf(struct uvSnapshotInfo *info, char *filename); /* Upon success `orphan` will be true if filename is a snapshot file without a * sibling .meta file */ int UvSnapshotIsOrphan(const char *dir, const char *filename, bool *orphan); /* Upon success `orphan` will be true if filename is a snapshot .meta file * without a sibling snapshot file */ int UvSnapshotMetaIsOrphan(const char *dir, const char *filename, bool *orphan); /* Append a new item to the given snapshot info list if the given filename * matches the pattern of a snapshot metadata file (snapshot-xxx-yyy-zzz.meta) * and there is actually a matching non-empty snapshot file on disk. */ int UvSnapshotInfoAppendIfMatch(struct uv *uv, const char *filename, struct uvSnapshotInfo *infos[], size_t *n_infos, bool *appended); /* Sort the given list of snapshots by comparing their filenames. Older * snapshots will come first. */ void UvSnapshotSort(struct uvSnapshotInfo *infos, size_t n_infos); /* Load the snapshot associated with the given metadata. */ int UvSnapshotLoad(struct uv *uv, struct uvSnapshotInfo *meta, struct raft_snapshot *snapshot, char *errmsg); /* Implementation raft_io->snapshot_put (defined in uv_snapshot.c). */ int UvSnapshotPut(struct raft_io *io, unsigned trailing, struct raft_io_snapshot_put *req, const struct raft_snapshot *snapshot, raft_io_snapshot_put_cb cb); /* Implementation of raft_io->snapshot_get (defined in uv_snapshot.c). */ int UvSnapshotGet(struct raft_io *io, struct raft_io_snapshot_get *req, raft_io_snapshot_get_cb cb); /* Implementation of raft_io->async_work (defined in uv_work.c). */ int UvAsyncWork(struct raft_io *io, struct raft_io_async_work *req, raft_io_async_work_cb cb); /* Return a list of all snapshots and segments found in the data directory. Both * snapshots and segments are ordered by filename (closed segments come before * open ones). */ int UvList(struct uv *uv, struct uvSnapshotInfo *snapshots[], size_t *n_snapshots, struct uvSegmentInfo *segments[], size_t *n_segments, char *errmsg); /* Request to obtain a newly prepared open segment. */ struct uvPrepare; typedef void (*uvPrepareCb)(struct uvPrepare *req, int status); struct uvPrepare { void *data; /* User data */ uv_file fd; /* Resulting segment file descriptor */ unsigned long long counter; /* Resulting segment counter */ uvPrepareCb cb; /* Completion callback */ queue queue; /* Links in uv_io->prepare_reqs */ }; /* Get a prepared open segment ready for writing. If a prepared open segment is * already available in the pool, it will be returned immediately using the fd * and counter pointers and the request callback won't be invoked. Otherwise the * request will be queued and its callback invoked once a newly prepared segment * is available. */ int UvPrepare(struct uv *uv, uv_file *fd, uvCounter *counter, struct uvPrepare *req, uvPrepareCb cb); /* Cancel all pending prepare requests and start removing all unused prepared * open segments. If a segment currently being created, wait for it to complete * and then remove it immediately. */ void UvPrepareClose(struct uv *uv); /* Implementation of raft_io->append. All the raft_buffers of the raft_entry * structs in the entries array are required to have a len that is a multiple * of 8. */ int UvAppend(struct raft_io *io, struct raft_io_append *req, const struct raft_entry entries[], unsigned n, raft_io_append_cb cb); /* Pause request object and callback. */ struct UvBarrierReq; /* A barrier cb that plans to perform work on the threadpool MUST exit early * and cleanup resources when it detects uv->closing, this is to allow forced * closing on shutdown. */ typedef void (*UvBarrierCb)(struct UvBarrierReq *req); struct UvBarrierReq { bool blocking; /* Whether this barrier should block future writes */ void *data; /* User data */ UvBarrierCb cb; /* Completion callback */ queue queue; /* Queue of reqs triggered by a UvBarrier */ }; struct UvBarrier { bool blocking; /* Whether this barrier should block future writes */ queue reqs; /* Queue of UvBarrierReq */ }; /* Submit a barrier request to interrupt the normal flow of append * operations. * * The following will happen: * * - Replace uv->append_next_index with the given next_index, so the next entry * that will be appended will have the new index. * * - Execution of new writes for subsequent append requests will be blocked * until UvUnblock is called when the barrier is blocking. * * - Wait for all currently pending and inflight append requests against all * open segments to complete, and for those open segments to be finalized, * then invoke the barrier callback. * * This API is used to implement truncate and snapshot install operations, which * need to wait until all pending writes have settled and modify the log state, * changing the next index. */ int UvBarrier(struct uv *uv, raft_index next_index, struct UvBarrierReq *req); /* Trigger a callback for a barrier request in this @barrier. Returns true if a * callback was triggered, false if there are no more requests to trigger. * A barrier callback will call UvUnblock, which in turn will try to run the * next callback, if any, from a barrier request in this barrier. */ bool UvBarrierMaybeTrigger(struct UvBarrier *barrier); /* Add a Barrier @req to an existing @barrier. */ void UvBarrierAddReq(struct UvBarrier *barrier, struct UvBarrierReq *req); /* Returns @true if there are no more segments referencing uv->barrier */ bool UvBarrierReady(struct uv *uv); /* Resume writing append requests after UvBarrier has been called. */ void UvUnblock(struct uv *uv); /* Cancel all pending write requests and request the current segment to be * finalized. Must be invoked at closing time. */ void uvAppendClose(struct uv *uv); /* Submit a request to finalize the open segment with the given counter. * * Requests are processed one at a time, to avoid ending up closing open segment * N + 1 before closing open segment N. */ int UvFinalize(struct uv *uv, unsigned long long counter, size_t used, raft_index first_index, raft_index last_index); /* Implementation of raft_io->send. */ int UvSend(struct raft_io *io, struct raft_io_send *req, const struct raft_message *message, raft_io_send_cb cb); /* Stop all clients by closing the outbound stream handles and canceling all * pending send requests. */ void UvSendClose(struct uv *uv); /* Start receiving messages from new incoming connections. */ int UvRecvStart(struct uv *uv); /* Stop all servers by closing the inbound stream handles and aborting all * requests being received. */ void UvRecvClose(struct uv *uv); void uvMaybeFireCloseCb(struct uv *uv); #endif /* UV_H_ */ dqlite-1.16.7/src/raft/uv_append.c000066400000000000000000000720141465252713400167550ustar00rootroot00000000000000#include "assert.h" #include "byte.h" #include "heap.h" #include "../lib/queue.h" #include "uv.h" #include "uv_encoding.h" #include "uv_writer.h" /* The happy path for an append request is: * * - If there is a current segment and it is has enough spare capacity to hold * the entries in the request, then queue the request, linking it to the * current segment. * * - If there is no current segment, or it hasn't enough spare capacity to hold * the entries in the request, then request a new open segment to be prepared, * queue the request and link it to the newly requested segment. * * - Wait for any pending write against the current segment to complete, and * also for the prepare request if we asked for a new segment. Also wait for * any in progress barrier to be removed. * * - Submit a write request for the entries in this append request. The write * request might contain other append requests targeted to the current segment * that might have accumulated in the meantime, if we have been waiting for a * segment to be prepared, or for the previous write to complete or for a * barrier to be removed. * * - Wait for the write request to finish and fire the append request's * callback. * * Possible failure modes are: * * - The request to prepare a new segment fails. * - The write request fails. * - The request to finalize a new segment fails to be submitted. * * In all these cases we mark the instance as errored and fire the relevant * callbacks. **/ /* An open segment being written or waiting to be written. */ struct uvAliveSegment { struct uv *uv; /* Our writer */ struct uvPrepare prepare; /* Prepare segment file request */ struct UvWriter writer; /* Writer to perform async I/O */ struct UvWriterReq write; /* Write request */ unsigned long long counter; /* Open segment counter */ raft_index first_index; /* Index of the first entry written */ raft_index pending_last_index; /* Index of the last entry written */ size_t size; /* Total number of bytes used */ unsigned next_block; /* Next segment block to write */ struct uvSegmentBuffer pending; /* Buffer for data yet to be written */ uv_buf_t buf; /* Write buffer for current write */ raft_index last_index; /* Last entry actually written */ size_t written; /* Number of bytes actually written */ queue queue; /* Segment queue */ struct UvBarrier *barrier; /* Barrier waiting on this segment */ bool finalize; /* Finalize the segment after writing */ }; struct uvAppend { struct raft_io_append *req; /* User request */ const struct raft_entry *entries; /* Entries to write */ unsigned n; /* Number of entries */ struct uvAliveSegment *segment; /* Segment to write to */ queue queue; }; static void uvAliveSegmentWriterCloseCb(struct UvWriter *writer) { struct uvAliveSegment *segment = writer->data; struct uv *uv = segment->uv; uvSegmentBufferClose(&segment->pending); RaftHeapFree(segment); uvMaybeFireCloseCb(uv); } /* Submit a request to close the current open segment. */ static void uvAliveSegmentFinalize(struct uvAliveSegment *s) { struct uv *uv = s->uv; int rv; rv = UvFinalize(uv, s->counter, s->written, s->first_index, s->last_index); if (rv != 0) { uv->errored = true; /* We failed to submit the finalize request, but let's still * close the file handle and release the segment memory. */ } queue_remove(&s->queue); UvWriterClose(&s->writer, uvAliveSegmentWriterCloseCb); } /* Flush the append requests in the given queue, firing their callbacks with the * given status. */ static void uvAppendFinishRequestsInQueue(struct uv *uv, queue *q, int status) { queue queue_copy; struct uvAppend *append; queue_init(&queue_copy); while (!queue_empty(q)) { queue *head; head = queue_head(q); append = QUEUE_DATA(head, struct uvAppend, queue); /* Rollback the append next index if the result was * unsuccessful. */ if (status != 0) { tracef("rollback uv->append_next_index was:%llu", uv->append_next_index); uv->append_next_index -= append->n; tracef("rollback uv->append_next_index now:%llu", uv->append_next_index); } queue_remove(head); queue_insert_tail(&queue_copy, head); } while (!queue_empty(&queue_copy)) { queue *head; struct raft_io_append *req; head = queue_head(&queue_copy); append = QUEUE_DATA(head, struct uvAppend, queue); queue_remove(head); req = append->req; RaftHeapFree(append); req->cb(req, status); } } /* Flush the append requests in the writing queue, firing their callbacks with * the given status. */ static void uvAppendFinishWritingRequests(struct uv *uv, int status) { uvAppendFinishRequestsInQueue(uv, &uv->append_writing_reqs, status); } /* Flush the append requests in the pending queue, firing their callbacks with * the given status. */ static void uvAppendFinishPendingRequests(struct uv *uv, int status) { uvAppendFinishRequestsInQueue(uv, &uv->append_pending_reqs, status); } /* Return the segment currently being written, or NULL when no segment has been * written yet. */ static struct uvAliveSegment *uvGetCurrentAliveSegment(struct uv *uv) { queue *head; if (queue_empty(&uv->append_segments)) { return NULL; } head = queue_head(&uv->append_segments); return QUEUE_DATA(head, struct uvAliveSegment, queue); } /* Extend the segment's write buffer by encoding the entries in the given * request into it. IOW, previous data in the write buffer will be retained, and * data for these new entries will be appended. */ static int uvAliveSegmentEncodeEntriesToWriteBuf(struct uvAliveSegment *segment, struct uvAppend *append) { int rv; assert(append->segment == segment); /* If this is the very first write to the segment, we need to include * the format version */ if (segment->pending.n == 0 && segment->next_block == 0) { rv = uvSegmentBufferFormat(&segment->pending); if (rv != 0) { return rv; } } rv = uvSegmentBufferAppend(&segment->pending, append->entries, append->n); if (rv != 0) { return rv; } segment->pending_last_index += append->n; return 0; } static int uvAppendMaybeStart(struct uv *uv); static void uvAliveSegmentWriteCb(struct UvWriterReq *write, const int status) { struct uvAliveSegment *s = write->data; struct uv *uv = s->uv; unsigned n_blocks; int rv; assert(uv->state != UV__CLOSED); assert(s->buf.len % uv->block_size == 0); assert(s->buf.len >= uv->block_size); /* Check if the write was successful. */ if (status != 0) { tracef("write: %s", uv->io->errmsg); uv->errored = true; goto out; } s->written = s->next_block * uv->block_size + s->pending.n; s->last_index = s->pending_last_index; /* Update our write markers. * * We have four cases: * * - The data fit completely in the leftover space of the first block * that we wrote and there is more space left. In this case we just keep * the scheduled marker unchanged. * * - The data fit completely in the leftover space of the first block * that we wrote and there is no space left. In this case we advance the * current block counter, reset the first write block and set the * scheduled marker to 0. * * - The data did not fit completely in the leftover space of the first * block that we wrote, so we wrote more than one block. The last * block that we wrote was not filled completely and has leftover space. * In this case we advance the current block counter and copy the memory * used for the last block to the head of the write arena list, updating * the scheduled marker accordingly. * * - The data did not fit completely in the leftover space of the first * block that we wrote, so we wrote more than one block. The last * block that we wrote was filled exactly and has no leftover space. In * this case we advance the current block counter, reset the first * buffer and set the scheduled marker to 0. */ n_blocks = (unsigned)(s->buf.len / uv->block_size); /* Number of blocks written. */ if (s->pending.n < uv->block_size) { /* Nothing to do */ assert(n_blocks == 1); } else if (s->pending.n == uv->block_size) { assert(n_blocks == 1); s->next_block++; uvSegmentBufferReset(&s->pending, 0); } else { assert(s->pending.n > uv->block_size); assert(s->buf.len > uv->block_size); if (s->pending.n % uv->block_size > 0) { s->next_block += n_blocks - 1; uvSegmentBufferReset(&s->pending, n_blocks - 1); } else { s->next_block += n_blocks; uvSegmentBufferReset(&s->pending, 0); } } out: /* Fire the callbacks of all requests that were fulfilled with this * write. */ uvAppendFinishWritingRequests(uv, status); if (status != 0) { /* When the write has failed additionally cancel all future * append related activity. This will also rewind * uv->append_next_index. All append requests need to be * canceled because raft assumes all appends happen in order and * if an append fails (and is not retried), we would be missing * a sequence of log entries on disk. The implementation can't * handle that + the accounting of the append index would be * off. */ uvAppendFinishPendingRequests(uv, status); /* Allow this segment to be finalized further down. Don't bother * rewinding state to possibly reuse the segment for writing, * it's too bug-prone. */ s->pending_last_index = s->last_index; s->finalize = true; } /* During the closing sequence we should have already canceled all * pending request. */ if (uv->closing) { assert(queue_empty(&uv->append_pending_reqs)); assert(s->finalize); uvAliveSegmentFinalize(s); return; } /* Possibly process waiting requests. */ if (!queue_empty(&uv->append_pending_reqs)) { rv = uvAppendMaybeStart(uv); if (rv != 0) { uv->errored = true; } } else if (s->finalize && (s->pending_last_index == s->last_index) && !s->writer.closing) { /* If there are no more append_pending_reqs or write requests in * flight, this segment must be finalized here in case we don't * receive AppendEntries RPCs anymore (could happen during a * Snapshot install, causing the BarrierCb to never fire), but * check that the callbacks that fired after completion of this * write didn't already close the segment. */ uvAliveSegmentFinalize(s); } } /* Submit a file write request to append the entries encoded in the write buffer * of the given segment. */ static int uvAliveSegmentWrite(struct uvAliveSegment *s) { int rv; assert(s->counter != 0); assert(s->pending.n > 0); uvSegmentBufferFinalize(&s->pending, &s->buf); rv = UvWriterSubmit(&s->writer, &s->write, &s->buf, 1, s->next_block * s->uv->block_size, uvAliveSegmentWriteCb); if (rv != 0) { return rv; } return 0; } /* Start writing all pending append requests for the current segment, unless we * are already writing, or the segment itself has not yet been prepared or we * are blocked on a barrier. If there are no more requests targeted at the * current segment, make sure it's marked to be finalize and try with the next * segment. */ static int uvAppendMaybeStart(struct uv *uv) { struct uvAliveSegment *segment; struct uvAppend *append; unsigned n_reqs; queue *head; queue q; int rv; assert(!uv->closing); assert(!queue_empty(&uv->append_pending_reqs)); /* If we are already writing, let's wait. */ if (!queue_empty(&uv->append_writing_reqs)) { return 0; } start: segment = uvGetCurrentAliveSegment(uv); assert(segment != NULL); /* If the preparer isn't done yet, let's wait. */ if (segment->counter == 0) { return 0; } /* If there's a blocking barrier in progress, and it's not waiting for * this segment to be finalized, let's wait. * * FIXME shouldn't we wait even if segment->barrier == uv->barrier, if * there are other open segments associated with the same barrier? */ if (uv->barrier != NULL && segment->barrier != uv->barrier && uv->barrier->blocking) { return 0; } /* If there's no barrier in progress and this segment is marked with a * barrier, it means that this was a pending barrier, which we can * become the current barrier now. */ if (uv->barrier == NULL && segment->barrier != NULL) { uv->barrier = segment->barrier; } /* Let's add to the segment's write buffer all pending requests targeted * to this segment. */ queue_init(&q); n_reqs = 0; while (!queue_empty(&uv->append_pending_reqs)) { head = queue_head(&uv->append_pending_reqs); append = QUEUE_DATA(head, struct uvAppend, queue); assert(append->segment != NULL); if (append->segment != segment) { break; /* Not targeted to this segment */ } queue_remove(head); queue_insert_tail(&q, head); n_reqs++; rv = uvAliveSegmentEncodeEntriesToWriteBuf(segment, append); if (rv != 0) { goto err; } } /* If we have no more requests for this segment, let's check if it has * been marked for closing, and in that case finalize it and possibly * trigger a write against the next segment (unless there is a truncate * request, in that case we need to wait for it). Otherwise it must mean * we have exhausted the queue of pending append requests. */ if (n_reqs == 0) { assert(queue_empty(&uv->append_writing_reqs)); if (segment->finalize) { uvAliveSegmentFinalize(segment); if (!queue_empty(&uv->append_pending_reqs)) { goto start; } } assert(queue_empty(&uv->append_pending_reqs)); return 0; } while (!queue_empty(&q)) { head = queue_head(&q); queue_remove(head); queue_insert_tail(&uv->append_writing_reqs, head); } rv = uvAliveSegmentWrite(segment); if (rv != 0) { goto err; } return 0; err: assert(rv != 0); return rv; } /* Invoked when a newly added open segment becomes ready for writing, after the * associated UvPrepare request completes (either synchronously or * asynchronously). */ static int uvAliveSegmentReady(struct uv *uv, uv_file fd, uvCounter counter, struct uvAliveSegment *segment) { int rv; rv = UvWriterInit(&segment->writer, uv->loop, fd, uv->direct_io, uv->async_io, 1, uv->io->errmsg); if (rv != 0) { ErrMsgWrapf(uv->io->errmsg, "setup writer for open-%llu", counter); return rv; } segment->counter = counter; return 0; } static void uvAliveSegmentPrepareCb(struct uvPrepare *req, int status) { struct uvAliveSegment *segment = req->data; struct uv *uv = segment->uv; int rv; assert(segment->counter == 0); assert(segment->written == 0); /* If we have been closed, let's discard the segment. */ if (uv->closing) { queue_remove(&segment->queue); assert(status == RAFT_CANCELED); /* UvPrepare cancels pending reqs */ uvSegmentBufferClose(&segment->pending); RaftHeapFree(segment); return; } if (status != 0) { tracef("prepare segment failed (%d)", status); rv = status; goto err; } assert(req->counter > 0); assert(req->fd >= 0); /* There must be pending appends that were waiting for this prepare * requests. */ assert(!queue_empty(&uv->append_pending_reqs)); rv = uvAliveSegmentReady(uv, req->fd, req->counter, segment); if (rv != 0) { tracef("prepare segment ready failed (%d)", rv); goto err; } rv = uvAppendMaybeStart(uv); if (rv != 0) { tracef("prepare segment start failed (%d)", rv); goto err; } return; err: queue_remove(&segment->queue); RaftHeapFree(segment); uv->errored = true; uvAppendFinishPendingRequests(uv, rv); } /* Initialize a new open segment object. */ static void uvAliveSegmentInit(struct uvAliveSegment *s, struct uv *uv) { s->uv = uv; s->prepare.data = s; s->writer.data = s; s->write.data = s; s->counter = 0; s->first_index = uv->append_next_index; s->pending_last_index = s->first_index - 1; s->last_index = 0; s->size = sizeof(uint64_t) /* Format version */; s->next_block = 0; uvSegmentBufferInit(&s->pending, uv->block_size); s->written = 0; s->barrier = NULL; s->finalize = false; } /* Add a new active open segment, since the append request being submitted does * not fit in the last segment we scheduled writes for, or no segment had been * previously requested at all. */ static int uvAppendPushAliveSegment(struct uv *uv) { struct uvAliveSegment *segment; uv_file fd; uvCounter counter; int rv; segment = RaftHeapMalloc(sizeof *segment); if (segment == NULL) { rv = RAFT_NOMEM; goto err; } uvAliveSegmentInit(segment, uv); queue_insert_tail(&uv->append_segments, &segment->queue); rv = UvPrepare(uv, &fd, &counter, &segment->prepare, uvAliveSegmentPrepareCb); if (rv != 0) { goto err_after_alloc; } /* If we've been returned a ready prepared segment right away, start * writing to it immediately. */ if (fd != -1) { rv = uvAliveSegmentReady(uv, fd, counter, segment); if (rv != 0) { goto err_after_prepare; } } return 0; err_after_prepare: UvOsClose(fd); UvFinalize(uv, counter, 0, 0, 0); err_after_alloc: queue_remove(&segment->queue); RaftHeapFree(segment); err: assert(rv != 0); return rv; } /* Return the last segment that we have requested to prepare. */ static struct uvAliveSegment *uvGetLastAliveSegment(struct uv *uv) { queue *tail; if (queue_empty(&uv->append_segments)) { return NULL; } tail = queue_tail(&uv->append_segments); return QUEUE_DATA(tail, struct uvAliveSegment, queue); } /* Return #true if the remaining capacity of the given segment is equal or * greater than @size. */ static bool uvAliveSegmentHasEnoughSpareCapacity(struct uvAliveSegment *s, size_t size) { return s->size + size <= s->uv->segment_size; } /* Add @size bytes to the number of bytes that the segment will hold. The actual * write will happen when the previous write completes, if any. */ static void uvAliveSegmentReserveSegmentCapacity(struct uvAliveSegment *s, size_t size) { s->size += size; } /* Return the number of bytes needed to store the batch of entries of this * append request on disk. */ static size_t uvAppendSize(struct uvAppend *a) { size_t size = sizeof(uint32_t) * 2; /* CRC checksums */ unsigned i; size += uvSizeofBatchHeader(a->n, true); /* Batch header */ for (i = 0; i < a->n; i++) { /* Entries data */ size += bytePad64(a->entries[i].buf.len); } return size; } /* Enqueue an append entries request, assigning it to the appropriate active * open segment. */ static int uvAppendEnqueueRequest(struct uv *uv, struct uvAppend *append) { struct uvAliveSegment *segment; size_t size; bool fits; int rv; assert(append->entries != NULL); assert(append->n > 0); assert(uv->append_next_index > 0); tracef("enqueue %u entries", append->n); size = uvAppendSize(append); /* If we have no segments yet, it means this is the very first append, * and we need to add a new segment. Otherwise we check if the last * segment has enough room for this batch of entries. */ segment = uvGetLastAliveSegment(uv); if (segment == NULL || segment->finalize) { fits = false; } else { fits = uvAliveSegmentHasEnoughSpareCapacity(segment, size); if (!fits) { segment->finalize = true; /* Finalize when all writes are done */ } } /* If there's no segment or if this batch does not fit in this segment, * we need to add a new one. */ if (!fits) { rv = uvAppendPushAliveSegment(uv); if (rv != 0) { goto err; } } segment = uvGetLastAliveSegment(uv); /* Get the last added segment */ uvAliveSegmentReserveSegmentCapacity(segment, size); append->segment = segment; queue_insert_tail(&uv->append_pending_reqs, &append->queue); uv->append_next_index += append->n; tracef("set uv->append_next_index %llu", uv->append_next_index); return 0; err: assert(rv != 0); return rv; } /* Check that all entry buffers are 8-byte aligned */ static int uvCheckEntryBuffersAligned(struct uv *uv, const struct raft_entry entries[], unsigned n) { unsigned i; for (i = 0; i < n; i++) { if (entries[i].buf.len % 8) { ErrMsgPrintf(uv->io->errmsg, "entry buffers must be 8-byte aligned"); tracef("%s", uv->io->errmsg); return RAFT_INVALID; } } return 0; } int UvAppend(struct raft_io *io, struct raft_io_append *req, const struct raft_entry entries[], unsigned n, raft_io_append_cb cb) { struct uv *uv; struct uvAppend *append; int rv; uv = io->impl; assert(!uv->closing); append = RaftHeapCalloc(1, sizeof *append); if (append == NULL) { rv = RAFT_NOMEM; goto err; } append->req = req; append->entries = entries; append->n = n; req->cb = cb; rv = uvCheckEntryBuffersAligned(uv, entries, n); if (rv != 0) { goto err_after_req_alloc; } rv = uvAppendEnqueueRequest(uv, append); if (rv != 0) { goto err_after_req_alloc; } assert(append->segment != NULL); assert(!queue_empty(&uv->append_pending_reqs)); /* Try to write immediately. */ rv = uvAppendMaybeStart(uv); if (rv != 0) { return rv; } return 0; err_after_req_alloc: RaftHeapFree(append); err: assert(rv != 0); return rv; } /* Finalize the current segment as soon as all its pending or inflight append * requests get completed. */ static void uvFinalizeCurrentAliveSegmentOnceIdle(struct uv *uv) { struct uvAliveSegment *s; queue *head; bool has_pending_reqs; bool has_writing_reqs; s = uvGetCurrentAliveSegment(uv); if (s == NULL) { return; } /* Check if there are pending append requests targeted to the current * segment. */ has_pending_reqs = false; QUEUE_FOREACH(head, &uv->append_pending_reqs) { struct uvAppend *r = QUEUE_DATA(head, struct uvAppend, queue); if (r->segment == s) { has_pending_reqs = true; break; } } has_writing_reqs = !queue_empty(&uv->append_writing_reqs); /* If there is no pending append request or inflight write against the * current segment, we can submit a request for it to be closed * immediately. Otherwise, we set the finalize flag. * * TODO: is it actually possible to have pending requests with no * writing requests? Probably no. */ if (!has_pending_reqs && !has_writing_reqs) { uvAliveSegmentFinalize(s); } else { s->finalize = true; } } bool UvBarrierReady(struct uv *uv) { if (uv->barrier == NULL) { return true; } queue *head; QUEUE_FOREACH(head, &uv->append_segments) { struct uvAliveSegment *segment; segment = QUEUE_DATA(head, struct uvAliveSegment, queue); if (segment->barrier == uv->barrier) { return false; } } return true; } bool UvBarrierMaybeTrigger(struct UvBarrier *barrier) { if (!barrier) { return false; } if (!queue_empty(&barrier->reqs)) { queue *head; struct UvBarrierReq *r; head = queue_head(&barrier->reqs); queue_remove(head); r = QUEUE_DATA(head, struct UvBarrierReq, queue); r->cb(r); return true; } return false; } /* Used during cleanup. */ static void uvBarrierTriggerAll(struct UvBarrier *barrier) { while (UvBarrierMaybeTrigger(barrier)) { ; } } static struct UvBarrier *uvBarrierCreate(void) { struct UvBarrier *barrier; barrier = RaftHeapCalloc(1, sizeof(*barrier)); if (!barrier) { return NULL; } barrier->blocking = false; queue_init(&barrier->reqs); return barrier; } int UvBarrier(struct uv *uv, raft_index next_index, struct UvBarrierReq *req) { /* The barrier to attach to. */ struct UvBarrier *barrier = NULL; struct uvAliveSegment *segment = NULL; queue *head; assert(!uv->closing); /* The next entry will be appended at this index. */ uv->append_next_index = next_index; tracef("UvBarrier uv->append_next_index:%llu", uv->append_next_index); /* Arrange for all open segments not already involved in other barriers * to be finalized as soon as their append requests get completed and * mark them as involved in this specific barrier request. */ QUEUE_FOREACH(head, &uv->append_segments) { segment = QUEUE_DATA(head, struct uvAliveSegment, queue); if (segment->barrier != NULL) { /* If a non-blocking barrier precedes this blocking * request, we want to also block all future writes. */ if (req->blocking) { segment->barrier->blocking = true; } continue; } if (!barrier) { barrier = uvBarrierCreate(); if (!barrier) { return RAFT_NOMEM; } /* And add the request to the barrier. */ UvBarrierAddReq(barrier, req); } segment->barrier = barrier; if (segment == uvGetCurrentAliveSegment(uv)) { uvFinalizeCurrentAliveSegmentOnceIdle(uv); continue; } segment->finalize = true; } /* Unable to attach to a segment, because all segments are involved in a * barrier, or there are no segments. */ if (barrier == NULL) { /* Attach req to last segment barrier. */ if (segment != NULL) { barrier = segment->barrier; /* There is no segment, attach to uv->barrier. */ } else if (uv->barrier != NULL) { barrier = uv->barrier; /* There is no uv->barrier, make new one. */ } else { barrier = uvBarrierCreate(); if (!barrier) { return RAFT_NOMEM; } } UvBarrierAddReq(barrier, req); } /* Let's not continue writing new entries if something down the line * asked us to stop writing. */ if (uv->barrier != NULL && req->blocking) { uv->barrier->blocking = true; } assert(barrier != NULL); if (uv->barrier == NULL) { uv->barrier = barrier; /* If there's no pending append-related activity, we can fire * the callback immediately. * * TODO: find a way to avoid invoking this synchronously. */ if (queue_empty(&uv->append_segments) && queue_empty(&uv->finalize_reqs) && uv->finalize_work.data == NULL) { /* Not interested in return value. */ UvBarrierMaybeTrigger(barrier); } } return 0; } void UvUnblock(struct uv *uv) { /* First fire all pending barrier requests. Unblock will be called again * when that request's callback is fired. */ if (UvBarrierMaybeTrigger(uv->barrier)) { tracef("UvUnblock triggered barrier request callback."); return; } /* All requests in barrier are finished. */ tracef("UvUnblock queue empty"); RaftHeapFree(uv->barrier); uv->barrier = NULL; if (uv->closing) { uvMaybeFireCloseCb(uv); return; } if (!queue_empty(&uv->append_pending_reqs)) { int rv; rv = uvAppendMaybeStart(uv); if (rv != 0) { uv->errored = true; } } } void UvBarrierAddReq(struct UvBarrier *barrier, struct UvBarrierReq *req) { assert(barrier != NULL); assert(req != NULL); /* Once there's a blocking req, this barrier becomes blocking. */ barrier->blocking |= req->blocking; queue_insert_tail(&barrier->reqs, &req->queue); } /* Fire all pending barrier requests, the barrier callback will notice that * we're closing and abort there. */ static void uvBarrierClose(struct uv *uv) { tracef("uv barrier close"); struct UvBarrier *barrier = NULL; queue *head; assert(uv->closing); QUEUE_FOREACH(head, &uv->append_segments) { struct uvAliveSegment *segment; segment = QUEUE_DATA(head, struct uvAliveSegment, queue); if (segment->barrier != NULL && segment->barrier != barrier && segment->barrier != uv->barrier) { barrier = segment->barrier; /* Fire all barrier cb's, this is safe because the * barrier cb exits early when uv->closing is true. */ uvBarrierTriggerAll(barrier); RaftHeapFree(barrier); } /* The segment->barrier field is used: * * - by UvBarrierReady, to check whether it's time to invoke the * barrier callback after successfully finalizing a segment * - by uvAppendMaybeStart, to see whether we should go ahead * with writing to a segment even though a barrier is active * because the barrier is waiting on that same segment to be * finalized (but see the * FIXME in that function) * - to save a barrier for later, if UvBarrier was called when * uv->barrier was already set * * If we're cancelling the barrier, we don't need to save it for * later; the callback will not be invoked a second time in any * case; and uvAppendMaybeStart won't be called while closing. * So it's fine to clear segment->barrier here. */ segment->barrier = NULL; } /* There might still be a current barrier set on uv->barrier, meaning * that the open segment it was associated with has started to be * finalized and is not anymore in the append_segments queue. Let's * cancel all untriggered barrier request callbacks too. */ if (uv->barrier != NULL) { uvBarrierTriggerAll(uv->barrier); /* Clear uv->barrier if there's no active work on the thread * pool. When the work on the threadpool finishes, UvUnblock * will notice we're closing, clear and free uv->barrier and * call uvMaybeFireCloseCb. UnUnblock will not try to fire * anymore barrier request callbacks because they were triggered * in the line above. */ if (uv->snapshot_put_work.data == NULL && uv->truncate_work.data == NULL) { RaftHeapFree(uv->barrier); uv->barrier = NULL; } } } void uvAppendClose(struct uv *uv) { struct uvAliveSegment *segment; assert(uv->closing); uvBarrierClose(uv); UvPrepareClose(uv); uvAppendFinishPendingRequests(uv, RAFT_CANCELED); uvFinalizeCurrentAliveSegmentOnceIdle(uv); /* Also finalize the segments that we didn't write at all and are just * sitting in the append_segments queue waiting for writes against the * current segment to complete. */ while (!queue_empty(&uv->append_segments)) { segment = uvGetLastAliveSegment(uv); assert(segment != NULL); if (segment == uvGetCurrentAliveSegment(uv)) { break; /* We reached the head of the queue */ } assert(segment->written == 0); uvAliveSegmentFinalize(segment); } } dqlite-1.16.7/src/raft/uv_encoding.c000066400000000000000000000346721465252713400173040ustar00rootroot00000000000000#include "uv_encoding.h" #include #include #include "../raft.h" #include "assert.h" #include "byte.h" #include "configuration.h" /** * Size of the request preamble. */ #define RAFT_IO_UV__PREAMBLE_SIZE \ (sizeof(uint64_t) /* Message type. */ + \ sizeof(uint64_t) /* Message size. */) static size_t sizeofRequestVoteV1(void) { return sizeof(uint64_t) + /* Term. */ sizeof(uint64_t) + /* Candidate ID. */ sizeof(uint64_t) + /* Last log index. */ sizeof(uint64_t) /* Last log term. */; } static size_t sizeofRequestVote(void) { return sizeofRequestVoteV1() + sizeof(uint64_t) /* Leadership transfer. */; } static size_t sizeofRequestVoteResultV1(void) { return sizeof(uint64_t) + /* Term. */ sizeof(uint64_t) /* Vote granted. */; } static size_t sizeofRequestVoteResult(void) { return sizeofRequestVoteResultV1() + /* Size of older version 1 message */ sizeof(uint64_t) /* Flags. */; } static size_t sizeofAppendEntries(const struct raft_append_entries *p) { return sizeof(uint64_t) + /* Leader's term. */ sizeof(uint64_t) + /* Leader ID */ sizeof(uint64_t) + /* Previous log entry index */ sizeof(uint64_t) + /* Previous log entry term */ sizeof(uint64_t) + /* Leader's commit index */ sizeof(uint64_t) + /* Number of entries in the batch */ 16 * p->n_entries /* One header per entry */; } static size_t sizeofAppendEntriesResultV0(void) { return sizeof(uint64_t) + /* Term. */ sizeof(uint64_t) + /* Success. */ sizeof(uint64_t) /* Last log index. */; } static size_t sizeofAppendEntriesResult(void) { return sizeofAppendEntriesResultV0() + sizeof(uint64_t) /* 64 bit Flags. */; } static size_t sizeofInstallSnapshot(const struct raft_install_snapshot *p) { size_t conf_size = configurationEncodedSize(&p->conf); return sizeof(uint64_t) + /* Leader's term. */ sizeof(uint64_t) + /* Leader ID */ sizeof(uint64_t) + /* Snapshot's last index */ sizeof(uint64_t) + /* Term of last index */ sizeof(uint64_t) + /* Configuration's index */ sizeof(uint64_t) + /* Length of configuration */ conf_size + /* Configuration data */ sizeof(uint64_t); /* Length of snapshot data */ } static size_t sizeofTimeoutNow(void) { return sizeof(uint64_t) + /* Term. */ sizeof(uint64_t) + /* Last log index. */ sizeof(uint64_t) /* Last log term. */; } size_t uvSizeofBatchHeader(size_t n, bool with_local_data) { size_t res = 8 + /* Number of entries in the batch, little endian */ 16 * n; /* One header per entry */; if (with_local_data) { #ifdef DQLITE_NEXT res += 8; /* Local data length, applies to all entries */ #endif } return res; } static void encodeRequestVote(const struct raft_request_vote *p, void *buf) { void *cursor = buf; uint64_t flags = 0; if (p->disrupt_leader) { flags |= 1 << 0; } if (p->pre_vote) { flags |= 1 << 1; } bytePut64(&cursor, p->term); bytePut64(&cursor, p->candidate_id); bytePut64(&cursor, p->last_log_index); bytePut64(&cursor, p->last_log_term); bytePut64(&cursor, flags); } static void encodeRequestVoteResult(const struct raft_request_vote_result *p, void *buf) { void *cursor = buf; uint64_t flags = 0; if (p->pre_vote) { flags |= (1 << 0); } bytePut64(&cursor, p->term); bytePut64(&cursor, p->vote_granted); bytePut64(&cursor, flags); } static void encodeAppendEntries(const struct raft_append_entries *p, void *buf) { void *cursor; cursor = buf; bytePut64(&cursor, p->term); /* Leader's term. */ bytePut64(&cursor, p->prev_log_index); /* Previous index. */ bytePut64(&cursor, p->prev_log_term); /* Previous term. */ bytePut64(&cursor, p->leader_commit); /* Commit index. */ uvEncodeBatchHeader(p->entries, p->n_entries, cursor, false /* no local data */); } static void encodeAppendEntriesResult( const struct raft_append_entries_result *p, void *buf) { void *cursor = buf; bytePut64(&cursor, p->term); bytePut64(&cursor, p->rejected); bytePut64(&cursor, p->last_log_index); bytePut64(&cursor, p->features); } static void encodeInstallSnapshot(const struct raft_install_snapshot *p, void *buf) { void *cursor; size_t conf_size = configurationEncodedSize(&p->conf); cursor = buf; bytePut64(&cursor, p->term); /* Leader's term. */ bytePut64(&cursor, p->last_index); /* Snapshot last index. */ bytePut64(&cursor, p->last_term); /* Term of last index. */ bytePut64(&cursor, p->conf_index); /* Configuration index. */ bytePut64(&cursor, conf_size); /* Configuration length. */ configurationEncodeToBuf(&p->conf, cursor); cursor = (uint8_t *)cursor + conf_size; bytePut64(&cursor, p->data.len); /* Snapshot data size. */ } static void encodeTimeoutNow(const struct raft_timeout_now *p, void *buf) { void *cursor = buf; bytePut64(&cursor, p->term); bytePut64(&cursor, p->last_log_index); bytePut64(&cursor, p->last_log_term); } int uvEncodeMessage(const struct raft_message *message, uv_buf_t **bufs, unsigned *n_bufs) { uv_buf_t header; void *cursor; /* Figure out the length of the header for this request and allocate a * buffer for it. */ header.len = RAFT_IO_UV__PREAMBLE_SIZE; switch (message->type) { case RAFT_IO_REQUEST_VOTE: header.len += sizeofRequestVote(); break; case RAFT_IO_REQUEST_VOTE_RESULT: header.len += sizeofRequestVoteResult(); break; case RAFT_IO_APPEND_ENTRIES: header.len += sizeofAppendEntries(&message->append_entries); break; case RAFT_IO_APPEND_ENTRIES_RESULT: header.len += sizeofAppendEntriesResult(); break; case RAFT_IO_INSTALL_SNAPSHOT: header.len += sizeofInstallSnapshot(&message->install_snapshot); break; case RAFT_IO_TIMEOUT_NOW: header.len += sizeofTimeoutNow(); break; default: return RAFT_MALFORMED; }; header.base = raft_malloc(header.len); if (header.base == NULL) { goto oom; } cursor = header.base; /* Encode the request preamble, with message type and message size. */ bytePut64(&cursor, message->type); bytePut64(&cursor, header.len - RAFT_IO_UV__PREAMBLE_SIZE); /* Encode the request header. */ switch (message->type) { case RAFT_IO_REQUEST_VOTE: encodeRequestVote(&message->request_vote, cursor); break; case RAFT_IO_REQUEST_VOTE_RESULT: encodeRequestVoteResult(&message->request_vote_result, cursor); break; case RAFT_IO_APPEND_ENTRIES: encodeAppendEntries(&message->append_entries, cursor); break; case RAFT_IO_APPEND_ENTRIES_RESULT: encodeAppendEntriesResult( &message->append_entries_result, cursor); break; case RAFT_IO_INSTALL_SNAPSHOT: encodeInstallSnapshot(&message->install_snapshot, cursor); break; case RAFT_IO_TIMEOUT_NOW: encodeTimeoutNow(&message->timeout_now, cursor); break; }; *n_bufs = 1; /* For AppendEntries request we also send the entries payload. */ if (message->type == RAFT_IO_APPEND_ENTRIES) { *n_bufs += message->append_entries.n_entries; } /* For InstallSnapshot request we also send the snapshot payload. */ if (message->type == RAFT_IO_INSTALL_SNAPSHOT) { *n_bufs += 1; } *bufs = raft_calloc(*n_bufs, sizeof **bufs); if (*bufs == NULL) { goto oom_after_header_alloc; } (*bufs)[0] = header; if (message->type == RAFT_IO_APPEND_ENTRIES) { unsigned i; for (i = 0; i < message->append_entries.n_entries; i++) { const struct raft_entry *entry = &message->append_entries.entries[i]; (*bufs)[i + 1].base = entry->buf.base; (*bufs)[i + 1].len = entry->buf.len; } } if (message->type == RAFT_IO_INSTALL_SNAPSHOT) { (*bufs)[1].base = message->install_snapshot.data.base; (*bufs)[1].len = message->install_snapshot.data.len; } return 0; oom_after_header_alloc: raft_free(header.base); oom: return RAFT_NOMEM; } void uvEncodeBatchHeader(const struct raft_entry *entries, unsigned n, void *buf, bool with_local_data) { unsigned i; void *cursor = buf; /* Number of entries in the batch, little endian */ bytePut64(&cursor, n); if (with_local_data) { #ifdef DQLITE_NEXT /* Local data size per entry, little endian */ bytePut64(&cursor, (uint64_t)sizeof(struct raft_entry_local_data)); #endif } for (i = 0; i < n; i++) { const struct raft_entry *entry = &entries[i]; /* Term in which the entry was created, little endian. */ bytePut64(&cursor, entry->term); /* Message type (Either RAFT_COMMAND or RAFT_CHANGE) */ bytePut8(&cursor, (uint8_t)entry->type); cursor = (uint8_t *)cursor + 3; /* Unused */ /* Size of the log entry data, little endian. */ bytePut32(&cursor, (uint32_t)entry->buf.len); } } static void decodeRequestVote(const uv_buf_t *buf, struct raft_request_vote *p) { const void *cursor; cursor = buf->base; p->version = 1; p->term = byteGet64(&cursor); p->candidate_id = byteGet64(&cursor); p->last_log_index = byteGet64(&cursor); p->last_log_term = byteGet64(&cursor); /* Support for legacy request vote that doesn't have disrupt_leader. */ if (buf->len == sizeofRequestVoteV1()) { p->disrupt_leader = false; p->pre_vote = false; } else { p->version = 2; uint64_t flags = byteGet64(&cursor); p->disrupt_leader = (bool)(flags & 1 << 0); p->pre_vote = (bool)(flags & 1 << 1); } } static void decodeRequestVoteResult(const uv_buf_t *buf, struct raft_request_vote_result *p) { const void *cursor; cursor = buf->base; p->version = 1; p->term = byteGet64(&cursor); p->vote_granted = byteGet64(&cursor); if (buf->len > sizeofRequestVoteResultV1()) { p->version = 2; uint64_t flags = byteGet64(&cursor); p->pre_vote = (flags & (1 << 0)); } } int uvDecodeBatchHeader(const void *batch, struct raft_entry **entries, unsigned *n, uint64_t *local_data_size) { const void *cursor = batch; size_t i; int rv; *n = (unsigned)byteGet64(&cursor); if (*n == 0) { *entries = NULL; return 0; } if (local_data_size != NULL) { #ifdef DQLITE_NEXT uint64_t z = byteGet64(&cursor); if (z == 0 || z > sizeof(struct raft_entry_local_data) || z % sizeof(uint64_t) != 0) { rv = RAFT_MALFORMED; goto err; } *local_data_size = z; #endif } *entries = raft_malloc(*n * sizeof **entries); if (*entries == NULL) { rv = RAFT_NOMEM; goto err; } for (i = 0; i < *n; i++) { struct raft_entry *entry = &(*entries)[i]; entry->term = byteGet64(&cursor); entry->type = byteGet8(&cursor); if (entry->type != RAFT_COMMAND && entry->type != RAFT_BARRIER && entry->type != RAFT_CHANGE) { rv = RAFT_MALFORMED; goto err_after_alloc; } cursor = (uint8_t *)cursor + 3; /* Unused */ /* Size of the log entry data, little endian. */ entry->buf.len = byteGet32(&cursor); } return 0; err_after_alloc: raft_free(*entries); *entries = NULL; err: assert(rv != 0); return rv; } static int decodeAppendEntries(const uv_buf_t *buf, struct raft_append_entries *args) { const void *cursor; int rv; assert(buf != NULL); assert(args != NULL); cursor = buf->base; args->version = 0; args->term = byteGet64(&cursor); args->prev_log_index = byteGet64(&cursor); args->prev_log_term = byteGet64(&cursor); args->leader_commit = byteGet64(&cursor); rv = uvDecodeBatchHeader(cursor, &args->entries, &args->n_entries, false); if (rv != 0) { return rv; } return 0; } static void decodeAppendEntriesResult(const uv_buf_t *buf, struct raft_append_entries_result *p) { const void *cursor; cursor = buf->base; p->version = 0; p->term = byteGet64(&cursor); p->rejected = byteGet64(&cursor); p->last_log_index = byteGet64(&cursor); p->features = 0; if (buf->len > sizeofAppendEntriesResultV0()) { p->version = 1; p->features = byteGet64(&cursor); } } static int decodeInstallSnapshot(const uv_buf_t *buf, struct raft_install_snapshot *args) { const void *cursor; struct raft_buffer conf; int rv; assert(buf != NULL); assert(args != NULL); cursor = buf->base; args->version = 0; args->term = byteGet64(&cursor); args->last_index = byteGet64(&cursor); args->last_term = byteGet64(&cursor); args->conf_index = byteGet64(&cursor); conf.len = (size_t)byteGet64(&cursor); conf.base = (void *)cursor; rv = configurationDecode(&conf, &args->conf); if (rv != 0) { return rv; } cursor = (uint8_t *)cursor + conf.len; args->data.len = (size_t)byteGet64(&cursor); return 0; } static void decodeTimeoutNow(const uv_buf_t *buf, struct raft_timeout_now *p) { const void *cursor; cursor = buf->base; p->version = 0; p->term = byteGet64(&cursor); p->last_log_index = byteGet64(&cursor); p->last_log_term = byteGet64(&cursor); } int uvDecodeMessage(uint16_t type, const uv_buf_t *header, struct raft_message *message, size_t *payload_len) { unsigned i; int rv = 0; memset(message, 0, sizeof(*message)); message->type = (unsigned short)type; *payload_len = 0; /* Decode the header. */ switch (type) { case RAFT_IO_REQUEST_VOTE: decodeRequestVote(header, &message->request_vote); break; case RAFT_IO_REQUEST_VOTE_RESULT: decodeRequestVoteResult(header, &message->request_vote_result); break; case RAFT_IO_APPEND_ENTRIES: rv = decodeAppendEntries(header, &message->append_entries); for (i = 0; i < message->append_entries.n_entries; i++) { *payload_len += message->append_entries.entries[i].buf.len; } break; case RAFT_IO_APPEND_ENTRIES_RESULT: decodeAppendEntriesResult( header, &message->append_entries_result); break; case RAFT_IO_INSTALL_SNAPSHOT: rv = decodeInstallSnapshot(header, &message->install_snapshot); *payload_len += message->install_snapshot.data.len; break; case RAFT_IO_TIMEOUT_NOW: decodeTimeoutNow(header, &message->timeout_now); break; default: rv = RAFT_IOERR; break; }; return rv; } int uvDecodeEntriesBatch(uint8_t *batch, size_t offset, struct raft_entry *entries, unsigned n, uint64_t local_data_size) { uint8_t *cursor; assert(batch != NULL); cursor = batch + offset; for (size_t i = 0; i < n; i++) { struct raft_entry *entry = &entries[i]; entry->batch = batch; entry->buf.base = (entry->buf.len > 0) ? cursor : NULL; cursor += entry->buf.len; if (entry->buf.len % 8 != 0) { /* Add padding */ cursor = cursor + 8 - (entry->buf.len % 8); } entry->is_local = false; entry->local_data = (struct raft_entry_local_data){}; assert(local_data_size <= sizeof(entry->local_data.buf)); assert(local_data_size % 8 == 0); #ifdef DQLITE_NEXT memcpy(entry->local_data.buf, cursor, local_data_size); cursor += local_data_size; #endif } return 0; } dqlite-1.16.7/src/raft/uv_encoding.h000066400000000000000000000036601465252713400173020ustar00rootroot00000000000000/* Encoding routines for the the libuv-based @raft_io backend. */ #ifndef UV_ENCODING_H_ #define UV_ENCODING_H_ #include #include "../raft.h" /* Current disk format version. */ #ifdef DQLITE_NEXT #define UV__DISK_FORMAT 2 #else #define UV__DISK_FORMAT 1 #endif int uvEncodeMessage(const struct raft_message *message, uv_buf_t **bufs, unsigned *n_bufs); int uvDecodeMessage(uint16_t type, const uv_buf_t *header, struct raft_message *message, size_t *payload_len); int uvDecodeBatchHeader(const void *batch, struct raft_entry **entries, unsigned *n, uint64_t *local_data_size); int uvDecodeEntriesBatch(uint8_t *batch, size_t offset, struct raft_entry *entries, unsigned n, uint64_t local_data_size); /** * The layout of the memory pointed at by a @batch pointer is the following: * * [8 bytes] Number of entries in the batch, little endian. * [header1] Header data of the first entry of the batch. * [ ... ] More headers * [headerN] Header data of the last entry of the batch. * [data1 ] Payload data of the first entry of the batch. * [ ... ] More data * [dataN ] Payload data of the last entry of the batch. * * An entry header is 16-byte long and has the following layout: * * [8 bytes] Term in which the entry was created, little endian. * [1 byte ] Message type (Either RAFT_COMMAND or RAFT_CHANGE) * [3 bytes] Currently unused. * [4 bytes] Size of the log entry data, little endian. * [8 bytes] Size of the local buffer, little endian. * * A payload data section for an entry is simply a sequence of bytes of * arbitrary lengths, possibly padded with extra bytes to reach 8-byte boundary * (which means that all entry data pointers are 8-byte aligned). */ size_t uvSizeofBatchHeader(size_t n, bool with_local_data); void uvEncodeBatchHeader(const struct raft_entry *entries, unsigned n, void *buf, bool with_local_data); #endif /* UV_ENCODING_H_ */ dqlite-1.16.7/src/raft/uv_finalize.c000066400000000000000000000104771465252713400173140ustar00rootroot00000000000000#include "assert.h" #include "heap.h" #include "../lib/queue.h" #include "uv.h" #include "uv_os.h" /* Metadata about an open segment not used anymore and that should be closed or * remove (if not written at all). */ struct uvDyingSegment { struct uv *uv; uvCounter counter; /* Segment counter */ size_t used; /* Number of used bytes */ raft_index first_index; /* Index of first entry */ raft_index last_index; /* Index of last entry */ int status; /* Status code of blocking syscalls */ queue queue; /* Link to finalize queue */ }; /* Run all blocking syscalls involved in closing a used open segment. * * An open segment is closed by truncating its length to the number of bytes * that were actually written into it and then renaming it. */ static void uvFinalizeWorkCb(uv_work_t *work) { struct uvDyingSegment *segment = work->data; struct uv *uv = segment->uv; char filename1[UV__FILENAME_LEN]; char filename2[UV__FILENAME_LEN]; char errmsg[RAFT_ERRMSG_BUF_SIZE]; int rv; sprintf(filename1, UV__OPEN_TEMPLATE, segment->counter); sprintf(filename2, UV__CLOSED_TEMPLATE, segment->first_index, segment->last_index); tracef("finalize %s into %s", filename1, filename2); /* If the segment hasn't actually been used (because the writer has been * closed or aborted before making any write), just remove it. */ if (segment->used == 0) { tracef("remove unused segment file: %s", filename1); rv = UvFsRemoveFile(uv->dir, filename1, errmsg); if (rv != 0) { goto err; } goto sync; } /* Truncate and rename the segment.*/ rv = UvFsTruncateAndRenameFile(uv->dir, segment->used, filename1, filename2, errmsg); if (rv != 0) { goto err; } sync: rv = UvFsSyncDir(uv->dir, errmsg); if (rv != 0) { goto err; } segment->status = 0; return; err: tracef("truncate segment %s: %s", filename1, errmsg); assert(rv != 0); segment->status = rv; } static int uvFinalizeStart(struct uvDyingSegment *segment); static void uvFinalizeAfterWorkCb(uv_work_t *work, int status) { struct uvDyingSegment *segment = work->data; struct uv *uv = segment->uv; tracef("uv finalize after work segment %p cb status:%d", (void *)segment, status); queue *head; int rv; assert(status == 0); /* We don't cancel worker requests */ uv->finalize_work.data = NULL; if (segment->status != 0) { uv->errored = true; } RaftHeapFree(segment); /* If we have no more dismissed segments to close, check if there's a * barrier to unblock or if we are done closing. */ if (queue_empty(&uv->finalize_reqs)) { tracef("unblock barrier or close"); if (uv->barrier != NULL && UvBarrierReady(uv)) { UvBarrierMaybeTrigger(uv->barrier); } uvMaybeFireCloseCb(uv); return; } /* Grab a new dismissed segment to close. */ head = queue_head(&uv->finalize_reqs); segment = QUEUE_DATA(head, struct uvDyingSegment, queue); queue_remove(&segment->queue); rv = uvFinalizeStart(segment); if (rv != 0) { RaftHeapFree(segment); uv->errored = true; } } /* Start finalizing an open segment. */ static int uvFinalizeStart(struct uvDyingSegment *segment) { struct uv *uv = segment->uv; int rv; assert(uv->finalize_work.data == NULL); assert(segment->counter > 0); uv->finalize_work.data = segment; rv = uv_queue_work(uv->loop, &uv->finalize_work, uvFinalizeWorkCb, uvFinalizeAfterWorkCb); if (rv != 0) { ErrMsgPrintf(uv->io->errmsg, "start to truncate segment file %llu: %s", segment->counter, uv_strerror(rv)); return RAFT_IOERR; } return 0; } int UvFinalize(struct uv *uv, unsigned long long counter, size_t used, raft_index first_index, raft_index last_index) { struct uvDyingSegment *segment; int rv; if (used > 0) { assert(first_index > 0); assert(last_index >= first_index); } segment = RaftHeapMalloc(sizeof *segment); if (segment == NULL) { return RAFT_NOMEM; } segment->uv = uv; segment->counter = counter; segment->used = used; segment->first_index = first_index; segment->last_index = last_index; /* If we're already processing a segment, let's put the request in the * queue and wait. */ if (uv->finalize_work.data != NULL) { queue_insert_tail(&uv->finalize_reqs, &segment->queue); return 0; } rv = uvFinalizeStart(segment); if (rv != 0) { RaftHeapFree(segment); return rv; } return 0; } #undef tracef dqlite-1.16.7/src/raft/uv_fs.c000066400000000000000000000460721465252713400161230ustar00rootroot00000000000000#include "uv_fs.h" #include #include #include #include #include "assert.h" #include "compress.h" #include "err.h" #include "heap.h" #include "uv_os.h" int UvFsCheckDir(const char *dir, char *errmsg) { struct uv_fs_s req; int rv; /* Make sure we have a directory we can write into. */ rv = uv_fs_stat(NULL, &req, dir, NULL); if (rv != 0) { switch (rv) { case UV_ENOENT: ErrMsgPrintf((char *)errmsg, "directory '%s' does not exist", dir); return RAFT_NOTFOUND; case UV_EACCES: ErrMsgPrintf((char *)errmsg, "can't access directory '%s'", dir); return RAFT_UNAUTHORIZED; case UV_ENOTDIR: ErrMsgPrintf((char *)errmsg, "path '%s' is not a directory", dir); return RAFT_INVALID; } ErrMsgPrintf((char *)errmsg, "can't stat '%s': %s", dir, uv_strerror(rv)); return RAFT_IOERR; } if (!(req.statbuf.st_mode & S_IFDIR)) { ErrMsgPrintf((char *)errmsg, "path '%s' is not a directory", dir); return RAFT_INVALID; } if (!(req.statbuf.st_mode & S_IWRITE)) { ErrMsgPrintf((char *)errmsg, "directory '%s' is not writable", dir); return RAFT_INVALID; } return 0; } int UvFsSyncDir(const char *dir, char *errmsg) { uv_file fd; int rv; rv = UvOsOpen(dir, UV_FS_O_RDONLY | UV_FS_O_DIRECTORY, 0, &fd); if (rv != 0) { UvOsErrMsg(errmsg, "open directory", rv); return RAFT_IOERR; } rv = UvOsFsync(fd); UvOsClose(fd); if (rv != 0) { UvOsErrMsg(errmsg, "fsync directory", rv); return RAFT_IOERR; } return 0; } int UvFsFileExists(const char *dir, const char *filename, bool *exists, char *errmsg) { uv_stat_t sb; char path[UV__PATH_SZ]; int rv; rv = UvOsJoin(dir, filename, path); if (rv != 0) { return RAFT_INVALID; } rv = UvOsStat(path, &sb); if (rv != 0) { if (rv == UV_ENOENT) { *exists = false; goto out; } UvOsErrMsg(errmsg, "stat", rv); return RAFT_IOERR; } *exists = true; out: return 0; } /* Get the size of the given file. */ int UvFsFileSize(const char *dir, const char *filename, off_t *size, char *errmsg) { uv_stat_t sb; char path[UV__PATH_SZ]; int rv; rv = UvOsJoin(dir, filename, path); if (rv != 0) { return RAFT_INVALID; } rv = UvOsStat(path, &sb); if (rv != 0) { UvOsErrMsg(errmsg, "stat", rv); return RAFT_IOERR; } *size = (off_t)sb.st_size; return 0; } int UvFsFileIsEmpty(const char *dir, const char *filename, bool *empty, char *errmsg) { off_t size; int rv; rv = UvFsFileSize(dir, filename, &size, errmsg); if (rv != 0) { return rv; } *empty = size == 0 ? true : false; return 0; } /* Open a file in a directory. */ static int uvFsOpenFile(const char *dir, const char *filename, int flags, int mode, uv_file *fd, char *errmsg) { char path[UV__PATH_SZ]; int rv; rv = UvOsJoin(dir, filename, path); if (rv != 0) { return RAFT_INVALID; } rv = UvOsOpen(path, flags, mode, fd); if (rv != 0) { UvOsErrMsg(errmsg, "open", rv); return RAFT_IOERR; } return 0; } int UvFsOpenFileForReading(const char *dir, const char *filename, uv_file *fd, char *errmsg) { char path[UV__PATH_SZ]; int flags = O_RDONLY; int rv; rv = UvOsJoin(dir, filename, path); if (rv != 0) { return RAFT_INVALID; } return uvFsOpenFile(dir, filename, flags, 0, fd, errmsg); } int UvFsAllocateFile(const char *dir, const char *filename, size_t size, uv_file *fd, bool fallocate, char *errmsg) { char path[UV__PATH_SZ]; int flags = O_WRONLY | O_CREAT | O_EXCL; /* Common open flags */ int rv = 0; rv = UvOsJoin(dir, filename, path); if (rv != 0) { return RAFT_INVALID; } /* Allocate the desired size. */ if (fallocate) { /* TODO: use RWF_DSYNC instead, if available. */ flags |= O_DSYNC; rv = uvFsOpenFile(dir, filename, flags, S_IRUSR | S_IWUSR, fd, errmsg); if (rv != 0) { goto err; } rv = UvOsFallocate(*fd, 0, (off_t)size); if (rv == 0) { return 0; } else if (rv == UV_ENOSPC) { ErrMsgPrintf(errmsg, "not enough space to allocate %zu bytes", size); rv = RAFT_NOSPACE; goto err_after_open; } else { UvOsErrMsg(errmsg, "posix_allocate", rv); rv = RAFT_IOERR; goto err_after_open; } } else { /* Emulate fallocate, open without O_DSYNC, because we risk * doing a lot of synced writes. */ rv = uvFsOpenFile(dir, filename, flags, S_IRUSR | S_IWUSR, fd, errmsg); if (rv != 0) { goto err; } rv = UvOsFallocateEmulation(*fd, 0, (off_t)size); if (rv == UV_ENOSPC) { ErrMsgPrintf(errmsg, "not enough space to allocate %zu bytes", size); rv = RAFT_NOSPACE; goto err_after_open; } else if (rv != 0) { ErrMsgPrintf(errmsg, "fallocate emulation %d", rv); rv = RAFT_IOERR; goto err_after_open; } rv = UvOsFsync(*fd); if (rv != 0) { ErrMsgPrintf(errmsg, "fsync %d", rv); rv = RAFT_IOERR; goto err_after_open; } /* Now close and reopen the file with O_DSYNC */ rv = UvOsClose(*fd); if (rv != 0) { ErrMsgPrintf(errmsg, "close %d", rv); goto err_unlink; } /* TODO: use RWF_DSYNC instead, if available. */ flags = O_WRONLY | O_DSYNC; rv = uvFsOpenFile(dir, filename, flags, S_IRUSR | S_IWUSR, fd, errmsg); if (rv != 0) { goto err_unlink; } } return 0; err_after_open: UvOsClose(*fd); err_unlink: UvOsUnlink(path); err: assert(rv != 0); return rv; } static int uvFsWriteFile(const char *dir, const char *filename, int flags, struct raft_buffer *bufs, unsigned n_bufs, char *errmsg) { uv_file fd; int rv; size_t size; unsigned i; size = 0; for (i = 0; i < n_bufs; i++) { size += bufs[i].len; } rv = uvFsOpenFile(dir, filename, flags, S_IRUSR | S_IWUSR, &fd, errmsg); if (rv != 0) { goto err; } rv = UvOsWrite(fd, (const uv_buf_t *)bufs, n_bufs, 0); if (rv != (int)(size)) { if (rv < 0) { UvOsErrMsg(errmsg, "write", rv); } else { ErrMsgPrintf(errmsg, "short write: %d only bytes written", rv); } goto err_after_file_open; } rv = UvOsFsync(fd); if (rv != 0) { UvOsErrMsg(errmsg, "fsync", rv); goto err_after_file_open; } rv = UvOsClose(fd); if (rv != 0) { UvOsErrMsg(errmsg, "close", rv); goto err; } return 0; err_after_file_open: UvOsClose(fd); err: return rv; } int UvFsMakeFile(const char *dir, const char *filename, struct raft_buffer *bufs, unsigned n_bufs, char *errmsg) { int rv; char tmp_filename[UV__FILENAME_LEN + 1] = {0}; char path[UV__PATH_SZ] = {0}; char tmp_path[UV__PATH_SZ] = {0}; /* Create a temp file with the given content * TODO as of libuv 1.34.0, use `uv_fs_mkstemp` */ size_t sz = sizeof(tmp_filename); rv = snprintf(tmp_filename, sz, TMP_FILE_FMT, filename); if (rv < 0 || rv >= (int)sz) { return rv; } int flags = UV_FS_O_WRONLY | UV_FS_O_CREAT | UV_FS_O_EXCL; rv = uvFsWriteFile(dir, tmp_filename, flags, bufs, n_bufs, errmsg); if (rv != 0) { goto err_after_tmp_create; } /* Check if the file exists */ bool exists = false; rv = UvFsFileExists(dir, filename, &exists, errmsg); if (rv != 0) { goto err_after_tmp_create; } if (exists) { rv = -1; goto err_after_tmp_create; } /* Rename the temp file. Remark that there is a race between the * existence check and the rename, there is no `renameat2` equivalent in * libuv. However, in the current implementation this should pose no * problems.*/ rv = UvOsJoin(dir, tmp_filename, tmp_path); if (rv != 0) { return RAFT_INVALID; } rv = UvOsJoin(dir, filename, path); if (rv != 0) { return RAFT_INVALID; } rv = UvOsRename(tmp_path, path); if (rv != 0) { UvOsErrMsg(errmsg, "rename", rv); goto err_after_tmp_create; } rv = UvFsSyncDir(dir, errmsg); if (rv != 0) { char ignored[RAFT_ERRMSG_BUF_SIZE]; UvFsRemoveFile(dir, filename, ignored); return rv; } return 0; err_after_tmp_create: UvFsRemoveFile(dir, tmp_filename, errmsg); return rv; } int UvFsMakeOrOverwriteFile(const char *dir, const char *filename, const struct raft_buffer *buf, char *errmsg) { char path[UV__PATH_SZ]; int flags = UV_FS_O_WRONLY; int mode = 0; bool exists = true; uv_file fd; int rv; rv = UvOsJoin(dir, filename, path); if (rv != 0) { return RAFT_INVALID; } open: rv = UvOsOpen(path, flags, mode, &fd); if (rv != 0) { if (rv == UV_ENOENT && !(flags & UV_FS_O_CREAT)) { exists = false; flags |= UV_FS_O_CREAT; mode = S_IRUSR | S_IWUSR; goto open; } goto err; } rv = UvOsWrite(fd, (const uv_buf_t *)buf, 1, 0); if (rv != (int)(buf->len)) { if (rv < 0) { UvOsErrMsg(errmsg, "write", rv); } else { ErrMsgPrintf(errmsg, "short write: %d only bytes written", rv); } goto err_after_file_open; } if (exists) { rv = UvOsFdatasync(fd); if (rv != 0) { UvOsErrMsg(errmsg, "fsync", rv); goto err_after_file_open; } } else { rv = UvOsFsync(fd); if (rv != 0) { UvOsErrMsg(errmsg, "fsync", rv); goto err_after_file_open; } } rv = UvOsClose(fd); if (rv != 0) { UvOsErrMsg(errmsg, "close", rv); goto err; } if (!exists) { rv = UvFsSyncDir(dir, errmsg); if (rv != 0) { goto err; } } return 0; err_after_file_open: UvOsClose(fd); err: return RAFT_IOERR; } int UvFsReadInto(uv_file fd, struct raft_buffer *buf, char *errmsg) { ssize_t rv; size_t offset = 0; /* TODO: use uv_fs_read() */ while (offset < buf->len) { rv = read(fd, (char *)buf->base + offset, buf->len - offset); if (rv == -1) { UvOsErrMsg(errmsg, "read", -errno); return RAFT_IOERR; } /* EOF. Don't think this is reachable, but just make very sure * we don't loop forever. */ if (rv == 0) { break; } assert(rv > 0); offset += (size_t)rv; } if (offset < buf->len) { ErrMsgPrintf(errmsg, "short read: %zu bytes instead of %zu", offset, buf->len); return RAFT_IOERR; } return 0; } int UvFsReadFile(const char *dir, const char *filename, struct raft_buffer *buf, char *errmsg) { uv_stat_t sb; char path[UV__PATH_SZ]; uv_file fd; int rv; rv = UvOsJoin(dir, filename, path); if (rv != 0) { return RAFT_INVALID; } rv = UvOsStat(path, &sb); if (rv != 0) { UvOsErrMsg(errmsg, "stat", rv); rv = RAFT_IOERR; goto err; } rv = uvFsOpenFile(dir, filename, O_RDONLY, 0, &fd, errmsg); if (rv != 0) { goto err; } buf->len = (size_t)sb.st_size; buf->base = RaftHeapMalloc(buf->len); if (buf->base == NULL) { ErrMsgOom(errmsg); rv = RAFT_NOMEM; goto err_after_open; } rv = UvFsReadInto(fd, buf, errmsg); if (rv != 0) { goto err_after_buf_alloc; } UvOsClose(fd); return 0; err_after_buf_alloc: RaftHeapFree(buf->base); err_after_open: UvOsClose(fd); err: return rv; } int UvFsReadFileInto(const char *dir, const char *filename, struct raft_buffer *buf, char *errmsg) { char path[UV__PATH_SZ]; uv_file fd; int rv; rv = UvOsJoin(dir, filename, path); if (rv != 0) { return RAFT_INVALID; } rv = uvFsOpenFile(dir, filename, O_RDONLY, 0, &fd, errmsg); if (rv != 0) { goto err; } rv = UvFsReadInto(fd, buf, errmsg); if (rv != 0) { goto err_after_open; } UvOsClose(fd); return 0; err_after_open: UvOsClose(fd); err: return rv; } int UvFsRemoveFile(const char *dir, const char *filename, char *errmsg) { char path[UV__PATH_SZ]; int rv; rv = UvOsJoin(dir, filename, path); if (rv != 0) { return RAFT_INVALID; } rv = UvOsUnlink(path); if (rv != 0) { UvOsErrMsg(errmsg, "unlink", rv); return RAFT_IOERR; } return 0; } int UvFsRenameFile(const char *dir, const char *filename1, const char *filename2, char *errmsg) { char path1[UV__PATH_SZ]; char path2[UV__PATH_SZ]; int rv; rv = UvOsJoin(dir, filename1, path1); if (rv != 0) { return RAFT_INVALID; } rv = UvOsJoin(dir, filename2, path2); if (rv != 0) { return RAFT_INVALID; } rv = UvOsRename(path1, path2); if (rv != 0) { UvOsErrMsg(errmsg, "rename", rv); return rv; } return 0; } int UvFsTruncateAndRenameFile(const char *dir, size_t size, const char *filename1, const char *filename2, char *errmsg) { char path1[UV__PATH_SZ]; char path2[UV__PATH_SZ]; uv_file fd; int rv; rv = UvOsJoin(dir, filename1, path1); if (rv != 0) { return RAFT_INVALID; } rv = UvOsJoin(dir, filename2, path2); if (rv != 0) { return RAFT_INVALID; } /* Truncate and rename. */ rv = UvOsOpen(path1, UV_FS_O_RDWR, 0, &fd); if (rv != 0) { UvOsErrMsg(errmsg, "open", rv); goto err; } rv = UvOsTruncate(fd, (off_t)size); if (rv != 0) { UvOsErrMsg(errmsg, "truncate", rv); goto err_after_open; } rv = UvOsFsync(fd); if (rv != 0) { UvOsErrMsg(errmsg, "fsync", rv); goto err_after_open; } UvOsClose(fd); rv = UvOsRename(path1, path2); if (rv != 0) { UvOsErrMsg(errmsg, "rename", rv); goto err; } return 0; err_after_open: UvOsClose(fd); err: return RAFT_IOERR; } /* Check if direct I/O is possible on the given fd. */ static int probeDirectIO(int fd, size_t *size, char *errmsg) { struct statfs fs_info; /* To check the file system type. */ void *buf; /* Buffer to use for the probe write. */ int rv; rv = UvOsSetDirectIo(fd); if (rv != 0) { if (rv != UV_EINVAL) { /* UNTESTED: the parameters are ok, so this should never * happen. */ UvOsErrMsg(errmsg, "fnctl", rv); return RAFT_IOERR; } rv = fstatfs(fd, &fs_info); if (rv == -1) { /* UNTESTED: in practice ENOMEM should be the only * failure mode */ UvOsErrMsg(errmsg, "fstatfs", -errno); return RAFT_IOERR; } switch (fs_info.f_type) { case 0x01021994: /* TMPFS_MAGIC */ case 0x2fc12fc1: /* ZFS magic */ case 0x24051905: /* UBIFS Support magic */ *size = 0; return 0; default: /* UNTESTED: this is an unsupported file system. */ ErrMsgPrintf(errmsg, "unsupported file system: %llx", (unsigned long long)fs_info.f_type); return RAFT_IOERR; } } /* Try to perform direct I/O, using various buffer size. */ *size = 4096; while (*size >= 512) { buf = raft_aligned_alloc(*size, *size); if (buf == NULL) { ErrMsgOom(errmsg); return RAFT_NOMEM; } memset(buf, 0, *size); rv = (int)write(fd, buf, *size); raft_aligned_free(*size, buf); if (rv > 0) { /* Since we fallocate'ed the file, we should never fail * because of lack of disk space, and all bytes should * have been written. */ assert(rv == (int)(*size)); return 0; } assert(rv == -1); if (errno != EIO && errno != EOPNOTSUPP) { /* UNTESTED: this should basically fail only because of * disk errors, since we allocated the file with * posix_fallocate. */ /* FIXME: this is a workaround because shiftfs doesn't * return EINVAL in the fnctl call above, for example * when the underlying fs is ZFS. */ if (errno == EINVAL && *size == 4096) { *size = 0; return 0; } UvOsErrMsg(errmsg, "write", -errno); return RAFT_IOERR; } *size = *size / 2; } *size = 0; return 0; } /* Check if fully non-blocking async I/O is possible on the given fd. */ static int probeAsyncIO(int fd, size_t size, bool *ok, char *errmsg) { void *buf; /* Buffer to use for the probe write */ aio_context_t ctx = 0; /* KAIO context handle */ struct iocb iocb; /* KAIO request object */ struct iocb *iocbs = &iocb; /* Because the io_submit() API sucks */ struct io_event event; /* KAIO response object */ int n_events; int rv; /* Setup the KAIO context handle */ rv = UvOsIoSetup(1, &ctx); if (rv != 0) { UvOsErrMsg(errmsg, "io_setup", rv); /* UNTESTED: in practice this should fail only with ENOMEM */ return RAFT_IOERR; } /* Allocate the write buffer */ buf = raft_aligned_alloc(size, size); if (buf == NULL) { ErrMsgOom(errmsg); return RAFT_NOMEM; } memset(buf, 0, size); /* Prepare the KAIO request object */ memset(&iocb, 0, sizeof iocb); iocb.aio_lio_opcode = IOCB_CMD_PWRITE; *((void **)(&iocb.aio_buf)) = buf; iocb.aio_nbytes = size; iocb.aio_offset = 0; iocb.aio_fildes = (uint32_t)fd; iocb.aio_reqprio = 0; iocb.aio_rw_flags |= RWF_NOWAIT | RWF_DSYNC; /* Submit the KAIO request */ rv = UvOsIoSubmit(ctx, 1, &iocbs); if (rv != 0) { /* UNTESTED: in practice this should fail only with ENOMEM */ raft_aligned_free(size, buf); UvOsIoDestroy(ctx); /* On ZFS 0.8 this is not properly supported yet. Also, when * running on older kernels a binary compiled on a kernel with * RWF_NOWAIT support, we might get EINVAL. */ if (errno == EOPNOTSUPP || errno == EINVAL) { *ok = false; return 0; } UvOsErrMsg(errmsg, "io_submit", rv); return RAFT_IOERR; } /* Fetch the response: will block until done. */ n_events = UvOsIoGetevents(ctx, 1, 1, &event, NULL); assert(n_events == 1); if (n_events != 1) { /* UNTESTED */ UvOsErrMsg(errmsg, "UvOsIoGetevents", n_events); return RAFT_IOERR; } /* Release the write buffer. */ raft_aligned_free(size, buf); /* Release the KAIO context handle. */ rv = UvOsIoDestroy(ctx); if (rv != 0) { UvOsErrMsg(errmsg, "io_destroy", rv); return RAFT_IOERR; } if (event.res > 0) { assert(event.res == (int)size); *ok = true; } else { /* UNTESTED: this should basically fail only because of disk * errors, since we allocated the file with posix_fallocate and * the block size is supposed to be correct. */ *ok = false; } return 0; } #define UV__FS_PROBE_FALLOCATE_FILE ".probe_fallocate" /* Leave detection of other error conditions to other probe* functions, only * bother checking if posix_fallocate returns success. */ static void probeFallocate(const char *dir, bool *fallocate) { int flags = O_WRONLY | O_CREAT | O_EXCL; /* Common open flags */ char ignored[RAFT_ERRMSG_BUF_SIZE]; int rv = 0; int fd = -1; *fallocate = false; UvFsRemoveFile(dir, UV__FS_PROBE_FALLOCATE_FILE, ignored); rv = uvFsOpenFile(dir, UV__FS_PROBE_FALLOCATE_FILE, flags, S_IRUSR | S_IWUSR, &fd, ignored); if (rv != 0) { goto out; } rv = UvOsFallocate(fd, 0, (off_t)4096); if (rv == 0) { *fallocate = true; } close(fd); out: UvFsRemoveFile(dir, UV__FS_PROBE_FALLOCATE_FILE, ignored); } #define UV__FS_PROBE_FILE ".probe" #define UV__FS_PROBE_FILE_SIZE 4096 int UvFsProbeCapabilities(const char *dir, size_t *direct, bool *async, bool *fallocate, char *errmsg) { int fd; /* File descriptor of the probe file */ int rv; char ignored[RAFT_ERRMSG_BUF_SIZE]; probeFallocate(dir, fallocate); /* Create a temporary probe file. */ UvFsRemoveFile(dir, UV__FS_PROBE_FILE, ignored); rv = UvFsAllocateFile(dir, UV__FS_PROBE_FILE, UV__FS_PROBE_FILE_SIZE, &fd, *fallocate, errmsg); if (rv != 0) { ErrMsgWrapf(errmsg, "create I/O capabilities probe file"); goto err; } UvFsRemoveFile(dir, UV__FS_PROBE_FILE, ignored); /* Check if we can use direct I/O. */ rv = probeDirectIO(fd, direct, errmsg); if (rv != 0) { ErrMsgWrapf(errmsg, "probe Direct I/O"); goto err_after_file_open; } /* If direct I/O is not possible, we can't perform fully asynchronous * I/O, because io_submit might potentially block. */ if (*direct == 0) { *async = false; goto out; } rv = probeAsyncIO(fd, *direct, async, errmsg); if (rv != 0) { ErrMsgWrapf(errmsg, "probe Async I/O"); goto err_after_file_open; } out: close(fd); return 0; err_after_file_open: close(fd); err: return rv; } dqlite-1.16.7/src/raft/uv_fs.h000066400000000000000000000070031465252713400161170ustar00rootroot00000000000000/* File system related utilities. */ #ifndef UV_FS_H_ #define UV_FS_H_ #include #include "../raft.h" #include "err.h" #define TMP_FILE_PREFIX "tmp-" #define TMP_FILE_FMT TMP_FILE_PREFIX "%s" /* Check that the given directory can be used. */ int UvFsCheckDir(const char *dir, char *errmsg); /* Sync the given directory by calling fsync(). */ int UvFsSyncDir(const char *dir, char *errmsg); /* Check whether a the given file exists. */ int UvFsFileExists(const char *dir, const char *filename, bool *exists, char *errmsg); /* Get the size of the given file. */ int UvFsFileSize(const char *dir, const char *filename, off_t *size, char *errmsg); /* Check whether the given file in the given directory is empty. */ int UvFsFileIsEmpty(const char *dir, const char *filename, bool *empty, char *errmsg); /* Create the given file in the given directory and allocate the given size to * it, returning its file descriptor. The file must not exist yet. */ int UvFsAllocateFile(const char *dir, const char *filename, size_t size, uv_file *fd, bool fallocate, char *errmsg); /* Create a file and write the given content into it. */ int UvFsMakeFile(const char *dir, const char *filename, struct raft_buffer *bufs, unsigned n_bufs, char *errmsg); /* Create or overwrite a file. * * If the file does not exists yet, it gets created, the given content written * to it, and then fully persisted to disk by fsync()'ing the file and the * dir. * * If the file already exists, it gets overwritten. The assumption is that the * file size will stay the same and its content will change, so only fdatasync() * will be used */ int UvFsMakeOrOverwriteFile(const char *dir, const char *filename, const struct raft_buffer *buf, char *errmsg); /* Open a file for reading. */ int UvFsOpenFileForReading(const char *dir, const char *filename, uv_file *fd, char *errmsg); /* Read exactly buf->len bytes from the given file descriptor into buf->base. Fail if less than buf->len bytes are read. */ int UvFsReadInto(uv_file fd, struct raft_buffer *buf, char *errmsg); /* Read all the content of the given file. */ int UvFsReadFile(const char *dir, const char *filename, struct raft_buffer *buf, char *errmsg); /* Read exactly buf->len bytes from the given file into buf->base. Fail if less * than buf->len bytes are read. */ int UvFsReadFileInto(const char *dir, const char *filename, struct raft_buffer *buf, char *errmsg); /* Synchronously remove a file, calling the unlink() system call. */ int UvFsRemoveFile(const char *dir, const char *filename, char *errmsg); /* Synchronously truncate a file to the given size and then rename it. */ int UvFsTruncateAndRenameFile(const char *dir, size_t size, const char *filename1, const char *filename2, char *errmsg); /* Synchronously rename a file. */ int UvFsRenameFile(const char *dir, const char *filename1, const char *filename2, char *errmsg); /* Return information about the I/O capabilities of the underlying file * system. * * The @direct parameter will be set to zero if direct I/O is not possible, or * to the block size to use for direct I/O otherwise. * * The @async parameter will be set to true if fully asynchronous I/O is * possible using the KAIO API. */ int UvFsProbeCapabilities(const char *dir, size_t *direct, bool *async, bool *fallocate, char *errmsg); #endif /* UV_FS_H_ */ dqlite-1.16.7/src/raft/uv_ip.c000066400000000000000000000032631465252713400161160ustar00rootroot00000000000000#include #include #include #include "../raft.h" #include "uv_ip.h" static const char *strCpyUntil(char *target, const char *source, size_t target_size, char separator) { size_t i; for (i = 0; i < target_size; ++i) { if (!source[i] || source[i] == separator) { target[i] = 0; return source + i; } else { target[i] = source[i]; } } return NULL; } int uvIpAddrSplit(const char *address, char *host, size_t host_size, char *service, size_t service_size) { char colon = ':'; const char *service_ptr = NULL; if (host) { service_ptr = strCpyUntil(host, address, host_size, colon); if (!service_ptr) { return RAFT_NAMETOOLONG; } } if (service) { if (!service_ptr) { service_ptr = strchr(address, colon); } if (!service_ptr || *service_ptr == 0 || *(++service_ptr) == 0) { service_ptr = "8080"; } if (!strCpyUntil(service, service_ptr, service_size, 0)) { return RAFT_NAMETOOLONG; } } return 0; } /* Synchronoues resolve hostname to IP address */ int uvIpResolveBindAddresses(const char *address, struct addrinfo **ai_result) { static struct addrinfo hints = { .ai_flags = AI_PASSIVE | AI_NUMERICSERV, .ai_family = AF_INET, .ai_socktype = SOCK_STREAM, .ai_protocol = 0}; char hostname[NI_MAXHOST]; char service[NI_MAXSERV]; int rv; rv = uvIpAddrSplit(address, hostname, sizeof(hostname), service, sizeof(service)); if (rv != 0) { return rv; } if (hostname[0]) { rv = getaddrinfo(hostname, service, &hints, ai_result); } else { rv = getaddrinfo(NULL, service, &hints, ai_result); } if (rv != 0) { return RAFT_IOERR; } return 0; } dqlite-1.16.7/src/raft/uv_ip.h000066400000000000000000000006471465252713400161260ustar00rootroot00000000000000/* IP-related utils. */ #ifndef UV_IP_H_ #define UV_IP_H_ #include /* Split @address into @host and @service. */ int uvIpAddrSplit(const char *address, char *host, size_t host_size, char *service, size_t service_size); struct addrinfo; /* Synchronous resolve hostname to IP address */ int uvIpResolveBindAddresses(const char *address, struct addrinfo **ai_result); #endif /* UV_IP_H */ dqlite-1.16.7/src/raft/uv_list.c000066400000000000000000000046171465252713400164650ustar00rootroot00000000000000#include #include #include "assert.h" #include "uv.h" static const char *uvListIgnored[] = {".", "..", "metadata1", "metadata2", NULL}; /* Return true if the given filename should be ignored. */ static bool uvListShouldIgnore(const char *filename) { const char **cursor = uvListIgnored; bool result = false; if (strlen(filename) >= UV__FILENAME_LEN) { return true; } while (*cursor != NULL) { if (strcmp(filename, *cursor) == 0) { result = true; break; } cursor++; } return result; } int UvList(struct uv *uv, struct uvSnapshotInfo *snapshots[], size_t *n_snapshots, struct uvSegmentInfo *segments[], size_t *n_segments, char *errmsg) { struct uv_fs_s req; struct uv_dirent_s entry; int n; int i; int rv; n = uv_fs_scandir(NULL, &req, uv->dir, 0, NULL); if (n < 0) { ErrMsgPrintf(errmsg, "scan data directory: %s", uv_strerror(n)); return RAFT_IOERR; } *snapshots = NULL; *n_snapshots = 0; *segments = NULL; *n_segments = 0; rv = 0; for (i = 0; i < n; i++) { const char *filename; bool appended; rv = uv_fs_scandir_next(&req, &entry); assert(rv == 0); /* Can't fail in libuv */ filename = entry.name; /* If an error occurred while processing a preceeding entry or * if we know that this is not a segment filename, just free it * and skip to the next one. */ if (uvListShouldIgnore(filename)) { tracef("ignore %s", filename); continue; } /* Append to the snapshot list if it's a snapshot metadata * filename and a valid associated snapshot file exists. */ rv = UvSnapshotInfoAppendIfMatch(uv, filename, snapshots, n_snapshots, &appended); if (rv != 0) { goto error; } if (appended) { tracef("snapshot %s", filename); continue; } /* Append to the segment list if it's a segment filename */ rv = uvSegmentInfoAppendIfMatch(entry.name, segments, n_segments, &appended); if (rv != 0) { goto error; } if (appended) { tracef("segment %s", filename); continue; } tracef("ignore %s", filename); } rv = uv_fs_scandir_next(&req, &entry); assert(rv == UV_EOF); if (*snapshots != NULL) { UvSnapshotSort(*snapshots, *n_snapshots); } if (*segments != NULL) { uvSegmentSort(*segments, *n_segments); } return 0; error: uv_fs_req_cleanup(&req); raft_free(*segments); *segments = NULL; raft_free(*snapshots); *snapshots = NULL; return rv; } #undef tracef dqlite-1.16.7/src/raft/uv_metadata.c000066400000000000000000000124451465252713400172700ustar00rootroot00000000000000#include "assert.h" #include "byte.h" #include "uv.h" #include "uv_encoding.h" /* We have metadata1 and metadata2. */ #define METADATA_FILENAME_PREFIX "metadata" #define METADATA_FILENAME_SIZE (sizeof(METADATA_FILENAME_PREFIX) + 2) /* Format, version, term, vote */ #define METADATA_CONTENT_SIZE (8 * 4) /* Encode the content of a metadata file. */ static void uvMetadataEncode(const struct uvMetadata *metadata, void *buf) { void *cursor = buf; bytePut64(&cursor, UV__DISK_FORMAT); bytePut64(&cursor, metadata->version); bytePut64(&cursor, metadata->term); bytePut64(&cursor, metadata->voted_for); } /* Decode the content of a metadata file. */ static int uvMetadataDecode(const void *buf, struct uvMetadata *metadata, char *errmsg) { const void *cursor = buf; uint64_t format; format = byteGet64(&cursor); if (format != UV__DISK_FORMAT) { ErrMsgPrintf(errmsg, "bad format version %ju", format); return RAFT_MALFORMED; } metadata->version = byteGet64(&cursor); metadata->term = byteGet64(&cursor); metadata->voted_for = byteGet64(&cursor); /* Coherence checks that values make sense */ if (metadata->version == 0) { ErrMsgPrintf(errmsg, "version is set to zero"); return RAFT_CORRUPT; } return 0; } /* Render the filename of the metadata file with index @n. */ static void uvMetadataFilename(const unsigned short n, char *filename) { sprintf(filename, METADATA_FILENAME_PREFIX "%d", n); } /* Read the n'th metadata file (with n equal to 1 or 2) and decode the content * of the file, populating the given metadata buffer accordingly. */ static int uvMetadataLoadN(const char *dir, const unsigned short n, struct uvMetadata *metadata, char *errmsg) { char filename[METADATA_FILENAME_SIZE]; /* Filename of the metadata file */ uint8_t content[METADATA_CONTENT_SIZE]; /* Content of metadata file */ off_t size; struct raft_buffer buf; bool exists; int rv; assert(n == 1 || n == 2); /* Render the metadata path */ uvMetadataFilename(n, filename); rv = UvFsFileExists(dir, filename, &exists, errmsg); if (rv != 0) { ErrMsgWrapf(errmsg, "check if %s exists", filename); return rv; } memset(metadata, 0, sizeof *metadata); /* If the file does not exist, just return. */ if (!exists) { return 0; } /* If the file exists but has less bytes than expected assume that the * server crashed while writing this metadata file, and pretend it has * not been written at all. If it has more file than expected, return an * error. */ rv = UvFsFileSize(dir, filename, &size, errmsg); if (rv != 0) { ErrMsgWrapf(errmsg, "check size of %s", filename); return rv; } if (size != sizeof content) { if ((size_t)size < sizeof content) { rv = UvFsRemoveFile(dir, filename, errmsg); if (rv != 0) { return rv; } return 0; } ErrMsgPrintf(errmsg, "%s has size %jd instead of %zu", filename, (intmax_t)size, sizeof content); return RAFT_CORRUPT; } /* Read the content of the metadata file. */ buf.base = content; buf.len = sizeof content; rv = UvFsReadFileInto(dir, filename, &buf, errmsg); if (rv != 0) { ErrMsgWrapf(errmsg, "read content of %s", filename); return rv; }; /* Decode the content of the metadata file. */ rv = uvMetadataDecode(content, metadata, errmsg); if (rv != 0) { ErrMsgWrapf(errmsg, "decode content of %s", filename); return rv; } return 0; } int uvMetadataLoad(const char *dir, struct uvMetadata *metadata, char *errmsg) { struct uvMetadata metadata1; struct uvMetadata metadata2; int rv; /* Read the two metadata files (if available). */ rv = uvMetadataLoadN(dir, 1, &metadata1, errmsg); if (rv != 0) { return rv; } rv = uvMetadataLoadN(dir, 2, &metadata2, errmsg); if (rv != 0) { return rv; } /* Check the versions. */ if (metadata1.version == 0 && metadata2.version == 0) { /* Neither metadata file exists: have a brand new server. */ metadata->version = 0; metadata->term = 0; metadata->voted_for = 0; } else if (metadata1.version == metadata2.version) { /* The two metadata files can't have the same version. */ ErrMsgPrintf(errmsg, "metadata1 and metadata2 are both at version %llu", metadata1.version); return RAFT_CORRUPT; } else { /* Pick the metadata with the grater version. */ if (metadata1.version > metadata2.version) { *metadata = metadata1; } else { *metadata = metadata2; } } return 0; } /* Return the metadata file index associated with the given version. */ static unsigned short uvMetadataFileIndex(unsigned long long version) { return version % 2 == 1 ? 1 : 2; } int uvMetadataStore(struct uv *uv, const struct uvMetadata *metadata) { char filename[METADATA_FILENAME_SIZE]; /* Filename of the metadata file */ uint8_t content[METADATA_CONTENT_SIZE]; /* Content of metadata file */ struct raft_buffer buf; unsigned short n; int rv; assert(metadata->version > 0); /* Encode the given metadata. */ uvMetadataEncode(metadata, content); /* Render the metadata file name. */ n = uvMetadataFileIndex(metadata->version); uvMetadataFilename(n, filename); /* Write the metadata file, creating it if it does not exist. */ buf.base = content; buf.len = sizeof content; rv = UvFsMakeOrOverwriteFile(uv->dir, filename, &buf, uv->io->errmsg); if (rv != 0) { ErrMsgWrapf(uv->io->errmsg, "persist %s", filename); return rv; } return 0; } dqlite-1.16.7/src/raft/uv_os.c000066400000000000000000000077721465252713400161400ustar00rootroot00000000000000#include "uv_os.h" #include #include #include #include #include #include #include #include #include #include #include #include "assert.h" #include "err.h" #include "syscall.h" /* Default permissions when creating a directory. */ #define DEFAULT_DIR_PERM 0700 int UvOsOpen(const char *path, int flags, int mode, uv_file *fd) { struct uv_fs_s req; int rv; rv = uv_fs_open(NULL, &req, path, flags, mode, NULL); if (rv < 0) { return rv; } *fd = rv; return 0; } int UvOsClose(uv_file fd) { struct uv_fs_s req; return uv_fs_close(NULL, &req, fd, NULL); } /* Emulate fallocate(). Mostly taken from glibc's implementation. */ int UvOsFallocateEmulation(int fd, off_t offset, off_t len) { ssize_t increment; struct statfs f; int rv; rv = fstatfs(fd, &f); if (rv != 0) { return -errno; } if (f.f_bsize == 0) { increment = 512; } else if (f.f_bsize < 4096) { increment = (ssize_t)f.f_bsize; } else { increment = 4096; } for (offset += (len - 1) % increment; len > 0; offset += increment) { len -= increment; rv = (int)pwrite(fd, "", 1, offset); if (rv != 1) { return -errno; } } return 0; } int UvOsFallocate(uv_file fd, off_t offset, off_t len) { int rv; rv = posix_fallocate(fd, offset, len); if (rv != 0) { /* From the manual page: * * posix_fallocate() returns zero on success, or an error * number on failure. Note that errno is not set. */ return -rv; } return 0; } int UvOsTruncate(uv_file fd, off_t offset) { struct uv_fs_s req; return uv_fs_ftruncate(NULL, &req, fd, offset, NULL); } int UvOsFsync(uv_file fd) { struct uv_fs_s req; return uv_fs_fsync(NULL, &req, fd, NULL); } int UvOsFdatasync(uv_file fd) { struct uv_fs_s req; return uv_fs_fdatasync(NULL, &req, fd, NULL); } int UvOsStat(const char *path, uv_stat_t *sb) { struct uv_fs_s req; int rv; rv = uv_fs_stat(NULL, &req, path, NULL); if (rv != 0) { return rv; } memcpy(sb, &req.statbuf, sizeof *sb); return 0; } int UvOsWrite(uv_file fd, const uv_buf_t bufs[], unsigned int nbufs, int64_t offset) { struct uv_fs_s req; return uv_fs_write(NULL, &req, fd, bufs, nbufs, offset, NULL); } int UvOsUnlink(const char *path) { struct uv_fs_s req; return uv_fs_unlink(NULL, &req, path, NULL); } int UvOsRename(const char *path1, const char *path2) { struct uv_fs_s req; return uv_fs_rename(NULL, &req, path1, path2, NULL); } int UvOsJoin(const char *dir, const char *filename, char *path) { if (!UV__DIR_HAS_VALID_LEN(dir) || !UV__FILENAME_HAS_VALID_LEN(filename)) { return -1; } strcpy(path, dir); strcat(path, "/"); strcat(path, filename); return 0; } int UvOsIoSetup(unsigned nr, aio_context_t *ctxp) { int rv; rv = io_setup(nr, ctxp); if (rv == -1) { return -errno; } return 0; } int UvOsIoDestroy(aio_context_t ctx) { int rv; rv = io_destroy(ctx); if (rv == -1) { return -errno; } return 0; } int UvOsIoSubmit(aio_context_t ctx, long nr, struct iocb **iocbpp) { int rv; rv = io_submit(ctx, nr, iocbpp); if (rv == -1) { return -errno; } assert(rv == nr); /* TODO: can something else be returned? */ return 0; } int UvOsIoGetevents(aio_context_t ctx, long min_nr, long max_nr, struct io_event *events, struct timespec *timeout) { int rv; do { rv = io_getevents(ctx, min_nr, max_nr, events, timeout); } while (rv == -1 && errno == EINTR); if (rv == -1) { return -errno; } assert(rv >= min_nr); assert(rv <= max_nr); return rv; } int UvOsEventfd(unsigned int initval, int flags) { int rv; /* At the moment only UV_FS_O_NONBLOCK is supported */ assert(flags == UV_FS_O_NONBLOCK); flags = EFD_NONBLOCK | EFD_CLOEXEC; rv = eventfd(initval, flags); if (rv == -1) { return -errno; } return rv; } int UvOsSetDirectIo(uv_file fd) { int flags; /* Current fcntl flags */ int rv; flags = fcntl(fd, F_GETFL); rv = fcntl(fd, F_SETFL, flags | UV_FS_O_DIRECT); if (rv == -1) { return -errno; } return 0; } dqlite-1.16.7/src/raft/uv_os.h000066400000000000000000000055031465252713400161330ustar00rootroot00000000000000/* Operating system related utilities. */ #ifndef UV_OS_H_ #define UV_OS_H_ #include #include #include #include #include /* Maximum size of a full file system path string. */ #define UV__PATH_SZ 1024 /* Maximum length of a filename string. */ #define UV__FILENAME_LEN 128 /* Length of path separator. */ #define UV__SEP_LEN 1 /* strlen("/") */ /* True if STR's length is at most LEN. */ #define LEN_AT_MOST_(STR, LEN) (strnlen(STR, LEN + 1) <= LEN) /* Maximum length of a directory path string. */ #define UV__DIR_LEN (UV__PATH_SZ - UV__SEP_LEN - UV__FILENAME_LEN - 1) /* True if the given DIR string has at most UV__DIR_LEN chars. */ #define UV__DIR_HAS_VALID_LEN(DIR) LEN_AT_MOST_(DIR, UV__DIR_LEN) /* True if the given FILENAME string has at most UV__FILENAME_LEN chars. */ #define UV__FILENAME_HAS_VALID_LEN(FILENAME) \ LEN_AT_MOST_(FILENAME, UV__FILENAME_LEN) /* Portable open() */ int UvOsOpen(const char *path, int flags, int mode, uv_file *fd); /* Portable close() */ int UvOsClose(uv_file fd); /* TODO: figure a portable abstraction. */ int UvOsFallocate(uv_file fd, off_t offset, off_t len); /* Emulation to use in case UvOsFallocate fails with -EONOTSUPP. * This might happen with a libc implementation (e.g. musl) that * doesn't implement a transparent fallback if fallocate() is * not supported by the underlying file system. */ int UvOsFallocateEmulation(int fd, off_t offset, off_t len); /* Portable truncate() */ int UvOsTruncate(uv_file fd, off_t offset); /* Portable fsync() */ int UvOsFsync(uv_file fd); /* Portable fdatasync() */ int UvOsFdatasync(uv_file fd); /* Portable stat() */ int UvOsStat(const char *path, uv_stat_t *sb); /* Portable write() */ int UvOsWrite(uv_file fd, const uv_buf_t bufs[], unsigned int nbufs, int64_t offset); /* Portable unlink() */ int UvOsUnlink(const char *path); /* Portable rename() */ int UvOsRename(const char *path1, const char *path2); /* Join dir and filename into a full OS path. */ int UvOsJoin(const char *dir, const char *filename, char *path); /* TODO: figure a portable abstraction. */ int UvOsIoSetup(unsigned nr, aio_context_t *ctxp); int UvOsIoDestroy(aio_context_t ctx); int UvOsIoSubmit(aio_context_t ctx, long nr, struct iocb **iocbpp); int UvOsIoGetevents(aio_context_t ctx, long min_nr, long max_nr, struct io_event *events, struct timespec *timeout); int UvOsEventfd(unsigned int initval, int flags); int UvOsSetDirectIo(uv_file fd); /* Format an error message caused by a failed system call or stdlib function. */ #define UvOsErrMsg(ERRMSG, SYSCALL, ERRNUM) \ { \ ErrMsgPrintf(ERRMSG, "%s", uv_strerror(ERRNUM)); \ ErrMsgWrapf(ERRMSG, SYSCALL); \ } #endif /* UV_OS_H_ */ dqlite-1.16.7/src/raft/uv_prepare.c000066400000000000000000000207741465252713400171520ustar00rootroot00000000000000#include #include #include "assert.h" #include "heap.h" #include "uv.h" #include "uv_os.h" /* The happy path for UvPrepare is: * * - If there is an unused open segment available, return its fd and counter * immediately. * * - Otherwise, wait for the creation of a new open segment to complete, * possibly kicking off the creation logic if no segment is being created * currently. * * Possible failure modes are: * * - The create file request fails, in that case we fail all pending prepare * requests and we mark the uv instance as errored. * * On close: * * - Cancel all pending prepare requests. * - Remove unused prepared open segments. * - Wait for any pending internal segment creation and then discard the newly * created segment. */ /* Number of open segments that we try to keep ready for writing. */ #define UV__TARGET_POOL_SIZE 2 /* An open segment being prepared or sitting in the pool */ struct uvIdleSegment { struct uv *uv; /* Open segment file */ size_t size; /* Segment size */ struct uv_work_s work; /* To execute logic in the threadpool */ int status; /* Result of threadpool callback */ char errmsg[RAFT_ERRMSG_BUF_SIZE]; /* Error of threadpool callback */ unsigned long long counter; /* Segment counter */ char filename[UV__FILENAME_LEN]; /* Filename of the segment */ uv_file fd; /* File descriptor of prepared file */ queue queue; /* Pool */ }; static void uvPrepareWorkCb(uv_work_t *work) { struct uvIdleSegment *segment = work->data; struct uv *uv = segment->uv; int rv; rv = UvFsAllocateFile(uv->dir, segment->filename, segment->size, &segment->fd, uv->fallocate, segment->errmsg); if (rv != 0) { goto err; } rv = UvFsSyncDir(uv->dir, segment->errmsg); if (rv != 0) { goto err_after_allocate; } segment->status = 0; return; err_after_allocate: UvOsClose(segment->fd); err: assert(rv != 0); segment->status = rv; return; } /* Flush all pending requests, invoking their callbacks with the given * status. */ static void uvPrepareFinishAllRequests(struct uv *uv, int status) { while (!queue_empty(&uv->prepare_reqs)) { queue *head; struct uvPrepare *req; head = queue_head(&uv->prepare_reqs); req = QUEUE_DATA(head, struct uvPrepare, queue); queue_remove(&req->queue); req->cb(req, status); } } /* Pop the oldest prepared segment in the pool and return its fd and counter * through the given pointers. */ static void uvPrepareConsume(struct uv *uv, uv_file *fd, uvCounter *counter) { queue *head; struct uvIdleSegment *segment; /* Pop a segment from the pool. */ head = queue_head(&uv->prepare_pool); segment = QUEUE_DATA(head, struct uvIdleSegment, queue); assert(segment->fd >= 0); queue_remove(&segment->queue); *fd = segment->fd; *counter = segment->counter; RaftHeapFree(segment); } /* Finish the oldest pending prepare request using the next available prepared * segment. */ static void uvPrepareFinishOldestRequest(struct uv *uv) { queue *head; struct uvPrepare *req; assert(!uv->closing); assert(!queue_empty(&uv->prepare_reqs)); assert(!queue_empty(&uv->prepare_pool)); /* Pop the head of the prepare requests queue. */ head = queue_head(&uv->prepare_reqs); req = QUEUE_DATA(head, struct uvPrepare, queue); queue_remove(&req->queue); /* Finish the request */ uvPrepareConsume(uv, &req->fd, &req->counter); req->cb(req, 0); } /* Return the number of ready prepared open segments in the pool. */ static unsigned uvPrepareCount(struct uv *uv) { queue *head; unsigned n; n = 0; QUEUE_FOREACH(head, &uv->prepare_pool) { n++; } return n; } static void uvPrepareAfterWorkCb(uv_work_t *work, int status); /* Start creating a new segment file. */ static int uvPrepareStart(struct uv *uv) { struct uvIdleSegment *segment; int rv; assert(uv->prepare_inflight == NULL); assert(uvPrepareCount(uv) < UV__TARGET_POOL_SIZE); segment = RaftHeapMalloc(sizeof *segment); if (segment == NULL) { rv = RAFT_NOMEM; goto err; } memset(segment, 0, sizeof *segment); segment->uv = uv; segment->counter = uv->prepare_next_counter; segment->work.data = segment; segment->fd = -1; segment->size = uv->block_size * uvSegmentBlocks(uv); sprintf(segment->filename, UV__OPEN_TEMPLATE, segment->counter); tracef("create open segment %s", segment->filename); rv = uv_queue_work(uv->loop, &segment->work, uvPrepareWorkCb, uvPrepareAfterWorkCb); if (rv != 0) { /* UNTESTED: with the current libuv implementation this can't * fail. */ tracef("can't create segment %s: %s", segment->filename, uv_strerror(rv)); rv = RAFT_IOERR; goto err_after_segment_alloc; } uv->prepare_inflight = segment; uv->prepare_next_counter++; return 0; err_after_segment_alloc: RaftHeapFree(segment); err: assert(rv != 0); return rv; } static void uvPrepareAfterWorkCb(uv_work_t *work, int status) { struct uvIdleSegment *segment = work->data; struct uv *uv = segment->uv; int rv; assert(status == 0); uv->prepare_inflight = NULL; /* Reset the creation in-progress marker. */ /* If we are closing, let's discard the segment. All pending requests * have already being fired with RAFT_CANCELED. */ if (uv->closing) { assert(queue_empty(&uv->prepare_pool)); assert(queue_empty(&uv->prepare_reqs)); if (segment->status == 0) { char errmsg[RAFT_ERRMSG_BUF_SIZE]; UvOsClose(segment->fd); UvFsRemoveFile(uv->dir, segment->filename, errmsg); } tracef("canceled creation of %s", segment->filename); RaftHeapFree(segment); uvMaybeFireCloseCb(uv); return; } /* If the request has failed, mark all pending requests as failed and * don't try to create any further segment. * * Note that if there's no pending request, we don't set the error * message, to avoid overwriting previous errors. */ if (segment->status != 0) { if (!queue_empty(&uv->prepare_reqs)) { ErrMsgTransferf(segment->errmsg, uv->io->errmsg, "create segment %s", segment->filename); uvPrepareFinishAllRequests(uv, segment->status); } uv->errored = true; RaftHeapFree(segment); return; } assert(segment->fd >= 0); tracef("completed creation of %s", segment->filename); queue_insert_tail(&uv->prepare_pool, &segment->queue); /* Let's process any pending request. */ if (!queue_empty(&uv->prepare_reqs)) { uvPrepareFinishOldestRequest(uv); } /* If we are already creating a segment, we're done. */ if (uv->prepare_inflight != NULL) { return; } /* If we have already enough prepared open segments, we're done. There * can't be any outstanding prepare requests, since if the request queue * was not empty, we would have called uvPrepareFinishOldestRequest() * above, thus reducing the pool size and making it smaller than the * target size. */ if (uvPrepareCount(uv) >= UV__TARGET_POOL_SIZE) { assert(queue_empty(&uv->prepare_reqs)); return; } /* Let's start preparing a new open segment. */ rv = uvPrepareStart(uv); if (rv != 0) { uvPrepareFinishAllRequests(uv, rv); uv->errored = true; } } /* Discard a prepared open segment, closing its file descriptor and removing the * underlying file. */ static void uvPrepareDiscard(struct uv *uv, uv_file fd, uvCounter counter) { char errmsg[RAFT_ERRMSG_BUF_SIZE]; char filename[UV__FILENAME_LEN]; assert(counter > 0); assert(fd >= 0); sprintf(filename, UV__OPEN_TEMPLATE, counter); UvOsClose(fd); UvFsRemoveFile(uv->dir, filename, errmsg); } int UvPrepare(struct uv *uv, uv_file *fd, uvCounter *counter, struct uvPrepare *req, uvPrepareCb cb) { int rv; assert(!uv->closing); if (!queue_empty(&uv->prepare_pool)) { uvPrepareConsume(uv, fd, counter); goto maybe_start; } *fd = -1; *counter = 0; req->cb = cb; queue_insert_tail(&uv->prepare_reqs, &req->queue); maybe_start: /* If we are already creating a segment, let's just wait. */ if (uv->prepare_inflight != NULL) { return 0; } rv = uvPrepareStart(uv); if (rv != 0) { goto err; } return 0; err: if (*fd != -1) { uvPrepareDiscard(uv, *fd, *counter); } else { queue_remove(&req->queue); } assert(rv != 0); return rv; } void UvPrepareClose(struct uv *uv) { assert(uv->closing); /* Cancel all pending prepare requests. */ uvPrepareFinishAllRequests(uv, RAFT_CANCELED); /* Remove any unused prepared segment. */ while (!queue_empty(&uv->prepare_pool)) { queue *head; struct uvIdleSegment *segment; head = queue_head(&uv->prepare_pool); segment = QUEUE_DATA(head, struct uvIdleSegment, queue); queue_remove(&segment->queue); uvPrepareDiscard(uv, segment->fd, segment->counter); RaftHeapFree(segment); } } #undef tracef dqlite-1.16.7/src/raft/uv_recv.c000066400000000000000000000254101465252713400164430ustar00rootroot00000000000000#include #include "../raft.h" #include "assert.h" #include "byte.h" #include "configuration.h" #include "err.h" #include "heap.h" #include "uv.h" #include "uv_encoding.h" /* The happy path for a receiving an RPC message is: * * - When a peer server successfully establishes a new connection with us, the * transport invokes our accept callback. * * - A new server object is created and added to the servers array. It starts * reading from the stream handle of the new connection. * * - The RPC message preamble is read, which contains the message type and the * message length. * * - The RPC message header is read, whose content depends on the message type. * * - Optionally, the RPC message payload is read (for AppendEntries requests). * * - The recv callback passed to raft_io->start() gets fired with the received * message. * * Possible failure modes are: * * - The peer server disconnects. In this case the read callback will fire with * UV_EOF, we'll close the stream handle and then release all memory * associated with the server object. * * - The peer server sends us invalid data. In this case we close the stream * handle and act like above. */ struct uvServer { struct uv *uv; /* libuv I/O implementation object */ raft_id id; /* ID of the remote server */ char *address; /* Address of the other server */ struct uv_stream_s *stream; /* Connection handle */ uv_buf_t buf; /* Sliding buffer for reading incoming data */ uint64_t preamble[2]; /* Static buffer with the request preamble */ uv_buf_t header; /* Dynamic buffer with the request header */ uv_buf_t payload; /* Dynamic buffer with the request payload */ struct raft_message message; /* The message being received */ queue queue; /* Servers queue */ }; /* Initialize a new server object for reading requests from an incoming * connection. */ static int uvServerInit(struct uvServer *s, struct uv *uv, const raft_id id, const char *address, struct uv_stream_s *stream) { s->uv = uv; s->id = id; s->address = RaftHeapMalloc(strlen(address) + 1); if (s->address == NULL) { return RAFT_NOMEM; } strcpy(s->address, address); s->stream = stream; s->stream->data = s; s->buf.base = NULL; s->buf.len = 0; s->preamble[0] = 0; s->preamble[1] = 0; s->header.base = NULL; s->header.len = 0; s->message.type = 0; s->payload.base = NULL; s->payload.len = 0; queue_insert_tail(&uv->servers, &s->queue); return 0; } static void uvServerDestroy(struct uvServer *s) { queue_remove(&s->queue); if (s->header.base != NULL) { /* This means we were interrupted while reading the header. */ RaftHeapFree(s->header.base); switch (s->message.type) { case RAFT_IO_APPEND_ENTRIES: RaftHeapFree(s->message.append_entries.entries); break; case RAFT_IO_INSTALL_SNAPSHOT: configurationClose( &s->message.install_snapshot.conf); break; } } if (s->payload.base != NULL) { /* This means we were interrupted while reading the payload. */ RaftHeapFree(s->payload.base); } RaftHeapFree(s->address); RaftHeapFree(s->stream); } /* Invoked to initialize the read buffer for the next asynchronous read on the * socket. */ static void uvServerAllocCb(uv_handle_t *handle, size_t suggested_size, uv_buf_t *buf) { struct uvServer *s = handle->data; (void)suggested_size; assert(!s->uv->closing); /* If this is the first read of the preamble, or of the header, or of * the payload, then initialize the read buffer, according to the chunk * of data that we expect next. */ if (s->buf.len == 0) { assert(s->buf.base == NULL); /* Check if we expect the preamble. */ if (s->header.len == 0) { assert(s->preamble[0] == 0); assert(s->preamble[1] == 0); s->buf.base = (char *)s->preamble; s->buf.len = sizeof s->preamble; goto out; } /* Check if we expect the header. */ if (s->payload.len == 0) { assert(s->header.len > 0); assert(s->header.base == NULL); s->header.base = RaftHeapMalloc(s->header.len); if (s->header.base == NULL) { /* Setting all buffer fields to 0 will make * read_cb fail with ENOBUFS. */ memset(buf, 0, sizeof *buf); return; } s->buf = s->header; goto out; } /* If we get here we should be expecting the payload. */ assert(s->payload.len > 0); s->payload.base = RaftHeapMalloc(s->payload.len); if (s->payload.base == NULL) { /* Setting all buffer fields to 0 will make read_cb fail * with ENOBUFS. */ memset(buf, 0, sizeof *buf); return; } s->buf = s->payload; } out: *buf = s->buf; } /* Callback invoked afer the stream handle of this server connection has been * closed. We can release all resources associated with the server object. */ static void uvServerStreamCloseCb(uv_handle_t *handle) { struct uvServer *s = handle->data; struct uv *uv = s->uv; uvServerDestroy(s); RaftHeapFree(s); uvMaybeFireCloseCb(uv); } static void uvServerAbort(struct uvServer *s) { struct uv *uv = s->uv; queue_remove(&s->queue); queue_insert_tail(&uv->aborting, &s->queue); uv_close((struct uv_handle_s *)s->stream, uvServerStreamCloseCb); } /* Invoke the receive callback. */ static void uvFireRecvCb(struct uvServer *s) { s->uv->recv_cb(s->uv->io, &s->message); /* Reset our state as we'll start reading a new message. We don't need * to release the payload buffer, since ownership was transferred to the * user. */ memset(s->preamble, 0, sizeof s->preamble); raft_free(s->header.base); s->message.type = 0; s->header.base = NULL; s->header.len = 0; s->payload.base = NULL; s->payload.len = 0; } /* Callback invoked when data has been read from the socket. */ static void uvServerReadCb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf) { struct uvServer *s = stream->data; int rv; (void)buf; assert(!s->uv->closing); /* If the read was successful, let's check if we have received all the * data we expected. */ if (nread > 0) { size_t n = (size_t)nread; /* We shouldn't have read more data than the pending amount. */ assert(n <= s->buf.len); /* Advance the read window */ s->buf.base += n; s->buf.len -= n; /* If there's more data to read in order to fill the current * read buffer, just return, we'll be invoked again. */ if (s->buf.len > 0) { return; } if (s->header.len == 0) { /* If the header buffer is not set, it means that we've * just completed reading the preamble. */ assert(s->header.base == NULL); s->header.len = (size_t)byteFlip64(s->preamble[1]); /* The length of the header must be greater than zero. */ if (s->header.len == 0) { tracef("message has zero length"); goto abort; } } else if (s->payload.len == 0) { /* If the payload buffer is not set, it means we just * completed reading the message header. */ uint64_t type; assert(s->header.base != NULL); type = byteFlip64(s->preamble[0]); /* Only use first 2 bytes of the type. Normally we would * check if type doesn't overflow UINT16_MAX, but we * don't do this to allow future legacy nodes to still * handle messages that include extra information in the * 6 unused bytes of the type field of the preamble. * TODO: This is preparation to add the version of the * message in the raft preamble. Once this change has * been active for sufficiently long time, we can start * encoding the version in some of the remaining bytes * of s->preamble[0]. */ rv = uvDecodeMessage((uint16_t)type, &s->header, &s->message, &s->payload.len); if (rv != 0) { tracef("decode message: %s", errCodeToString(rv)); goto abort; } s->message.server_id = s->id; s->message.server_address = s->address; /* If the message has no payload, we're done. */ if (s->payload.len == 0) { uvFireRecvCb(s); } } else { /* If we get here it means that we've just completed * reading the payload. TODO: avoid converting from * uv_buf_t */ struct raft_buffer payload; assert(s->payload.base != NULL); assert(s->payload.len > 0); switch (s->message.type) { case RAFT_IO_APPEND_ENTRIES: payload.base = s->payload.base; payload.len = s->payload.len; (void)uvDecodeEntriesBatch( payload.base, 0, s->message.append_entries.entries, s->message.append_entries .n_entries, false); break; case RAFT_IO_INSTALL_SNAPSHOT: s->message.install_snapshot.data.base = s->payload.base; break; default: /* We should never have read a payload * in the first place */ assert(0); } uvFireRecvCb(s); } /* Mark that we're done with this chunk. When the alloc callback * will trigger again it will notice that it needs to change the * read buffer. */ assert(s->buf.len == 0); s->buf.base = NULL; return; } /* The if nread>0 condition above should always exit the function with a * goto abort or a return. */ assert(nread <= 0); if (nread == 0) { /* Empty read */ return; } if (nread != UV_EOF) { tracef("receive data: %s", uv_strerror((int)nread)); } abort: uvServerAbort(s); } /* Start reading incoming requests. */ static int uvServerStart(struct uvServer *s) { int rv; rv = uv_read_start(s->stream, uvServerAllocCb, uvServerReadCb); if (rv != 0) { tracef("start reading: %s", uv_strerror(rv)); return RAFT_IOERR; } return 0; } static int uvAddServer(struct uv *uv, raft_id id, const char *address, struct uv_stream_s *stream) { struct uvServer *server; int rv; /* Initialize the new connection */ server = RaftHeapMalloc(sizeof *server); if (server == NULL) { rv = RAFT_NOMEM; goto err; } rv = uvServerInit(server, uv, id, address, stream); if (rv != 0) { goto err_after_server_alloc; } /* This will start reading requests. */ rv = uvServerStart(server); if (rv != 0) { goto err_after_init_server; } return 0; err_after_init_server: uvServerDestroy(server); err_after_server_alloc: raft_free(server); err: assert(rv != 0); return rv; } static void uvRecvAcceptCb(struct raft_uv_transport *transport, raft_id id, const char *address, struct uv_stream_s *stream) { struct uv *uv = transport->data; int rv; assert(!uv->closing); rv = uvAddServer(uv, id, address, stream); if (rv != 0) { tracef("add server: %s", errCodeToString(rv)); uv_close((struct uv_handle_s *)stream, (uv_close_cb)RaftHeapFree); } } int UvRecvStart(struct uv *uv) { int rv; rv = uv->transport->listen(uv->transport, uvRecvAcceptCb); if (rv != 0) { return rv; } return 0; } void UvRecvClose(struct uv *uv) { while (!queue_empty(&uv->servers)) { queue *head; struct uvServer *server; head = queue_head(&uv->servers); server = QUEUE_DATA(head, struct uvServer, queue); uvServerAbort(server); } } #undef tracef dqlite-1.16.7/src/raft/uv_segment.c000066400000000000000000000720221465252713400171470ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "array.h" #include "assert.h" #include "byte.h" #include "configuration.h" #include "entry.h" #include "heap.h" #include "uv.h" #include "uv_encoding.h" /* Check if the given filename matches the one of a closed segment (xxx-yyy), or * of an open segment (open-xxx), and fill the given info structure if so. * * Return true if the filename matched, false otherwise. */ static bool uvSegmentInfoMatch(const char *filename, struct uvSegmentInfo *info) { int consumed; int matched; size_t n; size_t filename_len = strnlen(filename, UV__FILENAME_LEN + 1); assert(filename_len < UV__FILENAME_LEN); matched = sscanf(filename, UV__CLOSED_TEMPLATE "%n", &info->first_index, &info->end_index, &consumed); if (matched == 2 && consumed == (int)filename_len) { info->is_open = false; goto match; } matched = sscanf(filename, UV__OPEN_TEMPLATE "%n", &info->counter, &consumed); if (matched == 1 && consumed == (int)filename_len) { info->is_open = true; goto match; } return false; match: n = sizeof(info->filename) - 1; strncpy(info->filename, filename, n); info->filename[n] = '\0'; return true; } int uvSegmentInfoAppendIfMatch(const char *filename, struct uvSegmentInfo *infos[], size_t *n_infos, bool *appended) { struct uvSegmentInfo info; bool matched; int rv; /* Check if it's a closed or open filename */ matched = uvSegmentInfoMatch(filename, &info); /* If this is neither a closed or an open segment, return. */ if (!matched) { *appended = false; return 0; } ARRAY__APPEND(struct uvSegmentInfo, info, infos, n_infos, rv); if (rv == -1) { return RAFT_NOMEM; } *appended = true; return 0; } /* Compare two segments to decide which one is more recent. */ static int uvSegmentInfoCompare(const void *p1, const void *p2) { struct uvSegmentInfo *s1 = (struct uvSegmentInfo *)p1; struct uvSegmentInfo *s2 = (struct uvSegmentInfo *)p2; /* Closed segments are less recent than open segments. */ if (s1->is_open && !s2->is_open) { return 1; } if (!s1->is_open && s2->is_open) { return -1; } /* If the segments are open, compare the counter. */ if (s1->is_open) { assert(s2->is_open); assert(s1->counter != s2->counter); return s1->counter < s2->counter ? -1 : 1; } /* If the segments are closed, compare the first index. The index ranges * must be disjoint. */ if (s2->first_index > s1->end_index) { return -1; } return 1; } void uvSegmentSort(struct uvSegmentInfo *infos, size_t n_infos) { qsort(infos, n_infos, sizeof *infos, uvSegmentInfoCompare); } int uvSegmentKeepTrailing(struct uv *uv, struct uvSegmentInfo *segments, size_t n, raft_index last_index, size_t trailing, char *errmsg) { raft_index retain_index; size_t i; int rv; assert(last_index > 0); assert(n > 0); if (last_index <= trailing) { return 0; } /* Index of the oldest entry we want to retain. */ retain_index = last_index - trailing + 1; for (i = 0; i < n; i++) { struct uvSegmentInfo *segment = &segments[i]; if (segment->is_open) { break; } if (trailing == 0 || segment->end_index < retain_index) { rv = UvFsRemoveFile(uv->dir, segment->filename, errmsg); if (rv != 0) { ErrMsgWrapf(errmsg, "delete closed segment %s", segment->filename); return rv; } } else { break; } } return 0; } /* Read a segment file and return its format version. */ static int uvReadSegmentFile(struct uv *uv, const char *filename, struct raft_buffer *buf, uint64_t *format) { char errmsg[RAFT_ERRMSG_BUF_SIZE]; int rv; rv = UvFsReadFile(uv->dir, filename, buf, errmsg); if (rv != 0) { ErrMsgTransfer(errmsg, uv->io->errmsg, "read file"); return RAFT_IOERR; } if (buf->len < 8) { ErrMsgPrintf(uv->io->errmsg, "file has only %zu bytes", buf->len); RaftHeapFree(buf->base); return RAFT_IOERR; } *format = byteFlip64(*(uint64_t *)buf->base); return 0; } /* Consume the content buffer, returning a pointer to the current position and * advancing the offset of n bytes. Return an error if not enough bytes are * available. */ static int uvConsumeContent(const struct raft_buffer *content, size_t *offset, size_t n, void **data, char *errmsg) { if (*offset + n > content->len) { size_t remaining = content->len - *offset; ErrMsgPrintf(errmsg, "short read: %zu bytes instead of %zu", remaining, n); return RAFT_IOERR; } if (data != NULL) { *data = &((uint8_t *)content->base)[*offset]; } *offset += n; return 0; } /* Load a single batch of entries from a segment. * * Set @last to #true if the loaded batch is the last one. */ static int uvLoadEntriesBatch(struct uv *uv, const struct raft_buffer *content, struct raft_entry **entries, unsigned *n_entries, size_t *offset, /* Offset of last batch */ bool *last) { void *checksums; /* CRC32 checksums */ void *batch; /* Entries batch */ unsigned long n; /* Number of entries in the batch */ unsigned max_n; /* Maximum number of entries we expect */ unsigned i; /* Iterate through the entries */ struct raft_buffer header; /* Batch header */ struct raft_buffer data; /* Batch data */ uint32_t crc1; /* Target checksum */ uint32_t crc2; /* Actual checksum */ char errmsg[RAFT_ERRMSG_BUF_SIZE]; size_t start; int rv; /* Save the current offset, to provide more information when logging. */ start = *offset; /* Read the checksums. */ rv = uvConsumeContent(content, offset, sizeof(uint32_t) * 2, &checksums, errmsg); if (rv != 0) { ErrMsgTransfer(errmsg, uv->io->errmsg, "read preamble"); return RAFT_IOERR; } /* Read the first 8 bytes of the batch, which contains the number of * entries in the batch. */ rv = uvConsumeContent(content, offset, sizeof(uint64_t), &batch, errmsg); if (rv != 0) { ErrMsgTransfer(errmsg, uv->io->errmsg, "read preamble"); return RAFT_IOERR; } n = (size_t)byteFlip64(*(uint64_t *)batch); if (n == 0) { ErrMsgPrintf(uv->io->errmsg, "entries count in preamble is zero"); rv = RAFT_CORRUPT; goto err; } /* Very optimistic upper bound of the number of entries we should * expect. This is mainly a protection against allocating too much * memory. Each entry will consume at least 4 words (for term, type, * size and payload). */ max_n = UV__MAX_SEGMENT_SIZE / (sizeof(uint64_t) * 4); if (n > max_n) { ErrMsgPrintf(uv->io->errmsg, "entries count %lu in preamble is too high", n); rv = RAFT_CORRUPT; goto err; } /* Consume the batch header, excluding the first 8 bytes containing the * number of entries, which we have already read. */ header.len = uvSizeofBatchHeader(n, true); header.base = batch; rv = uvConsumeContent(content, offset, uvSizeofBatchHeader(n, true) - sizeof(uint64_t), NULL, errmsg); if (rv != 0) { ErrMsgTransfer(errmsg, uv->io->errmsg, "read header"); rv = RAFT_IOERR; goto err; } /* Check batch header integrity. */ crc1 = byteFlip32(((uint32_t *)checksums)[0]); crc2 = byteCrc32(header.base, header.len, 0); if (crc1 != crc2) { ErrMsgPrintf(uv->io->errmsg, "header checksum mismatch"); rv = RAFT_CORRUPT; goto err; } /* Decode the batch header, allocating the entries array. */ uint64_t local_data_size = 0; rv = uvDecodeBatchHeader(header.base, entries, n_entries, &local_data_size); if (rv != 0) { goto err; } /* Calculate the total size of the batch data. TODO this computation * should be rolled into the actual parsing part somehow. */ data.len = 0; for (i = 0; i < n; i++) { data.len += (*entries)[i].buf.len; #ifdef DQLITE_NEXT data.len += sizeof((*entries)[i].local_data); #endif } data.base = (uint8_t *)content->base + *offset; /* Consume the batch data */ rv = uvConsumeContent(content, offset, data.len, NULL, errmsg); if (rv != 0) { ErrMsgTransfer(errmsg, uv->io->errmsg, "read data"); rv = RAFT_IOERR; goto err_after_header_decode; } /* Check batch data integrity. */ crc1 = byteFlip32(((uint32_t *)checksums)[1]); crc2 = byteCrc32(data.base, data.len, 0); if (crc1 != crc2) { tracef("batch is bad"); ErrMsgPrintf(uv->io->errmsg, "data checksum mismatch"); rv = RAFT_CORRUPT; goto err_after_header_decode; } rv = uvDecodeEntriesBatch(content->base, *offset - data.len, *entries, *n_entries, local_data_size); if (rv != 0) { goto err_after_header_decode; } *last = *offset == content->len; return 0; err_after_header_decode: RaftHeapFree(*entries); err: *entries = NULL; *n_entries = 0; assert(rv != 0); *offset = start; return rv; } /* Append to @entries2 all entries in @entries1. */ static int extendEntries(const struct raft_entry *entries1, const size_t n_entries1, struct raft_entry **entries2, size_t *n_entries2) { struct raft_entry *entries; /* To re-allocate the given entries */ size_t i; entries = raft_realloc(*entries2, (*n_entries2 + n_entries1) * sizeof *entries); if (entries == NULL) { return RAFT_NOMEM; } for (i = 0; i < n_entries1; i++) { entries[*n_entries2 + i] = entries1[i]; } *entries2 = entries; *n_entries2 += n_entries1; return 0; } int uvSegmentLoadClosed(struct uv *uv, struct uvSegmentInfo *info, struct raft_entry *entries[], size_t *n) { bool empty; /* Whether the file is empty */ uint64_t format; /* Format version */ bool last; /* Whether the last batch was reached */ struct raft_entry *tmp_entries; /* Entries in current batch */ struct raft_buffer buf; /* Segment file content */ size_t offset; /* Content read cursor */ unsigned tmp_n; /* Number of entries in current batch */ unsigned expected_n; /* Number of entries that we expect to find */ int i; char errmsg[RAFT_ERRMSG_BUF_SIZE]; int rv; expected_n = (unsigned)(info->end_index - info->first_index + 1); /* If the segment is completely empty, just bail out. */ rv = UvFsFileIsEmpty(uv->dir, info->filename, &empty, errmsg); if (rv != 0) { tracef("stat %s: %s", info->filename, errmsg); rv = RAFT_IOERR; goto err; } if (empty) { ErrMsgPrintf(uv->io->errmsg, "file is empty"); rv = RAFT_CORRUPT; goto err; } /* Open the segment file. */ rv = uvReadSegmentFile(uv, info->filename, &buf, &format); if (rv != 0) { goto err; } if (format != UV__DISK_FORMAT) { ErrMsgPrintf(uv->io->errmsg, "unexpected format version %ju", format); rv = RAFT_CORRUPT; goto err_after_read; } /* Load all batches in the segment. */ *entries = NULL; *n = 0; last = false; offset = sizeof format; for (i = 1; !last; i++) { rv = uvLoadEntriesBatch(uv, &buf, &tmp_entries, &tmp_n, &offset, &last); if (rv != 0) { ErrMsgWrapf(uv->io->errmsg, "entries batch %u starting at byte %zu", i, offset); /* Clean up the last allocation from extendEntries. */ goto err_after_extend_entries; } rv = extendEntries(tmp_entries, tmp_n, entries, n); if (rv != 0) { goto err_after_batch_load; } raft_free(tmp_entries); } if (*n != expected_n) { ErrMsgPrintf(uv->io->errmsg, "found %zu entries (expected %u)", *n, expected_n); rv = RAFT_CORRUPT; goto err_after_extend_entries; } assert(i > 1); /* At least one batch was loaded. */ assert(*n > 0); /* At least one entry was loaded. */ return 0; err_after_batch_load: raft_free(tmp_entries[0].batch); raft_free(tmp_entries); err_after_extend_entries: if (*entries != NULL) { RaftHeapFree(*entries); } err_after_read: RaftHeapFree(buf.base); err: assert(rv != 0); return rv; } /* Check if the content of the segment file contains all zeros from the current * offset onward. */ static bool uvContentHasOnlyTrailingZeros(const struct raft_buffer *buf, size_t offset) { size_t i; for (i = offset; i < buf->len; i++) { if (((char *)buf->base)[i] != 0) { return false; } } return true; } /* Load all entries contained in an open segment. */ static int uvSegmentLoadOpen(struct uv *uv, struct uvSegmentInfo *info, struct raft_entry *entries[], size_t *n, raft_index *next_index) { raft_index first_index; /* Index of first entry in segment */ bool all_zeros; /* Whether the file is zero'ed */ bool empty; /* Whether the segment file is empty */ bool remove = false; /* Whether to remove this segment */ bool last = false; /* Whether the last batch was reached */ uint64_t format; /* Format version */ size_t n_batches = 0; /* Number of loaded batches */ struct raft_entry *tmp_entries; /* Entries in current batch */ struct raft_buffer buf = {0}; /* Segment file content */ size_t offset; /* Content read cursor */ unsigned tmp_n_entries; /* Number of entries in current batch */ int i; char errmsg[RAFT_ERRMSG_BUF_SIZE]; int rv; first_index = *next_index; rv = UvFsFileIsEmpty(uv->dir, info->filename, &empty, errmsg); if (rv != 0) { tracef("check if %s is empty: %s", info->filename, errmsg); rv = RAFT_IOERR; goto err; } if (empty) { /* Empty segment, let's discard it. */ tracef("remove empty open segment %s", info->filename); remove = true; goto done; } rv = uvReadSegmentFile(uv, info->filename, &buf, &format); if (rv != 0) { goto err; } /* Check that the format is the expected one, or perhaps 0, indicating * that the segment was allocated but never written. */ offset = sizeof format; if (format != UV__DISK_FORMAT) { if (format == 0) { all_zeros = uvContentHasOnlyTrailingZeros(&buf, offset); if (all_zeros) { /* This is equivalent to the empty case, let's * remove the segment. */ tracef("remove zeroed open segment %s", info->filename); remove = true; RaftHeapFree(buf.base); buf.base = NULL; goto done; } } ErrMsgPrintf(uv->io->errmsg, "unexpected format version %ju", format); rv = RAFT_CORRUPT; goto err_after_read; } /* Load all batches in the segment. */ for (i = 1; !last; i++) { rv = uvLoadEntriesBatch(uv, &buf, &tmp_entries, &tmp_n_entries, &offset, &last); if (rv != 0) { /* If this isn't a decoding error, just bail out. */ if (rv != RAFT_CORRUPT) { ErrMsgWrapf( uv->io->errmsg, "entries batch %u starting at byte %zu", i, offset); goto err_after_read; } /* If this is a decoding error, and not an OS error, * check if the rest of the file is filled with zeros. * In that case we assume that the server shutdown * uncleanly and we just truncate this incomplete data. */ all_zeros = uvContentHasOnlyTrailingZeros(&buf, offset); if (!all_zeros) { tracef("%s has non-zero trail", info->filename); } tracef( "truncate open segment %s at %zu (batch %d), since " "it has " "corrupted " "entries", info->filename, offset, i); break; } rv = extendEntries(tmp_entries, tmp_n_entries, entries, n); if (rv != 0) { goto err_after_batch_load; } raft_free(tmp_entries); n_batches++; *next_index += tmp_n_entries; } if (n_batches == 0) { RaftHeapFree(buf.base); buf.base = NULL; remove = true; } done: /* If the segment has no valid entries in it, we remove it. Otherwise we * rename it and keep it. */ if (remove) { rv = UvFsRemoveFile(uv->dir, info->filename, errmsg); if (rv != 0) { tracef("unlink %s: %s", info->filename, errmsg); rv = RAFT_IOERR; goto err_after_read; } } else { char filename[UV__SEGMENT_FILENAME_BUF_SIZE]; raft_index end_index = *next_index - 1; /* At least one entry was loaded */ assert(end_index >= first_index); int nb = snprintf(filename, sizeof(filename), UV__CLOSED_TEMPLATE, first_index, end_index); if ((nb < 0) || ((size_t)nb >= sizeof(filename))) { tracef("snprintf failed: %d", nb); rv = RAFT_IOERR; goto err; } tracef("finalize %s into %s", info->filename, filename); rv = UvFsTruncateAndRenameFile( uv->dir, (size_t)offset, info->filename, filename, errmsg); if (rv != 0) { tracef("finalize %s: %s", info->filename, errmsg); rv = RAFT_IOERR; goto err; } info->is_open = false; info->first_index = first_index; info->end_index = end_index; memset(info->filename, '\0', sizeof(info->filename)); _Static_assert(sizeof(info->filename) >= sizeof(filename), "Destination buffer too small"); /* info->filename is zeroed out, info->filename is at least as * large as filename and we checked that nb < sizeof(filename) * -> we won't overflow and the result will be zero terminated. */ memcpy(info->filename, filename, (size_t)nb); } return 0; err_after_batch_load: raft_free(tmp_entries[0].batch); raft_free(tmp_entries); err_after_read: if (buf.base != NULL) { RaftHeapFree(buf.base); } err: assert(rv != 0); return rv; } /* Ensure that the write buffer of the given segment is large enough to hold the * the given number of bytes size. */ static int uvEnsureSegmentBufferIsLargeEnough(struct uvSegmentBuffer *b, size_t size) { unsigned n = (unsigned)(size / b->block_size); void *base; size_t len; if (b->arena.len >= size) { assert(b->arena.base != NULL); return 0; } if (size % b->block_size != 0) { n++; } len = b->block_size * n; base = raft_aligned_alloc(b->block_size, len); if (base == NULL) { return RAFT_NOMEM; } memset(base, 0, len); /* If the current arena is initialized, we need to copy its content, * since it might have data that we want to retain in the next write. */ if (b->arena.base != NULL) { assert(b->arena.len >= b->block_size); memcpy(base, b->arena.base, b->arena.len); raft_aligned_free(b->block_size, b->arena.base); } b->arena.base = base; b->arena.len = len; return 0; } void uvSegmentBufferInit(struct uvSegmentBuffer *b, size_t block_size) { b->block_size = block_size; b->arena.base = NULL; b->arena.len = 0; b->n = 0; } void uvSegmentBufferClose(struct uvSegmentBuffer *b) { if (b->arena.base != NULL) { raft_aligned_free(b->block_size, b->arena.base); } } int uvSegmentBufferFormat(struct uvSegmentBuffer *b) { int rv; void *cursor; size_t n; assert(b->n == 0); n = sizeof(uint64_t); rv = uvEnsureSegmentBufferIsLargeEnough(b, n); if (rv != 0) { return rv; } b->n = n; cursor = b->arena.base; bytePut64(&cursor, UV__DISK_FORMAT); return 0; } int uvSegmentBufferAppend(struct uvSegmentBuffer *b, const struct raft_entry entries[], unsigned n_entries) { size_t size; /* Total size of the batch */ uint32_t crc1; /* Header checksum */ uint32_t crc2; /* Data checksum */ void *crc1_p; /* Pointer to header checksum slot */ void *crc2_p; /* Pointer to data checksum slot */ void *header; /* Pointer to the header section */ void *cursor; unsigned i; int rv; size = sizeof(uint32_t) * 2; /* CRC checksums */ size += uvSizeofBatchHeader(n_entries, true); /* Batch header */ for (i = 0; i < n_entries; i++) { /* Entries data */ size += bytePad64(entries[i].buf.len); #ifdef DQLITE_NEXT size += sizeof(struct raft_entry_local_data); #endif } rv = uvEnsureSegmentBufferIsLargeEnough(b, b->n + size); if (rv != 0) { return rv; } cursor = b->arena.base + b->n; /* Placeholder of the checksums */ crc1_p = cursor; bytePut32(&cursor, 0); crc2_p = cursor; bytePut32(&cursor, 0); /* Batch header */ header = cursor; uvEncodeBatchHeader(entries, n_entries, cursor, true /* encode local data */); crc1 = byteCrc32(header, uvSizeofBatchHeader(n_entries, true), 0); cursor = (uint8_t *)cursor + uvSizeofBatchHeader(n_entries, true); /* Batch data */ crc2 = 0; for (i = 0; i < n_entries; i++) { const struct raft_entry *entry = &entries[i]; assert(entry->buf.len % sizeof(uint64_t) == 0); memcpy(cursor, entry->buf.base, entry->buf.len); crc2 = byteCrc32(cursor, entry->buf.len, crc2); cursor = (uint8_t *)cursor + entry->buf.len; static_assert(sizeof(entry->local_data.buf) % sizeof(uint64_t) == 0, "bad size for entry local data"); #ifdef DQLITE_NEXT size_t local_data_size = sizeof(entry->local_data.buf); memcpy(cursor, entry->local_data.buf, local_data_size); crc2 = byteCrc32(cursor, local_data_size, crc2); cursor = (uint8_t *)cursor + local_data_size; #endif } bytePut32(&crc1_p, crc1); bytePut32(&crc2_p, crc2); b->n += size; return 0; } void uvSegmentBufferFinalize(struct uvSegmentBuffer *b, uv_buf_t *out) { unsigned n_blocks; unsigned tail; n_blocks = (unsigned)(b->n / b->block_size); if (b->n % b->block_size != 0) { n_blocks++; } /* Set the remainder of the last block to 0 */ tail = (unsigned)(b->n % b->block_size); if (tail != 0) { memset(b->arena.base + b->n, 0, b->block_size - tail); } out->base = b->arena.base; out->len = n_blocks * b->block_size; } void uvSegmentBufferReset(struct uvSegmentBuffer *b, unsigned retain) { assert(b->n > 0); assert(b->arena.base != NULL); if (retain == 0) { b->n = 0; memset(b->arena.base, 0, b->block_size); return; } memcpy(b->arena.base, b->arena.base + retain * b->block_size, b->block_size); b->n = b->n % b->block_size; } /* When a corrupted segment is detected, the segment is renamed. * Upon a restart, raft will not detect the segment anymore and will try * to start without it. * */ #define CORRUPT_FILE_FMT "corrupt-%" PRId64 "-%s" static void uvMoveCorruptSegment(struct uv *uv, struct uvSegmentInfo *info) { char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0}; char new_filename[UV__FILENAME_LEN + 1] = {0}; size_t sz = sizeof(new_filename); int rv; struct timespec ts = {0}; /* Ignore errors */ clock_gettime(CLOCK_REALTIME, &ts); int64_t ns = ts.tv_sec * 1000000000 + ts.tv_nsec; rv = snprintf(new_filename, sz, CORRUPT_FILE_FMT, ns, info->filename); if (rv < 0 || rv >= (int)sz) { tracef("snprintf %d", rv); return; } UvFsRenameFile(uv->dir, info->filename, new_filename, errmsg); if (rv != 0) { tracef("%s", errmsg); return; } } /* * On startup, raft will try to recover when a corrupt segment is detected. * * When a corrupt open segment is encountered, it, and all subsequent open * segments, are renamed. Not renaming newer, possible non-corrupt, open * segments could lead to loading inconsistent data. * * When a corrupt closed segment is encountered, it will be renamed when * it is the last closed segment, in that case all open-segments are renamed * too. */ static void uvRecoverFromCorruptSegment(struct uv *uv, size_t i_corrupt, struct uvSegmentInfo *infos, size_t n_infos) { struct uvSegmentInfo *info = &infos[i_corrupt]; if (info->is_open) { for (size_t i = i_corrupt; i < n_infos; ++i) { info = &infos[i]; uvMoveCorruptSegment(uv, info); } } else { size_t i_next = i_corrupt + 1; /* last segment or last closed segment. */ if (i_next == n_infos || infos[i_next].is_open) { for (size_t i = i_corrupt; i < n_infos; ++i) { info = &infos[i]; uvMoveCorruptSegment(uv, info); } } } } int uvSegmentLoadAll(struct uv *uv, const raft_index start_index, struct uvSegmentInfo *infos, size_t n_infos, struct raft_entry **entries, size_t *n_entries) { raft_index next_index; /* Next entry to load from disk */ struct raft_entry *tmp_entries; /* Entries in current segment */ size_t tmp_n; /* Number of entries in current segment */ size_t i; int rv; assert(start_index >= 1); assert(n_infos > 0); *entries = NULL; *n_entries = 0; next_index = start_index; for (i = 0; i < n_infos; i++) { struct uvSegmentInfo *info = &infos[i]; tracef("load segment %s", info->filename); if (info->is_open) { rv = uvSegmentLoadOpen(uv, info, entries, n_entries, &next_index); ErrMsgWrapf(uv->io->errmsg, "load open segment %s", info->filename); if (rv != 0) { if (rv == RAFT_CORRUPT && uv->auto_recovery) { uvRecoverFromCorruptSegment( uv, i, infos, n_infos); } goto err; } } else { assert(info->first_index >= start_index); assert(info->first_index <= info->end_index); /* Check that the start index encoded in the name of the * segment matches what we expect and there are no gaps * in the sequence. */ if (info->first_index != next_index) { ErrMsgPrintf(uv->io->errmsg, "unexpected closed segment %s: " "first index should " "have been %llu", info->filename, next_index); rv = RAFT_CORRUPT; goto err; } rv = uvSegmentLoadClosed(uv, info, &tmp_entries, &tmp_n); if (rv != 0) { ErrMsgWrapf(uv->io->errmsg, "load closed segment %s", info->filename); if (rv == RAFT_CORRUPT && uv->auto_recovery) { uvRecoverFromCorruptSegment( uv, i, infos, n_infos); } goto err; } assert(tmp_n > 0); rv = extendEntries(tmp_entries, tmp_n, entries, n_entries); if (rv != 0) { /* TODO: release memory of entries in * tmp_entries */ goto err; } raft_free(tmp_entries); next_index += tmp_n; } } return 0; err: assert(rv != 0); /* Free any batch that we might have allocated and the entries array as * well. */ if (*entries != NULL) { void *batch = NULL; for (i = 0; i < *n_entries; i++) { struct raft_entry *entry = &(*entries)[i]; if (entry->batch != batch) { batch = entry->batch; raft_free(batch); } } raft_free(*entries); *entries = NULL; *n_entries = 0; } return rv; } /* Write a closed segment */ static int uvWriteClosedSegment(struct uv *uv, raft_index first_index, raft_index last_index, const struct raft_buffer *conf) { char filename[UV__FILENAME_LEN]; struct uvSegmentBuffer buf = {0}; struct raft_buffer data; struct raft_entry entry = {0}; size_t cap; char errmsg[RAFT_ERRMSG_BUF_SIZE]; int rv; assert(first_index <= last_index); /* Render the path */ sprintf(filename, UV__CLOSED_TEMPLATE, first_index, last_index); /* Make sure that the given encoded configuration fits in the first * block */ cap = uv->block_size - (sizeof(uint64_t) /* Format version */ + sizeof(uint64_t) /* Checksums */ + uvSizeofBatchHeader(1, true /* include local bufs */)); if (conf->len > cap) { return RAFT_TOOBIG; } uvSegmentBufferInit(&buf, uv->block_size); rv = uvSegmentBufferFormat(&buf); if (rv != 0) { return rv; } entry.term = 1; entry.type = RAFT_CHANGE; entry.buf = *conf; rv = uvSegmentBufferAppend(&buf, &entry, 1); if (rv != 0) { uvSegmentBufferClose(&buf); return rv; } data.base = buf.arena.base; data.len = buf.n; rv = UvFsMakeFile(uv->dir, filename, &data, 1, errmsg); uvSegmentBufferClose(&buf); if (rv != 0) { tracef("write segment %s: %s", filename, errmsg); return RAFT_IOERR; } return 0; } int uvSegmentCreateFirstClosed(struct uv *uv, const struct raft_configuration *configuration) { return uvSegmentCreateClosedWithConfiguration(uv, 1, configuration); } int uvSegmentCreateClosedWithConfiguration( struct uv *uv, raft_index index, const struct raft_configuration *configuration) { struct raft_buffer buf; char filename[UV__FILENAME_LEN]; int rv; /* Render the path */ sprintf(filename, UV__CLOSED_TEMPLATE, index, index); /* Encode the given configuration. */ rv = configurationEncode(configuration, &buf); if (rv != 0) { goto err; } /* Write the file */ rv = uvWriteClosedSegment(uv, index, index, &buf); if (rv != 0) { goto err_after_configuration_encode; } raft_free(buf.base); rv = UvFsSyncDir(uv->dir, uv->io->errmsg); if (rv != 0) { return RAFT_IOERR; } return 0; err_after_configuration_encode: raft_free(buf.base); err: assert(rv != 0); return rv; } int uvSegmentTruncate(struct uv *uv, struct uvSegmentInfo *segment, raft_index index) { char filename[UV__FILENAME_LEN]; struct raft_entry *entries; struct uvSegmentBuffer buf; struct raft_buffer data; size_t n; unsigned m; char errmsg[RAFT_ERRMSG_BUF_SIZE]; int rv; assert(!segment->is_open); tracef("truncate %llu-%llu at %llu", segment->first_index, segment->end_index, index); rv = uvSegmentLoadClosed(uv, segment, &entries, &n); if (rv != 0) { ErrMsgWrapf(uv->io->errmsg, "load closed segment %s", segment->filename); goto out; } /* Discard all entries after the truncate index (included) */ assert(index - segment->first_index < n); m = (unsigned)(index - segment->first_index); uvSegmentBufferInit(&buf, uv->block_size); rv = uvSegmentBufferFormat(&buf); if (rv != 0) { goto out_after_buffer_init; } rv = uvSegmentBufferAppend(&buf, entries, m); if (rv != 0) { goto out_after_buffer_init; } /* Render the path. * * TODO: we should use a temporary file name so in case of crash we * don't consider this segment as corrupted. */ sprintf(filename, UV__CLOSED_TEMPLATE, segment->first_index, index - 1); data.base = buf.arena.base; data.len = buf.n; rv = UvFsMakeFile(uv->dir, filename, &data, 1, errmsg); if (rv != 0) { tracef("write %s: %s", filename, errmsg); rv = RAFT_IOERR; goto out_after_buffer_init; } out_after_buffer_init: uvSegmentBufferClose(&buf); entryBatchesDestroy(entries, n); out: return rv; } #undef tracef dqlite-1.16.7/src/raft/uv_send.c000066400000000000000000000323701465252713400164400ustar00rootroot00000000000000#include #include "../raft.h" #include "assert.h" #include "heap.h" #include "uv.h" #include "uv_encoding.h" /* The happy path for an raft_io_send request is: * * - Get the uvClient object whose address matches the one of target server. * - Encode the message and write it using the uvClient's TCP handle. * - Once the write completes, fire the send request callback. * * Possible failure modes are: * * - The uv->clients queue has no client object with a matching address. In this * case add a new client object to the array, add the send request to the * queue of pending requests and submit a connection request. Once the * connection request succeeds, try to write the encoded request to the * connected stream handle. If the connection request fails, schedule another * attempt. * * - The uv->clients queue has a client object which is not connected. Add the * send request to the pending queue, and, if there's no connection attempt * already in progress, start a new one. * * - The write request fails (either synchronously or asynchronously). In this * case we fire the request callback with an error, close the connection * stream, and start a re-connection attempt. */ /* Maximum number of requests that can be buffered. */ #define UV__CLIENT_MAX_PENDING 3 struct uvClient { struct uv *uv; /* libuv I/O implementation object */ struct uv_timer_s timer; /* Schedule connection attempts */ struct raft_uv_connect connect; /* Connection request */ struct uv_stream_s *stream; /* Current connection handle */ struct uv_stream_s *old_stream; /* Connection handle being closed */ unsigned n_connect_attempt; /* Consecutive connection attempts */ raft_id id; /* ID of the other server */ char *address; /* Address of the other server */ queue pending; /* Pending send message requests */ queue queue; /* Clients queue */ bool closing; /* True after calling uvClientAbort */ }; /* Hold state for a single send RPC message request. */ struct uvSend { struct uvClient *client; /* Client connected to the target server */ struct raft_io_send *req; /* User request */ uv_buf_t *bufs; /* Encoded raft RPC message to send */ unsigned n_bufs; /* Number of buffers */ uv_write_t write; /* Stream write request */ queue queue; /* Pending send requests queue */ }; /* Free all memory used by the given send request object, including the object * itself. */ static void uvSendDestroy(struct uvSend *s) { if (s->bufs != NULL) { /* Just release the first buffer. Further buffers are entry or * snapshot payloads, which we were passed but we don't own. */ RaftHeapFree(s->bufs[0].base); /* Release the buffers array. */ RaftHeapFree(s->bufs); } RaftHeapFree(s); } /* Initialize a new client associated with the given server. */ static int uvClientInit(struct uvClient *c, struct uv *uv, raft_id id, const char *address) { int rv; c->uv = uv; c->timer.data = c; c->connect.data = NULL; /* Set upon starting a connect request */ c->stream = NULL; /* Set upon successful connection */ c->old_stream = NULL; /* Set after closing the current connection */ c->n_connect_attempt = 0; c->id = id; c->address = RaftHeapMalloc(strlen(address) + 1); if (c->address == NULL) { return RAFT_NOMEM; } rv = uv_timer_init(c->uv->loop, &c->timer); assert(rv == 0); strcpy(c->address, address); queue_init(&c->pending); c->closing = false; queue_insert_tail(&uv->clients, &c->queue); return 0; } /* If there's no more pending cleanup, remove the client from the abort queue * and destroy it. */ static void uvClientMaybeDestroy(struct uvClient *c) { struct uv *uv = c->uv; assert(c->stream == NULL); if (c->connect.data != NULL) { return; } if (c->timer.data != NULL) { return; } if (c->old_stream != NULL) { return; } while (!queue_empty(&c->pending)) { queue *head; struct uvSend *send; struct raft_io_send *req; head = queue_head(&c->pending); send = QUEUE_DATA(head, struct uvSend, queue); queue_remove(head); req = send->req; uvSendDestroy(send); if (req->cb != NULL) { req->cb(req, RAFT_CANCELED); } } queue_remove(&c->queue); assert(c->address != NULL); RaftHeapFree(c->address); RaftHeapFree(c); uvMaybeFireCloseCb(uv); } /* Forward declaration. */ static void uvClientConnect(struct uvClient *c); static void uvClientDisconnectCloseCb(struct uv_handle_s *handle) { struct uvClient *c = handle->data; assert(c->old_stream != NULL); assert(c->stream == NULL); assert(handle == (struct uv_handle_s *)c->old_stream); RaftHeapFree(c->old_stream); c->old_stream = NULL; if (c->closing) { uvClientMaybeDestroy(c); } else { uvClientConnect(c); /* Trigger a new connection attempt. */ } } /* Close the current connection. */ static void uvClientDisconnect(struct uvClient *c) { assert(c->stream != NULL); assert(c->old_stream == NULL); c->old_stream = c->stream; c->stream = NULL; uv_close((struct uv_handle_s *)c->old_stream, uvClientDisconnectCloseCb); } /* Invoked once an encoded RPC message has been written out. */ static void uvSendWriteCb(struct uv_write_s *write, const int status) { struct uvSend *send = write->data; struct uvClient *c = send->client; struct raft_io_send *req = send->req; int cb_status = 0; /* If the write failed and we're not currently closing, let's consider * the current stream handle as busted and start disconnecting (unless * we're already doing so). We'll trigger a new connection attempt once * the handle is closed. */ if (status != 0) { cb_status = RAFT_IOERR; if (!c->closing) { if (c->stream != NULL) { uvClientDisconnect(c); } } else if (status == UV_ECANCELED) { cb_status = RAFT_CANCELED; } } uvSendDestroy(send); if (req->cb != NULL) { req->cb(req, cb_status); } } static int uvClientSend(struct uvClient *c, struct uvSend *send) { int rv; assert(!c->closing); send->client = c; /* If there's no connection available, let's queue the request. */ if (c->stream == NULL) { tracef("no connection available -> enqueue message"); queue_insert_tail(&c->pending, &send->queue); return 0; } tracef("connection available -> write message"); send->write.data = send; rv = uv_write(&send->write, c->stream, send->bufs, send->n_bufs, uvSendWriteCb); if (rv != 0) { tracef("write message failed -> rv %d", rv); /* UNTESTED: what are the error conditions? perhaps ENOMEM */ return RAFT_IOERR; } return 0; } /* Try to execute all send requests that were blocked in the queue waiting for a * connection. */ static void uvClientSendPending(struct uvClient *c) { int rv; assert(c->stream != NULL); tracef("send pending messages"); while (!queue_empty(&c->pending)) { queue *head; struct uvSend *send; head = queue_head(&c->pending); send = QUEUE_DATA(head, struct uvSend, queue); queue_remove(head); rv = uvClientSend(c, send); if (rv != 0) { if (send->req->cb != NULL) { send->req->cb(send->req, rv); } uvSendDestroy(send); } } } static void uvClientTimerCb(uv_timer_t *timer) { struct uvClient *c = timer->data; tracef("timer expired -> attempt to reconnect"); uvClientConnect(c); /* Retry to connect. */ } /* Return the number of send requests that we have been parked in the send queue * because no connection is available yet. */ static unsigned uvClientPendingCount(struct uvClient *c) { queue *head; unsigned n = 0; QUEUE_FOREACH(head, &c->pending) { n++; } return n; } static void uvClientConnectCb(struct raft_uv_connect *req, struct uv_stream_s *stream, int status) { struct uvClient *c = req->data; unsigned n_pending; int rv; tracef("connect attempt completed -> status %s", errCodeToString(status)); assert(c->connect.data != NULL); assert(c->stream == NULL); assert(c->old_stream == NULL); assert(!uv_is_active((struct uv_handle_s *)&c->timer)); c->connect.data = NULL; /* If we are closing, bail out, possibly discarding the new connection. */ if (c->closing) { if (status == 0) { assert(stream != NULL); c->stream = stream; c->stream->data = c; uvClientDisconnect(c); } else { uvClientMaybeDestroy(c); } return; } /* If, the connection attempt was successful, we're good. If we have * pending requests, let's try to execute them. */ if (status == 0) { assert(stream != NULL); c->stream = stream; c->n_connect_attempt = 0; c->stream->data = c; uvClientSendPending(c); return; } /* Shrink the queue of pending requests, by failing the oldest ones */ n_pending = uvClientPendingCount(c); if (n_pending > UV__CLIENT_MAX_PENDING) { unsigned i; for (i = 0; i < n_pending - UV__CLIENT_MAX_PENDING; i++) { tracef("queue full -> evict oldest message"); queue *head; struct uvSend *old_send; struct raft_io_send *old_req; head = queue_head(&c->pending); old_send = QUEUE_DATA(head, struct uvSend, queue); queue_remove(head); old_req = old_send->req; uvSendDestroy(old_send); if (old_req->cb != NULL) { old_req->cb(old_req, RAFT_NOCONNECTION); } } } /* Let's schedule another attempt. */ rv = uv_timer_start(&c->timer, uvClientTimerCb, c->uv->connect_retry_delay, 0); assert(rv == 0); } /* Perform a single connection attempt, scheduling a retry if it fails. */ static void uvClientConnect(struct uvClient *c) { int rv; assert(!c->closing); assert(c->stream == NULL); assert(c->old_stream == NULL); assert(!uv_is_active((struct uv_handle_s *)&c->timer)); assert(c->connect.data == NULL); c->n_connect_attempt++; c->connect.data = c; rv = c->uv->transport->connect(c->uv->transport, &c->connect, c->id, c->address, uvClientConnectCb); if (rv != 0) { /* Restart the timer, so we can retry. */ c->connect.data = NULL; rv = uv_timer_start(&c->timer, uvClientTimerCb, c->uv->connect_retry_delay, 0); assert(rv == 0); } } /* Final callback in the close chain of an io_uv__client object */ static void uvClientTimerCloseCb(struct uv_handle_s *handle) { struct uvClient *c = handle->data; assert(handle == (struct uv_handle_s *)&c->timer); c->timer.data = NULL; uvClientMaybeDestroy(c); } /* Start shutting down a client. This happens when the `raft_io` instance * has been closed or when the address of the client has changed. */ static void uvClientAbort(struct uvClient *c) { struct uv *uv = c->uv; int rv; assert(c->stream != NULL || c->old_stream != NULL || uv_is_active((struct uv_handle_s *)&c->timer) || c->connect.data != NULL); queue_remove(&c->queue); queue_insert_tail(&uv->aborting, &c->queue); rv = uv_timer_stop(&c->timer); assert(rv == 0); /* If we are connected, let's close the outbound stream handle. This * will eventually complete all inflight write requests, possibly with * failing them with UV_ECANCELED. */ if (c->stream != NULL) { uvClientDisconnect(c); } /* Closing the timer implicitly stop it, so the timeout callback won't * be fired. */ uv_close((struct uv_handle_s *)&c->timer, uvClientTimerCloseCb); c->closing = true; } /* Find the client object associated with the given server, or create one if * there's none yet. */ static int uvGetClient(struct uv *uv, const raft_id id, const char *address, struct uvClient **client) { queue *head; int rv; /* Check if we already have a client object for this peer server. */ QUEUE_FOREACH(head, &uv->clients) { *client = QUEUE_DATA(head, struct uvClient, queue); if ((*client)->id != id) { continue; } /* Client address has changed, abort connection and create a new * one. */ if (strcmp((*client)->address, address) != 0) { uvClientAbort(*client); break; } return 0; } /* Initialize the new connection */ *client = RaftHeapMalloc(sizeof **client); if (*client == NULL) { rv = RAFT_NOMEM; goto err; } rv = uvClientInit(*client, uv, id, address); if (rv != 0) { goto err_after_client_alloc; } /* Make a first connection attempt right away.. */ uvClientConnect(*client); return 0; err_after_client_alloc: RaftHeapFree(*client); err: assert(rv != 0); return rv; } int UvSend(struct raft_io *io, struct raft_io_send *req, const struct raft_message *message, raft_io_send_cb cb) { struct uv *uv = io->impl; struct uvSend *send; struct uvClient *client; int rv; assert(!uv->closing); /* Allocate a new request object. */ send = RaftHeapMalloc(sizeof *send); if (send == NULL) { rv = RAFT_NOMEM; goto err; } send->req = req; req->cb = cb; rv = uvEncodeMessage(message, &send->bufs, &send->n_bufs); if (rv != 0) { send->bufs = NULL; goto err_after_send_alloc; } /* Get a client object connected to the target server, creating it if it * doesn't exist yet. */ rv = uvGetClient(uv, message->server_id, message->server_address, &client); if (rv != 0) { goto err_after_send_alloc; } rv = uvClientSend(client, send); if (rv != 0) { goto err_after_send_alloc; } return 0; err_after_send_alloc: uvSendDestroy(send); err: assert(rv != 0); return rv; } void UvSendClose(struct uv *uv) { assert(uv->closing); while (!queue_empty(&uv->clients)) { queue *head; struct uvClient *client; head = queue_head(&uv->clients); client = QUEUE_DATA(head, struct uvClient, queue); uvClientAbort(client); } } #undef tracef dqlite-1.16.7/src/raft/uv_snapshot.c000066400000000000000000000465001465252713400173460ustar00rootroot00000000000000#include #include #include "array.h" #include "assert.h" #include "byte.h" #include "compress.h" #include "configuration.h" #include "heap.h" #include "uv.h" #include "uv_encoding.h" #include "uv_os.h" /* Arbitrary maximum configuration size. Should be practically be enough */ #define UV__META_MAX_CONFIGURATION_SIZE 1024 * 1024 /* Returns true if the filename is a valid snapshot file or snapshot meta * filename depending on the `meta` switch. If the parse is successful, the * arguments will contain the parsed values. */ static bool uvSnapshotParseFilename(const char *filename, bool meta, raft_term *term, raft_index *index, raft_time *timestamp) { /* Check if it's a well-formed snapshot filename */ int consumed = 0; int matched; size_t filename_len = strlen(filename); assert(filename_len < UV__FILENAME_LEN); if (meta) { matched = sscanf(filename, UV__SNAPSHOT_META_TEMPLATE "%n", term, index, timestamp, &consumed); } else { matched = sscanf(filename, UV__SNAPSHOT_TEMPLATE "%n", term, index, timestamp, &consumed); } if (matched != 3 || consumed != (int)filename_len) { return false; } return true; } /* Check if the given filename matches the pattern of a snapshot metadata * filename (snapshot-xxx-yyy-zzz.meta), and fill the given info structure if * so. * * Return true if the filename matched, false otherwise. */ static bool uvSnapshotInfoMatch(const char *filename, struct uvSnapshotInfo *info) { if (!uvSnapshotParseFilename(filename, true, &info->term, &info->index, &info->timestamp)) { return false; } /* Allow room for '\0' terminator */ size_t n = sizeof(info->filename) - 1; strncpy(info->filename, filename, n); info->filename[n] = '\0'; return true; } void uvSnapshotFilenameOf(struct uvSnapshotInfo *info, char *filename) { size_t len = strlen(info->filename) - strlen(".meta"); assert(len < UV__FILENAME_LEN); strcpy(filename, info->filename); filename[len] = 0; } int UvSnapshotInfoAppendIfMatch(struct uv *uv, const char *filename, struct uvSnapshotInfo *infos[], size_t *n_infos, bool *appended) { struct uvSnapshotInfo info; bool matched; char snapshot_filename[UV__FILENAME_LEN]; bool exists; bool is_empty; char errmsg[RAFT_ERRMSG_BUF_SIZE]; int rv; /* Check if it's a snapshot metadata filename */ matched = uvSnapshotInfoMatch(filename, &info); if (!matched) { *appended = false; return 0; } /* Check if there's actually a valid snapshot file for this snapshot * metadata. If there's none or it's empty, it means that we aborted * before finishing the snapshot, or that another thread is still busy * writing the snapshot. */ uvSnapshotFilenameOf(&info, snapshot_filename); rv = UvFsFileExists(uv->dir, snapshot_filename, &exists, errmsg); if (rv != 0) { tracef("stat %s: %s", snapshot_filename, errmsg); rv = RAFT_IOERR; return rv; } if (!exists) { *appended = false; return 0; } /* TODO This check is strictly not needed, snapshot files are created by * renaming fully written and synced tmp-files. Leaving it here, just to * be extra-safe. Can probably be removed once more data integrity * checks are performed at startup. */ rv = UvFsFileIsEmpty(uv->dir, snapshot_filename, &is_empty, errmsg); if (rv != 0) { tracef("is_empty %s: %s", snapshot_filename, errmsg); rv = RAFT_IOERR; return rv; } if (is_empty) { *appended = false; return 0; } ARRAY__APPEND(struct uvSnapshotInfo, info, infos, n_infos, rv); if (rv == -1) { return RAFT_NOMEM; } *appended = true; return 0; } static int uvSnapshotIsOrphanInternal(const char *dir, const char *filename, bool meta, bool *orphan) { int rv; *orphan = false; raft_term term; raft_index index; raft_time timestamp; if (!uvSnapshotParseFilename(filename, meta, &term, &index, ×tamp)) { return 0; } /* filename is a well-formed snapshot filename, check if the sibling * file exists. */ char sibling_filename[UV__FILENAME_LEN]; if (meta) { rv = snprintf(sibling_filename, UV__FILENAME_LEN, UV__SNAPSHOT_TEMPLATE, term, index, timestamp); } else { rv = snprintf(sibling_filename, UV__FILENAME_LEN, UV__SNAPSHOT_META_TEMPLATE, term, index, timestamp); } if (rv >= UV__FILENAME_LEN) { /* Output truncated */ return -1; } bool sibling_exists = false; char ignored[RAFT_ERRMSG_BUF_SIZE]; rv = UvFsFileExists(dir, sibling_filename, &sibling_exists, ignored); if (rv != 0) { return rv; } *orphan = !sibling_exists; return 0; } int UvSnapshotIsOrphan(const char *dir, const char *filename, bool *orphan) { return uvSnapshotIsOrphanInternal(dir, filename, false, orphan); } int UvSnapshotMetaIsOrphan(const char *dir, const char *filename, bool *orphan) { return uvSnapshotIsOrphanInternal(dir, filename, true, orphan); } /* Compare two snapshots to decide which one is more recent. */ static int uvSnapshotCompare(const void *p1, const void *p2) { struct uvSnapshotInfo *s1 = (struct uvSnapshotInfo *)p1; struct uvSnapshotInfo *s2 = (struct uvSnapshotInfo *)p2; /* If terms are different, the snapshot with the highest term is the * most recent. */ if (s1->term != s2->term) { return s1->term < s2->term ? -1 : 1; } /* If the term are identical and the index differ, the snapshot with the * highest index is the most recent */ if (s1->index != s2->index) { return s1->index < s2->index ? -1 : 1; } /* If term and index are identical, compare the timestamp. */ return s1->timestamp < s2->timestamp ? -1 : 1; } /* Sort the given snapshots. */ void UvSnapshotSort(struct uvSnapshotInfo *infos, size_t n_infos) { qsort(infos, n_infos, sizeof *infos, uvSnapshotCompare); } /* Parse the metadata file of a snapshot and populate the metadata portion of * the given snapshot object accordingly. */ static int uvSnapshotLoadMeta(struct uv *uv, struct uvSnapshotInfo *info, struct raft_snapshot *snapshot, char *errmsg) { uint64_t header[1 + /* Format version */ 1 + /* CRC checksum */ 1 + /* Configuration index */ 1 /* Configuration length */]; struct raft_buffer buf; uint64_t format; uint32_t crc1; uint32_t crc2; uv_file fd; int rv; snapshot->term = info->term; snapshot->index = info->index; rv = UvFsOpenFileForReading(uv->dir, info->filename, &fd, errmsg); if (rv != 0) { tracef("open %s: %s", info->filename, errmsg); rv = RAFT_IOERR; goto err; } buf.base = header; buf.len = sizeof header; rv = UvFsReadInto(fd, &buf, errmsg); if (rv != 0) { tracef("read %s: %s", info->filename, errmsg); rv = RAFT_IOERR; goto err_after_open; } format = byteFlip64(header[0]); if (format != UV__DISK_FORMAT) { tracef("load %s: unsupported format %ju", info->filename, format); rv = RAFT_MALFORMED; goto err_after_open; } crc1 = (uint32_t)byteFlip64(header[1]); snapshot->configuration_index = byteFlip64(header[2]); buf.len = (size_t)byteFlip64(header[3]); if (buf.len > UV__META_MAX_CONFIGURATION_SIZE) { tracef("load %s: configuration data too big (%zd)", info->filename, buf.len); rv = RAFT_CORRUPT; goto err_after_open; } if (buf.len == 0) { tracef("load %s: no configuration data", info->filename); rv = RAFT_CORRUPT; goto err_after_open; } buf.base = RaftHeapMalloc(buf.len); if (buf.base == NULL) { rv = RAFT_NOMEM; goto err_after_open; } rv = UvFsReadInto(fd, &buf, errmsg); if (rv != 0) { tracef("read %s: %s", info->filename, errmsg); rv = RAFT_IOERR; goto err_after_buf_malloc; } crc2 = byteCrc32(header + 2, sizeof header - sizeof(uint64_t) * 2, 0); crc2 = byteCrc32(buf.base, buf.len, crc2); if (crc1 != crc2) { ErrMsgPrintf(errmsg, "read %s: checksum mismatch", info->filename); rv = RAFT_CORRUPT; goto err_after_buf_malloc; } rv = configurationDecode(&buf, &snapshot->configuration); if (rv != 0) { goto err_after_buf_malloc; } RaftHeapFree(buf.base); UvOsClose(fd); return 0; err_after_buf_malloc: RaftHeapFree(buf.base); err_after_open: close(fd); err: assert(rv != 0); return rv; } /* Load the snapshot data file and populate the data portion of the given * snapshot object accordingly. */ static int uvSnapshotLoadData(struct uv *uv, struct uvSnapshotInfo *info, struct raft_snapshot *snapshot, char *errmsg) { char filename[UV__FILENAME_LEN]; struct raft_buffer buf; int rv; uvSnapshotFilenameOf(info, filename); rv = UvFsReadFile(uv->dir, filename, &buf, errmsg); if (rv != 0) { tracef("stat %s: %s", filename, errmsg); goto err; } if (IsCompressed(buf.base, buf.len)) { struct raft_buffer decompressed = {0}; tracef("snapshot decompress start"); rv = Decompress(buf, &decompressed, errmsg); tracef("snapshot decompress end %d", rv); if (rv != 0) { tracef("decompress failed rv:%d", rv); goto err_after_read_file; } RaftHeapFree(buf.base); buf = decompressed; } snapshot->bufs = RaftHeapMalloc(sizeof *snapshot->bufs); snapshot->n_bufs = 1; if (snapshot->bufs == NULL) { rv = RAFT_NOMEM; goto err_after_read_file; } snapshot->bufs[0] = buf; return 0; err_after_read_file: RaftHeapFree(buf.base); err: assert(rv != 0); return rv; } int UvSnapshotLoad(struct uv *uv, struct uvSnapshotInfo *meta, struct raft_snapshot *snapshot, char *errmsg) { int rv; rv = uvSnapshotLoadMeta(uv, meta, snapshot, errmsg); if (rv != 0) { return rv; } rv = uvSnapshotLoadData(uv, meta, snapshot, errmsg); if (rv != 0) { return rv; } return 0; } struct uvSnapshotPut { struct uv *uv; size_t trailing; struct raft_io_snapshot_put *req; const struct raft_snapshot *snapshot; struct { unsigned long long timestamp; uint64_t header[4]; /* Format, CRC, configuration index/len */ struct raft_buffer bufs[2]; /* Preamble and configuration */ } meta; char errmsg[RAFT_ERRMSG_BUF_SIZE]; int status; struct UvBarrierReq barrier; }; struct uvSnapshotGet { struct uv *uv; struct raft_io_snapshot_get *req; struct raft_snapshot *snapshot; struct uv_work_s work; char errmsg[RAFT_ERRMSG_BUF_SIZE]; int status; queue queue; }; static int uvSnapshotKeepLastTwo(struct uv *uv, struct uvSnapshotInfo *snapshots, size_t n) { size_t i; char errmsg[RAFT_ERRMSG_BUF_SIZE]; int rv; /* Leave at least two snapshots, for safety. */ if (n <= 2) { return 0; } for (i = 0; i < n - 2; i++) { struct uvSnapshotInfo *snapshot = &snapshots[i]; char filename[UV__FILENAME_LEN]; rv = UvFsRemoveFile(uv->dir, snapshot->filename, errmsg); if (rv != 0) { tracef("unlink %s: %s", snapshot->filename, errmsg); return RAFT_IOERR; } uvSnapshotFilenameOf(snapshot, filename); rv = UvFsRemoveFile(uv->dir, filename, errmsg); if (rv != 0) { tracef("unlink %s: %s", filename, errmsg); return RAFT_IOERR; } } return 0; } /* Remove all segments and snapshots that are not needed anymore, because their past the trailing amount. */ static int uvRemoveOldSegmentsAndSnapshots(struct uv *uv, raft_index last_index, size_t trailing, char *errmsg) { struct uvSnapshotInfo *snapshots; struct uvSegmentInfo *segments; size_t n_snapshots; size_t n_segments; int rv = 0; rv = UvList(uv, &snapshots, &n_snapshots, &segments, &n_segments, errmsg); if (rv != 0) { goto out; } rv = uvSnapshotKeepLastTwo(uv, snapshots, n_snapshots); if (rv != 0) { goto out; } if (segments != NULL) { rv = uvSegmentKeepTrailing(uv, segments, n_segments, last_index, trailing, errmsg); if (rv != 0) { goto out; } } rv = UvFsSyncDir(uv->dir, errmsg); out: if (snapshots != NULL) { RaftHeapFree(snapshots); } if (segments != NULL) { RaftHeapFree(segments); } return rv; } static int makeFileCompressed(const char *dir, const char *filename, struct raft_buffer *bufs, unsigned n_bufs, char *errmsg) { int rv; struct raft_buffer compressed = {0}; rv = Compress(bufs, n_bufs, &compressed, errmsg); if (rv != 0) { ErrMsgWrapf(errmsg, "compress %s", filename); return RAFT_IOERR; } rv = UvFsMakeFile(dir, filename, &compressed, 1, errmsg); raft_free(compressed.base); return rv; } static void uvSnapshotPutWorkCb(uv_work_t *work) { struct uvSnapshotPut *put = work->data; struct uv *uv = put->uv; char metadata[UV__FILENAME_LEN]; char snapshot[UV__FILENAME_LEN]; char errmsg[RAFT_ERRMSG_BUF_SIZE]; int rv; sprintf(metadata, UV__SNAPSHOT_META_TEMPLATE, put->snapshot->term, put->snapshot->index, put->meta.timestamp); rv = UvFsMakeFile(uv->dir, metadata, put->meta.bufs, 2, put->errmsg); if (rv != 0) { tracef("snapshot.meta creation failed %d", rv); ErrMsgWrapf(put->errmsg, "write %s", metadata); put->status = RAFT_IOERR; return; } sprintf(snapshot, UV__SNAPSHOT_TEMPLATE, put->snapshot->term, put->snapshot->index, put->meta.timestamp); tracef("snapshot write start"); if (uv->snapshot_compression) { rv = makeFileCompressed(uv->dir, snapshot, put->snapshot->bufs, put->snapshot->n_bufs, put->errmsg); } else { rv = UvFsMakeFile(uv->dir, snapshot, put->snapshot->bufs, put->snapshot->n_bufs, put->errmsg); } tracef("snapshot write end %d", rv); if (rv != 0) { tracef("snapshot creation failed %d", rv); ErrMsgWrapf(put->errmsg, "write %s", snapshot); UvFsRemoveFile(uv->dir, metadata, errmsg); UvFsRemoveFile(uv->dir, snapshot, errmsg); put->status = RAFT_IOERR; return; } rv = UvFsSyncDir(uv->dir, put->errmsg); if (rv != 0) { put->status = RAFT_IOERR; return; } rv = uvRemoveOldSegmentsAndSnapshots(uv, put->snapshot->index, put->trailing, put->errmsg); if (rv != 0) { put->status = rv; return; } put->status = 0; return; } /* Finish the put request, releasing all associated memory and invoking its * callback. */ static void uvSnapshotPutFinish(struct uvSnapshotPut *put) { struct raft_io_snapshot_put *req = put->req; int status = put->status; struct uv *uv = put->uv; assert(uv->snapshot_put_work.data == NULL); RaftHeapFree(put->meta.bufs[1].base); RaftHeapFree(put); req->cb(req, status); } static void uvSnapshotPutAfterWorkCb(uv_work_t *work, int status) { struct uvSnapshotPut *put = work->data; struct uv *uv = put->uv; assert(status == 0); uv->snapshot_put_work.data = NULL; uvSnapshotPutFinish(put); UvUnblock(uv); } /* Start processing the given put request. */ static void uvSnapshotPutStart(struct uvSnapshotPut *put) { struct uv *uv = put->uv; int rv; /* If this is an install request, the barrier callback must have fired. */ if (put->trailing == 0) { assert(put->barrier.data == NULL); } uv->snapshot_put_work.data = put; rv = uv_queue_work(uv->loop, &uv->snapshot_put_work, uvSnapshotPutWorkCb, uvSnapshotPutAfterWorkCb); if (rv != 0) { tracef("store snapshot %lld: %s", put->snapshot->index, uv_strerror(rv)); uv->errored = true; } } static void uvSnapshotPutBarrierCb(struct UvBarrierReq *barrier) { /* Ensure that we don't invoke this callback more than once. */ barrier->cb = NULL; struct uvSnapshotPut *put = barrier->data; if (put == NULL) { return; } struct uv *uv = put->uv; put->barrier.data = NULL; /* If we're closing, abort the request. */ if (uv->closing) { put->status = RAFT_CANCELED; uvSnapshotPutFinish(put); uvMaybeFireCloseCb(uv); return; } uvSnapshotPutStart(put); } int UvSnapshotPut(struct raft_io *io, unsigned trailing, struct raft_io_snapshot_put *req, const struct raft_snapshot *snapshot, raft_io_snapshot_put_cb cb) { struct uv *uv; struct uvSnapshotPut *put; void *cursor; unsigned crc; int rv; raft_index next_index; uv = io->impl; if (uv->closing) { return RAFT_CANCELED; } assert(uv->snapshot_put_work.data == NULL); tracef("put snapshot at %lld, keeping %d", snapshot->index, trailing); put = RaftHeapMalloc(sizeof *put); if (put == NULL) { rv = RAFT_NOMEM; goto err; } put->uv = uv; put->req = req; put->snapshot = snapshot; put->meta.timestamp = uv_now(uv->loop); put->trailing = trailing; put->barrier.data = put; put->barrier.blocking = trailing == 0; put->barrier.cb = uvSnapshotPutBarrierCb; req->cb = cb; /* Prepare the buffers for the metadata file. */ put->meta.bufs[0].base = put->meta.header; put->meta.bufs[0].len = sizeof put->meta.header; rv = configurationEncode(&snapshot->configuration, &put->meta.bufs[1]); if (rv != 0) { goto err_after_req_alloc; } cursor = put->meta.header; bytePut64(&cursor, UV__DISK_FORMAT); bytePut64(&cursor, 0); bytePut64(&cursor, snapshot->configuration_index); bytePut64(&cursor, put->meta.bufs[1].len); crc = byteCrc32(&put->meta.header[2], sizeof(uint64_t) * 2, 0); crc = byteCrc32(put->meta.bufs[1].base, put->meta.bufs[1].len, crc); cursor = &put->meta.header[1]; bytePut64(&cursor, crc); /* - If the trailing parameter is set to 0, it means that we're * restoring a snapshot. Submit a barrier request setting the next * append index to the snapshot's last index + 1. * - When we are only writing a snapshot during normal operation, we * close all current open segments. New writes can continue on newly * opened segments that will only contain entries that are newer than * the snapshot, and we don't change append_next_index. */ next_index = (trailing == 0) ? (snapshot->index + 1) : uv->append_next_index; rv = UvBarrier(uv, next_index, &put->barrier); if (rv != 0) { goto err_after_configuration_encode; } return 0; err_after_configuration_encode: RaftHeapFree(put->meta.bufs[1].base); err_after_req_alloc: RaftHeapFree(put); err: assert(rv != 0); return rv; } static void uvSnapshotGetWorkCb(uv_work_t *work) { struct uvSnapshotGet *get = work->data; struct uv *uv = get->uv; struct uvSnapshotInfo *snapshots; size_t n_snapshots; struct uvSegmentInfo *segments; size_t n_segments; int rv; get->status = 0; rv = UvList(uv, &snapshots, &n_snapshots, &segments, &n_segments, get->errmsg); if (rv != 0) { get->status = rv; goto out; } if (snapshots != NULL) { rv = UvSnapshotLoad(uv, &snapshots[n_snapshots - 1], get->snapshot, get->errmsg); if (rv != 0) { get->status = rv; } RaftHeapFree(snapshots); } if (segments != NULL) { RaftHeapFree(segments); } out: return; } static void uvSnapshotGetAfterWorkCb(uv_work_t *work, int status) { struct uvSnapshotGet *get = work->data; struct raft_io_snapshot_get *req = get->req; struct raft_snapshot *snapshot = get->snapshot; int req_status = get->status; struct uv *uv = get->uv; assert(status == 0); queue_remove(&get->queue); RaftHeapFree(get); req->cb(req, snapshot, req_status); uvMaybeFireCloseCb(uv); } int UvSnapshotGet(struct raft_io *io, struct raft_io_snapshot_get *req, raft_io_snapshot_get_cb cb) { struct uv *uv; struct uvSnapshotGet *get; int rv; uv = io->impl; assert(!uv->closing); get = RaftHeapMalloc(sizeof *get); if (get == NULL) { rv = RAFT_NOMEM; goto err; } get->uv = uv; get->req = req; req->cb = cb; get->snapshot = RaftHeapMalloc(sizeof *get->snapshot); if (get->snapshot == NULL) { rv = RAFT_NOMEM; goto err_after_req_alloc; } get->work.data = get; queue_insert_tail(&uv->snapshot_get_reqs, &get->queue); rv = uv_queue_work(uv->loop, &get->work, uvSnapshotGetWorkCb, uvSnapshotGetAfterWorkCb); if (rv != 0) { queue_remove(&get->queue); tracef("get last snapshot: %s", uv_strerror(rv)); rv = RAFT_IOERR; goto err_after_snapshot_alloc; } return 0; err_after_snapshot_alloc: RaftHeapFree(get->snapshot); err_after_req_alloc: RaftHeapFree(get); err: assert(rv != 0); return rv; } #undef tracef dqlite-1.16.7/src/raft/uv_tcp.c000066400000000000000000000051241465252713400162720ustar00rootroot00000000000000#include "uv_tcp.h" #include "uv_ip.h" #include #include "../raft.h" #include "assert.h" #include "err.h" #include "heap.h" /* Implementation of raft_uv_transport->init. */ static int uvTcpInit(struct raft_uv_transport *transport, raft_id id, const char *address) { struct UvTcp *t = transport->impl; assert(id > 0); assert(address != NULL); t->id = id; t->address = address; return 0; } /* Implementation of raft_uv_transport->close. */ static void uvTcpClose(struct raft_uv_transport *transport, raft_uv_transport_close_cb cb) { struct UvTcp *t = transport->impl; assert(!t->closing); t->closing = true; t->close_cb = cb; UvTcpListenClose(t); UvTcpConnectClose(t); UvTcpMaybeFireCloseCb(t); } void UvTcpMaybeFireCloseCb(struct UvTcp *t) { if (!t->closing) { return; } assert(queue_empty(&t->accepting)); assert(queue_empty(&t->connecting)); if (!queue_empty(&t->aborting)) { return; } if (t->listeners != NULL) { return; } if (t->close_cb != NULL) { t->close_cb(t->transport); } } int raft_uv_tcp_init(struct raft_uv_transport *transport, struct uv_loop_s *loop) { struct UvTcp *t; void *data = transport->data; int version = transport->version; if (version != 1) { ErrMsgPrintf(transport->errmsg, "Invalid version: %d", version); return RAFT_INVALID; } memset(transport, 0, sizeof *transport); transport->data = data; transport->version = version; t = raft_malloc(sizeof *t); if (t == NULL) { ErrMsgOom(transport->errmsg); return RAFT_NOMEM; } t->transport = transport; t->loop = loop; t->id = 0; t->address = NULL; t->bind_address = NULL; t->listeners = NULL; t->n_listeners = 0; t->accept_cb = NULL; queue_init(&t->accepting); queue_init(&t->connecting); queue_init(&t->aborting); t->closing = false; t->close_cb = NULL; transport->impl = t; transport->init = uvTcpInit; transport->close = uvTcpClose; transport->listen = UvTcpListen; transport->connect = UvTcpConnect; return 0; } void raft_uv_tcp_close(struct raft_uv_transport *transport) { struct UvTcp *t = transport->impl; raft_free(t->bind_address); raft_free(t); } int raft_uv_tcp_set_bind_address(struct raft_uv_transport *transport, const char *address) { struct UvTcp *t = transport->impl; char hostname[NI_MAXHOST]; char service[NI_MAXSERV]; int rv; rv = uvIpAddrSplit(address, hostname, sizeof(hostname), service, sizeof(service)); if (rv != 0) { return RAFT_INVALID; } t->bind_address = raft_malloc(strlen(address) + 1); if (t->bind_address == NULL) { return RAFT_NOMEM; } strcpy(t->bind_address, address); return 0; } dqlite-1.16.7/src/raft/uv_tcp.h000066400000000000000000000033311465252713400162750ustar00rootroot00000000000000#ifndef UV_TCP_H_ #define UV_TCP_H_ #include "../raft.h" #include "../lib/queue.h" /* Protocol version. */ #define UV__TCP_HANDSHAKE_PROTOCOL 1 struct UvTcp { struct raft_uv_transport *transport; /* Interface object we implement */ struct uv_loop_s *loop; /* Event loop */ raft_id id; /* ID of this raft server */ const char *address; /* Address of this raft server */ unsigned n_listeners; /* Number of listener sockets */ struct uv_tcp_s *listeners; /* Listener sockets */ raft_uv_accept_cb accept_cb; /* Call after accepting a connection */ queue accepting; /* Connections being accepted */ queue connecting; /* Pending connection requests */ queue aborting; /* Connections being aborted */ bool closing; /* True after close() is called */ raft_uv_transport_close_cb close_cb; /* Call when it's safe to free us */ char *bind_address; /* Optional address:port to bind to */ }; /* Implementation of raft_uv_transport->listen. */ int UvTcpListen(struct raft_uv_transport *transport, raft_uv_accept_cb cb); /* Stop accepting new connection and close all connections being accepted. */ void UvTcpListenClose(struct UvTcp *t); /* Implementation of raft_uv_transport->connect. */ int UvTcpConnect(struct raft_uv_transport *transport, struct raft_uv_connect *req, raft_id id, const char *address, raft_uv_connect_cb cb); /* Abort all pending connection requests. */ void UvTcpConnectClose(struct UvTcp *t); /* Fire the transport close callback if the transport is closing and there's no * more pending callback. */ void UvTcpMaybeFireCloseCb(struct UvTcp *t); #endif /* UV_TCP_H_ */ dqlite-1.16.7/src/raft/uv_tcp_connect.c000066400000000000000000000250751465252713400200120ustar00rootroot00000000000000#include #include "assert.h" #include "byte.h" #include "err.h" #include "heap.h" #include "uv_ip.h" #include "uv_tcp.h" /* The happy path of a connection request is: * * - Create a TCP handle and submit a TCP connect request. * - Initiate an asynchronous dns resolve request * - Once the name lookup was successfull connect to the first given IP * - Once connected over TCP, submit a write request for the handshake. * - Once the write completes, fire the connection request callback. * * Alternative happy path of a connection request, if hostname resolves to * multiple IPs and first/second/... IP is reachable: * - close the tcp handle and initiate a new connect with next IP in cb * * Possible failure modes are: * * - The name resolve for the hostname is not sucessfull, close the TCP handle * and fire the request callback. * * - The transport get closed, close the TCP handle and and fire the request * callback with RAFT_CANCELED. * * - Either the TCP connect or the write request fails: close the TCP handle and * fire the request callback with RAFT_NOCONNECTION. */ /* Hold state for a single connection request. */ struct uvTcpConnect { struct UvTcp *t; /* Transport implementation */ struct raft_uv_connect *req; /* User request */ uv_buf_t handshake; /* Handshake data */ struct uv_tcp_s *tcp; /* TCP connection socket handle */ struct uv_getaddrinfo_s getaddrinfo; /* DNS resolve request */ const struct addrinfo *ai_current; /* The current sockaddr to connect to */ struct uv_connect_s connect; /* TCP connection request */ struct uv_write_s write; /* TCP handshake request */ int status; /* Returned to the request callback */ bool resolving; /* Indicate name resolving in progress */ bool retry; /* Indicate tcp connect failure handling */ queue queue; /* Pending connect queue */ }; /* Encode an handshake message into the given buffer. */ static int uvTcpEncodeHandshake(raft_id id, const char *address, uv_buf_t *buf) { void *cursor; size_t address_len = bytePad64(strlen(address) + 1); buf->len = sizeof(uint64_t) + /* Protocol version. */ sizeof(uint64_t) + /* Server ID. */ sizeof(uint64_t) /* Size of the address buffer */; buf->len += address_len; buf->base = RaftHeapMalloc(buf->len); if (buf->base == NULL) { return RAFT_NOMEM; } cursor = buf->base; bytePut64(&cursor, UV__TCP_HANDSHAKE_PROTOCOL); bytePut64(&cursor, id); bytePut64(&cursor, address_len); strcpy(cursor, address); return 0; } /* Finish the connect request, releasing its memory and firing the connect * callback. */ static void uvTcpConnectFinish(struct uvTcpConnect *connect) { struct uv_stream_s *stream = (struct uv_stream_s *)connect->tcp; struct raft_uv_connect *req = connect->req; int status = connect->status; queue_remove(&connect->queue); RaftHeapFree(connect->handshake.base); uv_freeaddrinfo(connect->getaddrinfo.addrinfo); raft_free(connect); req->cb(req, stream, status); } /* The TCP connection handle has been closed in consequence of an error or * because the transport is closing. */ static void uvTcpConnectUvCloseCb(struct uv_handle_s *handle) { struct uvTcpConnect *connect = handle->data; struct UvTcp *t = connect->t; assert(connect->status != 0); assert(handle == (struct uv_handle_s *)connect->tcp); RaftHeapFree(connect->tcp); connect->tcp = NULL; uvTcpConnectFinish(connect); UvTcpMaybeFireCloseCb(t); } /* Abort a connection request. */ static void uvTcpConnectAbort(struct uvTcpConnect *connect) { queue_remove(&connect->queue); queue_insert_tail(&connect->t->aborting, &connect->queue); uv_cancel((struct uv_req_s *)&connect->getaddrinfo); /* Call uv_close on the tcp handle, if there is no getaddrinfo request * in flight and the handle is not currently closed due to next IP * connect attempt. * Data structures may only be freed after the uvGetAddrInfoCb was * triggered. Tcp handle will be closed in the uvGetAddrInfoCb in this * case. uvTcpConnectUvCloseCb will be invoked from * uvTcpTryNextConnectCb in case a next IP connect should be started. */ if (!connect->resolving && !connect->retry) { uv_close((struct uv_handle_s *)connect->tcp, uvTcpConnectUvCloseCb); } } /* The handshake TCP write completes. Fire the connect callback. */ static void uvTcpConnectUvWriteCb(struct uv_write_s *write, int status) { struct uvTcpConnect *connect = write->data; struct UvTcp *t = connect->t; if (t->closing) { connect->status = RAFT_CANCELED; return; } if (status != 0) { assert(status != UV_ECANCELED); /* t->closing would have been true */ connect->status = RAFT_NOCONNECTION; uvTcpConnectAbort(connect); return; } uvTcpConnectFinish(connect); } /* Helper function to connect to the remote node */ static void uvTcpAsyncConnect(struct uvTcpConnect *connect); /* The TCP connect failed, we closed the handle and want to try with next IP */ static void uvTcpTryNextConnectCb(struct uv_handle_s *handle) { struct uvTcpConnect *connect = handle->data; struct UvTcp *t = connect->t; int rv; connect->retry = false; if (t->closing) { connect->status = RAFT_CANCELED; /* We are already in close cb for the tcp handle, simply invoke * final cb */ uvTcpConnectUvCloseCb(handle); return; } rv = uv_tcp_init(t->loop, connect->tcp); assert(rv == 0); uvTcpAsyncConnect(connect); } /* The TCP connection is established. Write the handshake data. */ static void uvTcpConnectUvConnectCb(struct uv_connect_s *req, int status) { struct uvTcpConnect *connect = req->data; struct UvTcp *t = connect->t; int rv; if (t->closing) { connect->status = RAFT_CANCELED; return; } if (status != 0) { assert(status != UV_ECANCELED); /* t->closing would have been true */ connect->ai_current = connect->ai_current->ai_next; if (connect->ai_current) { /* For the next connect attempt we need to close the tcp * handle. */ /* To avoid interference with aborting we set a flag to * indicate the connect attempt */ connect->retry = true; uv_close((struct uv_handle_s *)connect->tcp, uvTcpTryNextConnectCb); return; } connect->status = RAFT_NOCONNECTION; ErrMsgPrintf(t->transport->errmsg, "uv_tcp_connect(): %s", uv_strerror(status)); goto err; } rv = uv_write(&connect->write, (struct uv_stream_s *)connect->tcp, &connect->handshake, 1, uvTcpConnectUvWriteCb); if (rv != 0) { /* UNTESTED: what are the error conditions? perhaps ENOMEM */ connect->status = RAFT_NOCONNECTION; goto err; } return; err: uvTcpConnectAbort(connect); } /* Helper function to connect to the remote node */ static void uvTcpAsyncConnect(struct uvTcpConnect *connect) { int rv; rv = uv_tcp_connect(&connect->connect, connect->tcp, connect->ai_current->ai_addr, uvTcpConnectUvConnectCb); if (rv != 0) { /* UNTESTED: since parsing succeed, this should fail only * because of lack of system resources */ ErrMsgPrintf(connect->t->transport->errmsg, "uv_tcp_connect(): %s", uv_strerror(rv)); connect->status = RAFT_NOCONNECTION; uvTcpConnectAbort(connect); } } /* The hostname resolve is finished */ static void uvGetAddrInfoCb(uv_getaddrinfo_t *req, int status, struct addrinfo *res) { struct uvTcpConnect *connect = req->data; struct UvTcp *t = connect->t; connect->resolving = false; /* Indicate we are in the name resolving phase */ if (t->closing) { connect->status = RAFT_CANCELED; /* We need to close the tcp handle to abort connection attempt */ uv_close((struct uv_handle_s *)connect->tcp, uvTcpConnectUvCloseCb); return; } if (status < 0) { ErrMsgPrintf(t->transport->errmsg, "uv_getaddrinfo(): %s", uv_err_name(status)); connect->status = RAFT_NOCONNECTION; uvTcpConnectAbort(connect); return; } connect->ai_current = res; uvTcpAsyncConnect(connect); } /* Create a new TCP handle and submit a connection request to the event loop. */ static int uvTcpConnectStart(struct uvTcpConnect *r, const char *address) { static struct addrinfo hints = {.ai_flags = 0, .ai_family = AF_INET, .ai_socktype = SOCK_STREAM, .ai_protocol = 0}; struct UvTcp *t = r->t; char hostname[NI_MAXHOST]; char service[NI_MAXSERV]; int rv; r->handshake.base = NULL; /* Initialize the handshake buffer. */ rv = uvTcpEncodeHandshake(t->id, t->address, &r->handshake); if (rv != 0) { assert(rv == RAFT_NOMEM); ErrMsgOom(t->transport->errmsg); goto err; } r->tcp = RaftHeapMalloc(sizeof *r->tcp); if (r->tcp == NULL) { ErrMsgOom(t->transport->errmsg); rv = RAFT_NOMEM; goto err; } rv = uv_tcp_init(r->t->loop, r->tcp); assert(rv == 0); r->tcp->data = r; rv = uvIpAddrSplit(address, hostname, sizeof(hostname), service, sizeof(service)); if (rv) { ErrMsgPrintf( t->transport->errmsg, "uv_tcp_connect(): Cannot split %s into host and service", address); rv = RAFT_NOCONNECTION; goto err_after_tcp_init; } rv = uv_getaddrinfo(r->t->loop, &r->getaddrinfo, &uvGetAddrInfoCb, hostname, service, &hints); if (rv) { ErrMsgPrintf(t->transport->errmsg, "uv_tcp_connect(): Cannot initiate getaddrinfo %s", uv_strerror(rv)); rv = RAFT_NOCONNECTION; goto err_after_tcp_init; } r->resolving = true; /* Indicate we are in the name resolving phase */ return 0; err_after_tcp_init: uv_close((uv_handle_t *)r->tcp, (uv_close_cb)RaftHeapFree); err: RaftHeapFree(r->handshake.base); return rv; } int UvTcpConnect(struct raft_uv_transport *transport, struct raft_uv_connect *req, raft_id id, const char *address, raft_uv_connect_cb cb) { struct UvTcp *t = transport->impl; struct uvTcpConnect *r; int rv; (void)id; assert(!t->closing); /* Create and initialize a new TCP connection request object */ r = RaftHeapMalloc(sizeof *r); if (r == NULL) { rv = RAFT_NOMEM; ErrMsgOom(transport->errmsg); goto err; } r->t = t; r->req = req; r->status = 0; r->write.data = r; r->getaddrinfo.data = r; r->resolving = false; r->retry = false; r->connect.data = r; req->cb = cb; /* Keep track of the pending request */ queue_insert_tail(&t->connecting, &r->queue); /* Start connecting */ rv = uvTcpConnectStart(r, address); if (rv != 0) { goto err_after_alloc; } return 0; err_after_alloc: queue_remove(&r->queue); RaftHeapFree(r); err: return rv; } void UvTcpConnectClose(struct UvTcp *t) { while (!queue_empty(&t->connecting)) { struct uvTcpConnect *connect; queue *head; head = queue_head(&t->connecting); connect = QUEUE_DATA(head, struct uvTcpConnect, queue); uvTcpConnectAbort(connect); } } dqlite-1.16.7/src/raft/uv_tcp_listen.c000066400000000000000000000263511465252713400176550ustar00rootroot00000000000000#include #include "assert.h" #include "byte.h" #include "heap.h" #include "uv_ip.h" #include "uv_tcp.h" /* The happy path of an incoming connection is: * * - The connection callback is fired on the listener TCP handle, and the * incoming connection is uv_accept()'ed. We call uv_read_start() to get * notified about received handshake data. * * - Once the preamble is received, we start waiting for the server address. * * - Once the server address is received, we fire the receive callback. * * Possible failure modes are: * * - The accept process gets canceled in the transport->close() implementation, * by calling tcp_accept_stop(): the incoming TCP connection handle gets * closed, preventing any further handshake data notification, and all * allocated memory gets released in the handle close callback. */ /* Hold state for a connection being accepted. */ struct uvTcpHandshake { uint64_t preamble[3]; /* Preamble buffer */ uv_buf_t address; /* Address buffer */ size_t nread; /* Number of bytes read */ }; /* Hold handshake data for a new connection being established. */ struct uvTcpIncoming { struct UvTcp *t; /* Transport implementation */ struct uv_tcp_s *listener; /* The tcp handle, which accepted this socket */ struct uv_tcp_s *tcp; /* TCP connection socket handle */ struct uvTcpHandshake handshake; /* Handshake data */ queue queue; /* Pending accept queue */ }; /* Decode the handshake preamble, containing the protocol version, the ID of the * connecting server and the length of its address. Also, allocate the buffer to * start reading the server address. */ static int uvTcpDecodePreamble(struct uvTcpHandshake *h) { uint64_t protocol; protocol = byteFlip64(h->preamble[0]); if (protocol != UV__TCP_HANDSHAKE_PROTOCOL) { return RAFT_MALFORMED; } h->address.len = (size_t)byteFlip64(h->preamble[2]); h->address.base = RaftHeapMalloc(h->address.len); if (h->address.base == NULL) { return RAFT_NOMEM; } h->nread = 0; return 0; } /* The accepted TCP client connection has been closed, release all memory * associated with accept object. We can get here only if an error occurrent * during the handshake or if raft_uv_transport->close() has been invoked. */ static void uvTcpIncomingCloseCb(struct uv_handle_s *handle) { struct uvTcpIncoming *incoming = handle->data; struct UvTcp *t = incoming->t; queue_remove(&incoming->queue); if (incoming->handshake.address.base != NULL) { RaftHeapFree(incoming->handshake.address.base); } RaftHeapFree(incoming->tcp); RaftHeapFree(incoming); UvTcpMaybeFireCloseCb(t); } /* Close an incoming TCP connection which hasn't complete the handshake yet. */ static void uvTcpIncomingAbort(struct uvTcpIncoming *incoming) { struct UvTcp *t = incoming->t; /* After uv_close() returns we are guaranteed that no more alloc_cb or * read_cb will be called. */ queue_remove(&incoming->queue); queue_insert_tail(&t->aborting, &incoming->queue); uv_close((struct uv_handle_s *)incoming->tcp, uvTcpIncomingCloseCb); } /* Read the address part of the handshake. */ static void uvTcpIncomingAllocCbAddress(struct uv_handle_s *handle, size_t suggested_size, uv_buf_t *buf) { struct uvTcpIncoming *incoming = handle->data; (void)suggested_size; assert(!incoming->t->closing); buf->base = incoming->handshake.address.base + incoming->handshake.nread; buf->len = incoming->handshake.address.len - incoming->handshake.nread; } static void uvTcpIncomingReadCbAddress(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf) { struct uvTcpIncoming *incoming = stream->data; char *address; raft_id id; size_t n; int rv; (void)buf; assert(!incoming->t->closing); if (nread == 0) { /* Empty read just ignore it. */ return; } if (nread < 0) { uvTcpIncomingAbort(incoming); return; } /* We shouldn't have read more data than the pending amount. */ n = (size_t)nread; assert(n <= incoming->handshake.address.len - incoming->handshake.nread); /* Advance the read window */ incoming->handshake.nread += n; /* If there's more data to read in order to fill the current * read buffer, just return, we'll be invoked again. */ if (incoming->handshake.nread < incoming->handshake.address.len) { return; } /* If we have completed reading the address, let's fire the callback. */ rv = uv_read_stop(stream); assert(rv == 0); id = byteFlip64(incoming->handshake.preamble[1]); address = incoming->handshake.address.base; queue_remove(&incoming->queue); incoming->t->accept_cb(incoming->t->transport, id, address, (struct uv_stream_s *)incoming->tcp); RaftHeapFree(incoming->handshake.address.base); RaftHeapFree(incoming); } /* Read the preamble of the handshake. */ static void uvTcpIncomingAllocCbPreamble(struct uv_handle_s *handle, size_t suggested_size, uv_buf_t *buf) { struct uvTcpIncoming *incoming = handle->data; (void)suggested_size; buf->base = (char *)incoming->handshake.preamble + incoming->handshake.nread; buf->len = sizeof incoming->handshake.preamble - incoming->handshake.nread; } static void uvTcpIncomingReadCbPreamble(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf) { struct uvTcpIncoming *incoming = stream->data; size_t n; int rv; (void)buf; if (nread == 0) { /* Empty read just ignore it. */ return; } if (nread < 0) { uvTcpIncomingAbort(incoming); return; } /* We shouldn't have read more data than the pending amount. */ n = (size_t)nread; assert(n <= sizeof incoming->handshake.preamble - incoming->handshake.nread); /* Advance the read window */ incoming->handshake.nread += n; /* If there's more data to read in order to fill the current * read buffer, just return, we'll be invoked again. */ if (incoming->handshake.nread < sizeof incoming->handshake.preamble) { return; } /* If we have completed reading the preamble, let's parse it. */ rv = uvTcpDecodePreamble(&incoming->handshake); if (rv != 0) { uvTcpIncomingAbort(incoming); return; } rv = uv_read_stop(stream); assert(rv == 0); rv = uv_read_start((uv_stream_t *)incoming->tcp, uvTcpIncomingAllocCbAddress, uvTcpIncomingReadCbAddress); assert(rv == 0); } /* Start reading handshake data for a new incoming connection. */ static int uvTcpIncomingStart(struct uvTcpIncoming *incoming) { int rv; memset(&incoming->handshake, 0, sizeof incoming->handshake); incoming->tcp = RaftHeapMalloc(sizeof *incoming->tcp); if (incoming->tcp == NULL) { return RAFT_NOMEM; } incoming->tcp->data = incoming; rv = uv_tcp_init(incoming->t->loop, incoming->tcp); assert(rv == 0); rv = uv_accept((struct uv_stream_s *)incoming->listener, (struct uv_stream_s *)incoming->tcp); if (rv != 0) { rv = RAFT_IOERR; goto err_after_tcp_init; } rv = uv_read_start((uv_stream_t *)incoming->tcp, uvTcpIncomingAllocCbPreamble, uvTcpIncomingReadCbPreamble); assert(rv == 0); return 0; err_after_tcp_init: uv_close((uv_handle_t *)incoming->tcp, (uv_close_cb)RaftHeapFree); return rv; } #define IS_IN_ARRAY(elem, array, array_size) \ (const char *)(elem) >= (const char *)(array) && \ (const char *)(elem) < \ (const char *)(array) + array_size * sizeof(*array) /* Called when there's a new incoming connection: create a new tcp_accept object * and start receiving handshake data. */ static void uvTcpListenCb(struct uv_stream_s *stream, int status) { struct UvTcp *t = stream->data; struct uvTcpIncoming *incoming; int rv; assert(IS_IN_ARRAY(stream, t->listeners, t->n_listeners)); if (status != 0) { rv = RAFT_IOERR; goto err; } incoming = RaftHeapMalloc(sizeof *incoming); if (incoming == NULL) { rv = RAFT_NOMEM; goto err; } incoming->t = t; incoming->listener = (struct uv_tcp_s *)stream; incoming->tcp = NULL; queue_insert_tail(&t->accepting, &incoming->queue); rv = uvTcpIncomingStart(incoming); if (rv != 0) { goto err_after_accept_alloc; } return; err_after_accept_alloc: queue_remove(&incoming->queue); RaftHeapFree(incoming); err: assert(rv != 0); } /* Do bind/listen call on the tcp handle */ static int uvTcpBindListen(struct uv_tcp_s *listener, struct sockaddr *addr) { if (uv_tcp_bind(listener, addr, 0) || uv_listen((uv_stream_t *)listener, 1, uvTcpListenCb)) { return RAFT_IOERR; } return 0; } /* Create a tcp handle and do bind/listen for each IP */ static int uvTcpListenOnMultipleIP(struct raft_uv_transport *transport, struct addrinfo *addr_infos) { struct UvTcp *t; struct addrinfo *current; unsigned n_listeners; int rv; t = transport->impl; n_listeners = 0; for (current = addr_infos; current; current = current->ai_next) { ++n_listeners; } current = addr_infos; t->listeners = raft_malloc(n_listeners * sizeof(*t->listeners)); if (!t->listeners) { rv = RAFT_NOMEM; goto err; } t->n_listeners = n_listeners; for (n_listeners = 0; n_listeners < t->n_listeners; ++n_listeners) { struct uv_tcp_s *listener = &t->listeners[n_listeners]; listener->data = t; if (uv_tcp_init(t->loop, listener) || uvTcpBindListen(listener, current->ai_addr)) { rv = RAFT_IOERR; goto err; } current = addr_infos->ai_next; } return 0; err: if (t->listeners) { for (unsigned i = 0; i <= n_listeners; ++i) { uv_close((struct uv_handle_s *)&t->listeners[i], NULL); } raft_free(t->listeners); t->listeners = NULL; t->n_listeners = 0; } return rv; } /* Ignore duplicate entries from glibc getaddrinfo due to * https://bugzilla.redhat.com/show_bug.cgi?id=496300 * in case of resolving localhost */ static bool uvIsAddressDuplication(struct addrinfo *addr_info) { struct addrinfo *next = addr_info->ai_next; /* Check, if we have a list of length 2 */ if (!next || next->ai_next) { return false; } if (addr_info->ai_addrlen != next->ai_addrlen || bcmp(addr_info->ai_addr, next->ai_addr, addr_info->ai_addrlen)) { return false; } return true; } int UvTcpListen(struct raft_uv_transport *transport, raft_uv_accept_cb cb) { struct UvTcp *t; struct addrinfo *addr_infos; int rv; t = transport->impl; t->accept_cb = cb; if (t->bind_address == NULL) { rv = uvIpResolveBindAddresses(t->address, &addr_infos); } else { rv = uvIpResolveBindAddresses(t->bind_address, &addr_infos); } if (rv != 0 || !addr_infos) { return rv; } if (addr_infos->ai_next && uvIsAddressDuplication(addr_infos)) { rv = uvTcpListenOnMultipleIP(transport, addr_infos->ai_next); } else { rv = uvTcpListenOnMultipleIP(transport, addr_infos); } freeaddrinfo(addr_infos); return rv; } /* Close callback for uvTcp->listener. */ static void uvTcpListenCloseCbListener(struct uv_handle_s *handle) { struct UvTcp *t = handle->data; assert(t->closing); assert(t->n_listeners); assert(t->listeners); if (--t->n_listeners == 0) { raft_free(t->listeners); t->listeners = NULL; UvTcpMaybeFireCloseCb(t); } } void UvTcpListenClose(struct UvTcp *t) { queue *head; assert(t->closing); while (!queue_empty(&t->accepting)) { struct uvTcpIncoming *incoming; head = queue_head(&t->accepting); incoming = QUEUE_DATA(head, struct uvTcpIncoming, queue); uvTcpIncomingAbort(incoming); } if (t->n_listeners) { for (unsigned i = 0; i < t->n_listeners; ++i) { uv_close((struct uv_handle_s *)&t->listeners[i], uvTcpListenCloseCbListener); } } } dqlite-1.16.7/src/raft/uv_truncate.c000066400000000000000000000107521465252713400173340ustar00rootroot00000000000000#include #include #include "assert.h" #include "byte.h" #include "heap.h" #include "uv.h" #include "uv_encoding.h" /* Track a truncate request. */ struct uvTruncate { struct uv *uv; struct UvBarrierReq barrier; raft_index index; int status; }; /* Execute a truncate request in a thread. */ static void uvTruncateWorkCb(uv_work_t *work) { struct uvTruncate *truncate = work->data; struct uv *uv = truncate->uv; tracef("uv truncate work cb"); struct uvSnapshotInfo *snapshots; struct uvSegmentInfo *segments; struct uvSegmentInfo *segment; size_t n_snapshots; size_t n_segments; size_t i; size_t j; char errmsg[RAFT_ERRMSG_BUF_SIZE]; int rv; /* Load all segments on disk. */ rv = UvList(uv, &snapshots, &n_snapshots, &segments, &n_segments, errmsg); if (rv != 0) { goto err; } if (snapshots != NULL) { RaftHeapFree(snapshots); } assert(segments != NULL); /* Find the segment that contains the truncate point. */ segment = NULL; /* Suppress warnings. */ for (i = 0; i < n_segments; i++) { segment = &segments[i]; if (segment->is_open) { continue; } if (truncate->index >= segment->first_index && truncate->index <= segment->end_index) { break; } } assert(i < n_segments); /* If the truncate index is not the first of the segment, we need to * truncate it. */ if (truncate->index > segment->first_index) { rv = uvSegmentTruncate(uv, segment, truncate->index); if (rv != 0) { goto err_after_list; } } /* Remove all closed segments past the one containing the truncate * index. */ for (j = i; j < n_segments; j++) { segment = &segments[j]; if (segment->is_open) { continue; } rv = UvFsRemoveFile(uv->dir, segment->filename, errmsg); if (rv != 0) { tracef("unlink segment %s: %s", segment->filename, errmsg); rv = RAFT_IOERR; goto err_after_list; } } rv = UvFsSyncDir(uv->dir, errmsg); if (rv != 0) { tracef("sync data directory: %s", errmsg); rv = RAFT_IOERR; goto err_after_list; } RaftHeapFree(segments); truncate->status = 0; tracef("uv truncate work cb ok"); return; err_after_list: RaftHeapFree(segments); err: assert(rv != 0); truncate->status = rv; } static void uvTruncateAfterWorkCb(uv_work_t *work, int status) { assert(work != NULL); struct uvTruncate *truncate = work->data; assert(truncate != NULL); struct uv *uv = truncate->uv; assert(uv != NULL); tracef("uv truncate after work cb status:%d", status); assert(status == 0); if (truncate->status != 0) { uv->errored = true; } tracef("clear truncate work"); uv->truncate_work.data = NULL; RaftHeapFree(truncate); UvUnblock(uv); } static void uvTruncateBarrierCb(struct UvBarrierReq *barrier) { struct uvTruncate *truncate = barrier->data; struct uv *uv = truncate->uv; tracef("uv truncate barrier cb"); int rv; /* Ensure that we don't invoke this callback more than once. */ barrier->cb = NULL; /* If we're closing, don't perform truncation at all and abort here. */ if (uv->closing) { tracef("closing => don't truncate"); RaftHeapFree(truncate); uvMaybeFireCloseCb(uv); return; } assert(queue_empty(&uv->append_writing_reqs)); assert(queue_empty(&uv->finalize_reqs)); assert(uv->finalize_work.data == NULL); assert(uv->truncate_work.data == NULL); tracef("set truncate work"); uv->truncate_work.data = truncate; rv = uv_queue_work(uv->loop, &uv->truncate_work, uvTruncateWorkCb, uvTruncateAfterWorkCb); if (rv != 0) { tracef("truncate index %lld: %s", truncate->index, uv_strerror(rv)); tracef("clear truncate work"); uv->truncate_work.data = NULL; uv->errored = true; } } int UvTruncate(struct raft_io *io, raft_index index) { struct uv *uv; struct uvTruncate *truncate; int rv; uv = io->impl; tracef("uv truncate %llu", index); assert(!uv->closing); /* We should truncate only entries that we were requested to append in * the first place. */ assert(index > 0); assert(index < uv->append_next_index); truncate = RaftHeapMalloc(sizeof *truncate); if (truncate == NULL) { rv = RAFT_NOMEM; goto err; } truncate->uv = uv; truncate->index = index; truncate->barrier.data = truncate; truncate->barrier.blocking = true; truncate->barrier.cb = uvTruncateBarrierCb; /* Make sure that we wait for any inflight writes to finish and then * close the current segment. */ rv = UvBarrier(uv, index, &truncate->barrier); if (rv != 0) { goto err_after_req_alloc; } return 0; err_after_req_alloc: RaftHeapFree(truncate); err: assert(rv != 0); return rv; } #undef tracef dqlite-1.16.7/src/raft/uv_work.c000066400000000000000000000027061465252713400164710ustar00rootroot00000000000000#include "assert.h" #include "heap.h" #include "uv.h" struct uvAsyncWork { struct uv *uv; struct raft_io_async_work *req; struct uv_work_s work; int status; queue queue; }; static void uvAsyncWorkCb(uv_work_t *work) { struct uvAsyncWork *w = work->data; assert(w != NULL); int rv; rv = w->req->work(w->req); w->status = rv; } static void uvAsyncAfterWorkCb(uv_work_t *work, int status) { struct uvAsyncWork *w = work->data; struct raft_io_async_work *req = w->req; int req_status = w->status; struct uv *uv = w->uv; assert(status == 0); queue_remove(&w->queue); RaftHeapFree(w); req->cb(req, req_status); uvMaybeFireCloseCb(uv); } int UvAsyncWork(struct raft_io *io, struct raft_io_async_work *req, raft_io_async_work_cb cb) { struct uv *uv; struct uvAsyncWork *async_work; int rv; uv = io->impl; assert(!uv->closing); async_work = RaftHeapMalloc(sizeof *async_work); if (async_work == NULL) { rv = RAFT_NOMEM; goto err; } async_work->uv = uv; async_work->req = req; async_work->work.data = async_work; req->cb = cb; queue_insert_tail(&uv->async_work_reqs, &async_work->queue); rv = uv_queue_work(uv->loop, &async_work->work, uvAsyncWorkCb, uvAsyncAfterWorkCb); if (rv != 0) { queue_remove(&async_work->queue); tracef("async work: %s", uv_strerror(rv)); rv = RAFT_IOERR; goto err_after_req_alloc; } return 0; err_after_req_alloc: RaftHeapFree(async_work); err: assert(rv != 0); return rv; } #undef tracef dqlite-1.16.7/src/raft/uv_writer.c000066400000000000000000000326611465252713400170260ustar00rootroot00000000000000#include "uv_writer.h" #include #include #include "../raft.h" #include "assert.h" #include "heap.h" /* Copy the error message from the request object to the writer object. */ static void uvWriterReqTransferErrMsg(struct UvWriterReq *req) { ErrMsgPrintf(req->writer->errmsg, "%s", req->errmsg); } /* Set the request status according the given result code. */ static void uvWriterReqSetStatus(struct UvWriterReq *req, int result) { if (result < 0) { ErrMsgPrintf(req->errmsg, "write failed: %d", result); req->status = RAFT_IOERR; } else if ((size_t)result < req->len) { ErrMsgPrintf(req->errmsg, "short write: %d bytes instead of %zu", result, req->len); req->status = RAFT_NOSPACE; } else { req->status = 0; } } /* Remove the request from the queue of inflight writes and invoke the request * callback if set. */ static void uvWriterReqFinish(struct UvWriterReq *req) { queue_remove(&req->queue); if (req->status != 0) { uvWriterReqTransferErrMsg(req); } req->cb(req, req->status); } /* Wrapper around the low-level OS syscall, providing a better error message. */ static int uvWriterIoSetup(unsigned n, aio_context_t *ctx, char *errmsg) { int rv; rv = UvOsIoSetup(n, ctx); if (rv != 0) { switch (rv) { case UV_EAGAIN: ErrMsgPrintf(errmsg, "AIO events user limit exceeded"); rv = RAFT_TOOMANY; break; default: UvOsErrMsg(errmsg, "io_setup", rv); rv = RAFT_IOERR; break; } return rv; } return 0; } /* Run blocking syscalls involved in a file write request. * * Perform a KAIO write request and synchronously wait for it to complete. */ static void uvWriterWorkCb(uv_work_t *work) { struct UvWriterReq *req; /* Writer request object */ struct UvWriter *w; /* Writer object */ aio_context_t ctx; /* KAIO handle */ struct iocb *iocbs; /* Pointer to KAIO request object */ struct io_event event; /* KAIO response object */ int n_events; int rv; req = work->data; w = req->writer; iocbs = &req->iocb; /* If more than one write in parallel is allowed, submit the AIO request * using a dedicated context, to avoid synchronization issues between * threads when multiple writes are submitted in parallel. This is * suboptimal but in real-world users should use file systems and * kernels with proper async write support. */ if (w->n_events > 1) { ctx = 0; rv = uvWriterIoSetup(1 /* Maximum concurrent requests */, &ctx, req->errmsg); if (rv != 0) { goto out; } } else { ctx = w->ctx; } /* Submit the request */ rv = UvOsIoSubmit(ctx, 1, &iocbs); if (rv != 0) { /* UNTESTED: since we're not using NOWAIT and the parameters are * valid, this shouldn't fail. */ UvOsErrMsg(req->errmsg, "io_submit", rv); rv = RAFT_IOERR; goto out_after_io_setup; } /* Wait for the request to complete */ n_events = UvOsIoGetevents(ctx, 1, 1, &event, NULL); assert(n_events == 1); if (n_events != 1) { /* UNTESTED */ rv = n_events >= 0 ? -1 : n_events; } out_after_io_setup: if (w->n_events > 1) { UvOsIoDestroy(ctx); } out: if (rv != 0) { req->status = rv; } else { uvWriterReqSetStatus(req, (int)event.res); } return; } /* Callback run after writeWorkCb has returned. It normally invokes the write * request callback. */ static void uvWriterAfterWorkCb(uv_work_t *work, int status) { struct UvWriterReq *req = work->data; /* Write file request object */ assert(status == 0); /* We don't cancel worker requests */ uvWriterReqFinish(req); } /* Callback fired when the event fd associated with AIO write requests should be * ready for reading (i.e. when a write has completed). */ static void uvWriterPollCb(uv_poll_t *poller, int status, int events) { struct UvWriter *w = poller->data; uint64_t completed; /* True if the write is complete */ unsigned i; int n_events; int rv; assert(w->event_fd >= 0); assert(status == 0); if (status != 0) { /* UNTESTED libuv docs: If an error happens while polling, * status will be < 0 and corresponds with one of the UV_E* * error codes. */ goto fail_requests; } assert(events & UV_READABLE); /* Read the event file descriptor */ rv = (int)read(w->event_fd, &completed, sizeof completed); if (rv != sizeof completed) { /* UNTESTED: According to eventfd(2) this is the only possible * failure mode, meaning that epoll has indicated that the event * FD is not yet ready. */ assert(errno == EAGAIN); return; } /* TODO: this assertion fails in unit tests */ /* assert(completed == 1); */ /* Try to fetch the write responses. * * If we got here at least one write should have completed and io_events * should return immediately without blocking. */ n_events = UvOsIoGetevents(w->ctx, 1, (long int)w->n_events, w->events, NULL); assert(n_events >= 1); if (n_events < 1) { /* UNTESTED */ status = n_events == 0 ? -1 : n_events; goto fail_requests; } for (i = 0; i < (unsigned)n_events; i++) { struct io_event *event = &w->events[i]; struct UvWriterReq *req = *((void **)&event->data); /* If we got EAGAIN, it means it was not possible to perform the * write asynchronously, so let's fall back to the threadpool. */ if (event->res == -EAGAIN) { req->iocb.aio_flags &= (unsigned)~IOCB_FLAG_RESFD; req->iocb.aio_resfd = 0; req->iocb.aio_rw_flags &= ~RWF_NOWAIT; assert(req->work.data == NULL); req->work.data = req; rv = uv_queue_work(w->loop, &req->work, uvWriterWorkCb, uvWriterAfterWorkCb); if (rv != 0) { /* UNTESTED: with the current libuv * implementation this should never fail. */ UvOsErrMsg(req->errmsg, "uv_queue_work", rv); req->status = RAFT_IOERR; goto finish; } continue; } uvWriterReqSetStatus(req, (int)event->res); finish: uvWriterReqFinish(req); } return; fail_requests: while (!queue_empty(&w->poll_queue)) { queue *head; struct UvWriterReq *req; head = queue_head(&w->poll_queue); req = QUEUE_DATA(head, struct UvWriterReq, queue); uvWriterReqSetStatus(req, status); uvWriterReqFinish(req); } } int UvWriterInit(struct UvWriter *w, struct uv_loop_s *loop, uv_file fd, bool direct /* Whether to use direct I/O */, bool async /* Whether async I/O is available */, unsigned max_concurrent_writes, char *errmsg) { void *data = w->data; int rv = 0; memset(w, 0, sizeof *w); w->data = data; w->loop = loop; w->fd = fd; w->async = async; w->ctx = 0; w->events = NULL; w->n_events = max_concurrent_writes; w->event_fd = -1; w->event_poller.data = NULL; w->check.data = NULL; w->close_cb = NULL; queue_init(&w->poll_queue); queue_init(&w->work_queue); w->closing = false; w->errmsg = errmsg; /* Set direct I/O if available. */ if (direct) { rv = UvOsSetDirectIo(w->fd); if (rv != 0) { UvOsErrMsg(errmsg, "fcntl", rv); goto err; } } /* Setup the AIO context. */ rv = uvWriterIoSetup(w->n_events, &w->ctx, errmsg); if (rv != 0) { goto err; } /* Initialize the array of re-usable event objects. */ w->events = RaftHeapCalloc(w->n_events, sizeof *w->events); if (w->events == NULL) { /* UNTESTED: todo */ ErrMsgOom(errmsg); rv = RAFT_NOMEM; goto err_after_io_setup; } /* Create an event file descriptor to get notified when a write has * completed. */ rv = UvOsEventfd(0, UV_FS_O_NONBLOCK); if (rv < 0) { /* UNTESTED: should fail only with ENOMEM */ UvOsErrMsg(errmsg, "eventfd", rv); rv = RAFT_IOERR; goto err_after_events_alloc; } w->event_fd = rv; rv = uv_poll_init(loop, &w->event_poller, w->event_fd); if (rv != 0) { /* UNTESTED: with the current libuv implementation this should * never fail. */ UvOsErrMsg(errmsg, "uv_poll_init", rv); rv = RAFT_IOERR; goto err_after_event_fd; } w->event_poller.data = w; rv = uv_check_init(loop, &w->check); if (rv != 0) { /* UNTESTED: with the current libuv implementation this should * never fail. */ UvOsErrMsg(errmsg, "uv_check_init", rv); rv = RAFT_IOERR; goto err_after_event_fd; } w->check.data = w; rv = uv_poll_start(&w->event_poller, UV_READABLE, uvWriterPollCb); if (rv != 0) { /* UNTESTED: with the current libuv implementation this should * never fail. */ UvOsErrMsg(errmsg, "uv_poll_start", rv); rv = RAFT_IOERR; goto err_after_event_fd; } return 0; err_after_event_fd: UvOsClose(w->event_fd); err_after_events_alloc: RaftHeapFree(w->events); err_after_io_setup: UvOsIoDestroy(w->ctx); err: assert(rv != 0); return rv; } static void uvWriterCleanUpAndFireCloseCb(struct UvWriter *w) { assert(w->closing); UvOsClose(w->fd); RaftHeapFree(w->events); UvOsIoDestroy(w->ctx); if (w->close_cb != NULL) { w->close_cb(w); } } static void uvWriterPollerCloseCb(struct uv_handle_s *handle) { struct UvWriter *w = handle->data; w->event_poller.data = NULL; /* Cancel all pending requests. */ while (!queue_empty(&w->poll_queue)) { queue *head; struct UvWriterReq *req; head = queue_head(&w->poll_queue); req = QUEUE_DATA(head, struct UvWriterReq, queue); assert(req->work.data == NULL); req->status = RAFT_CANCELED; uvWriterReqFinish(req); } if (w->check.data != NULL) { return; } uvWriterCleanUpAndFireCloseCb(w); } static void uvWriterCheckCloseCb(struct uv_handle_s *handle) { struct UvWriter *w = handle->data; w->check.data = NULL; if (w->event_poller.data != NULL) { return; } uvWriterCleanUpAndFireCloseCb(w); } static void uvWriterCheckCb(struct uv_check_s *check) { struct UvWriter *w = check->data; if (!queue_empty(&w->work_queue)) { return; } uv_close((struct uv_handle_s *)&w->check, uvWriterCheckCloseCb); } void UvWriterClose(struct UvWriter *w, UvWriterCloseCb cb) { int rv; assert(!w->closing); w->closing = true; w->close_cb = cb; /* We can close the event file descriptor right away, but we shouldn't * close the main file descriptor or destroy the AIO context since there * might be threadpool requests in flight. */ UvOsClose(w->event_fd); rv = uv_poll_stop(&w->event_poller); assert(rv == 0); /* Can this ever fail? */ uv_close((struct uv_handle_s *)&w->event_poller, uvWriterPollerCloseCb); /* If we have requests executing in the threadpool, we need to wait for * them. That's done in the check callback. */ if (!queue_empty(&w->work_queue)) { uv_check_start(&w->check, uvWriterCheckCb); } else { uv_close((struct uv_handle_s *)&w->check, uvWriterCheckCloseCb); } } /* Return the total lengths of the given buffers. */ static size_t lenOfBufs(const uv_buf_t bufs[], unsigned n) { size_t len = 0; unsigned i; for (i = 0; i < n; i++) { len += bufs[i].len; } return len; } int UvWriterSubmit(struct UvWriter *w, struct UvWriterReq *req, const uv_buf_t bufs[], unsigned n, size_t offset, UvWriterReqCb cb) { int rv = 0; struct iocb *iocbs = &req->iocb; assert(!w->closing); /* TODO: at the moment we are not leveraging the support for concurrent * writes, so ensure that we're getting write requests * sequentially. */ if (w->n_events == 1) { assert(queue_empty(&w->poll_queue)); assert(queue_empty(&w->work_queue)); } assert(w->fd >= 0); assert(w->event_fd >= 0); assert(w->ctx != 0); assert(req != NULL); assert(bufs != NULL); assert(n > 0); req->writer = w; req->len = lenOfBufs(bufs, n); req->status = -1; req->work.data = NULL; req->cb = cb; memset(&req->iocb, 0, sizeof req->iocb); memset(req->errmsg, 0, sizeof req->errmsg); req->iocb.aio_fildes = (uint32_t)w->fd; req->iocb.aio_lio_opcode = IOCB_CMD_PWRITEV; req->iocb.aio_reqprio = 0; *((void **)(&req->iocb.aio_buf)) = (void *)bufs; req->iocb.aio_nbytes = n; req->iocb.aio_offset = (int64_t)offset; *((void **)(&req->iocb.aio_data)) = (void *)req; #if defined(RWF_HIPRI) /* High priority request, if possible */ /* TODO: do proper kernel feature detection for this one. */ /* req->iocb.aio_rw_flags |= RWF_HIPRI; */ #endif #if defined(RWF_DSYNC) /* Use per-request synchronous I/O if available. Otherwise, we have * opened the file with O_DSYNC. */ /* TODO: do proper kernel feature detection for this one. */ /* req->iocb.aio_rw_flags |= RWF_DSYNC; */ #endif /* If io_submit can be run in a 100% non-blocking way, we'll try to * write without using the threadpool. */ if (w->async) { req->iocb.aio_flags |= IOCB_FLAG_RESFD; req->iocb.aio_resfd = (uint32_t)w->event_fd; req->iocb.aio_rw_flags |= RWF_NOWAIT; } /* Try to submit the write request asynchronously */ if (w->async) { queue_insert_tail(&w->poll_queue, &req->queue); rv = UvOsIoSubmit(w->ctx, 1, &iocbs); /* If no error occurred, we're done, the write request was * submitted. */ if (rv == 0) { goto done; } queue_remove(&req->queue); /* Check the reason of the error. */ switch (rv) { case UV_EAGAIN: break; default: /* Unexpected error */ UvOsErrMsg(w->errmsg, "io_submit", rv); rv = RAFT_IOERR; goto err; } /* Submitting the write would block, or NOWAIT is not * supported. Let's run this request in the threadpool. */ req->iocb.aio_flags &= (unsigned)~IOCB_FLAG_RESFD; req->iocb.aio_resfd = 0; req->iocb.aio_rw_flags &= ~RWF_NOWAIT; } /* If we got here it means we need to run io_submit in the threadpool. */ queue_insert_tail(&w->work_queue, &req->queue); req->work.data = req; rv = uv_queue_work(w->loop, &req->work, uvWriterWorkCb, uvWriterAfterWorkCb); if (rv != 0) { /* UNTESTED: with the current libuv implementation this can't * fail. */ req->work.data = NULL; queue_remove(&req->queue); UvOsErrMsg(w->errmsg, "uv_queue_work", rv); rv = RAFT_IOERR; goto err; } done: return 0; err: assert(rv != 0); return rv; } dqlite-1.16.7/src/raft/uv_writer.h000066400000000000000000000051231465252713400170240ustar00rootroot00000000000000/* Asynchronous API to write a file. */ #ifndef UV_WRITER_H_ #define UV_WRITER_H_ #include #include "err.h" #include "../lib/queue.h" #include "uv_os.h" /* Perform asynchronous writes to a single file. */ struct UvWriter; /* Callback called after the memory associated with a file handle can be * released. */ typedef void (*UvWriterCloseCb)(struct UvWriter *w); struct UvWriter { void *data; /* User data */ struct uv_loop_s *loop; /* Event loop */ uv_file fd; /* File handle */ bool async; /* Whether fully async I/O is supported */ aio_context_t ctx; /* KAIO handle */ struct io_event *events; /* Array of KAIO response objects */ unsigned n_events; /* Length of the events array */ int event_fd; /* Poll'ed to check if write is finished */ struct uv_poll_s event_poller; /* Poll event_fd for completed poll requests */ struct uv_check_s check; /* Check for completed threadpool requests */ UvWriterCloseCb close_cb; /* Close callback */ queue poll_queue; /* Pollable write requests */ queue work_queue; /* Threadpool write requests */ bool closing; /* Whether we're closing or closed */ char *errmsg; /* Description of last error */ }; /* Initialize a file writer. */ int UvWriterInit(struct UvWriter *w, struct uv_loop_s *loop, uv_file fd, bool direct /* Whether to use direct I/O */, bool async /* Whether async I/O is available */, unsigned max_concurrent_writes, char *errmsg); /* Close the given file and release all associated resources. */ void UvWriterClose(struct UvWriter *w, UvWriterCloseCb cb); /* Write request. */ struct UvWriterReq; /* Callback called after a write request has been completed. */ typedef void (*UvWriterReqCb)(struct UvWriterReq *req, int status); struct UvWriterReq { void *data; /* User data */ struct UvWriter *writer; /* Originating writer */ size_t len; /* Total number of bytes to write */ int status; /* Request result code */ struct uv_work_s work; /* To execute logic in the threadpool */ UvWriterReqCb cb; /* Callback to invoke upon request completion */ struct iocb iocb; /* KAIO request (for writing) */ char errmsg[256]; /* Error description (for thread-safety) */ queue queue; /* Prev/next links in the inflight queue */ }; /* Asynchronously write data to the underlying file. */ int UvWriterSubmit(struct UvWriter *w, struct UvWriterReq *req, const uv_buf_t bufs[], unsigned n, size_t offset, UvWriterReqCb cb); #endif /* UV_WRITER_H_ */ dqlite-1.16.7/src/registry.c000066400000000000000000000015521465252713400157070ustar00rootroot00000000000000#include #include "../include/dqlite.h" #include "lib/assert.h" #include "registry.h" void registry__init(struct registry *r, struct config *config) { r->config = config; queue_init(&r->dbs); } void registry__close(struct registry *r) { while (!queue_empty(&r->dbs)) { struct db *db; queue *head; head = queue_head(&r->dbs); queue_remove(head); db = QUEUE_DATA(head, struct db, queue); db__close(db); sqlite3_free(db); } } int registry__db_get(struct registry *r, const char *filename, struct db **db) { queue *head; QUEUE_FOREACH(head, &r->dbs) { *db = QUEUE_DATA(head, struct db, queue); if (strcmp((*db)->filename, filename) == 0) { return 0; } } *db = sqlite3_malloc(sizeof **db); if (*db == NULL) { return DQLITE_NOMEM; } db__init(*db, r->config, filename); queue_insert_tail(&r->dbs, &(*db)->queue); return 0; } dqlite-1.16.7/src/registry.h000066400000000000000000000007401465252713400157120ustar00rootroot00000000000000#ifndef REGISTRY_H_ #define REGISTRY_H_ #include #include #include "lib/queue.h" #include "db.h" struct registry { struct config *config; queue dbs; }; void registry__init(struct registry *r, struct config *config); void registry__close(struct registry *r); /** * Get the db with the given filename. If no one is registered, create one. */ int registry__db_get(struct registry *r, const char *filename, struct db **db); #endif /* REGISTRY_H_*/ dqlite-1.16.7/src/request.c000066400000000000000000000004231465252713400155230ustar00rootroot00000000000000#include "request.h" #define REQUEST__IMPLEMENT(LOWER, UPPER, _) \ SERIALIZE__IMPLEMENT(request_##LOWER, REQUEST_##UPPER); REQUEST__TYPES(REQUEST__IMPLEMENT, ); SERIALIZE__IMPLEMENT(request_connect, REQUEST_CONNECT); SERIALIZE__IMPLEMENT(request_assign, REQUEST_ASSIGN); dqlite-1.16.7/src/request.h000066400000000000000000000071161465252713400155360ustar00rootroot00000000000000#ifndef REQUEST_H_ #define REQUEST_H_ #include "lib/serialize.h" /** * Request types. */ #define REQUEST_LEADER(X, ...) X(uint64, __unused__, ##__VA_ARGS__) #define REQUEST_CLIENT(X, ...) X(uint64, id, ##__VA_ARGS__) #define REQUEST_OPEN(X, ...) \ X(text, filename, ##__VA_ARGS__) \ X(uint64, flags, ##__VA_ARGS__) \ X(text, vfs, ##__VA_ARGS__) #define REQUEST_PREPARE(X, ...) \ X(uint64, db_id, ##__VA_ARGS__) \ X(text, sql, ##__VA_ARGS__) #define REQUEST_EXEC(X, ...) \ X(uint32, db_id, ##__VA_ARGS__) \ X(uint32, stmt_id, ##__VA_ARGS__) #define REQUEST_QUERY(X, ...) \ X(uint32, db_id, ##__VA_ARGS__) \ X(uint32, stmt_id, ##__VA_ARGS__) #define REQUEST_FINALIZE(X, ...) \ X(uint32, db_id, ##__VA_ARGS__) \ X(uint32, stmt_id, ##__VA_ARGS__) #define REQUEST_EXEC_SQL(X, ...) \ X(uint64, db_id, ##__VA_ARGS__) \ X(text, sql, ##__VA_ARGS__) #define REQUEST_QUERY_SQL(X, ...) \ X(uint64, db_id, ##__VA_ARGS__) \ X(text, sql, ##__VA_ARGS__) #define REQUEST_INTERRUPT(X, ...) X(uint64, db_id, ##__VA_ARGS__) #define REQUEST_ADD(X, ...) \ X(uint64, id, ##__VA_ARGS__) \ X(text, address, ##__VA_ARGS__) #define REQUEST_PROMOTE_OR_ASSIGN(X, ...) X(uint64, id, ##__VA_ARGS__) #define REQUEST_REMOVE(X, ...) X(uint64, id, ##__VA_ARGS__) #define REQUEST_DUMP(X, ...) X(text, filename, ##__VA_ARGS__) #define REQUEST_CLUSTER(X, ...) X(uint64, format, ##__VA_ARGS__) #define REQUEST_TRANSFER(X, ...) X(uint64, id, ##__VA_ARGS__) #define REQUEST_DESCRIBE(X, ...) X(uint64, format, ##__VA_ARGS__) #define REQUEST_WEIGHT(X, ...) X(uint64, weight, ##__VA_ARGS__) #define REQUEST__DEFINE(LOWER, UPPER, _) \ SERIALIZE__DEFINE(request_##LOWER, REQUEST_##UPPER); #define REQUEST__TYPES(X, ...) \ X(leader, LEADER, __VA_ARGS__) \ X(client, CLIENT, __VA_ARGS__) \ X(open, OPEN, __VA_ARGS__) \ X(prepare, PREPARE, __VA_ARGS__) \ X(exec, EXEC, __VA_ARGS__) \ X(query, QUERY, __VA_ARGS__) \ X(finalize, FINALIZE, __VA_ARGS__) \ X(exec_sql, EXEC_SQL, __VA_ARGS__) \ X(query_sql, QUERY_SQL, __VA_ARGS__) \ X(interrupt, INTERRUPT, __VA_ARGS__) \ X(add, ADD, __VA_ARGS__) \ X(promote_or_assign, PROMOTE_OR_ASSIGN, __VA_ARGS__) \ X(remove, REMOVE, __VA_ARGS__) \ X(dump, DUMP, __VA_ARGS__) \ X(cluster, CLUSTER, __VA_ARGS__) \ X(transfer, TRANSFER, __VA_ARGS__) \ X(describe, DESCRIBE, __VA_ARGS__) \ X(weight, WEIGHT, __VA_ARGS__) REQUEST__TYPES(REQUEST__DEFINE); #define REQUEST_CONNECT(X, ...) \ X(uint64, id, ##__VA_ARGS__) \ X(text, address, ##__VA_ARGS__) SERIALIZE__DEFINE(request_connect, REQUEST_CONNECT); /* Definition of the ASSIGN request that's used only for serialization. * * The one-field PROMOTE request and the two-field ASSIGN request have the * same type tag, so we can't dispatch deserialization based on that field. * Instead, we deserialize as the least-common-denominator PROMOTE_OR_ASSIGN * and then manually read the second field if appropriate. * * But when serializing, we can just decide to send an ASSIGN request, so * we provide the message definition here for that purpose. */ #define REQUEST_ASSIGN(X, ...) \ X(uint64, id, ##__VA_ARGS__) \ X(uint64, role, ##__VA_ARGS__) SERIALIZE__DEFINE(request_assign, REQUEST_ASSIGN); #endif /* REQUEST_H_ */ dqlite-1.16.7/src/response.c000066400000000000000000000002521465252713400156710ustar00rootroot00000000000000#include "response.h" #define RESPONSE__IMPLEMENT(LOWER, UPPER, _) \ SERIALIZE__IMPLEMENT(response_##LOWER, RESPONSE_##UPPER); RESPONSE__TYPES(RESPONSE__IMPLEMENT, ); dqlite-1.16.7/src/response.h000066400000000000000000000044201465252713400156770ustar00rootroot00000000000000#ifndef RESPONSE_H_ #define RESPONSE_H_ #include "lib/serialize.h" /** * Response types. */ #define RESPONSE_SERVER(X, ...) \ X(uint64, id, ##__VA_ARGS__) \ X(text, address, ##__VA_ARGS__) #define RESPONSE_SERVER_LEGACY(X, ...) X(text, address, ##__VA_ARGS__) #define RESPONSE_WELCOME(X, ...) X(uint64, heartbeat_timeout, ##__VA_ARGS__) #define RESPONSE_FAILURE(X, ...) \ X(uint64, code, ##__VA_ARGS__) \ X(text, message, ##__VA_ARGS__) #define RESPONSE_DB(X, ...) \ X(uint32, id, ##__VA_ARGS__) \ X(uint32, __pad__, ##__VA_ARGS__) #define RESPONSE_STMT(X, ...) \ X(uint32, db_id, ##__VA_ARGS__) \ X(uint32, id, ##__VA_ARGS__) \ X(uint64, params, ##__VA_ARGS__) #define RESPONSE_STMT_WITH_OFFSET(X, ...) \ X(uint32, db_id, ##__VA_ARGS__) \ X(uint32, id, ##__VA_ARGS__) \ X(uint64, params, ##__VA_ARGS__) \ X(uint64, offset, ##__VA_ARGS__) #define RESPONSE_RESULT(X, ...) \ X(uint64, last_insert_id, ##__VA_ARGS__) \ X(uint64, rows_affected, ##__VA_ARGS__) #define RESPONSE_ROWS(X, ...) X(uint64, eof, ##__VA_ARGS__) #define RESPONSE_EMPTY(X, ...) X(uint64, __unused__, ##__VA_ARGS__) #define RESPONSE_FILES(X, ...) X(uint64, n, ##__VA_ARGS__) #define RESPONSE_SERVERS(X, ...) X(uint64, n, ##__VA_ARGS__) #define RESPONSE_METADATA(X, ...) \ X(uint64, failure_domain, ##__VA_ARGS__) \ X(uint64, weight, ##__VA_ARGS__) #define RESPONSE__DEFINE(LOWER, UPPER, _) \ SERIALIZE__DEFINE(response_##LOWER, RESPONSE_##UPPER); #define RESPONSE__TYPES(X, ...) \ X(server, SERVER, __VA_ARGS__) \ X(server_legacy, SERVER_LEGACY, __VA_ARGS__) \ X(welcome, WELCOME, __VA_ARGS__) \ X(failure, FAILURE, __VA_ARGS__) \ X(db, DB, __VA_ARGS__) \ X(stmt, STMT, __VA_ARGS__) \ X(stmt_with_offset, STMT_WITH_OFFSET, __VA_ARGS__) \ X(result, RESULT, __VA_ARGS__) \ X(rows, ROWS, __VA_ARGS__) \ X(empty, EMPTY, __VA_ARGS__) \ X(files, FILES, __VA_ARGS__) \ X(servers, SERVERS, __VA_ARGS__) \ X(metadata, METADATA, __VA_ARGS__) RESPONSE__TYPES(RESPONSE__DEFINE); #endif /* RESPONSE_H_ */ dqlite-1.16.7/src/roles.c000066400000000000000000000462121465252713400151650ustar00rootroot00000000000000#include #include "client/protocol.h" #include "lib/queue.h" #include "raft.h" #include "roles.h" #include "server.h" #include "translate.h" /* Overview * -------- * * This file implements automatic role management for dqlite servers. When * automatic role management is enabled, servers in a dqlite cluster will * autonomously (without client intervention) promote and demote each other * to maintain a specified number of voters and standbys, taking into account * the health, failure domain, and weight of each server. * * We implement two ingredients of role management: adjustments and handovers. * Adjustment runs on the cluster leader every tick (the frequency is defined * in server.c). The first step is to "poll" every server in the cluster to find * out whether it's online, and if so, its failure domain and weight. It demotes * to spare any servers that appear to have gone offline, then, if the numbers * of (online) voters and standbys don't match the target values, chooses * servers that should be promoted or demoted. The preference ordering for * promotion is based on the failure domains and weights previously gathered, * and is defined in compareNodesForPromotion, below. * * The actual roles changes are computed in a batch each time adjustment * occurs, and are stored in a queue. Individual "change records" are taken * off this queue and applied asynchronously. Since we only have a blocking * client implementation available, the exchanges of requests and responses * that implements polling a single server happens on the libuv blocking * thread pool (see pollClusterAfterWorkCb). We don't start a new round of * adjustment if a "tick" occurs while the queue of changes from the last * round is still nonempty. * * A handover is triggered when we call dqlite_node_handover on a node that's * the current cluster leader, or is a voter. Before shutting down for real, * the node in question tries to cause another node to become leader (using * raft_transfer), if applicable, and then promotes another node to voter * (if possible) before demoting itself. This is intended to smooth over * availability problems that can result if a privileged node (leader or * non-leader voter) crashes out of the cluster unceremoniously. The handover * task also needs to poll the cluster to figure out which nodes are good * candidates for promotion to voter. * * Unresolved * ---------- * * - Should the failure-domains accounting for standbys use information about * voters' failure domains? Vice versa? * - Should we try multiple candidates when doing an adjustment, if the * preferred candidate can't be promoted? * - Should we retry when some step in the handover process fails? How, and * how many times? * - Should we have dedicated code somewhere to (possibly) promote newly- * joined nodes? go-dqlite does this, but I'm not convinced it's important, * or that it should run on the server if we do decide we want it. */ /* XXX */ #define NUM_TRACKED_DOMAINS 5 struct change_record { raft_id id; int role; /* dqlite role codes */ queue queue; }; struct counted_failure_domain { unsigned long long domain; int count; }; struct compare_data { unsigned n; struct counted_failure_domain domains[NUM_TRACKED_DOMAINS]; }; struct polling { void (*cb)(struct polling *); struct dqlite_node *node; struct all_node_info *cluster; unsigned *count; unsigned n_cluster; unsigned i; }; struct handover_voter_data { struct dqlite_node *node; dqlite_node_id target_id; char *leader_addr; dqlite_node_id leader_id; }; static int domainCount(uint64_t needle, const struct compare_data *data) { unsigned i; for (i = 0; i < data->n; i += 1) { if (data->domains[i].domain == needle) { return data->domains[i].count; } } return 0; } static void addDomain(uint64_t domain, struct compare_data *data) { unsigned i; for (i = 0; i < data->n; i += 1) { if (data->domains[i].domain == domain) { data->domains[i].count += 1; return; } } if (i < NUM_TRACKED_DOMAINS) { data->domains[i].domain = domain; data->domains[i].count = 1; data->n += 1; } } static void removeDomain(uint64_t domain, struct compare_data *data) { unsigned i; for (i = 0; i < data->n; i += 1) { if (data->domains[i].domain == domain) { if (data->domains[i].count > 0) { data->domains[i].count -= 1; } return; } } } static int compareNodesForPromotion(const void *l, const void *r, void *p) { struct compare_data *data = p; const struct all_node_info *left = l; const struct all_node_info *right = r; int result; /* Nodes whose failure domains appear fewer times are preferred. */ result = domainCount(left->failure_domain, data) - domainCount(right->failure_domain, data); if (result != 0) { return result; } /* Nodes with lower weights are preferred. */ result = (int)(left->weight - right->weight); if (result != 0) { return result; } /* We prefer to promote a standby rather than a spare. If * left->role > right->role, then right is more "senior" than left, * so we want right to come first, so return 1.*/ return (left->role > right->role) - (left->role < right->role); } static int compareNodesForDemotion(const void *l, const void *r, void *p) { /* XXX */ return -compareNodesForPromotion(l, r, p); } static void changeCb(struct raft_change *change, int status); /* Take one role change record off the queue and apply it. */ static void startChange(struct dqlite_node *d) { queue *head; struct change_record *rec; struct raft_change *change; uint64_t id; int role; int rv; if (queue_empty(&d->roles_changes)) { return; } head = queue_head(&d->roles_changes); queue_remove(head); rec = QUEUE_DATA(head, struct change_record, queue); id = rec->id; role = rec->role; raft_free(rec); change = raft_malloc(sizeof *change); if (change == NULL) { return; } change->data = d; /* TODO request ID */ rv = raft_assign(&d->raft, change, id, translateDqliteRole(role), changeCb); if (rv != 0) { /* TODO */ raft_free(change); } } /* When a role change has completed, start the next one. */ static void changeCb(struct raft_change *change, int status) { struct dqlite_node *d = change->data; raft_free(change); if (status != 0) { /* TODO */ } startChange(d); } static void queueChange(uint64_t id, int role, void *arg) { struct dqlite_node *d = arg; queue *head; struct change_record *rec; /* If we already queued a role change for this node, just update * that record instead of queueing a new one. */ QUEUE_FOREACH(head, &d->roles_changes) { rec = QUEUE_DATA(head, struct change_record, queue); if (rec->id == id) { rec->role = role; return; } } rec = raft_malloc(sizeof *rec); if (rec == NULL) { return; } rec->id = id; rec->role = role; queue_insert_tail(&d->roles_changes, &rec->queue); } void RolesComputeChanges(int voters, int standbys, struct all_node_info *cluster, unsigned n_cluster, dqlite_node_id my_id, void (*cb)(uint64_t, int, void *), void *arg) { int voter_count = 0; int standby_count = 0; struct compare_data voter_compare = {0}; struct compare_data standby_compare = {0}; unsigned i; /* Count (online) voters and standbys in the cluster, and demote any * offline nodes to spare. */ for (i = 0; i < n_cluster; i += 1) { if (!cluster[i].online && cluster[i].role != DQLITE_SPARE) { cb(cluster[i].id, DQLITE_SPARE, arg); cluster[i].role = DQLITE_SPARE; } else if (cluster[i].online && cluster[i].role == DQLITE_VOTER) { voter_count += 1; addDomain(cluster[i].failure_domain, &voter_compare); } else if (cluster[i].online && cluster[i].role == DQLITE_STANDBY) { standby_count += 1; addDomain(cluster[i].failure_domain, &standby_compare); } } /* If we don't have enough voters, promote some standbys and spares. */ if (voter_count < voters) { qsort_r(cluster, n_cluster, sizeof *cluster, compareNodesForPromotion, &voter_compare); } for (i = 0; i < n_cluster && voter_count < voters; i += 1) { if (!cluster[i].online || cluster[i].role == DQLITE_VOTER) { continue; } cb(cluster[i].id, DQLITE_VOTER, arg); if (cluster[i].role == DQLITE_STANDBY) { standby_count -= 1; removeDomain(cluster[i].failure_domain, &standby_compare); } cluster[i].role = DQLITE_VOTER; voter_count += 1; addDomain(cluster[i].failure_domain, &voter_compare); } /* If we have too many voters, demote some of them. We always demote * to spare in this step -- if it turns out that it would be better * for some of these nodes to end up as standbys, that change will * be picked up in the next step, and the two role changes will be * consolidated by queueChangeCb. */ if (voter_count > voters) { qsort_r(cluster, n_cluster, sizeof *cluster, compareNodesForDemotion, &voter_compare); } for (i = 0; i < n_cluster && voter_count > voters; i += 1) { if (cluster[i].role != DQLITE_VOTER || cluster[i].id == my_id) { continue; } cb(cluster[i].id, DQLITE_SPARE, arg); cluster[i].role = DQLITE_SPARE; voter_count -= 1; removeDomain(cluster[i].failure_domain, &voter_compare); } /* If we don't have enough standbys, promote some spares. */ if (standby_count < standbys) { qsort_r(cluster, n_cluster, sizeof *cluster, compareNodesForPromotion, &standby_compare); } for (i = 0; i < n_cluster && standby_count < standbys; i += 1) { if (!cluster[i].online || cluster[i].role != DQLITE_SPARE) { continue; } cb(cluster[i].id, DQLITE_STANDBY, arg); cluster[i].role = DQLITE_STANDBY; standby_count += 1; addDomain(cluster[i].failure_domain, &standby_compare); } /* If we have too many standbys, demote some of them. */ if (standby_count > standbys) { qsort_r(cluster, n_cluster, sizeof *cluster, compareNodesForDemotion, &standby_compare); } for (i = 0; i < n_cluster && standby_count > standbys; i += 1) { if (cluster[i].role != DQLITE_STANDBY) { continue; } cb(cluster[i].id, DQLITE_SPARE, arg); cluster[i].role = DQLITE_SPARE; standby_count -= 1; removeDomain(cluster[i].failure_domain, &standby_compare); } } /* Process information about the state of the cluster and queue up any * necessary role adjustments. This runs on the main thread. */ static void adjustClusterCb(struct polling *polling) { struct dqlite_node *d; if (polling == NULL) { return; } d = polling->node; RolesComputeChanges(d->config.voters, d->config.standbys, polling->cluster, polling->n_cluster, d->config.id, queueChange, d); /* Start pulling role changes off the queue. */ startChange(d); } /* Runs on the blocking thread pool to retrieve information about a single * server for use in roles adjustment. */ static void pollClusterWorkCb(uv_work_t *work) { struct polling *polling = work->data; struct dqlite_node *d = polling->node; struct client_proto proto = {0}; struct client_context context; int rv; proto.connect = d->connect_func; proto.connect_arg = d->connect_func_arg; rv = clientOpen(&proto, polling->cluster[polling->i].address, polling->cluster[polling->i].id); if (rv != 0) { return; } clientContextMillis(&context, 5000); rv = clientSendHandshake(&proto, &context); if (rv != 0) { goto close; } rv = clientSendDescribe(&proto, &context); rv = clientRecvMetadata(&proto, &polling->cluster[polling->i].failure_domain, &polling->cluster[polling->i].weight, &context); if (rv != 0) { goto close; } polling->cluster[polling->i].online = true; close: clientClose(&proto); } /* Runs on the main thread after polling each server for roles adjustment. */ static void pollClusterAfterWorkCb(uv_work_t *work, int status) { struct polling *polling = work->data; uv_work_t *work_objs; struct polling *polling_objs; unsigned i; /* The only path to status != 0 involves calling uv_cancel on this task, * which we don't do. */ assert(status == 0); *polling->count += 1; /* If all nodes have been polled, invoke the callback. */ if (*polling->count == polling->n_cluster) { polling->cb(polling); /* Free the shared data, now that all tasks have finished. */ raft_free(polling->count); for (i = 0; i < polling->n_cluster; i += 1) { raft_free(polling->cluster[i].address); } raft_free(polling->cluster); work_objs = work - polling->i; raft_free(work_objs); polling_objs = polling - polling->i; raft_free(polling_objs); } } /* Poll every node in the cluster to learn whether it's online, and if so, its * weight and failure domain. */ static void pollCluster(struct dqlite_node *d, void (*cb)(struct polling *)) { struct all_node_info *cluster; const struct raft_server *server; struct polling *polling_objs; struct polling *polling; struct uv_work_s *work_objs; struct uv_work_s *work; unsigned *count; unsigned n; unsigned i; unsigned j; unsigned ii; int rv; n = d->raft.configuration.n; cluster = raft_calloc(n, sizeof *cluster); if (cluster == NULL) { goto err; } count = raft_malloc(sizeof *count); if (count == NULL) { goto err_after_alloc_cluster; } *count = 0; for (i = 0; i < n; i += 1) { server = &d->raft.configuration.servers[i]; cluster[i].id = server->id; cluster[i].address = raft_malloc(strlen(server->address) + 1); if (cluster[i].address == NULL) { goto err_after_alloc_addrs; } memcpy(cluster[i].address, server->address, strlen(server->address) + 1); cluster[i].role = translateRaftRole(server->role); } polling_objs = raft_calloc(n, sizeof *polling_objs); if (polling_objs == NULL) { goto err_after_alloc_addrs; } work_objs = raft_calloc(n, sizeof *work_objs); if (work_objs == NULL) { goto err_after_alloc_polling; } for (j = 0; j < n; j += 1) { polling = &polling_objs[j]; polling->cb = cb; polling->node = d; polling->cluster = cluster; polling->n_cluster = n; polling->count = count; polling->i = j; work = &work_objs[j]; work->data = polling; rv = uv_queue_work(&d->loop, work, pollClusterWorkCb, pollClusterAfterWorkCb); /* uv_queue_work can't fail unless a NULL callback is passed. */ assert(rv == 0); } return; err_after_alloc_polling: raft_free(polling_objs); err_after_alloc_addrs: for (ii = 0; ii < i; ii += 1) { raft_free(cluster[ii].address); } raft_free(count); err_after_alloc_cluster: raft_free(cluster); err: cb(NULL); } /* Runs on the thread pool to open a connection to the leader, promote another * node to voter, and demote the calling node to spare. */ static void handoverVoterWorkCb(uv_work_t *work) { struct handover_voter_data *data = work->data; struct client_proto proto = {0}; struct client_context context; int rv; proto.connect = data->node->connect_func; proto.connect_arg = data->node->connect_func_arg; rv = clientOpen(&proto, data->leader_addr, data->leader_id); if (rv != 0) { return; } clientContextMillis(&context, 5000); rv = clientSendHandshake(&proto, &context); if (rv != 0) { goto close; } rv = clientSendAssign(&proto, data->target_id, DQLITE_VOTER, &context); if (rv != 0) { goto close; } rv = clientRecvEmpty(&proto, &context); if (rv != 0) { goto close; } rv = clientSendAssign(&proto, data->node->config.id, DQLITE_SPARE, &context); if (rv != 0) { goto close; } rv = clientRecvEmpty(&proto, &context); close: clientClose(&proto); } static void handoverVoterAfterWorkCb(uv_work_t *work, int status) { struct handover_voter_data *data = work->data; struct dqlite_node *node = data->node; int handover_status = 0; void (*cb)(struct dqlite_node *, int); if (status != 0) { handover_status = DQLITE_ERROR; } raft_free(data->leader_addr); raft_free(data); raft_free(work); cb = node->handover_done_cb; cb(node, handover_status); node->handover_done_cb = NULL; } /* Having gathered information about the cluster, pick a non-voter node * to promote in our place. */ static void handoverVoterCb(struct polling *polling) { struct dqlite_node *node; raft_id leader_id; const char *borrowed_addr; char *leader_addr; struct compare_data voter_compare = {0}; unsigned i; struct all_node_info *cluster; unsigned n_cluster; dqlite_node_id target_id; struct handover_voter_data *data; uv_work_t *work; void (*cb)(struct dqlite_node *, int); int rv; if (polling == NULL) { return; } node = polling->node; cluster = polling->cluster; n_cluster = polling->n_cluster; cb = node->handover_done_cb; raft_leader(&node->raft, &leader_id, &borrowed_addr); if (leader_id == node->raft.id || leader_id == 0) { goto finish; } leader_addr = raft_malloc(strlen(borrowed_addr) + 1); if (leader_addr == NULL) { goto finish; } memcpy(leader_addr, borrowed_addr, strlen(borrowed_addr) + 1); /* Select a non-voter to transfer to -- the logic is similar to * adjustClusterCb. */ for (i = 0; i < n_cluster; i += 1) { if (cluster[i].online && cluster[i].role == DQLITE_VOTER && cluster[i].id != node->raft.id) { addDomain(cluster[i].failure_domain, &voter_compare); } } qsort_r(cluster, n_cluster, sizeof *cluster, compareNodesForPromotion, &voter_compare); target_id = 0; for (i = 0; i < n_cluster; i += 1) { if (cluster[i].online && cluster[i].role != DQLITE_VOTER && cluster[i].id != node->raft.id) { target_id = cluster[i].id; break; } } /* If no transfer candidates found, give up. */ if (target_id == 0) { goto err_after_alloc_leader_addr; } /* Submit the handover work. */ data = raft_malloc(sizeof *data); if (data == NULL) { goto err_after_alloc_leader_addr; } data->node = node; data->target_id = target_id; data->leader_addr = leader_addr; data->leader_id = leader_id; work = raft_malloc(sizeof *work); if (work == NULL) { goto err_after_alloc_data; } work->data = data; rv = uv_queue_work(&node->loop, work, handoverVoterWorkCb, handoverVoterAfterWorkCb); if (rv != 0) { goto err_after_alloc_work; } return; err_after_alloc_work: raft_free(work); err_after_alloc_data: raft_free(data); err_after_alloc_leader_addr: raft_free(leader_addr); finish: node->handover_done_cb = NULL; cb(node, DQLITE_ERROR); } static void handoverTransferCb(struct raft_transfer *req) { struct dqlite_node *d = req->data; raft_free(req); pollCluster(d, handoverVoterCb); } void RolesAdjust(struct dqlite_node *d) { /* Only the leader can assign roles. */ if (raft_state(&d->raft) != RAFT_LEADER) { return; } /* If a series of role adjustments is already in progress, don't kick * off another one. */ if (!queue_empty(&d->roles_changes)) { return; } assert(d->running); pollCluster(d, adjustClusterCb); } void RolesHandover(struct dqlite_node *d, void (*cb)(struct dqlite_node *, int)) { struct raft_transfer *req; int rv; req = raft_malloc(sizeof *req); if (req == NULL) { goto err; } d->handover_done_cb = cb; req->data = d; /* We try the leadership transfer unconditionally -- Raft will tell us * if we're not the leader. */ rv = raft_transfer(&d->raft, req, 0, handoverTransferCb); if (rv == RAFT_NOTLEADER) { raft_free(req); pollCluster(d, handoverVoterCb); return; } else if (rv != 0) { raft_free(req); goto err; } return; err: d->handover_done_cb = NULL; cb(d, DQLITE_ERROR); } void RolesCancelPendingChanges(struct dqlite_node *d) { queue *head; struct change_record *rec; while (!queue_empty(&d->roles_changes)) { head = queue_head(&d->roles_changes); rec = QUEUE_DATA(head, struct change_record, queue); queue_remove(head); raft_free(rec); } } dqlite-1.16.7/src/roles.h000066400000000000000000000034211465252713400151650ustar00rootroot00000000000000#ifndef DQLITE_ROLE_MANAGEMENT_H #define DQLITE_ROLE_MANAGEMENT_H #include "server.h" struct all_node_info { uint64_t id; char *address; int role; bool online; uint64_t failure_domain; uint64_t weight; }; /* Determine what roles changes should be made to the cluster, without * side-effects. The given callback will be invoked for each computed change, * with first argument the node whose role should be adjusted, second argument * the node's new role, and third argument taken from the last argument of this * function. * * The memory pointed to by @cluster is "borrowed" and not freed by this * function, but it may be modified as part of this function's bookkeeping. */ void RolesComputeChanges(int voters, int standbys, struct all_node_info *cluster, unsigned n_cluster, dqlite_node_id my_id, void (*cb)(uint64_t, int, void *), void *arg); /* If necessary, try to assign new roles to nodes in the cluster to achieve * the configured number of voters and standbys. Polling the cluster and * assigning roles happens asynchronously. This can safely be called on any * server, but does nothing if called on a server that is not the leader. */ void RolesAdjust(struct dqlite_node *d); /* Begin a graceful shutdown of this node. Leadership and the voter role will * be transferred to other nodes if necessary, and then the callback will be * invoked on the loop thread. The callback's second argument will be 0 if the * handover succeeded and nonzero otherwise. */ void RolesHandover(struct dqlite_node *d, void (*cb)(struct dqlite_node *, int)); /* Drain the queue of changes computed by RoleManagementAdjust. This should be * done when the node is shutting down, to avoid a memory leak. */ void RolesCancelPendingChanges(struct dqlite_node *d); #endif dqlite-1.16.7/src/server.c000066400000000000000000001212201465252713400153400ustar00rootroot00000000000000#include "server.h" #include #include #include #include #include #include #include #include "../include/dqlite.h" #include "client/protocol.h" #include "conn.h" #include "fsm.h" #include "id.h" #include "lib/addr.h" #include "lib/assert.h" #include "lib/fs.h" #include "lib/threadpool.h" #include "logger.h" #include "protocol.h" #include "roles.h" #include "tracing.h" #include "translate.h" #include "transport.h" #include "utils.h" #include "vfs.h" /* Special ID for the bootstrap node. Equals to raft_digest("1", 0). */ #define BOOTSTRAP_ID 0x2dc171858c3155be #define DATABASE_DIR_FMT "%s/database" #define NODE_STORE_INFO_FORMAT_V1 "v1" /* Called by raft every time the raft state changes. */ static void state_cb(struct raft *r, unsigned short old_state, unsigned short new_state) { struct dqlite_node *d = r->data; queue *head; struct conn *conn; if (old_state == RAFT_LEADER && new_state != RAFT_LEADER) { tracef("node %llu@%s: leadership lost", r->id, r->address); QUEUE_FOREACH(head, &d->conns) { conn = QUEUE_DATA(head, struct conn, queue); gateway__leader_close(&conn->gateway, RAFT_LEADERSHIPLOST); } } } int dqlite__init(struct dqlite_node *d, dqlite_node_id id, const char *address, const char *dir) { int rv; char db_dir_path[1024]; int urandom; ssize_t count; d->initialized = false; d->lock_fd = -1; memset(d->errmsg, 0, sizeof d->errmsg); rv = snprintf(db_dir_path, sizeof db_dir_path, DATABASE_DIR_FMT, dir); if (rv == -1 || rv >= (int)(sizeof db_dir_path)) { snprintf(d->errmsg, DQLITE_ERRMSG_BUF_SIZE, "failed to init: snprintf(rv:%d)", rv); goto err; } rv = config__init(&d->config, id, address, dir, db_dir_path); if (rv != 0) { snprintf(d->errmsg, DQLITE_ERRMSG_BUF_SIZE, "config__init(rv:%d)", rv); goto err; } rv = VfsInit(&d->vfs, d->config.name); sqlite3_vfs_register(&d->vfs, 0); if (rv != 0) { goto err_after_config_init; } registry__init(&d->registry, &d->config); rv = uv_loop_init(&d->loop); if (rv != 0) { snprintf(d->errmsg, DQLITE_ERRMSG_BUF_SIZE, "uv_loop_init(): %s", uv_strerror(rv)); rv = DQLITE_ERROR; goto err_after_vfs_init; } #ifdef DQLITE_NEXT rv = pool_init(&d->pool, &d->loop, d->config.pool_thread_count, POOL_QOS_PRIO_FAIR); if (rv != 0) { snprintf(d->errmsg, DQLITE_ERRMSG_BUF_SIZE, "pool_init(): %s", uv_strerror(rv)); rv = DQLITE_ERROR; goto err_after_loop_init; } #endif rv = raftProxyInit(&d->raft_transport, &d->loop); if (rv != 0) { goto err_after_pool_init; } rv = raft_uv_init(&d->raft_io, &d->loop, dir, &d->raft_transport); if (rv != 0) { snprintf(d->errmsg, DQLITE_ERRMSG_BUF_SIZE, "raft_uv_init(): %s", d->raft_io.errmsg); rv = DQLITE_ERROR; goto err_after_raft_transport_init; } rv = fsm__init(&d->raft_fsm, &d->config, &d->registry); if (rv != 0) { goto err_after_raft_io_init; } /* TODO: properly handle closing the dqlite server without running it */ rv = raft_init(&d->raft, &d->raft_io, &d->raft_fsm, d->config.id, d->config.address); if (rv != 0) { snprintf(d->errmsg, DQLITE_ERRMSG_BUF_SIZE, "raft_init(): %s", raft_errmsg(&d->raft)); rv = DQLITE_ERROR; goto err; } /* TODO: expose these values through some API */ raft_set_election_timeout(&d->raft, 3000); raft_set_heartbeat_timeout(&d->raft, 500); raft_set_snapshot_threshold(&d->raft, 1024); raft_set_snapshot_trailing(&d->raft, 8192); raft_set_pre_vote(&d->raft, true); raft_set_max_catch_up_rounds(&d->raft, 100); raft_set_max_catch_up_round_duration(&d->raft, 50 * 1000); /* 50 secs */ raft_register_state_cb(&d->raft, state_cb); rv = sem_init(&d->ready, 0, 0); if (rv != 0) { snprintf(d->errmsg, DQLITE_ERRMSG_BUF_SIZE, "sem_init(): %s", strerror(errno)); rv = DQLITE_ERROR; goto err_after_raft_fsm_init; } rv = sem_init(&d->stopped, 0, 0); if (rv != 0) { snprintf(d->errmsg, DQLITE_ERRMSG_BUF_SIZE, "sem_init(): %s", strerror(errno)); rv = DQLITE_ERROR; goto err_after_ready_init; } rv = sem_init(&d->handover_done, 0, 0); if (rv != 0) { snprintf(d->errmsg, DQLITE_ERRMSG_BUF_SIZE, "sem_init(): %s", strerror(errno)); rv = DQLITE_ERROR; goto err_after_stopped_init; } queue_init(&d->queue); queue_init(&d->conns); queue_init(&d->roles_changes); d->raft_state = RAFT_UNAVAILABLE; d->running = false; d->listener = NULL; d->bind_address = NULL; d->role_management = false; d->connect_func = transportDefaultConnect; d->connect_func_arg = NULL; urandom = open("/dev/urandom", O_RDONLY); assert(urandom != -1); count = read(urandom, d->random_state.data, sizeof(uint64_t[4])); (void)count; close(urandom); d->initialized = true; return 0; err_after_stopped_init: sem_destroy(&d->stopped); err_after_ready_init: sem_destroy(&d->ready); err_after_raft_fsm_init: fsm__close(&d->raft_fsm); err_after_raft_io_init: raft_uv_close(&d->raft_io); err_after_raft_transport_init: raftProxyClose(&d->raft_transport); err_after_pool_init: #ifdef DQLITE_NEXT pool_close(&d->pool); pool_fini(&d->pool); err_after_loop_init: #endif uv_loop_close(&d->loop); err_after_vfs_init: VfsClose(&d->vfs); err_after_config_init: config__close(&d->config); err: return rv; } void dqlite__close(struct dqlite_node *d) { int rv; if (!d->initialized) { return; } raft_free(d->listener); rv = sem_destroy(&d->stopped); assert(rv == 0); /* Fails only if sem object is not valid */ rv = sem_destroy(&d->ready); assert(rv == 0); /* Fails only if sem object is not valid */ rv = sem_destroy(&d->handover_done); assert(rv == 0); fsm__close(&d->raft_fsm); // TODO assert rv of uv_loop_close after fixing cleanup logic related to // the TODO above referencing the cleanup logic without running the // node. See https://github.com/canonical/dqlite/issues/504. #ifdef DQLITE_NEXT pool_fini(&d->pool); #endif uv_loop_close(&d->loop); raftProxyClose(&d->raft_transport); registry__close(&d->registry); sqlite3_vfs_unregister(&d->vfs); VfsClose(&d->vfs); config__close(&d->config); if (d->bind_address != NULL) { sqlite3_free(d->bind_address); } } int dqlite_node_create(dqlite_node_id id, const char *address, const char *data_dir, dqlite_node **t) { *t = sqlite3_malloc(sizeof **t); if (*t == NULL) { return DQLITE_NOMEM; } return dqlite__init(*t, id, address, data_dir); } int dqlite_node_set_bind_address(dqlite_node *t, const char *address) { /* sockaddr_un is large enough for our purposes */ struct sockaddr_un addr_un; struct sockaddr *addr = (struct sockaddr *)&addr_un; socklen_t addr_len = sizeof(addr_un); sa_family_t domain; size_t path_len; int fd; int rv; if (t->running) { return DQLITE_MISUSE; } rv = AddrParse(address, addr, &addr_len, "8080", DQLITE_ADDR_PARSE_UNIX); if (rv != 0) { return rv; } domain = addr->sa_family; fd = socket(domain, SOCK_STREAM, 0); if (fd == -1) { return DQLITE_ERROR; } rv = fcntl(fd, FD_CLOEXEC); if (rv != 0) { close(fd); return DQLITE_ERROR; } if (domain == AF_INET || domain == AF_INET6) { int reuse = 1; rv = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (const char *)&reuse, sizeof(reuse)); if (rv != 0) { close(fd); return DQLITE_ERROR; } } rv = bind(fd, addr, addr_len); if (rv != 0) { close(fd); return DQLITE_ERROR; } rv = transport__stream(&t->loop, fd, &t->listener); if (rv != 0) { close(fd); return DQLITE_ERROR; } if (domain == AF_INET || domain == AF_INET6) { int sz = ((int)strlen(address)) + 1; /* Room for '\0' */ t->bind_address = sqlite3_malloc(sz); if (t->bind_address == NULL) { close(fd); return DQLITE_NOMEM; } strcpy(t->bind_address, address); } else { path_len = sizeof addr_un.sun_path; t->bind_address = sqlite3_malloc((int)path_len); if (t->bind_address == NULL) { close(fd); return DQLITE_NOMEM; } memset(t->bind_address, 0, path_len); rv = uv_pipe_getsockname((struct uv_pipe_s *)t->listener, t->bind_address, &path_len); if (rv != 0) { close(fd); sqlite3_free(t->bind_address); t->bind_address = NULL; return DQLITE_ERROR; } t->bind_address[0] = '@'; } return 0; } const char *dqlite_node_get_bind_address(dqlite_node *t) { return t->bind_address; } int dqlite_node_set_connect_func(dqlite_node *t, int (*f)(void *arg, const char *address, int *fd), void *arg) { if (t->running) { return DQLITE_MISUSE; } raftProxySetConnectFunc(&t->raft_transport, f, arg); /* Also save this info for use in automatic role management. */ t->connect_func = f; t->connect_func_arg = arg; return 0; } int dqlite_node_set_network_latency(dqlite_node *t, unsigned long long nanoseconds) { unsigned milliseconds; if (t->running) { return DQLITE_MISUSE; } /* 1 hour latency should be more than sufficient, also avoids overflow * issues when converting to unsigned milliseconds later on */ if (nanoseconds > 3600000000000ULL) { return DQLITE_MISUSE; } milliseconds = (unsigned)(nanoseconds / (1000000ULL)); return dqlite_node_set_network_latency_ms(t, milliseconds); } int dqlite_node_set_network_latency_ms(dqlite_node *t, unsigned milliseconds) { if (t->running) { return DQLITE_MISUSE; } /* Currently we accept at least 1 millisecond latency and maximum 3600 s * of latency */ if (milliseconds == 0 || milliseconds > 3600U * 1000U) { return DQLITE_MISUSE; } raft_set_heartbeat_timeout(&t->raft, (milliseconds * 15) / 10); raft_set_election_timeout(&t->raft, milliseconds * 15); return 0; } int dqlite_node_set_failure_domain(dqlite_node *n, unsigned long long code) { n->config.failure_domain = code; return 0; } int dqlite_node_set_snapshot_params(dqlite_node *n, unsigned snapshot_threshold, unsigned snapshot_trailing) { if (n->running) { return DQLITE_MISUSE; } if (snapshot_trailing < 4) { return DQLITE_MISUSE; } /* This is a safety precaution and allows to recover data from the * second last raft snapshot and segment files in case the last raft * snapshot is unusable. */ if (snapshot_trailing < snapshot_threshold) { return DQLITE_MISUSE; } raft_set_snapshot_threshold(&n->raft, snapshot_threshold); raft_set_snapshot_trailing(&n->raft, snapshot_trailing); return 0; } #define KB(N) (1024 * N) int dqlite_node_set_block_size(dqlite_node *n, size_t size) { if (n->running) { return DQLITE_MISUSE; } switch (size) { case 512: // fallthrough case KB(1): // fallthrough case KB(2): // fallthrough case KB(4): // fallthrough case KB(8): // fallthrough case KB(16): // fallthrough case KB(32): // fallthrough case KB(64): // fallthrough case KB(128): // fallthrough case KB(256): break; default: return DQLITE_ERROR; } raft_uv_set_block_size(&n->raft_io, size); return 0; } int dqlite_node_enable_disk_mode(dqlite_node *n) { int rv; if (n->running) { return DQLITE_MISUSE; } rv = dqlite_vfs_enable_disk(&n->vfs); if (rv != 0) { return rv; } n->registry.config->disk = true; /* Close the default fsm and initialize the disk one. */ fsm__close(&n->raft_fsm); rv = fsm__init_disk(&n->raft_fsm, &n->config, &n->registry); if (rv != 0) { return rv; } return 0; } static int maybeBootstrap(dqlite_node *d, dqlite_node_id id, const char *address) { struct raft_configuration configuration; int rv; if (id != 1 && id != BOOTSTRAP_ID) { return 0; } raft_configuration_init(&configuration); rv = raft_configuration_add(&configuration, id, address, RAFT_VOTER); if (rv != 0) { assert(rv == RAFT_NOMEM); rv = DQLITE_NOMEM; goto out; }; rv = raft_bootstrap(&d->raft, &configuration); if (rv != 0) { if (rv == RAFT_CANTBOOTSTRAP) { rv = 0; } else { snprintf(d->errmsg, DQLITE_ERRMSG_BUF_SIZE, "raft_bootstrap(): %s", raft_errmsg(&d->raft)); rv = DQLITE_ERROR; } goto out; } out: raft_configuration_close(&configuration); return rv; } /* Callback invoked when the stop async handle gets fired. * * This callback will walk through all active handles and close them. After the * last handle is closed, the loop gets stopped. */ static void raftCloseCb(struct raft *raft) { struct dqlite_node *s = raft->data; raft_uv_close(&s->raft_io); uv_close((struct uv_handle_s *)&s->stop, NULL); uv_close((struct uv_handle_s *)&s->handover, NULL); uv_close((struct uv_handle_s *)&s->startup, NULL); uv_close((struct uv_handle_s *)s->listener, NULL); uv_close((struct uv_handle_s *)&s->timer, NULL); } static void destroy_conn(struct conn *conn) { queue_remove(&conn->queue); sqlite3_free(conn); } static void handoverDoneCb(struct dqlite_node *d, int status) { d->handover_status = status; sem_post(&d->handover_done); } static void handoverCb(uv_async_t *handover) { struct dqlite_node *d = handover->data; int rv; /* Nothing to do. */ if (!d->running) { return; } if (d->role_management) { rv = uv_timer_stop(&d->timer); assert(rv == 0); RolesCancelPendingChanges(d); } RolesHandover(d, handoverDoneCb); } static void stopCb(uv_async_t *stop) { struct dqlite_node *d = stop->data; queue *head; struct conn *conn; int rv; /* Nothing to do. */ if (!d->running) { tracef("not running or already stopped"); return; } #ifdef DQLITE_NEXT pool_close(&d->pool); #endif if (d->role_management) { rv = uv_timer_stop(&d->timer); assert(rv == 0); RolesCancelPendingChanges(d); } d->running = false; QUEUE_FOREACH(head, &d->conns) { conn = QUEUE_DATA(head, struct conn, queue); conn__stop(conn); } raft_close(&d->raft, raftCloseCb); } /* Callback invoked as soon as the loop as started. * * It unblocks the s->ready semaphore. */ static void startup_cb(uv_timer_t *startup) { struct dqlite_node *d = startup->data; int rv; d->running = true; rv = sem_post(&d->ready); assert(rv == 0); /* No reason for which posting should fail */ } static void listenCb(uv_stream_t *listener, int status) { struct dqlite_node *t = listener->data; struct uv_stream_s *stream; struct conn *conn; struct id_state seed; int rv; if (!t->running) { tracef("not running"); return; } if (status != 0) { /* TODO: log the error. */ return; } switch (listener->type) { case UV_TCP: stream = raft_malloc(sizeof(struct uv_tcp_s)); if (stream == NULL) { return; } rv = uv_tcp_init(&t->loop, (struct uv_tcp_s *)stream); assert(rv == 0); break; case UV_NAMED_PIPE: stream = raft_malloc(sizeof(struct uv_pipe_s)); if (stream == NULL) { return; } rv = uv_pipe_init(&t->loop, (struct uv_pipe_s *)stream, 0); assert(rv == 0); break; default: assert(0); } rv = uv_accept(listener, stream); if (rv != 0) { goto err; } /* We accept unix socket connections only from the same process. */ if (listener->type == UV_NAMED_PIPE) { int fd = stream->io_watcher.fd; #if defined(SO_PEERCRED) // Linux struct ucred cred; socklen_t len = sizeof(cred); rv = getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &cred, &len); if (rv != 0) { goto err; } if (cred.pid != getpid()) { goto err; } #elif defined(LOCAL_PEERPID) // BSD pid_t pid = -1; socklen_t len = sizeof(pid); rv = getsockopt(fd, SOL_LOCAL, LOCAL_PEERPID, &pid, &len); if (rv != 0) { goto err; } if (pid != getpid()) { goto err; } #else // The unix socket connection can't be verified and from // security perspective it's better to block it entirely goto err; #endif } seed = t->random_state; idJump(&t->random_state); conn = sqlite3_malloc(sizeof *conn); if (conn == NULL) { goto err; } rv = conn__start(conn, &t->config, &t->loop, &t->registry, &t->raft, stream, &t->raft_transport, seed, destroy_conn); if (rv != 0) { goto err_after_conn_alloc; } queue_insert_tail(&t->conns, &conn->queue); return; err_after_conn_alloc: sqlite3_free(conn); err: uv_close((struct uv_handle_s *)stream, (uv_close_cb)raft_free); } /* Runs every tick on the main thread to kick off roles adjustment. */ static void roleManagementTimerCb(uv_timer_t *handle) { struct dqlite_node *d = handle->data; RolesAdjust(d); } static int taskRun(struct dqlite_node *d) { int rv; /* TODO: implement proper cleanup upon error by spinning the loop a few * times. */ assert(d->listener != NULL); rv = uv_listen(d->listener, 128, listenCb); if (rv != 0) { return rv; } d->listener->data = d; d->handover.data = d; rv = uv_async_init(&d->loop, &d->handover, handoverCb); assert(rv == 0); /* Initialize notification handles. */ d->stop.data = d; rv = uv_async_init(&d->loop, &d->stop, stopCb); assert(rv == 0); /* Schedule startup_cb to be fired as soon as the loop starts. It will * unblock clients of taskReady. */ d->startup.data = d; rv = uv_timer_init(&d->loop, &d->startup); assert(rv == 0); rv = uv_timer_start(&d->startup, startup_cb, 0, 0); assert(rv == 0); /* Schedule the role management callback. */ d->timer.data = d; rv = uv_timer_init(&d->loop, &d->timer); assert(rv == 0); if (d->role_management) { /* TODO make the interval configurable */ rv = uv_timer_start(&d->timer, roleManagementTimerCb, 1000, 1000); assert(rv == 0); } d->raft.data = d; rv = raft_start(&d->raft); if (rv != 0) { snprintf(d->errmsg, DQLITE_ERRMSG_BUF_SIZE, "raft_start(): %s", raft_errmsg(&d->raft)); /* Unblock any client of taskReady */ sem_post(&d->ready); return rv; } rv = uv_run(&d->loop, UV_RUN_DEFAULT); assert(rv == 0); /* Unblock any client of taskReady */ rv = sem_post(&d->ready); assert(rv == 0); /* no reason for which posting should fail */ return 0; } int dqlite_node_set_target_voters(dqlite_node *n, int voters) { n->config.voters = voters; return 0; } int dqlite_node_set_target_standbys(dqlite_node *n, int standbys) { n->config.standbys = standbys; return 0; } int dqlite_node_enable_role_management(dqlite_node *n) { n->role_management = true; return 0; } int dqlite_node_set_snapshot_compression(dqlite_node *n, bool enabled) { return raft_uv_set_snapshot_compression(&n->raft_io, enabled); } int dqlite_node_set_auto_recovery(dqlite_node *n, bool enabled) { raft_uv_set_auto_recovery(&n->raft_io, enabled); return 0; } int dqlite_node_set_pool_thread_count(dqlite_node *n, unsigned thread_count) { n->config.pool_thread_count = thread_count; return 0; } const char *dqlite_node_errmsg(dqlite_node *n) { if (n != NULL) { return n->errmsg; } return "node is NULL"; } static void *taskStart(void *arg) { struct dqlite_node *t = arg; int rv; rv = taskRun(t); if (rv != 0) { uintptr_t result = (uintptr_t)rv; return (void *)result; } return NULL; } void dqlite_node_destroy(dqlite_node *d) { dqlite__close(d); sqlite3_free(d); } /* Wait until a dqlite server is ready and can handle connections. ** ** Returns true if the server has been successfully started, false otherwise. ** ** This is a thread-safe API, but must be invoked before any call to ** dqlite_stop or dqlite_handle. */ static bool taskReady(struct dqlite_node *d) { /* Wait for the ready semaphore */ sem_wait(&d->ready); return d->running; } #define LOCK_FILENAME "dqlite-lock" static int acquire_dir(const char *dir, int *fd_out) { char path[PATH_MAX]; int fd; int rv; snprintf(path, sizeof(path), "%s/%s", dir, LOCK_FILENAME); fd = open(path, O_RDWR|O_CREAT|O_CLOEXEC, S_IRUSR|S_IWUSR); if (fd < 0) { return DQLITE_ERROR; } rv = flock(fd, LOCK_EX|LOCK_NB); if (rv != 0) { return DQLITE_ERROR; } *fd_out = fd; return 0; } static void release_dir(int fd) { close(fd); } static int dqliteDatabaseDirSetup(dqlite_node *t) { int rv; if (!t->config.disk) { // nothing to do return 0; } rv = FsEnsureDir(t->config.database_dir); if (rv != 0) { snprintf(t->errmsg, DQLITE_ERRMSG_BUF_SIZE, "Error creating database dir: %d", rv); return rv; } rv = FsRemoveDirFiles(t->config.database_dir); if (rv != 0) { snprintf(t->errmsg, DQLITE_ERRMSG_BUF_SIZE, "Error removing files in database dir: %d", rv); return rv; } return rv; } int dqlite_node_start(dqlite_node *t) { int rv; tracef("dqlite node start"); dqliteTracingMaybeEnable(true); rv = dqliteDatabaseDirSetup(t); if (rv != 0) { tracef("database dir setup failed %s", t->errmsg); goto err; } int lock_fd; rv = acquire_dir(t->config.raft_dir, &lock_fd); if (rv != 0) { snprintf(t->errmsg, DQLITE_ERRMSG_BUF_SIZE, "couldn't lock the raft directory"); return rv; } t->lock_fd = lock_fd; rv = maybeBootstrap(t, t->config.id, t->config.address); if (rv != 0) { tracef("bootstrap failed %d", rv); goto err_after_acquire_dir; } rv = pthread_create(&t->thread, 0, &taskStart, t); if (rv != 0) { tracef("pthread create failed %d", rv); rv = DQLITE_ERROR; goto err_after_acquire_dir; } if (!taskReady(t)) { tracef("!taskReady"); rv = DQLITE_ERROR; goto err_after_acquire_dir; } return 0; err_after_acquire_dir: release_dir(t->lock_fd); err: return rv; } int dqlite_node_handover(dqlite_node *d) { int rv; rv = uv_async_send(&d->handover); assert(rv == 0); sem_wait(&d->handover_done); return d->handover_status; } int dqlite_node_stop(dqlite_node *d) { tracef("dqlite node stop"); void *result; int rv; rv = uv_async_send(&d->stop); assert(rv == 0); rv = pthread_join(d->thread, &result); assert(rv == 0); release_dir(d->lock_fd); return (int)((uintptr_t)result); } int dqlite_node_recover(dqlite_node *n, struct dqlite_node_info infos[], int n_info) { tracef("dqlite node recover"); int i; int ret; struct dqlite_node_info_ext *infos_ext = calloc((size_t)n_info, sizeof(*infos_ext)); if (infos_ext == NULL) { return DQLITE_NOMEM; } for (i = 0; i < n_info; i++) { infos_ext[i].size = sizeof(*infos_ext); infos_ext[i].id = infos[i].id; infos_ext[i].address = PTR_TO_UINT64(infos[i].address); infos_ext[i].dqlite_role = DQLITE_VOTER; } ret = dqlite_node_recover_ext(n, infos_ext, n_info); free(infos_ext); return ret; } static bool node_info_valid(struct dqlite_node_info_ext *info) { /* Reject any size smaller than the original definition of the * extensible struct. */ if (info->size < DQLITE_NODE_INFO_EXT_SZ_ORIG) { return false; } /* Require 8 byte allignment */ if (info->size % sizeof(uint64_t)) { return false; } /* If the user uses a newer, and larger version of the struct, make sure * the unknown fields are zeroed out. */ uint64_t known_size = sizeof(struct dqlite_node_info_ext); if (info->size > known_size) { const uint64_t num_known_fields = known_size / sizeof(uint64_t); const uint64_t num_extra_fields = (info->size - known_size) / sizeof(uint64_t); const uint64_t *extra_fields = ((const uint64_t *)info) + num_known_fields; for (uint64_t i = 0; i < num_extra_fields; i++) { if (extra_fields[i] != (uint64_t)0) { return false; } } } return true; } int dqlite_node_recover_ext(dqlite_node *n, struct dqlite_node_info_ext infos[], int n_info) { tracef("dqlite node recover ext"); struct raft_configuration configuration; int i; int rv; raft_configuration_init(&configuration); for (i = 0; i < n_info; i++) { struct dqlite_node_info_ext *info = &infos[i]; if (!node_info_valid(info)) { rv = DQLITE_MISUSE; goto out; } int raft_role = translateDqliteRole((int)info->dqlite_role); const char *address = UINT64_TO_PTR(info->address, const char *); rv = raft_configuration_add(&configuration, info->id, address, raft_role); if (rv != 0) { assert(rv == RAFT_NOMEM); rv = DQLITE_NOMEM; goto out; }; } int lock_fd; rv = acquire_dir(n->config.raft_dir, &lock_fd); if (rv != 0) { goto out; } rv = raft_recover(&n->raft, &configuration); if (rv != 0) { rv = DQLITE_ERROR; goto out; } release_dir(lock_fd); out: raft_configuration_close(&configuration); return rv; } dqlite_node_id dqlite_generate_node_id(const char *address) { tracef("generate node id"); struct timespec ts; int rv; unsigned long long n; rv = clock_gettime(CLOCK_REALTIME, &ts); assert(rv == 0); n = (unsigned long long)(ts.tv_sec * 1000 * 1000 * 1000 + ts.tv_nsec); return raft_digest(address, n); } static void pushNodeInfo(struct node_store_cache *cache, struct client_node_info info) { unsigned cap = cache->cap; struct client_node_info *new; if (cache->len == cap) { if (cap == 0) { cap = 5; } cap *= 2; new = callocChecked(cap, sizeof *new); memcpy(new, cache->nodes, cache->len * sizeof *new); free(cache->nodes); cache->nodes = new; cache->cap = cap; } cache->nodes[cache->len] = info; cache->len += 1; } static void emptyCache(struct node_store_cache *cache) { unsigned i; for (i = 0; i < cache->len; i += 1) { free(cache->nodes[i].addr); } free(cache->nodes); cache->nodes = NULL; cache->len = 0; cache->cap = 0; } static const struct client_node_info *findNodeInCache( const struct node_store_cache *cache, uint64_t id) { unsigned i; for (i = 0; i < cache->len; i += 1) { if (cache->nodes[i].id == id) { return &cache->nodes[i]; } } return NULL; } /* Called at startup to parse the node store read from disk into an in-memory * representation. */ static int parseNodeStore(char *buf, size_t len, struct node_store_cache *cache) { const char *p = buf; const char *end = buf + len; char *nl; const char *version_str; const char *addr; const char *id_str; const char *dig; unsigned long long id; const char *role_str; int role; struct client_node_info info; version_str = p; nl = memchr(p, '\n', (size_t)(end - version_str)); if (nl == NULL) { return 1; } *nl = '\0'; p = nl + 1; if (strcmp(version_str, NODE_STORE_INFO_FORMAT_V1) != 0) { return 1; } while (p != end) { addr = p; nl = memchr(p, '\n', (size_t)(end - addr)); if (nl == NULL) { return 1; } *nl = '\0'; p = nl + 1; id_str = p; nl = memchr(p, '\n', (size_t)(end - id_str)); if (nl == NULL) { return 1; } *nl = '\0'; p = nl + 1; /* Be stricter than strtoull: digits only! */ for (dig = id_str; dig != nl; dig += 1) { if (*dig < '0' || *dig > '9') { return 1; } } errno = 0; id = strtoull(id_str, NULL, 10); if (errno != 0) { return 1; } role_str = p; nl = memchr(p, '\n', (size_t)(end - role_str)); if (nl == NULL) { return 1; } *nl = '\0'; p = nl + 1; if (strcmp(role_str, "spare") == 0) { role = DQLITE_SPARE; } else if (strcmp(role_str, "standby") == 0) { role = DQLITE_STANDBY; } else if (strcmp(role_str, "voter") == 0) { role = DQLITE_VOTER; } else { return 1; } info.addr = strdupChecked(addr); info.id = (uint64_t)id; info.role = role; pushNodeInfo(cache, info); } return 0; } /* Write the in-memory node store to disk. This discards errors, because: * * - we can't do much to handle any of these error cases * - we don't want to stop everything when this encounters an error, since the * persisted node store is an optimization, so it's not disastrous for it to * be missing or out of date * - there is already a "retry" mechanism in the form of the refreshTask thread, * which periodically tries to write the node store file */ static void writeNodeStore(struct dqlite_server *server) { int store_fd; FILE *f; unsigned i; ssize_t k; const char *role_name; int rv; store_fd = openat(server->dir_fd, "node-store-tmp", O_RDWR | O_CREAT | O_TRUNC, 0644); if (store_fd < 0) { return; } f = fdopen(store_fd, "w+"); if (f == NULL) { close(store_fd); return; } k = fprintf(f, "%s\n", NODE_STORE_INFO_FORMAT_V1); if (k < 0) { fclose(f); return; } for (i = 0; i < server->cache.len; i += 1) { role_name = (server->cache.nodes[i].role == DQLITE_SPARE) ? "spare" : ((server->cache.nodes[i].role == DQLITE_STANDBY) ? "standby" : "voter"); k = fprintf(f, "%s\n%" PRIu64 "\n%s\n", server->cache.nodes[i].addr, server->cache.nodes[i].id, role_name); if (k < 0) { fclose(f); return; } } fclose(f); rv = renameat(server->dir_fd, "node-store-tmp", server->dir_fd, "node-store"); (void)rv; } /* Called at startup to parse the node store read from disk into an in-memory * representation. */ static int parseLocalInfo(char *buf, size_t len, char **local_addr, uint64_t *local_id) { const char *p = buf; const char *end = buf + len; char *nl; const char *version_str; const char *addr; const char *id_str; const char *dig; unsigned long long id; version_str = p; nl = memchr(version_str, '\n', (size_t)(end - version_str)); if (nl == NULL) { return 1; } *nl = '\0'; p = nl + 1; if (strcmp(version_str, NODE_STORE_INFO_FORMAT_V1) != 0) { return 1; } addr = p; nl = memchr(addr, '\n', (size_t)(end - addr)); if (nl == NULL) { return 1; } *nl = '\0'; p = nl + 1; id_str = p; nl = memchr(id_str, '\n', (size_t)(end - id_str)); if (nl == NULL) { return 1; } *nl = '\0'; p = nl + 1; for (dig = id_str; dig != nl; dig += 1) { if (*dig < '0' || *dig > '9') { return 1; } } errno = 0; id = strtoull(id_str, NULL, 10); if (errno != 0) { return 1; } if (p != end) { return 1; } *local_addr = strdupChecked(addr); *local_id = (uint64_t)id; return 0; } /* Write the local node's info to disk. */ static int writeLocalInfo(struct dqlite_server *server) { int info_fd; FILE *f; ssize_t k; int rv; info_fd = openat(server->dir_fd, "server-info-tmp", O_RDWR | O_CREAT | O_TRUNC, 0664); if (info_fd < 0) { return 1; } f = fdopen(info_fd, "w+"); if (f == NULL) { close(info_fd); return 1; } k = fprintf(f, "%s\n%s\n%" PRIu64 "\n", NODE_STORE_INFO_FORMAT_V1, server->local_addr, server->local_id); if (k < 0) { fclose(f); return 1; } rv = renameat(server->dir_fd, "server-info-tmp", server->dir_fd, "server-info"); if (rv != 0) { fclose(f); return 1; } fclose(f); return 0; } int dqlite_server_create(const char *path, dqlite_server **server) { int rv; *server = callocChecked(1, sizeof **server); rv = pthread_cond_init(&(*server)->cond, NULL); assert(rv == 0); rv = pthread_mutex_init(&(*server)->mutex, NULL); assert(rv == 0); (*server)->dir_path = strdupChecked(path); (*server)->connect = transportDefaultConnect; (*server)->proto.connect = transportDefaultConnect; (*server)->dir_fd = -1; (*server)->refresh_period = 30 * 1000; return 0; } int dqlite_server_set_address(dqlite_server *server, const char *address) { free(server->local_addr); server->local_addr = strdupChecked(address); return 0; } int dqlite_server_set_auto_bootstrap(dqlite_server *server, bool on) { server->bootstrap = on; return 0; } int dqlite_server_set_auto_join(dqlite_server *server, const char *const *addrs, unsigned n) { /* We don't know the ID or role of this server, so leave those fields * zeroed. In dqlite_server_start, we must take care not to use this * initial node store cache to do anything except find a server to * connect to. Once we've done that, we immediately fetch a fresh list * of cluster members that includes ID and role information, and clear * away the temporary node store cache. */ struct client_node_info info = { 0 }; unsigned i; for (i = 0; i < n; i += 1) { info.addr = strdupChecked(addrs[i]); pushNodeInfo(&server->cache, info); } return 0; } int dqlite_server_set_bind_address(dqlite_server *server, const char *addr) { free(server->bind_addr); server->bind_addr = strdupChecked(addr); return 0; } int dqlite_server_set_connect_func(dqlite_server *server, dqlite_connect_func f, void *arg) { server->connect = f; server->connect_arg = arg; server->proto.connect = f; server->proto.connect_arg = arg; return 0; } static int openAndHandshake(struct client_proto *proto, const char *addr, uint64_t id, struct client_context *context) { int rv; rv = clientOpen(proto, addr, id); if (rv != 0) { return 1; } rv = clientSendHandshake(proto, context); if (rv != 0) { clientClose(proto); return 1; } /* TODO client identification? */ return 0; } /* TODO prioritize voters > standbys > spares */ static int connectToSomeServer(struct dqlite_server *server, struct client_context *context) { unsigned i; int rv; for (i = 0; i < server->cache.len; i += 1) { rv = openAndHandshake(&server->proto, server->cache.nodes[i].addr, server->cache.nodes[i].id, context); if (rv == 0) { return 0; } } return 1; } /* Given an open connection, make an honest effort to reopen it as a connection * to the current cluster leader. This bails out rather than retrying on * client/server/network errors, leaving the retry policy up to the caller. On * failure (rv != 0) the given client object may be closed or not: the caller * must check this by comparing proto->fd to -1. */ static int tryReconnectToLeader(struct client_proto *proto, struct client_context *context) { char *addr; uint64_t id; int rv; rv = clientSendLeader(proto, context); if (rv != 0) { clientClose(proto); return 1; } rv = clientRecvServer(proto, &id, &addr, context); if (rv == DQLITE_CLIENT_PROTO_RECEIVED_FAILURE) { return 1; } else if (rv != 0) { clientClose(proto); return 1; } if (id == 0) { free(addr); return 1; } else if (id == proto->server_id) { free(addr); return 0; } clientClose(proto); rv = openAndHandshake(proto, addr, id, context); free(addr); if (rv != 0) { return 1; } return 0; } static int refreshNodeStoreCache(struct dqlite_server *server, struct client_context *context) { struct client_node_info *servers; uint64_t n_servers; int rv; rv = clientSendCluster(&server->proto, context); if (rv != 0) { clientClose(&server->proto); return 1; } rv = clientRecvServers(&server->proto, &servers, &n_servers, context); if (rv != 0) { clientClose(&server->proto); return 1; } emptyCache(&server->cache); server->cache.nodes = servers; server->cache.len = (unsigned)n_servers; assert((uint64_t)server->cache.len == n_servers); server->cache.cap = (unsigned)n_servers; return 0; } static int maybeJoinCluster(struct dqlite_server *server, struct client_context *context) { int rv; if (findNodeInCache(&server->cache, server->local_id) != NULL) { return 0; } rv = clientSendAdd(&server->proto, server->local_id, server->local_addr, context); if (rv != 0) { clientClose(&server->proto); return 1; } rv = clientRecvEmpty(&server->proto, context); if (rv != 0) { clientClose(&server->proto); return 1; } rv = refreshNodeStoreCache(server, context); if (rv != 0) { return 1; } return 0; } static int bootstrapOrJoinCluster(struct dqlite_server *server, struct client_context *context) { struct client_node_info info; int rv; if (server->is_new && server->bootstrap) { rv = openAndHandshake(&server->proto, server->local_addr, server->local_id, context); if (rv != 0) { return 1; } info.addr = strdupChecked(server->local_addr); info.id = server->local_id; info.role = DQLITE_VOTER; pushNodeInfo(&server->cache, info); } else { rv = connectToSomeServer(server, context); if (rv != 0) { return 1; } rv = tryReconnectToLeader(&server->proto, context); if (rv != 0) { return 1; } rv = refreshNodeStoreCache(server, context); if (rv != 0) { return 1; } rv = maybeJoinCluster(server, context); if (rv != 0) { return 1; } } writeNodeStore(server); return 0; } static void *refreshTask(void *arg) { struct dqlite_server *server = arg; struct client_context context; struct timespec ts; unsigned long long nsec; int rv; rv = pthread_mutex_lock(&server->mutex); assert(rv == 0); for (;;) { rv = clock_gettime(CLOCK_REALTIME, &ts); assert(rv == 0); nsec = (unsigned long long)ts.tv_nsec; nsec += server->refresh_period * 1000 * 1000; while (nsec > 1000 * 1000 * 1000) { nsec -= 1000 * 1000 * 1000; ts.tv_sec += 1; } /* The type of tv_nsec is "an implementation-defined signed type * capable of holding [the range 0..=999,999,999]". int is the * narrowest such type (on all the targets we care about), so * cast to that before doing the assignment to avoid warnings. */ ts.tv_nsec = (int)nsec; rv = pthread_cond_timedwait(&server->cond, &server->mutex, &ts); if (server->shutdown) { rv = pthread_mutex_unlock(&server->mutex); assert(rv == 0); break; } assert(rv == 0 || rv == ETIMEDOUT); clientContextMillis(&context, 5000); if (server->proto.fd == -1) { rv = connectToSomeServer(server, &context); if (rv != 0) { continue; } (void)tryReconnectToLeader(&server->proto, &context); if (server->proto.fd == -1) { continue; } } rv = refreshNodeStoreCache(server, &context); if (rv != 0) { continue; } writeNodeStore(server); } return NULL; } int dqlite_server_start(dqlite_server *server) { int info_fd; int store_fd; off_t full_size; ssize_t size; char *buf; ssize_t n_read; struct client_context context; int rv; rv = sqlite3_threadsafe(); if (!(rv == 1 || rv == 2)) { goto err; } if (server->started) { goto err; } if (server->bootstrap && server->cache.len > 0) { goto err; } server->is_new = true; server->dir_fd = open(server->dir_path, O_RDONLY | O_DIRECTORY); if (server->dir_fd < 0) { goto err; } info_fd = openat(server->dir_fd, "server-info", O_RDWR | O_CREAT, 0664); if (info_fd < 0) { goto err_after_open_dir; } store_fd = openat(server->dir_fd, "node-store", O_RDWR | O_CREAT, 0664); if (store_fd < 0) { goto err_after_open_info; } full_size = lseek(info_fd, 0, SEEK_END); assert(full_size >= 0); if (full_size > (off_t)SSIZE_MAX) { goto err_after_open_store; } size = (ssize_t)full_size; if (size > 0) { server->is_new = false; /* TODO mmap it? */ buf = mallocChecked((size_t)size); n_read = pread(info_fd, buf, (size_t)size, 0); if (n_read < size) { free(buf); goto err_after_open_store; } free(server->local_addr); server->local_addr = NULL; rv = parseLocalInfo(buf, (size_t)size, &server->local_addr, &server->local_id); free(buf); if (rv != 0) { goto err_after_open_store; } } full_size = lseek(store_fd, 0, SEEK_END); assert(full_size >= 0); if (full_size > (off_t)SSIZE_MAX) { goto err_after_open_store; } size = (ssize_t)full_size; if (size > 0) { if (server->is_new) { goto err_after_open_store; } /* TODO mmap it? */ buf = mallocChecked((size_t)size); n_read = pread(store_fd, buf, (size_t)size, 0); if (n_read < size) { free(buf); goto err_after_open_store; } emptyCache(&server->cache); rv = parseNodeStore(buf, (size_t)size, &server->cache); free(buf); if (rv != 0) { goto err_after_open_store; } } if (server->is_new) { server->local_id = server->bootstrap ? BOOTSTRAP_ID : dqlite_generate_node_id(server->local_addr); } rv = dqlite_node_create(server->local_id, server->local_addr, server->dir_path, &server->local); if (rv != 0) { goto err_after_create_node; } rv = dqlite_node_set_bind_address( server->local, (server->bind_addr != NULL) ? server->bind_addr : server->local_addr); if (rv != 0) { goto err_after_create_node; } rv = dqlite_node_set_connect_func(server->local, server->connect, server->connect_arg); if (rv != 0) { goto err_after_create_node; } rv = dqlite_node_start(server->local); if (rv != 0) { goto err_after_create_node; } /* TODO set weight and failure domain here */ rv = writeLocalInfo(server); if (rv != 0) { goto err_after_start_node; } clientContextMillis(&context, 5000); rv = bootstrapOrJoinCluster(server, &context); if (rv != 0) { goto err_after_start_node; } rv = pthread_create(&server->refresh_thread, NULL, refreshTask, server); assert(rv == 0); close(store_fd); close(info_fd); server->started = true; return 0; err_after_start_node: dqlite_node_stop(server->local); err_after_create_node: dqlite_node_destroy(server->local); server->local = NULL; err_after_open_store: close(store_fd); err_after_open_info: close(info_fd); err_after_open_dir: close(server->dir_fd); server->dir_fd = -1; err: return 1; } dqlite_node_id dqlite_server_get_id(dqlite_server *server) { return server->local_id; } int dqlite_server_handover(dqlite_server *server) { int rv = dqlite_node_handover(server->local); if (rv != 0) { return 1; } return 0; } int dqlite_server_stop(dqlite_server *server) { void *ret; int rv; if (!server->started) { return 1; } rv = pthread_mutex_lock(&server->mutex); assert(rv == 0); server->shutdown = true; rv = pthread_mutex_unlock(&server->mutex); assert(rv == 0); rv = pthread_cond_signal(&server->cond); assert(rv == 0); rv = pthread_join(server->refresh_thread, &ret); assert(rv == 0); emptyCache(&server->cache); clientClose(&server->proto); server->started = false; rv = dqlite_node_stop(server->local); if (rv != 0) { return 1; } return 0; } void dqlite_server_destroy(dqlite_server *server) { pthread_cond_destroy(&server->cond); pthread_mutex_destroy(&server->mutex); emptyCache(&server->cache); free(server->dir_path); if (server->local != NULL) { dqlite_node_destroy(server->local); } free(server->local_addr); free(server->bind_addr); close(server->dir_fd); free(server); } dqlite-1.16.7/src/server.h000066400000000000000000000062741465252713400153600ustar00rootroot00000000000000#ifndef DQLITE_SERVER_H #define DQLITE_SERVER_H #include #include #include "client/protocol.h" #include "config.h" #include "id.h" #include "lib/assert.h" #include "lib/threadpool.h" #include "logger.h" #include "raft.h" #include "registry.h" #define DQLITE_ERRMSG_BUF_SIZE 300 /** * A single dqlite server instance. */ struct dqlite_node { bool initialized; /* dqlite__init succeeded */ int lock_fd; /* Locked file in the data directory */ pthread_t thread; /* Main run loop thread. */ struct config config; /* Config values */ struct sqlite3_vfs vfs; /* In-memory VFS */ struct registry registry; /* Databases */ struct uv_loop_s loop; /* UV loop */ struct pool_s pool; /* Thread pool */ struct raft_uv_transport raft_transport; /* Raft libuv transport */ struct raft_io raft_io; /* libuv I/O */ struct raft_fsm raft_fsm; /* dqlite FSM */ sem_t ready; /* Server is ready */ sem_t stopped; /* Notify loop stopped */ sem_t handover_done; queue queue; /* Incoming connections */ queue conns; /* Active connections */ queue roles_changes; bool running; /* Loop is running */ struct raft raft; /* Raft instance */ struct uv_stream_s *listener; /* Listening socket */ struct uv_async_s handover; int handover_status; void (*handover_done_cb)(struct dqlite_node *, int); struct uv_async_s stop; /* Trigger UV loop stop */ struct uv_timer_s startup; /* Unblock ready sem */ struct uv_timer_s timer; int raft_state; /* Previous raft state */ char *bind_address; /* Listen address */ bool role_management; int (*connect_func)( void *, const char *, int *); /* Connection function for role management */ void *connect_func_arg; /* User data for connection function */ char errmsg[DQLITE_ERRMSG_BUF_SIZE]; /* Last error occurred */ struct id_state random_state; /* For seeding ID generation */ }; /* Dynamic array of node info objects. This is the in-memory representation of * the node store. */ struct node_store_cache { struct client_node_info *nodes; /* owned */ unsigned len; unsigned cap; }; struct dqlite_server { /* Threading stuff: */ pthread_cond_t cond; pthread_mutex_t mutex; pthread_t refresh_thread; /* These fields are protected by the mutex: */ bool shutdown; struct node_store_cache cache; /* We try to keep this pointing at the leader, but it might be out of * date or not open. */ struct client_proto proto; /* These fields are only accessed on the main thread: */ bool started; bool is_new; bool bootstrap; char *dir_path; /* owned */ dqlite_node *local; uint64_t local_id; char *local_addr; /* owned */ char *bind_addr; /* owned */ dqlite_connect_func connect; void *connect_arg; unsigned long long refresh_period; /* in milliseconds */ int dir_fd; }; int dqlite__init(struct dqlite_node *d, dqlite_node_id id, const char *address, const char *dir); void dqlite__close(struct dqlite_node *d); int dqlite__run(struct dqlite_node *d); #endif dqlite-1.16.7/src/stmt.c000066400000000000000000000011571465252713400150270ustar00rootroot00000000000000#include #include "./lib/assert.h" #include "./tuple.h" #include "stmt.h" /* The maximum number of columns we expect (for bindings or rows) is 255, which * can fit in one byte. */ #define STMT__MAX_COLUMNS (1 << 8) - 1 void stmt__init(struct stmt *s) { s->stmt = NULL; } void stmt__close(struct stmt *s) { if (s->stmt != NULL) { /* Ignore the return code, since it will be non-zero in case the * most rececent evaluation of the statement failed. */ sqlite3_finalize(s->stmt); } } const char *stmt__hash(struct stmt *stmt) { (void)stmt; return NULL; } REGISTRY_METHODS(stmt__registry, stmt); dqlite-1.16.7/src/stmt.h000066400000000000000000000055661465252713400150440ustar00rootroot00000000000000/****************************************************************************** * * APIs to decode parameters and bind them to SQLite statement, and to fetch * rows and encode them. * * The dqlite wire format for a list of parameters to be bound to a statement * is divided in header and body. The format of the header is: * * 8 bits: Number of parameters to bind (min is 1, max is 255). * 4 bits: Type code of the 1st parameter to bind. * 4 bits: Type code of the 2nd parameter to bind, or 0. * 4 bits: Type code of the 3rn parameter to bind, or 0. * ... * * This repeats until reaching a full 64-bit word. If there are more than 14 * parameters, the header will grow additional 64-bit words as needed, following * the same pattern: a sequence of 4-bit slots with type codes of the parameters * to bind, followed by a sequence of zero bits, until word boundary is reached. * * After the parameters header follows the parameters body, which contain one * value for each parameter to bind, following the normal encoding rules. * * The dqlite wire format for a set of query rows is divided in header and * body. The format of the header is: * * 64 bits: Number of columns in the result set (min is 1). * 64 bits: Name of the first column. If the name is longer, additional words * of 64 bits can be used, like for normal string encoding. * ... If present, name of the 2nd, 3rd, ..., nth column. * * After the result set header follows the result set body, which is a sequence * of zero or more rows. Each row has the following format: * * 4 bits: Type code of the 1st column of the row. * 4 bits: Type code of the 2nd column of row, or 0. * 4 bits: Type code of the 2nd column of row, or 0. * * This repeats until reaching a full 64-bit word. If there are more than 16 row * columns, the header will grow additional 64-bit words as needed, following * the same pattern. After this row preamble, the values of all columns of the * row follow, using the normal dqlite enconding conventions. * *****************************************************************************/ #ifndef DQLITE_STMT_H #define DQLITE_STMT_H #include #include "lib/registry.h" /* Hold state for a single open SQLite database */ struct stmt { size_t id; /* Statement ID */ sqlite3_stmt *stmt; /* Underlying SQLite statement handle */ }; /* Initialize a statement state object */ void stmt__init(struct stmt *s); /* Close a statement state object, releasing all associated resources. */ void stmt__close(struct stmt *s); /* No-op hash function (hashing is not supported for stmt). This is * required by the registry interface. */ const char *stmt__hash(struct stmt *stmt); /* TODO: change registry naming pattern */ #define stmt_init stmt__init #define stmt_close stmt__close #define stmt_hash stmt__hash REGISTRY(stmt__registry, stmt); #endif /* DQLITE_STMT_H */ dqlite-1.16.7/src/tracing.c000066400000000000000000000046431465252713400154720ustar00rootroot00000000000000#include "tracing.h" #include /* stderr */ #include #include /* strstr, strlen */ #include /* syscall */ #include /* syscall, getpid */ #include "assert.h" /* assert */ #include "lib/byte.h" /* ARRAY_SIZE */ #define LIBDQLITE_TRACE "LIBDQLITE_TRACE" bool _dqliteTracingEnabled = false; static unsigned tracer__level; static pid_t tracerPidCached; void dqliteTracingMaybeEnable(bool enable) { const char *trace_level = getenv(LIBDQLITE_TRACE); if (trace_level != NULL) { tracerPidCached = getpid(); _dqliteTracingEnabled = enable; tracer__level = (unsigned)atoi(trace_level); tracer__level = tracer__level < TRACE_NR ? tracer__level : TRACE_NONE; } } static inline const char *tracerShortFileName(const char *fname) { static const char top_src_dir[] = "dqlite/"; const char *p; p = strstr(fname, top_src_dir); return p != NULL ? p + strlen(top_src_dir) : fname; } static inline const char *tracerTraceLevelName(unsigned int level) { static const char *levels[] = { "NONE", "DEBUG", "INFO", "WARN", "ERROR", "FATAL", }; return level < ARRAY_SIZE(levels) ? levels[level] : levels[0]; } static pid_t tracerPidCached; /* NOTE: on i386 and other platforms there're no specifically imported gettid() functions in unistd.h */ static inline pid_t gettidImpl(void) { return (pid_t)syscall(SYS_gettid); } static inline void tracerEmit(const char *file, unsigned int line, const char *func, unsigned int level, const char *message) { struct timespec ts = {0}; struct tm tm; pid_t tid = gettidImpl(); clock_gettime(CLOCK_REALTIME, &ts); gmtime_r(&ts.tv_sec, &tm); /* Example: LIBDQLITE[182942] 2023-11-27T14:46:24.912050507 001132 INFO uvClientSend src/uv_send.c:218 connection available... */ fprintf(stderr, "LIBDQLITE[%6.6u] %04d-%02d-%02dT%02d:%02d:%02d.%09lu " "%6.6u %-7s %-20s %s:%-3i %s\n", tracerPidCached, tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, (unsigned long)ts.tv_nsec, (unsigned)tid, tracerTraceLevelName(level), func, tracerShortFileName(file), line, message); } void stderrTracerEmit(const char *file, unsigned int line, const char *func, unsigned int level, const char *message) { assert(tracer__level < TRACE_NR); if (level >= tracer__level) tracerEmit(file, line, func, level, message); } dqlite-1.16.7/src/tracing.h000066400000000000000000000034721465252713400154760ustar00rootroot00000000000000/* Tracing functionality for dqlite */ #ifndef DQLITE_TRACING_H_ #define DQLITE_TRACING_H_ #include #include #include #include #include "../include/dqlite.h" #include "utils.h" /* This global variable is only written once at startup and is only read * from there on. Users should not manipulate the value of this variable. */ DQLITE_VISIBLE_TO_TESTS extern bool _dqliteTracingEnabled; DQLITE_VISIBLE_TO_TESTS void stderrTracerEmit(const char *file, unsigned int line, const char *func, unsigned int level, const char *message); #define tracef0(LEVEL, ...) \ do { \ if (UNLIKELY(_dqliteTracingEnabled)) { \ char _msg[1024]; \ snprintf(_msg, sizeof _msg, __VA_ARGS__); \ stderrTracerEmit(__FILE__, __LINE__, __func__, \ (LEVEL), _msg); \ } \ } while (0) enum dqlite_trace_level { /** Represents an invalid trace level */ TRACE_NONE, /** Lower-level information to debug and analyse incorrect behavior */ TRACE_DEBUG, /** Information about current system's state */ TRACE_INFO, /** * Condition which requires a special handling, something which doesn't * happen normally */ TRACE_WARN, /** Resource unavailable, no connectivity, invalid value, etc. */ TRACE_ERROR, /** System is not able to continue performing its basic function */ TRACE_FATAL, TRACE_NR, }; #define tracef(...) tracef0(TRACE_DEBUG, __VA_ARGS__) /* Enable tracing if the appropriate env variable is set, or disable tracing. */ DQLITE_VISIBLE_TO_TESTS void dqliteTracingMaybeEnable(bool enabled); #endif /* DQLITE_TRACING_H_ */ dqlite-1.16.7/src/translate.c000066400000000000000000000020471465252713400160340ustar00rootroot00000000000000#include "translate.h" #include "assert.h" #include "leader.h" #include "protocol.h" #include "raft.h" /* Translate a raft error to a dqlite one. */ int translateRaftErrCode(int code) { switch (code) { case RAFT_NOTLEADER: return SQLITE_IOERR_NOT_LEADER; case RAFT_LEADERSHIPLOST: return SQLITE_IOERR_LEADERSHIP_LOST; case RAFT_CANTCHANGE: return SQLITE_BUSY; default: return SQLITE_ERROR; } } /* Translate a dqlite role code to its raft equivalent. */ int translateDqliteRole(int role) { switch (role) { case DQLITE_VOTER: return RAFT_VOTER; case DQLITE_STANDBY: return RAFT_STANDBY; case DQLITE_SPARE: return RAFT_SPARE; default: /* For backward compat with clients that don't set a * role. */ return DQLITE_VOTER; } } /* Translate a raft role code to its dqlite equivalent. */ int translateRaftRole(int role) { switch (role) { case RAFT_VOTER: return DQLITE_VOTER; case RAFT_STANDBY: return DQLITE_STANDBY; case RAFT_SPARE: return DQLITE_SPARE; default: assert(0); return -1; } } dqlite-1.16.7/src/translate.h000066400000000000000000000006171465252713400160420ustar00rootroot00000000000000/* Translate to/from dqlite types */ #ifndef DQLITE_TRANSLATE_H_ #define DQLITE_TRANSLATE_H_ /* Translate a raft error to a dqlite one. */ int translateRaftErrCode(int code); /* Translate a dqlite role code to its raft equivalent. */ int translateDqliteRole(int role); /* Translate a raft role code to its dqlite equivalent. */ int translateRaftRole(int role); #endif /* DQLITE_TRANSLATE_H_ */ dqlite-1.16.7/src/transport.c000066400000000000000000000135741465252713400161020ustar00rootroot00000000000000#include "lib/transport.h" #include #include #include #include #include "lib/addr.h" #include "message.h" #include "protocol.h" #include "raft.h" #include "request.h" #include "tracing.h" #include "transport.h" struct impl { struct uv_loop_s *loop; struct { int (*f)(void *arg, const char *address, int *fd); void *arg; } connect; raft_id id; const char *address; raft_uv_accept_cb accept_cb; }; struct connect { struct impl *impl; struct raft_uv_connect *req; struct uv_work_s work; raft_id id; const char *address; int fd; int status; }; static int impl_init(struct raft_uv_transport *transport, raft_id id, const char *address) { tracef("impl init"); struct impl *i = transport->impl; i->id = id; i->address = address; return 0; } static int impl_listen(struct raft_uv_transport *transport, raft_uv_accept_cb cb) { tracef("impl listen"); struct impl *i = transport->impl; i->accept_cb = cb; return 0; } static void connect_work_cb(uv_work_t *work) { tracef("connect work cb"); struct connect *r = work->data; struct impl *i = r->impl; struct message message = {0}; struct request_connect request = {0}; uint64_t protocol; void *buf; char *cursor; size_t n; size_t n1; size_t n2; int rv; /* Establish a connection to the other node using the provided connect * function. */ rv = i->connect.f(i->connect.arg, r->address, &r->fd); if (rv != 0) { tracef("connect failed to %llu@%s", r->id, r->address); rv = RAFT_NOCONNECTION; goto err; } /* Send the initial dqlite protocol handshake. */ protocol = ByteFlipLe64(DQLITE_PROTOCOL_VERSION); rv = (int)write(r->fd, &protocol, sizeof protocol); if (rv != sizeof protocol) { tracef("write failed"); rv = RAFT_NOCONNECTION; goto err_after_connect; } /* Send a CONNECT dqlite protocol command, which will transfer control * to the underlying raft UV backend. */ request.id = i->id; request.address = i->address; n1 = message__sizeof(&message); n2 = request_connect__sizeof(&request); message.type = DQLITE_REQUEST_CONNECT; message.words = (uint32_t)(n2 / 8); n = n1 + n2; buf = sqlite3_malloc64(n); if (buf == NULL) { tracef("malloc failed"); rv = RAFT_NOCONNECTION; goto err_after_connect; } cursor = buf; message__encode(&message, &cursor); request_connect__encode(&request, &cursor); rv = (int)write(r->fd, buf, n); sqlite3_free(buf); if (rv != (int)n) { tracef("write failed"); rv = RAFT_NOCONNECTION; goto err_after_connect; } r->status = 0; return; err_after_connect: close(r->fd); err: r->status = rv; return; } static void connect_after_work_cb(uv_work_t *work, int status) { tracef("connect after work cb status %d", status); struct connect *r = work->data; struct impl *i = r->impl; struct uv_stream_s *stream = NULL; int rv; assert(status == 0); if (r->status != 0) { goto out; } rv = transport__stream(i->loop, r->fd, &stream); if (rv != 0) { tracef("transport stream failed %d", rv); r->status = RAFT_NOCONNECTION; close(r->fd); goto out; } out: r->req->cb(r->req, stream, r->status); sqlite3_free(r); } static int impl_connect(struct raft_uv_transport *transport, struct raft_uv_connect *req, raft_id id, const char *address, raft_uv_connect_cb cb) { tracef("impl connect id:%llu address:%s", id, address); struct impl *i = transport->impl; struct connect *r; int rv; r = sqlite3_malloc(sizeof *r); if (r == NULL) { tracef("malloc failed"); rv = DQLITE_NOMEM; goto err; } r->impl = i; r->req = req; r->work.data = r; r->id = id; r->address = address; req->cb = cb; rv = uv_queue_work(i->loop, &r->work, connect_work_cb, connect_after_work_cb); if (rv != 0) { tracef("queue work failed"); rv = RAFT_NOCONNECTION; goto err_after_connect_alloc; } return 0; err_after_connect_alloc: sqlite3_free(r); err: return rv; } static void impl_close(struct raft_uv_transport *transport, raft_uv_transport_close_cb cb) { tracef("impl close"); cb(transport); } int transportDefaultConnect(void *arg, const char *address, int *fd) { struct sockaddr_in addr_in; struct sockaddr *addr = (struct sockaddr *)&addr_in; socklen_t addr_len = sizeof addr_in; int rv; (void)arg; rv = AddrParse(address, addr, &addr_len, "8080", 0); if (rv != 0) { return RAFT_NOCONNECTION; } assert(addr->sa_family == AF_INET || addr->sa_family == AF_INET6); *fd = socket(addr->sa_family, SOCK_STREAM, 0); if (*fd == -1) { return RAFT_NOCONNECTION; } rv = connect(*fd, addr, addr_len); if (rv == -1) { close(*fd); return RAFT_NOCONNECTION; } return 0; } int raftProxyInit(struct raft_uv_transport *transport, struct uv_loop_s *loop) { tracef("raft proxy init"); struct impl *i = sqlite3_malloc(sizeof *i); if (i == NULL) { return DQLITE_NOMEM; } i->loop = loop; i->connect.f = transportDefaultConnect; i->connect.arg = NULL; i->accept_cb = NULL; transport->version = 1; transport->impl = i; transport->init = impl_init; transport->listen = impl_listen; transport->connect = impl_connect; transport->close = impl_close; return 0; } void raftProxyClose(struct raft_uv_transport *transport) { tracef("raft proxy close"); struct impl *i = transport->impl; sqlite3_free(i); } void raftProxyAccept(struct raft_uv_transport *transport, raft_id id, const char *address, struct uv_stream_s *stream) { tracef("raft proxy accept"); struct impl *i = transport->impl; /* If the accept callback is NULL it means we were stopped. */ if (i->accept_cb == NULL) { tracef("raft proxy accept closed"); uv_close((struct uv_handle_s *)stream, (uv_close_cb)raft_free); } else { i->accept_cb(transport, id, address, stream); } } void raftProxySetConnectFunc(struct raft_uv_transport *transport, int (*f)(void *arg, const char *address, int *fd), void *arg) { struct impl *i = transport->impl; i->connect.f = f; i->connect.arg = arg; } dqlite-1.16.7/src/transport.h000066400000000000000000000022651465252713400161020ustar00rootroot00000000000000/* Implementation of the raft_uv_transport interface, proxied by a dqlite * connection. * * Instead of having raft instances connect to each other directly, we pass a * custom connect function that causes dqlite to send a CONNECT request to the * dqlite server where the destination raft instance is running. That server * responds to the CONNECT request by forwarding the dqlite connection to its * raft instance, after which the raft-to-raft connection is transparent. */ #ifndef TRANSPORT_H_ #define TRANSPORT_H_ #include "raft.h" #include "../include/dqlite.h" int transportDefaultConnect(void *arg, const char *address, int *fd); int raftProxyInit(struct raft_uv_transport *transport, struct uv_loop_s *loop); void raftProxyClose(struct raft_uv_transport *transport); /* Invoke the accept callback configured on the transport object. */ void raftProxyAccept(struct raft_uv_transport *transport, raft_id id, const char *address, struct uv_stream_s *stream); /* Set a custom connect function. */ void raftProxySetConnectFunc(struct raft_uv_transport *transport, int (*f)(void *arg, const char *address, int *fd), void *arg); #endif /* TRANSPORT_H_*/ dqlite-1.16.7/src/tuple.c000066400000000000000000000151161465252713400151710ustar00rootroot00000000000000#include #include "assert.h" #include "tuple.h" /* Return the tuple header size in bytes, for a tuple of @n values. * * If the tuple is a row, then each slot is 4 bits, otherwise if the tuple is a * sequence of parameters each slot is 8 bits. */ static size_t calc_header_size(unsigned long n, int format) { size_t size; switch (format) { case TUPLE__ROW: /* Half a byte for each slot, rounded up... */ size = (n + 1) / 2; /* ...and padded to a multiple of 8 bytes. */ size = BytePad64(size); break; case TUPLE__PARAMS: /* 1-byte params count at the beginning of the first * word */ size = n + 1; size = BytePad64(size); /* Params count is not included in the header */ size -= 1; break; case TUPLE__PARAMS32: /* 4-byte params count at the beginning of the first * word */ size = n + 4; size = BytePad64(size); /* Params count is not included in the header */ size -= 4; break; default: assert(0); } return size; } int tuple_decoder__init(struct tuple_decoder *d, unsigned n, int format, struct cursor *cursor) { size_t header_size; uint8_t byte = 0; uint32_t val = 0; int rc = 0; switch (format) { case TUPLE__ROW: assert(n > 0); d->n = n; break; case TUPLE__PARAMS: assert(n == 0); rc = uint8__decode(cursor, &byte); d->n = byte; break; case TUPLE__PARAMS32: assert(n == 0); rc = uint32__decode(cursor, &val); d->n = val; break; default: assert(0); } if (rc != 0) { return rc; } d->format = format; d->i = 0; d->header = (const uint8_t *)cursor->p; /* Check that there is enough room to hold n type code slots. */ header_size = calc_header_size(d->n, d->format); if (header_size > cursor->cap) { return DQLITE_PARSE; } d->cursor = cursor; d->cursor->p += header_size; d->cursor->cap -= header_size; return 0; } /* Return the number of values in the decoder's tuple. */ unsigned long tuple_decoder__n(struct tuple_decoder *d) { return d->n; } /* Return the type of the i'th value of the tuple. */ static int get_type(struct tuple_decoder *d, unsigned long i) { int type; /* In row format the type slot size is 4 bits, while in params format * the slot is 8 bits. */ if (d->format == TUPLE__ROW) { type = d->header[i / 2]; if (i % 2 == 0) { type &= 0x0f; } else { type = type >> 4; } } else { type = d->header[i]; } return type; } int tuple_decoder__next(struct tuple_decoder *d, struct value *value) { int rc; assert(d->i < d->n); value->type = get_type(d, d->i); switch (value->type) { case SQLITE_INTEGER: rc = int64__decode(d->cursor, &value->integer); break; case SQLITE_FLOAT: rc = float__decode(d->cursor, &value->float_); break; case SQLITE_BLOB: rc = blob__decode(d->cursor, &value->blob); break; case SQLITE_NULL: /* TODO: allow null to be encoded with 0 bytes? */ rc = uint64__decode(d->cursor, &value->null); break; case SQLITE_TEXT: rc = text__decode(d->cursor, &value->text); break; case DQLITE_ISO8601: rc = text__decode(d->cursor, &value->iso8601); break; case DQLITE_BOOLEAN: rc = uint64__decode(d->cursor, &value->boolean); break; default: rc = DQLITE_PARSE; break; }; if (rc != 0) { return rc; } d->i++; return 0; } /* Return a pointer to the tuple header. */ static uint8_t *encoder__header(struct tuple_encoder *e) { return buffer__cursor(e->buffer, e->header); } int tuple_encoder__init(struct tuple_encoder *e, unsigned long n, int format, struct buffer *buffer) { char *cursor; size_t n_header; e->n = n; e->format = format; e->buffer = buffer; e->i = 0; /* When encoding a tuple of parameters, we need to write the * number of values at the beginning of the header. */ if (e->format == TUPLE__PARAMS) { assert(n <= UINT8_MAX); uint8_t *header = buffer__advance(buffer, 1); if (header == NULL) { return DQLITE_NOMEM; } header[0] = (uint8_t)n; } else if (e->format == TUPLE__PARAMS32) { uint32_t val = (uint32_t)n; assert((unsigned long long)val == (unsigned long long)n); char *header = buffer__advance(buffer, 4); if (header == NULL) { return DQLITE_NOMEM; } uint32__encode(&val, &header); } e->header = buffer__offset(buffer); /* Reset the header */ n_header = calc_header_size(n, format); memset(encoder__header(e), 0, n_header); /* Advance the buffer write pointer past the tuple header. */ cursor = buffer__advance(buffer, n_header); if (cursor == NULL) { return DQLITE_NOMEM; } return 0; } /* Set the type of the i'th value of the tuple. */ static void set_type(struct tuple_encoder *e, unsigned long i, int type) { uint8_t *header = encoder__header(e); /* In row format the type slot size is 4 bits, while in params format * the slot is 8 bits. */ if (e->format == TUPLE__ROW) { uint8_t *slot; slot = &header[i / 2]; if (i % 2 == 0) { *slot = (uint8_t)type; } else { *slot |= (uint8_t)(type << 4); } } else { header[i] = (uint8_t)type; } } int tuple_encoder__next(struct tuple_encoder *e, struct value *value) { char *cursor; size_t size; assert(e->i < e->n); set_type(e, e->i, value->type); switch (value->type) { case SQLITE_INTEGER: size = int64__sizeof(&value->integer); break; case SQLITE_FLOAT: size = float__sizeof(&value->float_); break; case SQLITE_BLOB: size = blob__sizeof(&value->blob); break; case SQLITE_NULL: /* TODO: allow null to be encoded with 0 bytes */ size = uint64__sizeof(&value->null); break; case SQLITE_TEXT: size = text__sizeof(&value->text); break; case DQLITE_UNIXTIME: size = int64__sizeof(&value->unixtime); break; case DQLITE_ISO8601: size = text__sizeof(&value->iso8601); break; case DQLITE_BOOLEAN: size = uint64__sizeof(&value->boolean); break; default: assert(0); }; /* Advance the buffer write pointer. */ cursor = buffer__advance(e->buffer, size); if (cursor == NULL) { return DQLITE_NOMEM; } switch (value->type) { case SQLITE_INTEGER: int64__encode(&value->integer, &cursor); break; case SQLITE_FLOAT: float__encode(&value->float_, &cursor); break; case SQLITE_BLOB: blob__encode(&value->blob, &cursor); break; case SQLITE_NULL: /* TODO: allow null to be encoded with 0 bytes */ uint64__encode(&value->null, &cursor); break; case SQLITE_TEXT: text__encode(&value->text, &cursor); break; case DQLITE_UNIXTIME: int64__encode(&value->unixtime, &cursor); break; case DQLITE_ISO8601: text__encode(&value->iso8601, &cursor); break; case DQLITE_BOOLEAN: uint64__encode(&value->boolean, &cursor); break; }; e->i++; return 0; } dqlite-1.16.7/src/tuple.h000066400000000000000000000104701465252713400151740ustar00rootroot00000000000000/** * Encode and decode tuples of database values. * * A tuple is composed by a header and a body. * * The format of the header changes depending on whether the tuple is a sequence * of parameters to bind to a statement, or a sequence of values of a single row * yielded by a query. * * For a tuple of parameters the format of the header is: * * 8 bits: Number of values in the tuple. * 8 bits: Type code of the 1st value of the tuple. * 8 bits: Type code of the 2nd value of the tuple, or 0. * 8 bits: Type code of the 3rd value of the tuple, or 0. * ... * * This repeats until reaching a full 64-bit word. If there are more than 7 * parameters to bind, the header will grow additional 64-bit words as needed, * following the same pattern: a sequence of 8-bit slots with type codes of the * parameters followed by a sequence of zero bits, until word boundary is * reached. * * For a tuple of row values the format of the header is: * * 4 bits: Type code of the 1st value of the tuple. * 4 bits: Type code of the 2nd value of the tuple, or 0. * 4 bits: Type code of the 3rd value of the tuple, or 0. * ... * * This repeats until reaching a full 64-bit word. If there are more than 16 * values, the header will grow additional 64-bit words as needed, following the * same pattern: a sequence of 4-bit slots with type codes of the values * followed by a sequence of zero bits, until word boundary is reached. * * After the header the body follows immediately, which contains all parameters * or values in sequence, encoded using type-specific rules. */ #ifndef DQLITE_TUPLE_H_ #define DQLITE_TUPLE_H_ #include #include #include #include "lib/buffer.h" #include "lib/serialize.h" #include "protocol.h" /* Formats for tuple encoding and decoding. */ enum { /* Used for coding a row from the database: length field is implicit, * type codes are 4 bits each. */ TUPLE__ROW = 1, /* Used for coding a short tuple of statement parameters: length field * is 1 byte, type codes are 1 byte each. */ TUPLE__PARAMS, /* Used for coding a longer tuple of statement parameters: length field * is 4 bytes, type codes are 1 byte each. */ TUPLE__PARAMS32 }; /** * Hold a single database value. */ struct value { int type; union { int64_t integer; double float_; uv_buf_t blob; uint64_t null; const char *text; const char *iso8601; /* INT8601 date string */ int64_t unixtime; /* Unix time in seconds since epoch */ uint64_t boolean; }; }; /** * Maintain state while decoding a single tuple. */ struct tuple_decoder { unsigned long n; /* Number of values in the tuple */ struct cursor *cursor; /* Reading cursor */ int format; /* Tuple format */ unsigned long i; /* Index of next value to decode */ const uint8_t *header; /* Pointer to tuple header */ }; /** * Initialize the state of the decoder, before starting to decode a new * tuple. * * If @n is zero, it means that the tuple is a sequence of statement * parameters. In that case the d->n field will be read from the first byte of * @cursor. */ int tuple_decoder__init(struct tuple_decoder *d, unsigned n, int format, struct cursor *cursor); /** * Return the number of values in the tuple being decoded. * * In row format this will be the same @n passed to the constructor. In * parameters format this is the value contained in the first byte of the tuple * header. */ unsigned long tuple_decoder__n(struct tuple_decoder *d); /** * Decode the next value of the tuple. */ int tuple_decoder__next(struct tuple_decoder *d, struct value *value); /** * Maintain state while encoding a single tuple. */ struct tuple_encoder { unsigned long n; /* Number of values in the tuple */ int format; /* Tuple format */ struct buffer *buffer; /* Write buffer */ unsigned long i; /* Index of next value to encode */ size_t header; /* Buffer offset of tuple header */ }; /** * Initialize the state of the encoder, before starting to encode a new * tuple. The @n parameter must always be greater than zero. */ int tuple_encoder__init(struct tuple_encoder *e, unsigned long n, int format, struct buffer *buffer); /** * Encode the next value of the tuple. */ int tuple_encoder__next(struct tuple_encoder *e, struct value *value); #endif /* DQLITE_TUPLE_H_ */ dqlite-1.16.7/src/utils.h000066400000000000000000000014231465252713400152010ustar00rootroot00000000000000#ifndef DQLITE_UTILS_H_ #define DQLITE_UTILS_H_ #include #include #include /* Various utility functions and macros */ #define PTR_TO_UINT64(p) ((uint64_t)((uintptr_t)(p))) #define UINT64_TO_PTR(u, ptr_type) ((ptr_type)((uintptr_t)(u))) #define LIKELY(x) __builtin_expect(!!(x), 1) #define UNLIKELY(x) __builtin_expect(!!(x), 0) #define IMPOSSIBLE(why) assert(false && why) #define DBG() fprintf(stderr, "%s:%d\n", __func__, __LINE__) #define CONTAINER_OF(e, type, field) \ ((type *)(uintptr_t)((char *)(e)-offsetof(type, field))) #define PRE(cond) assert((cond)) #define POST(cond) assert((cond)) #define ERGO(a, b) (!(a) || (b)) static inline bool is_po2(unsigned long n) { return n > 0 && (n & (n - 1)) == 0; } #endif /* DQLITE_UTILS_H_ */ dqlite-1.16.7/src/vfs.c000066400000000000000000002512711465252713400146420ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include "../include/dqlite.h" #include "lib/assert.h" #include "lib/byte.h" #include "format.h" #include "raft.h" #include "tracing.h" #include "vfs.h" /* tinycc doesn't have this builtin, nor the warning that it's meant to silence. */ #ifdef __TINYC__ #define __builtin_assume_aligned(x, y) x #endif /* Byte order */ #if defined(DQLITE_LITTLE_ENDIAN) #define VFS__BIGENDIAN 0 #elif defined(DQLITE_BIG_ENDIAN) #define VFS__BIGENDIAN 1 #else const int vfsOne = 1; #define VFS__BIGENDIAN (*(char *)(&vfsOne) == 0) #endif /* Maximum pathname length supported by this VFS. */ #define VFS__MAX_PATHNAME 512 /* WAL magic value. Either this value, or the same value with the least * significant bit also set (FORMAT__WAL_MAGIC | 0x00000001) is stored in 32-bit * big-endian format in the first 4 bytes of a WAL file. * * If the LSB is set, then the checksums for each frame within the WAL file are * calculated by treating all data as an array of 32-bit big-endian * words. Otherwise, they are calculated by interpreting all data as 32-bit * little-endian words. */ #define VFS__WAL_MAGIC 0x377f0682 /* WAL format version (same for WAL index). */ #define VFS__WAL_VERSION 3007000 /* Index of the write lock in the WAL-index header locks area. */ #define VFS__WAL_WRITE_LOCK 0 /* Write ahead log header size. */ #define VFS__WAL_HEADER_SIZE 32 /* Write ahead log frame header size. */ #define VFS__FRAME_HEADER_SIZE 24 /* Size of the first part of the WAL index header. */ #define VFS__WAL_INDEX_HEADER_SIZE 48 /* Size of a single memory-mapped WAL index region. */ #define VFS__WAL_INDEX_REGION_SIZE 32768 #define vfsFrameSize(PAGE_SIZE) (VFS__FRAME_HEADER_SIZE + PAGE_SIZE) /* Hold content for a shared memory mapping. */ struct vfsShm { void **regions; /* Pointers to shared memory regions. */ unsigned n_regions; /* Number of shared memory regions. */ unsigned refcount; /* Number of outstanding mappings. */ unsigned shared[SQLITE_SHM_NLOCK]; /* Count of shared locks */ unsigned exclusive[SQLITE_SHM_NLOCK]; /* Count of exclusive locks */ }; /* Hold the content of a single WAL frame. */ struct vfsFrame { uint8_t header[VFS__FRAME_HEADER_SIZE]; uint8_t *page; /* Content of the page. */ }; /* WAL-specific content. * Watch out when changing the members of this struct, see * comment in `formatWalChecksumBytes`. */ struct vfsWal { uint8_t hdr[VFS__WAL_HEADER_SIZE]; /* Header. */ struct vfsFrame **frames; /* All frames committed. */ unsigned n_frames; /* Number of committed frames. */ struct vfsFrame **tx; /* Frames added by a transaction. */ unsigned n_tx; /* Number of added frames. */ }; /* Database-specific content */ struct vfsDatabase { char *name; /* Database name. */ void **pages; /* All database. */ unsigned page_size; /* Only used for on-disk db */ unsigned n_pages; /* Number of pages. */ struct vfsShm shm; /* Shared memory. */ struct vfsWal wal; /* Associated WAL. */ }; /* * Generate or extend an 8 byte checksum based on the data in array data[] and * the initial values of in[0] and in[1] (or initial values of 0 and 0 if * in==NULL). * * The checksum is written back into out[] before returning. * * n must be a positive multiple of 8. */ static void vfsChecksum( uint8_t *data, /* Content to be checksummed */ unsigned n, /* Bytes of content in a[]. Must be a multiple of 8. */ const uint32_t in[2], /* Initial checksum value input */ uint32_t out[2] /* OUT: Final checksum value output */ ) { assert((((uintptr_t)data) % sizeof(uint32_t)) == 0); uint32_t s1, s2; uint32_t *cur = (uint32_t *)__builtin_assume_aligned(data, sizeof(uint32_t)); uint32_t *end = (uint32_t *)__builtin_assume_aligned(&data[n], sizeof(uint32_t)); if (in) { s1 = in[0]; s2 = in[1]; } else { s1 = s2 = 0; } assert(n >= 8); assert((n & 0x00000007) == 0); assert(n <= 65536); do { s1 += *cur++ + s2; s2 += *cur++ + s1; } while (cur < end); out[0] = s1; out[1] = s2; } /* Create a new frame of a WAL file. */ static struct vfsFrame *vfsFrameCreate(unsigned size) { struct vfsFrame *f; assert(size > 0); f = sqlite3_malloc(sizeof *f); if (f == NULL) { goto oom; } f->page = sqlite3_malloc64(size); if (f->page == NULL) { goto oom_after_page_alloc; } memset(f->header, 0, FORMAT__WAL_FRAME_HDR_SIZE); memset(f->page, 0, (size_t)size); return f; oom_after_page_alloc: sqlite3_free(f); oom: return NULL; } /* Destroy a WAL frame */ static void vfsFrameDestroy(struct vfsFrame *f) { assert(f != NULL); assert(f->page != NULL); sqlite3_free(f->page); sqlite3_free(f); } /* Initialize the shared memory mapping of a database file. */ static void vfsShmInit(struct vfsShm *s) { int i; s->regions = NULL; s->n_regions = 0; s->refcount = 0; for (i = 0; i < SQLITE_SHM_NLOCK; i++) { s->shared[i] = 0; s->exclusive[i] = 0; } } /* Release all resources used by a shared memory mapping. */ static void vfsShmClose(struct vfsShm *s) { void *region; unsigned i; assert(s != NULL); /* Free all regions. */ for (i = 0; i < s->n_regions; i++) { region = *(s->regions + i); assert(region != NULL); sqlite3_free(region); } /* Free the shared memory region array. */ if (s->regions != NULL) { sqlite3_free(s->regions); } } /* Revert the shared mamory to its initial state. */ static void vfsShmReset(struct vfsShm *s) { vfsShmClose(s); vfsShmInit(s); } /* Initialize a new WAL object. */ static void vfsWalInit(struct vfsWal *w) { memset(w->hdr, 0, VFS__WAL_HEADER_SIZE); w->frames = NULL; w->n_frames = 0; w->tx = NULL; w->n_tx = 0; } /* Initialize a new database object. */ static void vfsDatabaseInit(struct vfsDatabase *d) { d->pages = NULL; d->n_pages = 0; d->page_size = 0; vfsShmInit(&d->shm); vfsWalInit(&d->wal); } /* Release all memory used by a WAL object. */ static void vfsWalClose(struct vfsWal *w) { unsigned i; for (i = 0; i < w->n_frames; i++) { vfsFrameDestroy(w->frames[i]); } if (w->frames != NULL) { sqlite3_free(w->frames); } for (i = 0; i < w->n_tx; i++) { vfsFrameDestroy(w->tx[i]); } if (w->tx != NULL) { sqlite3_free(w->tx); } } /* Release all memory used by a database object. */ static void vfsDatabaseClose(struct vfsDatabase *d) { unsigned i; for (i = 0; d->pages != NULL && i < d->n_pages; i++) { sqlite3_free(d->pages[i]); } if (d->pages != NULL) { sqlite3_free(d->pages); } vfsShmClose(&d->shm); vfsWalClose(&d->wal); } /* Destroy the content of a database object. */ static void vfsDatabaseDestroy(struct vfsDatabase *d) { assert(d != NULL); sqlite3_free(d->name); vfsDatabaseClose(d); sqlite3_free(d); } /* * Comment copied entirely for sqlite source code, it is safe to assume * the value 0x40000000 will never change. dq_sqlite_pending_byte is global * to be able to adapt it in the unittest, the value must never be changed. * * ==BEGIN COPY== * The value of the "pending" byte must be 0x40000000 (1 byte past the * 1-gibabyte boundary) in a compatible database. SQLite never uses * the database page that contains the pending byte. It never attempts * to read or write that page. The pending byte page is set aside * for use by the VFS layers as space for managing file locks. * * During testing, it is often desirable to move the pending byte to * a different position in the file. This allows code that has to * deal with the pending byte to run on files that are much smaller * than 1 GiB. The sqlite3_test_control() interface can be used to * move the pending byte. * * IMPORTANT: Changing the pending byte to any value other than * 0x40000000 results in an incompatible database file format! * Changing the pending byte during operation will result in undefined * and incorrect behavior. * ==END COPY== */ DQLITE_VISIBLE_TO_TESTS unsigned dq_sqlite_pending_byte = 0x40000000; /* Get a page from the given database, possibly creating a new one. */ static int vfsDatabaseGetPage(struct vfsDatabase *d, uint32_t page_size, unsigned pgno, void **page) { int rc; assert(d != NULL); assert(pgno > 0); /* SQLite should access pages progressively, without jumping more than * one page after the end unless one would attempt to access a page at * `sqlite_pending_byte` offset, skipping a page is permitted then. */ bool pending_byte_page_reached = (page_size * d->n_pages == dq_sqlite_pending_byte); if ((pgno > d->n_pages + 1) && !pending_byte_page_reached) { rc = SQLITE_IOERR_WRITE; goto err; } if (pgno <= d->n_pages) { /* Return the existing page. */ assert(d->pages != NULL); *page = d->pages[pgno - 1]; return SQLITE_OK; } /* Create a new page, grow the page array, and append the * new page to it. */ *page = sqlite3_malloc64(page_size); if (*page == NULL) { rc = SQLITE_NOMEM; goto err; } void **pages = sqlite3_realloc64(d->pages, sizeof *pages * pgno); if (pages == NULL) { rc = SQLITE_NOMEM; goto err_after_vfs_page_create; } pages[pgno - 1] = *page; /* Allocate a page to store the pending_byte */ if (pending_byte_page_reached) { void *pending_byte_page = sqlite3_malloc64(page_size); if (pending_byte_page == NULL) { rc = SQLITE_NOMEM; goto err_after_pending_byte_page; } pages[d->n_pages] = pending_byte_page; } /* Update the page array. */ d->pages = pages; d->n_pages = pgno; return SQLITE_OK; err_after_pending_byte_page: d->pages = pages; err_after_vfs_page_create: sqlite3_free(*page); err: *page = NULL; return rc; } /* Get a frame from the current transaction, possibly creating a new one. */ static int vfsWalFrameGet(struct vfsWal *w, unsigned index, uint32_t page_size, struct vfsFrame **frame) { int rv; assert(w != NULL); assert(index > 0); /* SQLite should access pages progressively, without jumping more than * one page after the end. */ if (index > w->n_frames + w->n_tx + 1) { rv = SQLITE_IOERR_WRITE; goto err; } if (index == w->n_frames + w->n_tx + 1) { /* Create a new frame, grow the transaction array, and append * the new frame to it. */ struct vfsFrame **tx; /* We assume that the page size has been set, either by * intervepting the first main database file write, or by * handling a 'PRAGMA page_size=N' command in * vfs__file_control(). This assumption is enforved in * vfsFileWrite(). */ assert(page_size > 0); *frame = vfsFrameCreate(page_size); if (*frame == NULL) { rv = SQLITE_NOMEM; goto err; } tx = sqlite3_realloc64(w->tx, sizeof *tx * w->n_tx + 1); if (tx == NULL) { rv = SQLITE_NOMEM; goto err_after_vfs_frame_create; } /* Append the new page to the new page array. */ tx[index - w->n_frames - 1] = *frame; /* Update the page array. */ w->tx = tx; w->n_tx++; } else { /* Return the existing page. */ assert(w->tx != NULL); *frame = w->tx[index - w->n_frames - 1]; } return SQLITE_OK; err_after_vfs_frame_create: vfsFrameDestroy(*frame); err: *frame = NULL; return rv; } /* Lookup a page from the given database, returning NULL if it doesn't exist. */ static void *vfsDatabasePageLookup(struct vfsDatabase *d, unsigned pgno) { void *page; assert(d != NULL); assert(pgno > 0); if (pgno > d->n_pages) { /* This page hasn't been written yet. */ return NULL; } page = d->pages[pgno - 1]; assert(page != NULL); return page; } /* Lookup a frame from the WAL, returning NULL if it doesn't exist. */ static struct vfsFrame *vfsWalFrameLookup(struct vfsWal *w, unsigned n) { struct vfsFrame *frame; assert(w != NULL); assert(n > 0); if (n > w->n_frames + w->n_tx) { /* This page hasn't been written yet. */ return NULL; } if (n <= w->n_frames) { frame = w->frames[n - 1]; } else { frame = w->tx[n - w->n_frames - 1]; } assert(frame != NULL); return frame; } /* Parse the page size ("Must be a power of two between 512 and 32768 * inclusive, or the value 1 representing a page size of 65536"). * * Return 0 if the page size is out of bound. */ static uint32_t vfsParsePageSize(uint32_t page_size) { if (page_size == 1) { page_size = FORMAT__PAGE_SIZE_MAX; } else if (page_size < FORMAT__PAGE_SIZE_MIN) { page_size = 0; } else if (page_size > (FORMAT__PAGE_SIZE_MAX / 2)) { page_size = 0; } else if (((page_size - 1) & page_size) != 0) { page_size = 0; } return page_size; } static uint32_t vfsDatabaseGetPageSize(struct vfsDatabase *d) { uint8_t *page; /* Only set in disk-mode */ if (d->page_size != 0) { return d->page_size; } assert(d->n_pages > 0); page = d->pages[0]; /* The page size is stored in the 16th and 17th bytes of the first * database page (big-endian) */ return vfsParsePageSize(ByteGetBe16(&page[16])); } /* Truncate a database file to be exactly the given number of pages. */ static int vfsDatabaseTruncate(struct vfsDatabase *d, sqlite_int64 size) { void **cursor; uint32_t page_size; unsigned n_pages; unsigned i; if (d->n_pages == 0) { if (size > 0) { return SQLITE_IOERR_TRUNCATE; } return SQLITE_OK; } /* Since the file size is not zero, some content must * have been written and the page size must be known. */ page_size = vfsDatabaseGetPageSize(d); assert(page_size > 0); if ((size % page_size) != 0) { return SQLITE_IOERR_TRUNCATE; } n_pages = (unsigned)(size / page_size); /* We expect callers to only invoke us if some actual content has been * written already. */ assert(d->n_pages > 0); /* Truncate should always shrink a file. */ assert(n_pages <= d->n_pages); assert(d->pages != NULL); /* Destroy pages beyond pages_len. */ cursor = d->pages + n_pages; for (i = 0; i < (d->n_pages - n_pages); i++) { sqlite3_free(*cursor); cursor++; } /* Shrink the page array, possibly to 0. * * TODO: in principle realloc could fail also when shrinking. */ d->pages = sqlite3_realloc64(d->pages, sizeof *d->pages * n_pages); /* Update the page count. */ d->n_pages = n_pages; return SQLITE_OK; } /* Truncate a WAL file to zero. */ static int vfsWalTruncate(struct vfsWal *w, sqlite3_int64 size) { unsigned i; /* We expect SQLite to only truncate to zero, after a * full checkpoint. * * TODO: figure out other case where SQLite might * truncate to a different size. */ if (size != 0) { return SQLITE_PROTOCOL; } if (w->n_frames == 0) { return SQLITE_OK; } assert(w->frames != NULL); /* Restart the header. */ formatWalRestartHeader(w->hdr); /* Destroy all frames. */ for (i = 0; i < w->n_frames; i++) { vfsFrameDestroy(w->frames[i]); } sqlite3_free(w->frames); w->frames = NULL; w->n_frames = 0; return SQLITE_OK; } enum vfsFileType { VFS__DATABASE, /* Main database file */ VFS__JOURNAL, /* Default SQLite journal file */ VFS__WAL /* Write-Ahead Log */ }; /* Implementation of the abstract sqlite3_file base class. */ struct vfsFile { sqlite3_file base; /* Base class. Must be first. */ struct vfs *vfs; /* Pointer to volatile VFS data. */ enum vfsFileType type; /* Associated file (main db or WAL). */ struct vfsDatabase *database; /* Underlying database content. */ int flags; /* Flags passed to xOpen */ sqlite3_file *temp; /* For temp-files, actual VFS. */ sqlite3_file *db; /* For on-disk DB files, actual VFS. */ }; /* Custom dqlite VFS. Contains pointers to all databases that were created. */ struct vfs { struct vfsDatabase **databases; /* Database objects */ unsigned n_databases; /* Number of databases */ int error; /* Last error occurred. */ bool disk; /* True if the database is kept on disk. */ struct sqlite3_vfs *base_vfs; /* Base VFS. */ }; /* Create a new vfs object. */ static struct vfs *vfsCreate(void) { struct vfs *v; v = sqlite3_malloc(sizeof *v); if (v == NULL) { return NULL; } v->databases = NULL; v->n_databases = 0; v->error = 0; v->disk = false; v->base_vfs = sqlite3_vfs_find("unix"); assert(v->base_vfs != NULL); return v; } /* Release the memory used internally by the VFS object. * * All file content will be de-allocated, so dangling open FDs against * those files will be broken. */ static void vfsDestroy(struct vfs *r) { unsigned i; assert(r != NULL); for (i = 0; i < r->n_databases; i++) { struct vfsDatabase *database = r->databases[i]; vfsDatabaseDestroy(database); } if (r->databases != NULL) { sqlite3_free(r->databases); } } static bool vfsFilenameEndsWith(const char *filename, const char *suffix) { size_t n_filename = strlen(filename); size_t n_suffix = strlen(suffix); if (n_suffix > n_filename) { return false; } return strncmp(filename + n_filename - n_suffix, suffix, n_suffix) == 0; } /* Find the database object associated with the given filename. */ static struct vfsDatabase *vfsDatabaseLookup(struct vfs *v, const char *filename) { size_t n = strlen(filename); unsigned i; assert(v != NULL); assert(filename != NULL); if (vfsFilenameEndsWith(filename, "-wal")) { n -= strlen("-wal"); } if (vfsFilenameEndsWith(filename, "-journal")) { n -= strlen("-journal"); } for (i = 0; i < v->n_databases; i++) { struct vfsDatabase *database = v->databases[i]; if (strlen(database->name) == n && strncmp(database->name, filename, n) == 0) { // Found matching file. return database; } } return NULL; } static int vfsDeleteDatabase(struct vfs *r, const char *name) { unsigned i; for (i = 0; i < r->n_databases; i++) { struct vfsDatabase *database = r->databases[i]; unsigned j; if (strcmp(database->name, name) != 0) { continue; } /* Free all memory allocated for this file. */ vfsDatabaseDestroy(database); /* Shift all other contents objects. */ for (j = i + 1; j < r->n_databases; j++) { r->databases[j - 1] = r->databases[j]; } r->n_databases--; return SQLITE_OK; } r->error = ENOENT; return SQLITE_IOERR_DELETE_NOENT; } static int vfsFileClose(sqlite3_file *file) { int rc = SQLITE_OK; struct vfsFile *f = (struct vfsFile *)file; struct vfs *v = (struct vfs *)(f->vfs); if (f->temp != NULL) { /* Close the actual temporary file. */ rc = f->temp->pMethods->xClose(f->temp); sqlite3_free(f->temp); return rc; } if (f->flags & SQLITE_OPEN_DELETEONCLOSE) { rc = vfsDeleteDatabase(v, f->database->name); } return rc; } /* Read data from the main database. */ static int vfsDatabaseRead(struct vfsDatabase *d, void *buf, int amount, sqlite_int64 offset) { unsigned page_size; unsigned pgno; void *page; if (d->n_pages == 0) { return SQLITE_IOERR_SHORT_READ; } /* If the main database file is not empty, we expect the * page size to have been set by an initial write. */ page_size = vfsDatabaseGetPageSize(d); assert(page_size > 0); if (offset < (int)page_size) { /* Reading from page 1. We expect the read to be * at most page_size bytes. */ assert(amount <= (int)page_size); pgno = 1; } else { /* For pages greater than 1, we expect an offset * that starts exactly at a page boundary. The read * size can be less than a full page when SQLite * is compiled with SQLITE_DIRECT_OVERFLOW_READ * (enabled by default since 3.45.1). */ assert(amount <= (int)page_size); assert((offset % (int)page_size) == 0); pgno = (unsigned)(offset / (int)page_size) + 1; } assert(pgno > 0); page = vfsDatabasePageLookup(d, pgno); if (pgno == 1) { /* Read the desired part of page 1. */ memcpy(buf, (char *)page + offset, (size_t)amount); } else { /* Read the page. */ memcpy(buf, page, (size_t)amount); } return SQLITE_OK; } /* Get the page size stored in the WAL header. */ static uint32_t vfsWalGetPageSize(struct vfsWal *w) { /* The page size is stored in the 4 bytes starting at 8 * (big-endian) */ return vfsParsePageSize(ByteGetBe32(&w->hdr[8])); } /* Read data from the WAL. */ static int vfsWalRead(struct vfsWal *w, void *buf, int amount, sqlite_int64 offset) { uint32_t page_size; unsigned index; struct vfsFrame *frame; if (offset == 0) { /* Read the header. */ assert(amount == VFS__WAL_HEADER_SIZE); memcpy(buf, w->hdr, VFS__WAL_HEADER_SIZE); return SQLITE_OK; } page_size = vfsWalGetPageSize(w); assert(page_size > 0); /* For any other frame, we expect either a header read, * a checksum read, a page read or a full frame read. */ if (amount == FORMAT__WAL_FRAME_HDR_SIZE) { assert(((offset - VFS__WAL_HEADER_SIZE) % ((int)page_size + FORMAT__WAL_FRAME_HDR_SIZE)) == 0); index = (unsigned)formatWalCalcFrameIndex((int)page_size, offset); } else if (amount == sizeof(uint32_t) * 2) { if (offset == FORMAT__WAL_FRAME_HDR_SIZE) { /* Read the checksum from the WAL * header. */ memcpy(buf, w->hdr + offset, (size_t)amount); return SQLITE_OK; } assert(((offset - 16 - VFS__WAL_HEADER_SIZE) % ((int)page_size + FORMAT__WAL_FRAME_HDR_SIZE)) == 0); index = (unsigned)((offset - 16 - VFS__WAL_HEADER_SIZE) / ((int)page_size + FORMAT__WAL_FRAME_HDR_SIZE)) + 1; } else if (amount == (int)page_size) { assert(((offset - VFS__WAL_HEADER_SIZE - FORMAT__WAL_FRAME_HDR_SIZE) % ((int)page_size + FORMAT__WAL_FRAME_HDR_SIZE)) == 0); index = (unsigned)formatWalCalcFrameIndex((int)page_size, offset); } else { assert(amount == (FORMAT__WAL_FRAME_HDR_SIZE + (int)page_size)); index = (unsigned)formatWalCalcFrameIndex((int)page_size, offset); } if (index == 0) { // This is an attempt to read a page that was // never written. memset(buf, 0, (size_t)amount); return SQLITE_IOERR_SHORT_READ; } frame = vfsWalFrameLookup(w, index); if (frame == NULL) { // Again, the requested page doesn't exist. memset(buf, 0, (size_t)amount); return SQLITE_IOERR_SHORT_READ; } if (amount == FORMAT__WAL_FRAME_HDR_SIZE) { memcpy(buf, frame->header, (size_t)amount); } else if (amount == sizeof(uint32_t) * 2) { memcpy(buf, frame->header + 16, (size_t)amount); } else if (amount == (int)page_size) { memcpy(buf, frame->page, (size_t)amount); } else { memcpy(buf, frame->header, FORMAT__WAL_FRAME_HDR_SIZE); memcpy(buf + FORMAT__WAL_FRAME_HDR_SIZE, frame->page, page_size); } return SQLITE_OK; } static int vfsFileRead(sqlite3_file *file, void *buf, int amount, sqlite_int64 offset) { struct vfsFile *f = (struct vfsFile *)file; int rv; assert(buf != NULL); assert(amount > 0); assert(offset >= 0); assert(f != NULL); if (f->temp != NULL) { /* Read from the actual temporary file. */ return f->temp->pMethods->xRead(f->temp, buf, amount, offset); } switch (f->type) { case VFS__DATABASE: rv = vfsDatabaseRead(f->database, buf, amount, offset); break; case VFS__WAL: rv = vfsWalRead(&f->database->wal, buf, amount, offset); break; default: rv = SQLITE_IOERR_READ; break; } /* From SQLite docs: * * If xRead() returns SQLITE_IOERR_SHORT_READ it must also fill * in the unread portions of the buffer with zeros. A VFS that * fails to zero-fill short reads might seem to work. However, * failure to zero-fill short reads will eventually lead to * database corruption. */ if (rv == SQLITE_IOERR_SHORT_READ) { memset(buf, 0, (size_t)amount); } return rv; } static int vfsDatabaseWrite(struct vfsDatabase *d, const void *buf, int amount, sqlite_int64 offset) { unsigned pgno; uint32_t page_size; void *page; int rc; if (offset == 0) { const uint8_t *header = buf; /* This is the first database page. We expect * the data to contain at least the header. */ assert(amount >= FORMAT__DB_HDR_SIZE); /* Extract the page size from the header. */ page_size = vfsParsePageSize(ByteGetBe16(&header[16])); if (page_size == 0) { return SQLITE_CORRUPT; } pgno = 1; } else { page_size = vfsDatabaseGetPageSize(d); /* The header must have been written and the page size set. */ assert(page_size > 0); /* For pages beyond the first we expect offset to be a multiple * of the page size. */ assert((offset % (int)page_size) == 0); /* We expect that SQLite writes a page at time. */ assert(amount == (int)page_size); pgno = ((unsigned)(offset / (int)page_size)) + 1; } rc = vfsDatabaseGetPage(d, page_size, pgno, &page); if (rc != SQLITE_OK) { return rc; } assert(page != NULL); memcpy(page, buf, (size_t)amount); return SQLITE_OK; } static int vfsWalWrite(struct vfsWal *w, const void *buf, int amount, sqlite_int64 offset) { uint32_t page_size; unsigned index; struct vfsFrame *frame; /* WAL header. */ if (offset == 0) { /* We expect the data to contain exactly 32 * bytes. */ assert(amount == VFS__WAL_HEADER_SIZE); memcpy(w->hdr, buf, (size_t)amount); return SQLITE_OK; } page_size = vfsWalGetPageSize(w); assert(page_size > 0); /* This is a WAL frame write. We expect either a frame * header or page write. */ if (amount == FORMAT__WAL_FRAME_HDR_SIZE) { /* Frame header write. */ assert(((offset - VFS__WAL_HEADER_SIZE) % ((int)page_size + FORMAT__WAL_FRAME_HDR_SIZE)) == 0); index = (unsigned)formatWalCalcFrameIndex((int)page_size, offset); vfsWalFrameGet(w, index, page_size, &frame); if (frame == NULL) { return SQLITE_NOMEM; } memcpy(frame->header, buf, (size_t)amount); } else { /* Frame page write. */ assert(amount == (int)page_size); assert(((offset - VFS__WAL_HEADER_SIZE - FORMAT__WAL_FRAME_HDR_SIZE) % ((int)page_size + FORMAT__WAL_FRAME_HDR_SIZE)) == 0); index = (unsigned)formatWalCalcFrameIndex((int)page_size, offset); /* The header for the this frame must already * have been written, so the page is there. */ frame = vfsWalFrameLookup(w, index); assert(frame != NULL); memcpy(frame->page, buf, (size_t)amount); } return SQLITE_OK; } static int vfsFileWrite(sqlite3_file *file, const void *buf, int amount, sqlite_int64 offset) { struct vfsFile *f = (struct vfsFile *)file; int rv; assert(buf != NULL); assert(amount > 0); assert(f != NULL); if (f->temp != NULL) { /* Write to the actual temporary file. */ return f->temp->pMethods->xWrite(f->temp, buf, amount, offset); } switch (f->type) { case VFS__DATABASE: rv = vfsDatabaseWrite(f->database, buf, amount, offset); break; case VFS__WAL: rv = vfsWalWrite(&f->database->wal, buf, amount, offset); break; case VFS__JOURNAL: /* Silently swallow writes to the journal */ rv = SQLITE_OK; break; default: rv = SQLITE_IOERR_WRITE; break; } return rv; } static int vfsFileTruncate(sqlite3_file *file, sqlite_int64 size) { struct vfsFile *f = (struct vfsFile *)file; int rv; assert(f != NULL); switch (f->type) { case VFS__DATABASE: rv = vfsDatabaseTruncate(f->database, size); break; case VFS__WAL: rv = vfsWalTruncate(&f->database->wal, size); break; default: rv = SQLITE_IOERR_TRUNCATE; break; } return rv; } static int vfsFileSync(sqlite3_file *file, int flags) { (void)file; (void)flags; return SQLITE_IOERR_FSYNC; } /* Return the size of the database file in bytes. */ static size_t vfsDatabaseFileSize(struct vfsDatabase *d) { uint64_t size = 0; if (d->n_pages > 0) { size = (uint64_t)d->n_pages * (uint64_t)vfsDatabaseGetPageSize(d); } /* TODO dqlite is limited to a max database size of SIZE_MAX */ assert(size <= SIZE_MAX); return (size_t)size; } /* Return the size of the WAL file in bytes. */ static size_t vfsWalFileSize(struct vfsWal *w) { uint64_t size = 0; if (w->n_frames > 0) { uint32_t page_size; page_size = vfsWalGetPageSize(w); size += VFS__WAL_HEADER_SIZE; size += (uint64_t)w->n_frames * (uint64_t)(FORMAT__WAL_FRAME_HDR_SIZE + page_size); } /* TODO dqlite is limited to a max database size of SIZE_MAX */ assert(size <= SIZE_MAX); return (size_t)size; } static int vfsFileSize(sqlite3_file *file, sqlite_int64 *size) { struct vfsFile *f = (struct vfsFile *)file; size_t n; switch (f->type) { case VFS__DATABASE: n = vfsDatabaseFileSize(f->database); break; case VFS__WAL: /* TODO? here we assume that FileSize() is never invoked * between a header write and a page write. */ n = vfsWalFileSize(&f->database->wal); break; default: n = 0; break; } *size = (sqlite3_int64)n; return SQLITE_OK; } /* Locking a file is a no-op, since no other process has visibility on it. */ static int vfsFileLock(sqlite3_file *file, int lock) { (void)file; (void)lock; return SQLITE_OK; } /* Unlocking a file is a no-op, since no other process has visibility on it. */ static int vfsFileUnlock(sqlite3_file *file, int lock) { (void)file; (void)lock; return SQLITE_OK; } /* We always report that a lock is held. This routine should be used only in * journal mode, so it doesn't matter. */ static int vfsFileCheckReservedLock(sqlite3_file *file, int *result) { (void)file; *result = 1; return SQLITE_OK; } /* Handle pragma a pragma file control. See the xFileControl * docstring in sqlite.h.in for more details. */ static int vfsFileControlPragma(struct vfsFile *f, char **fnctl) { const char *left; const char *right; assert(f != NULL); assert(fnctl != NULL); left = fnctl[1]; right = fnctl[2]; assert(left != NULL); if (strcmp(left, "page_size") == 0 && right) { /* When the user executes 'PRAGMA page_size=N' we save the * size internally. * * The page size must be between 512 and 65536, and be a * power of two. The check below was copied from * sqlite3BtreeSetPageSize in btree.c. * * Invalid sizes are simply ignored, SQLite will do the same. * * It's not possible to change the size after it's set. */ int page_size = atoi(right); if (page_size >= FORMAT__PAGE_SIZE_MIN && page_size <= FORMAT__PAGE_SIZE_MAX && ((page_size - 1) & page_size) == 0) { if (f->database->n_pages > 0 && page_size != (int)vfsDatabaseGetPageSize(f->database)) { fnctl[0] = sqlite3_mprintf( "changing page size is not supported"); return SQLITE_IOERR; } } } else if (strcmp(left, "journal_mode") == 0 && right) { /* When the user executes 'PRAGMA journal_mode=x' we ensure * that the desired mode is 'wal'. */ if (strcasecmp(right, "wal") != 0) { fnctl[0] = sqlite3_mprintf("only WAL mode is supported"); return SQLITE_IOERR; } } /* We're returning NOTFOUND here to tell SQLite that we wish it to go on * with its own handling as well. If we returned SQLITE_OK the page size * of the journal mode wouldn't be effectively set, as the processing of * the PRAGMA would stop here. */ return SQLITE_NOTFOUND; } /* Return the page number field stored in the header of the given frame. */ static uint32_t vfsFrameGetPageNumber(struct vfsFrame *f) { return ByteGetBe32(&f->header[0]); } /* Return the database size field stored in the header of the given frame. */ static uint32_t vfsFrameGetDatabaseSize(struct vfsFrame *f) { return ByteGetBe32(&f->header[4]); } /* Return the checksum-1 field stored in the header of the given frame. */ static uint32_t vfsFrameGetChecksum1(struct vfsFrame *f) { return ByteGetBe32(&f->header[16]); } /* Return the checksum-2 field stored in the header of the given frame. */ static uint32_t vfsFrameGetChecksum2(struct vfsFrame *f) { return ByteGetBe32(&f->header[20]); } /* Fill the header and the content of a WAL frame. The given checksum is the * rolling one of all preceeding frames and is updated by this function. */ static void vfsFrameFill(struct vfsFrame *f, uint32_t page_number, uint32_t database_size, uint32_t salt[2], uint32_t checksum[2], uint8_t *page, uint32_t page_size) { BytePutBe32(page_number, &f->header[0]); BytePutBe32(database_size, &f->header[4]); vfsChecksum(f->header, 8, checksum, checksum); vfsChecksum(page, page_size, checksum, checksum); memcpy(&f->header[8], &salt[0], sizeof salt[0]); memcpy(&f->header[12], &salt[1], sizeof salt[1]); BytePutBe32(checksum[0], &f->header[16]); BytePutBe32(checksum[1], &f->header[20]); memcpy(f->page, page, page_size); } /* This function modifies part of the WAL index header to reflect the current * content of the WAL. * * It is called in two cases. First, after a write transaction gets completed * and the SQLITE_FCNTL_COMMIT_PHASETWO file control op code is triggered, in * order to "rewind" the mxFrame and szPage fields of the WAL index header back * to when the write transaction started, effectively "shadowing" the * transaction, which will be replicated asynchronously. Second, when the * replication actually succeeds and dqlite_vfs_apply() is called on the VFS * that originated the transaction, in order to make the transaction visible. * * Note that the hash table contained in the WAL index does not get modified, * and even after a rewind following a write transaction it will still contain * entries for the frames committed by the transaction. That's safe because * mxFrame will make clients ignore those hash table entries. However it means * that in case the replication is not actually successful and * dqlite_vfs_abort() is called the WAL index must be invalidated. **/ static void vfsAmendWalIndexHeader(struct vfsDatabase *d) { struct vfsShm *shm = &d->shm; struct vfsWal *wal = &d->wal; uint8_t *index; uint32_t frame_checksum[2] = {0, 0}; uint32_t n_pages = (uint32_t)d->n_pages; uint32_t checksum[2] = {0, 0}; if (wal->n_frames > 0) { struct vfsFrame *last = wal->frames[wal->n_frames - 1]; frame_checksum[0] = vfsFrameGetChecksum1(last); frame_checksum[1] = vfsFrameGetChecksum2(last); n_pages = vfsFrameGetDatabaseSize(last); } assert(shm->n_regions > 0); index = shm->regions[0]; /* index is an alias for shm->regions[0] which is a void* that points to * memory allocated by `sqlite3_malloc64` and has the required alignment */ assert(*(uint32_t *)(__builtin_assume_aligned(&index[0], sizeof(uint32_t))) == VFS__WAL_VERSION); /* iVersion */ assert(index[12] == 1); /* isInit */ assert(index[13] == VFS__BIGENDIAN); /* bigEndCksum */ *(uint32_t *)(__builtin_assume_aligned(&index[16], sizeof(uint32_t))) = wal->n_frames; *(uint32_t *)(__builtin_assume_aligned(&index[20], sizeof(uint32_t))) = n_pages; *(uint32_t *)(__builtin_assume_aligned(&index[24], sizeof(uint32_t))) = frame_checksum[0]; *(uint32_t *)(__builtin_assume_aligned(&index[28], sizeof(uint32_t))) = frame_checksum[1]; vfsChecksum(index, 40, checksum, checksum); *(uint32_t *)__builtin_assume_aligned(&index[40], sizeof(uint32_t)) = checksum[0]; *(uint32_t *)__builtin_assume_aligned(&index[44], sizeof(uint32_t)) = checksum[1]; /* Update the second copy of the first part of the WAL index header. */ memcpy(index + VFS__WAL_INDEX_HEADER_SIZE, index, VFS__WAL_INDEX_HEADER_SIZE); } /* The SQLITE_FCNTL_COMMIT_PHASETWO file control op code is trigged by the * SQLite pager after completing a transaction. */ static int vfsFileControlCommitPhaseTwo(struct vfsFile *f) { struct vfsDatabase *database = f->database; struct vfsWal *wal = &database->wal; if (wal->n_tx > 0) { vfsAmendWalIndexHeader(database); } return 0; } static int vfsFileControl(sqlite3_file *file, int op, void *arg) { struct vfsFile *f = (struct vfsFile *)file; int rv; assert(f->type == VFS__DATABASE); switch (op) { case SQLITE_FCNTL_PRAGMA: rv = vfsFileControlPragma(f, arg); break; case SQLITE_FCNTL_COMMIT_PHASETWO: rv = vfsFileControlCommitPhaseTwo(f); break; case SQLITE_FCNTL_PERSIST_WAL: /* This prevents SQLite from deleting the WAL after the * last connection is closed. */ *(int *)(arg) = 1; rv = SQLITE_OK; break; default: rv = SQLITE_OK; break; } return rv; } static int vfsFileSectorSize(sqlite3_file *file) { (void)file; return 0; } static int vfsFileDeviceCharacteristics(sqlite3_file *file) { (void)file; return 0; } static int vfsShmMap(struct vfsShm *s, unsigned region_index, unsigned region_size, bool extend, void volatile **out) { void *region; int rv; if (s->regions != NULL && region_index < s->n_regions) { /* The region was already allocated. */ region = s->regions[region_index]; assert(region != NULL); } else { if (extend) { void **regions; /* We should grow the map one region at a time. */ assert(region_size == VFS__WAL_INDEX_REGION_SIZE); assert(region_index == s->n_regions); region = sqlite3_malloc64(region_size); if (region == NULL) { rv = SQLITE_NOMEM; goto err; } memset(region, 0, region_size); regions = sqlite3_realloc64( s->regions, sizeof *s->regions * (s->n_regions + 1)); if (regions == NULL) { rv = SQLITE_NOMEM; goto err_after_region_malloc; } s->regions = regions; s->regions[region_index] = region; s->n_regions++; } else { /* The region was not allocated and we don't have to * extend the map. */ region = NULL; } } *out = region; if (region_index == 0 && region != NULL) { s->refcount++; } return SQLITE_OK; err_after_region_malloc: sqlite3_free(region); err: assert(rv != SQLITE_OK); *out = NULL; return rv; } /* Simulate shared memory by allocating on the C heap. */ static int vfsFileShmMap(sqlite3_file *file, /* Handle open on database file */ int region_index, /* Region to retrieve */ int region_size, /* Size of regions */ int extend, /* True to extend file if necessary */ void volatile **out /* OUT: Mapped memory */ ) { struct vfsFile *f = (struct vfsFile *)file; assert(f->type == VFS__DATABASE); return vfsShmMap(&f->database->shm, (unsigned)region_index, (unsigned)region_size, extend != 0, out); } static int vfsShmLock(struct vfsShm *s, int ofst, int n, int flags) { int i; if (flags & SQLITE_SHM_EXCLUSIVE) { /* No shared or exclusive lock must be held in the region. */ for (i = ofst; i < ofst + n; i++) { if (s->shared[i] > 0 || s->exclusive[i] > 0) { tracef( "EXCLUSIVE lock contention ofst:%d n:%d " "exclusive[%d]=%d shared[%d]=%d", ofst, n, i, s->exclusive[i], i, s->shared[i]); return SQLITE_BUSY; } } for (i = ofst; i < ofst + n; i++) { assert(s->exclusive[i] == 0); s->exclusive[i] = 1; } } else { /* No exclusive lock must be held in the region. */ for (i = ofst; i < ofst + n; i++) { if (s->exclusive[i] > 0) { tracef( "SHARED lock contention ofst:%d n:%d " "exclusive[%d]=%d shared[%d]=%d", ofst, n, i, s->exclusive[i], i, s->shared[i]); return SQLITE_BUSY; } } for (i = ofst; i < ofst + n; i++) { s->shared[i]++; } } return SQLITE_OK; } static int vfsShmUnlock(struct vfsShm *s, int ofst, int n, int flags) { unsigned *these_locks; unsigned *other_locks; int i; if (flags & SQLITE_SHM_SHARED) { these_locks = s->shared; other_locks = s->exclusive; } else { these_locks = s->exclusive; other_locks = s->shared; } for (i = ofst; i < ofst + n; i++) { /* Coherence check that no lock of the other type is held in * this region. */ assert(other_locks[i] == 0); /* Only decrease the lock count if it's positive. In other words * releasing a never acquired lock is legal and idemponent. */ if (these_locks[i] > 0) { these_locks[i]--; } } return SQLITE_OK; } /* If there's a uncommitted transaction, roll it back. */ static void vfsWalRollbackIfUncommitted(struct vfsWal *w) { struct vfsFrame *last; uint32_t commit; unsigned i; if (w->n_tx == 0) { return; } tracef("rollback n_tx:%d", w->n_tx); last = w->tx[w->n_tx - 1]; commit = vfsFrameGetDatabaseSize(last); if (commit > 0) { tracef("rollback commit:%u", commit); return; } for (i = 0; i < w->n_tx; i++) { vfsFrameDestroy(w->tx[i]); } w->n_tx = 0; } static int vfsFileShmLock(sqlite3_file *file, int ofst, int n, int flags) { struct vfsFile *f; struct vfsShm *shm; struct vfsWal *wal; int rv; assert(file != NULL); assert(ofst >= 0); assert(n >= 0); /* Legal values for the offset and the range */ assert(ofst >= 0 && ofst + n <= SQLITE_SHM_NLOCK); assert(n >= 1); assert(n == 1 || (flags & SQLITE_SHM_EXCLUSIVE) != 0); /* Legal values for the flags. * * See https://sqlite.org/c3ref/c_shm_exclusive.html. */ assert(flags == (SQLITE_SHM_LOCK | SQLITE_SHM_SHARED) || flags == (SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE) || flags == (SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED) || flags == (SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE)); /* This is a no-op since shared-memory locking is relevant only for * inter-process concurrency. See also the unix-excl branch from * upstream (git commit cda6b3249167a54a0cf892f949d52760ee557129). */ f = (struct vfsFile *)file; assert(f->type == VFS__DATABASE); assert(f->database != NULL); shm = &f->database->shm; if (flags & SQLITE_SHM_UNLOCK) { rv = vfsShmUnlock(shm, ofst, n, flags); } else { rv = vfsShmLock(shm, ofst, n, flags); } wal = &f->database->wal; if (rv == SQLITE_OK && ofst == VFS__WAL_WRITE_LOCK) { assert(n == 1); /* When acquiring the write lock, make sure there's no * transaction that hasn't been rolled back or polled. */ if (flags == (SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE)) { assert(wal->n_tx == 0); } /* When releasing the write lock, if we find a pending * uncommitted transaction then a rollback must have occurred. * In that case we delete the pending transaction. */ tracef("ROLLBACK TIME"); if (flags == (SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE)) { vfsWalRollbackIfUncommitted(wal); } } return rv; } static void vfsFileShmBarrier(sqlite3_file *file) { (void)file; /* This is a no-op since we expect SQLite to be compiled with mutex * support (i.e. SQLITE_MUTEX_OMIT or SQLITE_MUTEX_NOOP are *not* * defined, see sqliteInt.h). */ } static void vfsShmUnmap(struct vfsShm *s) { s->refcount--; if (s->refcount == 0) { vfsShmReset(s); } } static int vfsFileShmUnmap(sqlite3_file *file, int delete_flag) { struct vfsFile *f = (struct vfsFile *)file; (void)delete_flag; vfsShmUnmap(&f->database->shm); return SQLITE_OK; } static const sqlite3_io_methods vfsFileMethods = { 2, // iVersion vfsFileClose, // xClose vfsFileRead, // xRead vfsFileWrite, // xWrite vfsFileTruncate, // xTruncate vfsFileSync, // xSync vfsFileSize, // xFileSize vfsFileLock, // xLock vfsFileUnlock, // xUnlock vfsFileCheckReservedLock, // xCheckReservedLock vfsFileControl, // xFileControl vfsFileSectorSize, // xSectorSize vfsFileDeviceCharacteristics, // xDeviceCharacteristics vfsFileShmMap, // xShmMap vfsFileShmLock, // xShmLock vfsFileShmBarrier, // xShmBarrier vfsFileShmUnmap, // xShmUnmap 0, 0, }; /* Create a database object and add it to the databases array. */ static struct vfsDatabase *vfsCreateDatabase(struct vfs *v, const char *name) { unsigned n = v->n_databases + 1; struct vfsDatabase **databases; struct vfsDatabase *d; assert(name != NULL); /* Create a new entry. */ databases = sqlite3_realloc64(v->databases, sizeof *databases * n); if (databases == NULL) { goto oom; } v->databases = databases; d = sqlite3_malloc(sizeof *d); if (d == NULL) { goto oom; } d->name = sqlite3_malloc64(strlen(name) + 1); if (d->name == NULL) { goto oom_after_database_malloc; } strcpy(d->name, name); vfsDatabaseInit(d); v->databases[n - 1] = d; v->n_databases = n; return d; oom_after_database_malloc: sqlite3_free(d); oom: return NULL; } static int vfsOpen(sqlite3_vfs *vfs, const char *filename, sqlite3_file *file, int flags, int *out_flags) { struct vfs *v; struct vfsFile *f; struct vfsDatabase *database; enum vfsFileType type; bool exists; int exclusive = flags & SQLITE_OPEN_EXCLUSIVE; int create = flags & SQLITE_OPEN_CREATE; int rc; (void)out_flags; assert(vfs != NULL); assert(vfs->pAppData != NULL); assert(file != NULL); /* From sqlite3.h.in: * * The SQLITE_OPEN_EXCLUSIVE flag is always used in conjunction with * the SQLITE_OPEN_CREATE flag, which are both directly analogous to * the O_EXCL and O_CREAT flags of the POSIX open() API. The * SQLITE_OPEN_EXCLUSIVE flag, when paired with the * SQLITE_OPEN_CREATE, is used to indicate that file should always be * created, and that it is an error if it already exists. It is not * used to indicate the file should be opened for exclusive access. */ assert(!exclusive || create); v = (struct vfs *)(vfs->pAppData); f = (struct vfsFile *)file; /* This tells SQLite to not call Close() in case we return an error. */ f->base.pMethods = 0; f->temp = NULL; /* Save the flags */ f->flags = flags; /* From SQLite documentation: * * If the zFilename parameter to xOpen is a NULL pointer then xOpen * must invent its own temporary name for the file. Whenever the * xFilename parameter is NULL it will also be the case that the * flags parameter will include SQLITE_OPEN_DELETEONCLOSE. */ if (filename == NULL) { assert(flags & SQLITE_OPEN_DELETEONCLOSE); /* Open an actual temporary file. */ vfs = sqlite3_vfs_find("unix"); assert(vfs != NULL); f->temp = sqlite3_malloc(vfs->szOsFile); if (f->temp == NULL) { v->error = ENOENT; return SQLITE_CANTOPEN; } rc = vfs->xOpen(vfs, NULL, f->temp, flags, out_flags); if (rc != SQLITE_OK) { sqlite3_free(f->temp); return rc; } f->base.pMethods = &vfsFileMethods; f->vfs = NULL; f->database = NULL; return SQLITE_OK; } /* Search if the database object exists already. */ database = vfsDatabaseLookup(v, filename); exists = database != NULL; if (flags & SQLITE_OPEN_MAIN_DB) { type = VFS__DATABASE; } else if (flags & SQLITE_OPEN_MAIN_JOURNAL) { type = VFS__JOURNAL; } else if (flags & SQLITE_OPEN_WAL) { type = VFS__WAL; } else { v->error = ENOENT; return SQLITE_CANTOPEN; } /* If file exists, and the exclusive flag is on, return an error. */ if (exists && exclusive && create && type == VFS__DATABASE) { v->error = EEXIST; rc = SQLITE_CANTOPEN; goto err; } if (!exists) { /* When opening a WAL or journal file we expect the main * database file to have already been created. */ if (type == VFS__WAL || type == VFS__JOURNAL) { v->error = ENOENT; rc = SQLITE_CANTOPEN; goto err; } assert(type == VFS__DATABASE); /* Check the create flag. */ if (!create) { v->error = ENOENT; rc = SQLITE_CANTOPEN; goto err; } database = vfsCreateDatabase(v, filename); if (database == NULL) { v->error = ENOMEM; rc = SQLITE_CANTOPEN; goto err; } } /* Populate the new file handle. */ f->base.pMethods = &vfsFileMethods; f->vfs = v; f->type = type; f->database = database; return SQLITE_OK; err: assert(rc != SQLITE_OK); return rc; } static int vfsDelete(sqlite3_vfs *vfs, const char *filename, int dir_sync) { struct vfs *v; (void)dir_sync; assert(vfs != NULL); assert(vfs->pAppData != NULL); if (vfsFilenameEndsWith(filename, "-journal")) { return SQLITE_OK; } if (vfsFilenameEndsWith(filename, "-wal")) { return SQLITE_OK; } v = (struct vfs *)(vfs->pAppData); return vfsDeleteDatabase(v, filename); } static int vfsAccess(sqlite3_vfs *vfs, const char *filename, int flags, int *result) { struct vfs *v; struct vfsDatabase *database; (void)flags; assert(vfs != NULL); assert(vfs->pAppData != NULL); v = (struct vfs *)(vfs->pAppData); /* If the database object exists, we consider all associated files as * existing and accessible. */ database = vfsDatabaseLookup(v, filename); if (database == NULL) { *result = 0; } else { *result = 1; } return SQLITE_OK; } static int vfsFullPathname(sqlite3_vfs *vfs, const char *filename, int pathname_len, char *pathname) { (void)vfs; /* Just return the path unchanged. */ sqlite3_snprintf(pathname_len, pathname, "%s", filename); return SQLITE_OK; } static void *vfsDlOpen(sqlite3_vfs *vfs, const char *filename) { (void)vfs; (void)filename; return 0; } static void vfsDlError(sqlite3_vfs *vfs, int nByte, char *zErrMsg) { (void)vfs; sqlite3_snprintf(nByte, zErrMsg, "Loadable extensions are not supported"); zErrMsg[nByte - 1] = '\0'; } static void (*vfsDlSym(sqlite3_vfs *vfs, void *pH, const char *z))(void) { (void)vfs; (void)pH; (void)z; return 0; } static void vfsDlClose(sqlite3_vfs *vfs, void *pHandle) { (void)vfs; (void)pHandle; return; } static int vfsRandomness(sqlite3_vfs *vfs, int nByte, char *zByte) { (void)vfs; (void)nByte; (void)zByte; /* TODO (is this needed?) */ return SQLITE_OK; } static int vfsSleep(sqlite3_vfs *vfs, int microseconds) { (void)vfs; /* TODO (is this needed?) */ return microseconds; } static int vfsCurrentTimeInt64(sqlite3_vfs *vfs, sqlite3_int64 *piNow) { static const sqlite3_int64 unixEpoch = 24405875 * (sqlite3_int64)8640000; struct timeval now; (void)vfs; gettimeofday(&now, 0); *piNow = unixEpoch + 1000 * (sqlite3_int64)now.tv_sec + now.tv_usec / 1000; return SQLITE_OK; } static int vfsCurrentTime(sqlite3_vfs *vfs, double *piNow) { // TODO: check if it's always safe to cast a double* to a // sqlite3_int64*. return vfsCurrentTimeInt64(vfs, (sqlite3_int64 *)piNow); } static int vfsGetLastError(sqlite3_vfs *vfs, int x, char *y) { struct vfs *v = (struct vfs *)(vfs->pAppData); int rc; (void)vfs; (void)x; (void)y; rc = v->error; return rc; } int VfsInit(struct sqlite3_vfs *vfs, const char *name) { tracef("vfs init"); vfs->iVersion = 2; vfs->szOsFile = sizeof(struct vfsFile); vfs->mxPathname = VFS__MAX_PATHNAME; vfs->pNext = NULL; vfs->pAppData = vfsCreate(); if (vfs->pAppData == NULL) { return DQLITE_NOMEM; } vfs->xOpen = vfsOpen; vfs->xDelete = vfsDelete; vfs->xAccess = vfsAccess; vfs->xFullPathname = vfsFullPathname; vfs->xDlOpen = vfsDlOpen; vfs->xDlError = vfsDlError; vfs->xDlSym = vfsDlSym; vfs->xDlClose = vfsDlClose; vfs->xRandomness = vfsRandomness; vfs->xSleep = vfsSleep; vfs->xCurrentTime = vfsCurrentTime; vfs->xGetLastError = vfsGetLastError; vfs->xCurrentTimeInt64 = vfsCurrentTimeInt64; vfs->zName = name; return 0; } void VfsClose(struct sqlite3_vfs *vfs) { tracef("vfs close"); struct vfs *v = vfs->pAppData; vfsDestroy(v); sqlite3_free(v); } static int vfsWalPoll(struct vfsWal *w, dqlite_vfs_frame **frames, unsigned *n) { struct vfsFrame *last; uint32_t commit; unsigned i; if (w->n_tx == 0) { *frames = NULL; *n = 0; return 0; } /* Check if the last frame in the transaction has the commit marker. */ last = w->tx[w->n_tx - 1]; commit = vfsFrameGetDatabaseSize(last); if (commit == 0) { *frames = NULL; *n = 0; return 0; } *frames = sqlite3_malloc64(sizeof **frames * w->n_tx); if (*frames == NULL) { return DQLITE_NOMEM; } *n = w->n_tx; for (i = 0; i < w->n_tx; i++) { dqlite_vfs_frame *frame = &(*frames)[i]; uint32_t page_number = vfsFrameGetPageNumber(w->tx[i]); frame->data = w->tx[i]->page; frame->page_number = page_number; /* Release the vfsFrame object, but not its buf attribute, since * responsibility for that memory has been transferred to the * caller. */ sqlite3_free(w->tx[i]); } w->n_tx = 0; return 0; } int VfsPoll(sqlite3_vfs *vfs, const char *filename, dqlite_vfs_frame **frames, unsigned *n) { tracef("vfs poll filename:%s", filename); struct vfs *v; struct vfsDatabase *database; struct vfsShm *shm; struct vfsWal *wal; int rv; v = (struct vfs *)(vfs->pAppData); database = vfsDatabaseLookup(v, filename); if (database == NULL) { tracef("not found"); return DQLITE_ERROR; } shm = &database->shm; wal = &database->wal; if (wal == NULL) { *frames = NULL; *n = 0; return 0; } rv = vfsWalPoll(wal, frames, n); if (rv != 0) { tracef("wal poll failed %d", rv); return rv; } /* If some frames have been written take the write lock. */ if (*n > 0) { rv = vfsShmLock(shm, 0, 1, SQLITE_SHM_EXCLUSIVE); if (rv != 0) { tracef("shm lock failed %d", rv); return rv; } vfsAmendWalIndexHeader(database); } return 0; } /* Return the salt-1 field stored in the WAL header.*/ static uint32_t vfsWalGetSalt1(struct vfsWal *w) { /* `hdr` field is pointer aligned, cast is safe */ return *(uint32_t *)__builtin_assume_aligned(&w->hdr[16], sizeof(uint32_t)); } /* Return the salt-2 field stored in the WAL header.*/ static uint32_t vfsWalGetSalt2(struct vfsWal *w) { /* `hdr` field is pointer aligned, cast is safe */ return *(uint32_t *)__builtin_assume_aligned(&w->hdr[20], sizeof(uint32_t)); } /* Return the checksum-1 field stored in the WAL header.*/ static uint32_t vfsWalGetChecksum1(struct vfsWal *w) { return ByteGetBe32(&w->hdr[24]); } /* Return the checksum-2 field stored in the WAL header.*/ static uint32_t vfsWalGetChecksum2(struct vfsWal *w) { return ByteGetBe32(&w->hdr[28]); } /* Append the given pages as new frames. */ static int vfsWalAppend(struct vfsWal *w, unsigned database_n_pages, unsigned n, unsigned long *page_numbers, uint8_t *pages) { struct vfsFrame **frames; /* New frames array. */ uint32_t page_size; uint32_t database_size; unsigned i; unsigned j; uint32_t salt[2]; uint32_t checksum[2]; /* No pending transactions. */ assert(w->n_tx == 0); page_size = vfsWalGetPageSize(w); assert(page_size > 0); /* Get the salt from the WAL header. */ salt[0] = vfsWalGetSalt1(w); salt[1] = vfsWalGetSalt2(w); /* If there are currently no frames in the WAL, the starting database * size will be equal to the current number of pages in the main * database, and the starting checksum should be set to the one stored * in the WAL header. Otherwise, the starting database size and checksum * will be the ones stored in the last frame of the WAL. */ if (w->n_frames == 0) { database_size = (uint32_t)database_n_pages; checksum[0] = vfsWalGetChecksum1(w); checksum[1] = vfsWalGetChecksum2(w); } else { struct vfsFrame *frame = w->frames[w->n_frames - 1]; checksum[0] = vfsFrameGetChecksum1(frame); checksum[1] = vfsFrameGetChecksum2(frame); database_size = vfsFrameGetDatabaseSize(frame); } frames = sqlite3_realloc64(w->frames, sizeof *frames * (w->n_frames + n)); if (frames == NULL) { goto oom; } w->frames = frames; for (i = 0; i < n; i++) { struct vfsFrame *frame = vfsFrameCreate(page_size); uint32_t page_number = (uint32_t)page_numbers[i]; uint32_t commit = 0; uint8_t *page = &pages[i * page_size]; if (frame == NULL) { goto oom_after_frames_alloc; } if (page_number > database_size) { database_size = page_number; } /* For commit records, the size of the database file in pages * after the commit. For all other records, zero. */ if (i == n - 1) { commit = database_size; } vfsFrameFill(frame, page_number, commit, salt, checksum, page, page_size); frames[w->n_frames + i] = frame; } w->n_frames += n; return 0; oom_after_frames_alloc: for (j = 0; j < i; j++) { vfsFrameDestroy(frames[w->n_frames + j]); } oom: return DQLITE_NOMEM; } /* Write the header of a brand new WAL file image. */ static void vfsWalStartHeader(struct vfsWal *w, uint32_t page_size) { assert(page_size > 0); uint32_t checksum[2] = {0, 0}; /* SQLite calculates checksums for the WAL header and frames either * using little endian or big endian byte order when adding up 32-bit * words. The byte order that should be used is recorded in the WAL file * header by setting the least significant bit of the magic value stored * in the first 32 bits. This allows portability of the WAL file across * hosts with different native byte order. * * When creating a brand new WAL file, SQLite will set the byte order * bit to match the host's native byte order, so checksums are a bit * more efficient. * * In Dqlite the WAL file image is always generated at run time on the * host, so we can always use the native byte order. */ BytePutBe32(VFS__WAL_MAGIC | VFS__BIGENDIAN, &w->hdr[0]); BytePutBe32(VFS__WAL_VERSION, &w->hdr[4]); BytePutBe32(page_size, &w->hdr[8]); BytePutBe32(0, &w->hdr[12]); sqlite3_randomness(8, &w->hdr[16]); vfsChecksum(w->hdr, 24, checksum, checksum); BytePutBe32(checksum[0], w->hdr + 24); BytePutBe32(checksum[1], w->hdr + 28); } /* Invalidate the WAL index header, forcing the next connection that tries to * start a read transaction to rebuild the WAL index by reading the WAL. * * No read or write lock must be currently held. */ static void vfsInvalidateWalIndexHeader(struct vfsDatabase *d) { struct vfsShm *shm = &d->shm; uint8_t *header = shm->regions[0]; unsigned i; for (i = 0; i < SQLITE_SHM_NLOCK; i++) { assert(shm->shared[i] == 0); assert(shm->exclusive[i] == 0); } /* The walIndexTryHdr function in sqlite/wal.c (which is indirectly * called by sqlite3WalBeginReadTransaction), compares the first and * second copy of the WAL index header to see if it is valid. Changing * the first byte of each of the two copies is enough to make the check * fail. */ header[0] = 1; header[VFS__WAL_INDEX_HEADER_SIZE] = 0; } int VfsApply(sqlite3_vfs *vfs, const char *filename, unsigned n, unsigned long *page_numbers, void *frames) { tracef("vfs apply filename %s n %u", filename, n); struct vfs *v; struct vfsDatabase *database; struct vfsWal *wal; struct vfsShm *shm; int rv; v = (struct vfs *)(vfs->pAppData); database = vfsDatabaseLookup(v, filename); assert(database != NULL); wal = &database->wal; shm = &database->shm; /* If there's no page size set in the WAL header, it must mean that WAL * file was never written. In that case we need to initialize the WAL * header. */ if (vfsWalGetPageSize(wal) == 0) { vfsWalStartHeader(wal, vfsDatabaseGetPageSize(database)); } rv = vfsWalAppend(wal, database->n_pages, n, page_numbers, frames); if (rv != 0) { tracef("wal append failed rv:%d n_pages:%u n:%u", rv, database->n_pages, n); return rv; } /* If a write lock is held it means that this is the VFS that orginated * this commit and on which dqlite_vfs_poll() was called. In that case * we release the lock and update the WAL index. * * Otherwise, if the WAL index header is mapped it means that this VFS * has one or more open connections even if it's not the one that * originated the transaction (this can happen for example when applying * a Raft barrier and replaying the Raft log in order to serve a request * of a newly connected client). */ if (shm->exclusive[0] == 1) { shm->exclusive[0] = 0; vfsAmendWalIndexHeader(database); } else { if (shm->n_regions > 0) { vfsInvalidateWalIndexHeader(database); } } return 0; } int VfsAbort(sqlite3_vfs *vfs, const char *filename) { tracef("vfs abort filename %s", filename); struct vfs *v; struct vfsDatabase *database; int rv; v = (struct vfs *)(vfs->pAppData); database = vfsDatabaseLookup(v, filename); if (database == NULL) { tracef("database: %s does not exist", filename); return DQLITE_ERROR; } rv = vfsShmUnlock(&database->shm, 0, 1, SQLITE_SHM_EXCLUSIVE); if (rv != 0) { tracef("shm unlock failed %d", rv); return rv; } return 0; } /* Extract the number of pages field from the database header. */ static uint32_t vfsDatabaseGetNumberOfPages(struct vfsDatabase *d) { uint8_t *page; assert(d->n_pages > 0); page = d->pages[0]; /* The page size is stored in the 16th and 17th bytes of the first * database page (big-endian) */ return ByteGetBe32(&page[28]); } int VfsDatabaseNumPages(sqlite3_vfs *vfs, const char *filename, uint32_t *n) { struct vfs *v; struct vfsDatabase *d; v = (struct vfs *)(vfs->pAppData); d = vfsDatabaseLookup(v, filename); if (d == NULL) { return -1; } *n = vfsDatabaseGetNumberOfPages(d); return 0; } static void vfsDatabaseSnapshot(struct vfsDatabase *d, uint8_t **cursor) { uint32_t page_size; unsigned i; page_size = vfsDatabaseGetPageSize(d); assert(page_size > 0); assert(d->n_pages == vfsDatabaseGetNumberOfPages(d)); for (i = 0; i < d->n_pages; i++) { memcpy(*cursor, d->pages[i], page_size); *cursor += page_size; } } static void vfsWalSnapshot(struct vfsWal *w, uint8_t **cursor) { uint32_t page_size; unsigned i; if (w->n_frames == 0) { return; } memcpy(*cursor, w->hdr, VFS__WAL_HEADER_SIZE); *cursor += VFS__WAL_HEADER_SIZE; page_size = vfsWalGetPageSize(w); assert(page_size > 0); for (i = 0; i < w->n_frames; i++) { struct vfsFrame *frame = w->frames[i]; memcpy(*cursor, frame->header, FORMAT__WAL_FRAME_HDR_SIZE); *cursor += FORMAT__WAL_FRAME_HDR_SIZE; memcpy(*cursor, frame->page, page_size); *cursor += page_size; } } int VfsSnapshot(sqlite3_vfs *vfs, const char *filename, void **data, size_t *n) { tracef("vfs snapshot filename %s", filename); struct vfs *v; struct vfsDatabase *database; struct vfsWal *wal; uint8_t *cursor; v = (struct vfs *)(vfs->pAppData); database = vfsDatabaseLookup(v, filename); if (database == NULL) { tracef("not found"); *data = NULL; *n = 0; return 0; } if (database->n_pages != vfsDatabaseGetNumberOfPages(database)) { tracef("corrupt"); return SQLITE_CORRUPT; } wal = &database->wal; *n = vfsDatabaseFileSize(database) + vfsWalFileSize(wal); /* TODO: we should fix the tests and use sqlite3_malloc instead. */ *data = raft_malloc(*n); if (*data == NULL) { tracef("malloc"); return DQLITE_NOMEM; } cursor = *data; vfsDatabaseSnapshot(database, &cursor); vfsWalSnapshot(wal, &cursor); return 0; } static void vfsDatabaseShallowSnapshot(struct vfsDatabase *d, struct dqlite_buffer *bufs) { uint32_t page_size; page_size = vfsDatabaseGetPageSize(d); assert(page_size > 0); /* Fill the buffers with pointers to all of the database pages */ for (unsigned i = 0; i < d->n_pages; ++i) { bufs[i].base = d->pages[i]; bufs[i].len = page_size; } } int VfsShallowSnapshot(sqlite3_vfs *vfs, const char *filename, struct dqlite_buffer bufs[], uint32_t n) { tracef("vfs snapshot filename %s", filename); struct vfs *v; struct vfsDatabase *database; struct vfsWal *wal; uint8_t *cursor; v = (struct vfs *)(vfs->pAppData); database = vfsDatabaseLookup(v, filename); if (database == NULL) { tracef("not found"); return -1; } if (database->n_pages != vfsDatabaseGetNumberOfPages(database)) { tracef("corrupt"); return SQLITE_CORRUPT; } if (database->n_pages != n - 1) { tracef("not enough buffers provided"); return SQLITE_MISUSE; } /* Copy WAL to last buffer. */ wal = &database->wal; bufs[n - 1].len = vfsWalFileSize(wal); bufs[n - 1].base = sqlite3_malloc64(bufs[n - 1].len); /* WAL can have 0 length! */ if (bufs[n - 1].base == NULL && bufs[n - 1].len != 0) { return SQLITE_NOMEM; } cursor = bufs[n - 1].base; vfsWalSnapshot(wal, &cursor); /* Copy page pointers to first n-1 buffers */ vfsDatabaseShallowSnapshot(database, bufs); return 0; } static int vfsDatabaseRestore(struct vfsDatabase *d, const uint8_t *data, size_t n) { uint32_t page_size = vfsParsePageSize(ByteGetBe16(&data[16])); unsigned n_pages; void **pages; unsigned i; size_t offset; int rv; assert(page_size > 0); /* Check that the page size of the snapshot is consistent with what we * have here. */ assert(vfsDatabaseGetPageSize(d) == page_size); n_pages = (unsigned)ByteGetBe32(&data[28]); if (n < (uint64_t)n_pages * (uint64_t)page_size) { return DQLITE_ERROR; } pages = sqlite3_malloc64(sizeof *pages * n_pages); if (pages == NULL) { goto oom; } for (i = 0; i < n_pages; i++) { void *page = sqlite3_malloc64(page_size); if (page == NULL) { unsigned j; for (j = 0; j < i; j++) { sqlite3_free(pages[j]); } goto oom_after_pages_alloc; } pages[i] = page; offset = (size_t)i * (size_t)page_size; memcpy(page, &data[offset], page_size); } /* Truncate any existing content. */ rv = vfsDatabaseTruncate(d, 0); assert(rv == 0); d->pages = pages; d->n_pages = n_pages; return 0; oom_after_pages_alloc: sqlite3_free(pages); oom: return DQLITE_NOMEM; } static int vfsWalRestore(struct vfsWal *w, const uint8_t *data, size_t n, uint32_t page_size) { struct vfsFrame **frames; unsigned n_frames; unsigned i; size_t offset; int rv; if (n == 0) { return 0; } assert(w->n_tx == 0); assert(n > VFS__WAL_HEADER_SIZE); assert(((n - (size_t)VFS__WAL_HEADER_SIZE) % ((size_t)vfsFrameSize(page_size))) == 0); n_frames = (unsigned)((n - (size_t)VFS__WAL_HEADER_SIZE) / ((size_t)vfsFrameSize(page_size))); frames = sqlite3_malloc64(sizeof *frames * n_frames); if (frames == NULL) { goto oom; } for (i = 0; i < n_frames; i++) { struct vfsFrame *frame = vfsFrameCreate(page_size); const uint8_t *p; if (frame == NULL) { unsigned j; for (j = 0; j < i; j++) { vfsFrameDestroy(frames[j]); } goto oom_after_frames_alloc; } frames[i] = frame; offset = (size_t)VFS__WAL_HEADER_SIZE + ((size_t)i * (size_t)vfsFrameSize(page_size)); p = &data[offset]; memcpy(frame->header, p, VFS__FRAME_HEADER_SIZE); memcpy(frame->page, p + VFS__FRAME_HEADER_SIZE, page_size); } memcpy(w->hdr, data, VFS__WAL_HEADER_SIZE); rv = vfsWalTruncate(w, 0); assert(rv == 0); w->frames = frames; w->n_frames = n_frames; return 0; oom_after_frames_alloc: sqlite3_free(frames); oom: return DQLITE_NOMEM; } int VfsRestore(sqlite3_vfs *vfs, const char *filename, const void *data, size_t n) { tracef("vfs restore filename %s size %zd", filename, n); struct vfs *v; struct vfsDatabase *database; struct vfsWal *wal; uint32_t page_size; size_t offset; int rv; v = (struct vfs *)(vfs->pAppData); database = vfsDatabaseLookup(v, filename); assert(database != NULL); wal = &database->wal; /* Truncate any existing content. */ rv = vfsWalTruncate(wal, 0); if (rv != 0) { tracef("wal truncate failed %d", rv); return rv; } /* Restore the content of the main database and of the WAL. */ rv = vfsDatabaseRestore(database, data, n); if (rv != 0) { tracef("database restore failed %d", rv); return rv; } page_size = vfsDatabaseGetPageSize(database); offset = (size_t)database->n_pages * (size_t)page_size; rv = vfsWalRestore(wal, data + offset, n - offset, page_size); if (rv != 0) { tracef("wal restore failed %d", rv); return rv; } return 0; } /****************************************************************************** Disk-based VFS *****************************************************************************/ static int vfsDiskFileClose(sqlite3_file *file) { int rc = SQLITE_OK; struct vfsFile *f = (struct vfsFile *)file; struct vfs *v = (struct vfs *)(f->vfs); if (f->temp != NULL) { /* Close the actual temporary file. */ rc = f->temp->pMethods->xClose(f->temp); sqlite3_free(f->temp); return rc; } if (f->db != NULL) { rc = f->db->pMethods->xClose(f->db); sqlite3_free(f->db); f->db = NULL; if (rc != SQLITE_OK) { return rc; } } if (f->flags & SQLITE_OPEN_DELETEONCLOSE) { rc = vfsDeleteDatabase(v, f->database->name); } return rc; } static int vfsDiskFileRead(sqlite3_file *file, void *buf, int amount, sqlite_int64 offset) { struct vfsFile *f = (struct vfsFile *)file; struct vfs *v; int rv; assert(buf != NULL); assert(amount > 0); assert(f != NULL); if (f->temp != NULL) { /* Read from the actual temporary file. */ return f->temp->pMethods->xRead(f->temp, buf, amount, offset); } if (f->db != NULL) { /* Read from the actual database file. */ return f->db->pMethods->xRead(f->db, buf, amount, offset); } switch (f->type) { case VFS__WAL: rv = vfsWalRead(&f->database->wal, buf, amount, offset); break; case VFS__JOURNAL: rv = SQLITE_IOERR_READ; v = f->vfs; if (v->disk) { rv = SQLITE_OK; } break; default: rv = SQLITE_IOERR_READ; break; } /* From SQLite docs: * * If xRead() returns SQLITE_IOERR_SHORT_READ it must also fill * in the unread portions of the buffer with zeros. A VFS that * fails to zero-fill short reads might seem to work. However, * failure to zero-fill short reads will eventually lead to * database corruption. */ if (rv == SQLITE_IOERR_SHORT_READ) { memset(buf, 0, (size_t)amount); } return rv; } /* Need to keep track of the number of database pages to allow creating correct * WAL headers when in on-disk mode. */ static int vfsDiskDatabaseTrackNumPages(struct vfsDatabase *d, sqlite_int64 offset) { unsigned pgno; if (offset == 0) { pgno = 1; } else { assert(d->page_size != 0); if (d->page_size == 0) { return SQLITE_ERROR; } pgno = ((unsigned)offset / d->page_size) + 1; } if (pgno > d->n_pages) { d->n_pages = pgno; } return SQLITE_OK; } static int vfsDiskFileWrite(sqlite3_file *file, const void *buf, int amount, sqlite_int64 offset) { struct vfsFile *f = (struct vfsFile *)file; int rv; assert(buf != NULL); assert(amount > 0); assert(f != NULL); if (f->temp != NULL) { /* Write to the actual temporary file. */ return f->temp->pMethods->xWrite(f->temp, buf, amount, offset); } if (f->db != NULL) { /* Write to the actual database file. */ vfsDiskDatabaseTrackNumPages(f->database, offset); rv = f->db->pMethods->xWrite(f->db, buf, amount, offset); tracef("vfsDiskFileWrite %s amount:%d rv:%d", "db", amount, rv); return rv; } switch (f->type) { case VFS__WAL: rv = vfsWalWrite(&f->database->wal, buf, amount, offset); break; case VFS__JOURNAL: /* Silently swallow writes to the journal */ rv = SQLITE_OK; break; default: rv = SQLITE_IOERR_WRITE; break; } return rv; } static int vfsDiskFileTruncate(sqlite3_file *file, sqlite_int64 size) { struct vfsFile *f = (struct vfsFile *)file; int rv; assert(f != NULL); if (f->db != NULL) { return f->db->pMethods->xTruncate(f->db, size); } switch (f->type) { case VFS__WAL: rv = vfsWalTruncate(&f->database->wal, size); break; default: rv = SQLITE_IOERR_TRUNCATE; break; } return rv; } static int vfsDiskFileSync(sqlite3_file *file, int flags) { int rv; struct vfsFile *f = (struct vfsFile *)file; if (f->db != NULL) { rv = f->db->pMethods->xSync(f->db, flags); return rv; } return SQLITE_IOERR_FSYNC; } static int vfsDiskFileSize(sqlite3_file *file, sqlite_int64 *size) { struct vfsFile *f = (struct vfsFile *)file; size_t n; if (f->db != NULL) { return f->db->pMethods->xFileSize(f->db, size); } switch (f->type) { case VFS__WAL: /* TODO? here we assume that FileSize() is never invoked * between a header write and a page write. */ n = vfsWalFileSize(&f->database->wal); break; default: n = 0; break; } *size = (sqlite3_int64)n; return SQLITE_OK; } /* Locking a file is a no-op, since no other process has visibility on it, * unless the database resides on disk. */ static int vfsDiskFileLock(sqlite3_file *file, int lock) { struct vfsFile *f = (struct vfsFile *)file; if (f->db != NULL) { return f->db->pMethods->xLock(f->db, lock); } return SQLITE_OK; } /* Unlocking a file is a no-op, since no other process has visibility on it, * unless the database resides on disk. */ static int vfsDiskFileUnlock(sqlite3_file *file, int lock) { struct vfsFile *f = (struct vfsFile *)file; if (f->db != NULL) { return f->db->pMethods->xUnlock(f->db, lock); } return SQLITE_OK; } /* We always report that a lock is held. This routine should be used only in * journal mode, so it doesn't matter. * TODO does this need to be treated differently in disk-mode? * */ static int vfsDiskFileCheckReservedLock(sqlite3_file *file, int *result) { (void)file; *result = 1; return SQLITE_OK; } /* Handle pragma a pragma file control. See the xFileControl * docstring in sqlite.h.in for more details. */ static int vfsDiskFileControlPragma(struct vfsFile *f, char **fnctl) { int rv; const char *left; const char *right; assert(f != NULL); assert(fnctl != NULL); left = fnctl[1]; right = fnctl[2]; assert(left != NULL); if (strcmp(left, "page_size") == 0 && right) { int page_size = atoi(right); /* The first page_size pragma sets page_size member of the db * and is called by dqlite based on the page_size configuration. * Only used for on-disk databases. * */ if (f->db == NULL) { fnctl[0] = sqlite3_mprintf("no DB file found"); return SQLITE_IOERR; } if (page_size > UINT16_MAX) { fnctl[0] = sqlite3_mprintf("max page_size exceeded"); return SQLITE_IOERR; } if (f->database->page_size == 0) { rv = f->db->pMethods->xFileControl( f->db, SQLITE_FCNTL_PRAGMA, fnctl); if (rv == SQLITE_NOTFOUND || rv == SQLITE_OK) { f->database->page_size = (uint16_t)page_size; } return rv; } else if ((uint16_t)page_size != f->database->page_size) { fnctl[0] = sqlite3_mprintf( "changing page size is not supported"); return SQLITE_IOERR; } } else if (strcmp(left, "journal_mode") == 0 && right) { /* When the user executes 'PRAGMA journal_mode=x' we ensure * that the desired mode is 'wal'. */ if (strcasecmp(right, "wal") != 0) { fnctl[0] = sqlite3_mprintf("only WAL mode is supported"); return SQLITE_IOERR; } } /* We're returning NOTFOUND here to tell SQLite that we wish it to go on * with its own handling as well. If we returned SQLITE_OK the page size * of the journal mode wouldn't be effectively set, as the processing of * the PRAGMA would stop here. */ return SQLITE_NOTFOUND; } static int vfsDiskFileControl(sqlite3_file *file, int op, void *arg) { struct vfsFile *f = (struct vfsFile *)file; int rv; assert(f->type == VFS__DATABASE); switch (op) { case SQLITE_FCNTL_PRAGMA: rv = vfsDiskFileControlPragma(f, arg); break; case SQLITE_FCNTL_COMMIT_PHASETWO: rv = vfsFileControlCommitPhaseTwo(f); break; case SQLITE_FCNTL_PERSIST_WAL: /* This prevents SQLite from deleting the WAL after the * last connection is closed. */ *(int *)(arg) = 1; rv = SQLITE_OK; break; default: rv = SQLITE_OK; break; } return rv; } static int vfsDiskFileSectorSize(sqlite3_file *file) { struct vfsFile *f = (struct vfsFile *)file; if (f->db != NULL) { return f->db->pMethods->xSectorSize(f->db); } return 0; } static int vfsDiskFileDeviceCharacteristics(sqlite3_file *file) { struct vfsFile *f = (struct vfsFile *)file; if (f->db != NULL) { return f->db->pMethods->xDeviceCharacteristics(f->db); } return 0; } static const sqlite3_io_methods vfsDiskFileMethods = { 2, // iVersion vfsDiskFileClose, // xClose vfsDiskFileRead, // xRead vfsDiskFileWrite, // xWrite vfsDiskFileTruncate, // xTruncate vfsDiskFileSync, // xSync vfsDiskFileSize, // xFileSize vfsDiskFileLock, // xLock vfsDiskFileUnlock, // xUnlock vfsDiskFileCheckReservedLock, // xCheckReservedLock vfsDiskFileControl, // xFileControl vfsDiskFileSectorSize, // xSectorSize vfsDiskFileDeviceCharacteristics, // xDeviceCharacteristics vfsFileShmMap, // xShmMap vfsFileShmLock, // xShmLock vfsFileShmBarrier, // xShmBarrier vfsFileShmUnmap, // xShmUnmap 0, 0, }; static int vfsDiskOpen(sqlite3_vfs *vfs, const char *filename, sqlite3_file *file, int flags, int *out_flags) { struct vfs *v; struct vfsFile *f; struct vfsDatabase *database; enum vfsFileType type; bool exists; int exclusive = flags & SQLITE_OPEN_EXCLUSIVE; int create = flags & SQLITE_OPEN_CREATE; int rc; tracef("filename:%s", filename); (void)out_flags; assert(vfs != NULL); assert(vfs->pAppData != NULL); assert(file != NULL); /* From sqlite3.h.in: * * The SQLITE_OPEN_EXCLUSIVE flag is always used in conjunction with * the SQLITE_OPEN_CREATE flag, which are both directly analogous to * the O_EXCL and O_CREAT flags of the POSIX open() API. The * SQLITE_OPEN_EXCLUSIVE flag, when paired with the * SQLITE_OPEN_CREATE, is used to indicate that file should always be * created, and that it is an error if it already exists. It is not * used to indicate the file should be opened for exclusive access. */ assert(!exclusive || create); v = (struct vfs *)(vfs->pAppData); f = (struct vfsFile *)file; /* This tells SQLite to not call Close() in case we return an error. */ f->base.pMethods = 0; f->temp = NULL; f->db = NULL; /* Save the flags */ f->flags = flags; /* From SQLite documentation: * * If the zFilename parameter to xOpen is a NULL pointer then xOpen * must invent its own temporary name for the file. Whenever the * xFilename parameter is NULL it will also be the case that the * flags parameter will include SQLITE_OPEN_DELETEONCLOSE. */ if (filename == NULL) { assert(flags & SQLITE_OPEN_DELETEONCLOSE); /* Open an actual temporary file. */ vfs = v->base_vfs; f->temp = sqlite3_malloc(vfs->szOsFile); if (f->temp == NULL) { v->error = ENOENT; return SQLITE_CANTOPEN; } rc = vfs->xOpen(vfs, NULL, f->temp, flags, out_flags); if (rc != SQLITE_OK) { sqlite3_free(f->temp); return rc; } f->base.pMethods = &vfsDiskFileMethods; f->vfs = NULL; f->database = NULL; return SQLITE_OK; } /* Search if the database object exists already. */ database = vfsDatabaseLookup(v, filename); exists = database != NULL; if (flags & SQLITE_OPEN_MAIN_DB) { type = VFS__DATABASE; } else if (flags & SQLITE_OPEN_MAIN_JOURNAL) { type = VFS__JOURNAL; } else if (flags & SQLITE_OPEN_WAL) { type = VFS__WAL; } else { v->error = ENOENT; return SQLITE_CANTOPEN; } /* If file exists, and the exclusive flag is on, return an error. */ if (exists && exclusive && create && type == VFS__DATABASE) { v->error = EEXIST; rc = SQLITE_CANTOPEN; goto err; } if (!exists) { /* When opening a WAL or journal file we expect the main * database file to have already been created. */ if (type == VFS__WAL || type == VFS__JOURNAL) { v->error = ENOENT; rc = SQLITE_CANTOPEN; goto err; } assert(type == VFS__DATABASE); /* Check the create flag. */ if (!create) { v->error = ENOENT; rc = SQLITE_CANTOPEN; goto err; } database = vfsCreateDatabase(v, filename); if (database == NULL) { v->error = ENOMEM; rc = SQLITE_CANTOPEN; goto err; } } if (type == VFS__DATABASE && v->disk) { /* Open an actual database file. */ vfs = v->base_vfs; f->db = sqlite3_malloc(vfs->szOsFile); if (f->db == NULL) { return SQLITE_NOMEM; } rc = vfs->xOpen(vfs, filename, f->db, flags, out_flags); if (rc != SQLITE_OK) { sqlite3_free(f->db); f->db = NULL; return rc; } } /* Populate the new file handle. */ f->base.pMethods = &vfsDiskFileMethods; f->vfs = v; f->type = type; f->database = database; return SQLITE_OK; err: assert(rc != SQLITE_OK); return rc; } static int vfsDiskDelete(sqlite3_vfs *vfs, const char *filename, int dir_sync) { int rv; struct vfs *v; (void)dir_sync; assert(vfs != NULL); assert(vfs->pAppData != NULL); if (vfsFilenameEndsWith(filename, "-journal")) { return SQLITE_OK; } if (vfsFilenameEndsWith(filename, "-wal")) { return SQLITE_OK; } v = (struct vfs *)(vfs->pAppData); rv = vfsDeleteDatabase(v, filename); if (rv != 0) { return rv; } if (!v->disk) { return rv; } return v->base_vfs->xDelete(v->base_vfs, filename, dir_sync); } static int vfsDiskAccess(sqlite3_vfs *vfs, const char *filename, int flags, int *result) { struct vfs *v; struct vfsDatabase *database; (void)flags; assert(vfs != NULL); assert(vfs->pAppData != NULL); v = (struct vfs *)(vfs->pAppData); /* If the database object exists, we consider all associated files as * existing and accessible. */ database = vfsDatabaseLookup(v, filename); if (database == NULL) { *result = 0; } else if (vfsFilenameEndsWith(filename, "-journal")) { *result = 1; } else if (vfsFilenameEndsWith(filename, "-wal")) { *result = 1; } else { /* dqlite database object exists, now check if the regular * SQLite file exists. */ return v->base_vfs->xAccess(vfs, filename, flags, result); } return SQLITE_OK; } int VfsEnableDisk(struct sqlite3_vfs *vfs) { if (vfs->pAppData == NULL) { return -1; } struct vfs *v = vfs->pAppData; v->disk = true; vfs->xOpen = vfsDiskOpen; vfs->xDelete = vfsDiskDelete; vfs->xAccess = vfsDiskAccess; /* TODO check if below functions need alteration for on-disk case. */ vfs->xFullPathname = vfsFullPathname; vfs->xDlOpen = vfsDlOpen; vfs->xDlError = vfsDlError; vfs->xDlSym = vfsDlSym; vfs->xDlClose = vfsDlClose; vfs->xRandomness = vfsRandomness; vfs->xSleep = vfsSleep; vfs->xCurrentTime = vfsCurrentTime; vfs->xGetLastError = vfsGetLastError; vfs->xCurrentTimeInt64 = vfsCurrentTimeInt64; return 0; } int VfsDiskSnapshotWal(sqlite3_vfs *vfs, const char *path, struct dqlite_buffer *buf) { struct vfs *v; struct vfsDatabase *database; struct vfsWal *wal; uint8_t *cursor; int rv; v = (struct vfs *)(vfs->pAppData); database = vfsDatabaseLookup(v, path); if (database == NULL) { tracef("not found"); rv = SQLITE_NOTFOUND; goto err; } /* Copy WAL to last buffer. */ wal = &database->wal; buf->len = vfsWalFileSize(wal); buf->base = sqlite3_malloc64(buf->len); /* WAL can have 0 length! */ if (buf->base == NULL && buf->len != 0) { rv = SQLITE_NOMEM; goto err; } cursor = buf->base; vfsWalSnapshot(wal, &cursor); return 0; err: return rv; } int VfsDiskSnapshotDb(sqlite3_vfs *vfs, const char *path, struct dqlite_buffer *buf) { struct vfs *v; struct vfsDatabase *database; int fd; int rv; char *addr; struct stat sb; v = (struct vfs *)(vfs->pAppData); database = vfsDatabaseLookup(v, path); if (database == NULL) { tracef("not found"); rv = SQLITE_NOTFOUND; goto err; } /* mmap the database file */ fd = open(path, O_RDONLY); if (fd == -1) { tracef("failed to open %s", path); rv = SQLITE_IOERR; goto err; } rv = fstat(fd, &sb); if (rv == -1) { tracef("fstat failed path:%s fd:%d", path, fd); close(fd); rv = SQLITE_IOERR; goto err; } /* TODO database size limited to whatever fits in a size_t. Multiple * mmap's needed. This limitation also exists in various other places * throughout the codebase. */ addr = mmap(NULL, (size_t)sb.st_size, PROT_READ, MAP_SHARED, fd, 0); close(fd); if (addr == MAP_FAILED) { rv = SQLITE_IOERR; goto err; } buf->base = addr; buf->len = (size_t)sb.st_size; return 0; err: return rv; } static int vfsDiskDatabaseRestore(struct vfsDatabase *d, const char *filename, const uint8_t *data, size_t n) { int rv = 0; int fd; ssize_t sz; /* rv of write */ uint32_t page_size; unsigned n_pages; const uint8_t *cursor; size_t n_left; /* amount of data left to write */ fd = open(filename, O_CREAT | O_TRUNC | O_WRONLY, 0600); if (fd == -1) { tracef("fopen failed filename:%s", filename); return -1; } n_left = n; cursor = data; while (n_left > 0) { sz = write(fd, cursor, n_left); /* sz == 0 should not be possible when writing a positive amount * of bytes. */ if (sz <= 0) { tracef("fwrite failed n:%zd sz:%zd errno:%d", n_left, sz, errno); rv = DQLITE_ERROR; goto out; } n_left -= (size_t)sz; cursor += sz; } page_size = vfsParsePageSize(ByteGetBe16(&data[16])); assert(page_size > 0); /* Check that the page size of the snapshot is consistent with what we * have here. */ assert(vfsDatabaseGetPageSize(d) == page_size); n_pages = (unsigned)ByteGetBe32(&data[28]); d->n_pages = n_pages; d->page_size = page_size; out: close(fd); return rv; } int VfsDiskRestore(sqlite3_vfs *vfs, const char *path, const void *data, size_t main_size, size_t wal_size) { tracef("vfs restore path %s main_size %zd wal_size %zd", path, main_size, wal_size); struct vfs *v; struct vfsDatabase *database; struct vfsWal *wal; uint32_t page_size; int rv; v = (struct vfs *)(vfs->pAppData); database = vfsDatabaseLookup(v, path); assert(database != NULL); wal = &database->wal; /* Truncate any existing content. */ rv = vfsWalTruncate(wal, 0); if (rv != 0) { tracef("wal truncate failed %d", rv); return rv; } /* Restore the content of the main database and of the WAL. */ rv = vfsDiskDatabaseRestore(database, path, data, main_size); if (rv != 0) { tracef("database restore failed %d", rv); return rv; } page_size = vfsDatabaseGetPageSize(database); rv = vfsWalRestore(wal, data + main_size, wal_size, page_size); if (rv != 0) { tracef("wal restore failed %d", rv); return rv; } return 0; } uint64_t VfsDatabaseSize(sqlite3_vfs *vfs, const char *path, unsigned n, unsigned page_size) { struct vfs *v; struct vfsDatabase *database; struct vfsWal *wal; uint64_t new_wal_size; v = (struct vfs *)(vfs->pAppData); database = vfsDatabaseLookup(v, path); assert(database != NULL); wal = &database->wal; new_wal_size = (uint64_t)vfsWalFileSize(wal); if (new_wal_size == 0) { new_wal_size += (uint64_t)VFS__WAL_HEADER_SIZE; } new_wal_size += (uint64_t)n * (uint64_t)vfsFrameSize(page_size); return (uint64_t)vfsDatabaseFileSize(database) + new_wal_size; } uint64_t VfsDatabaseSizeLimit(sqlite3_vfs *vfs) { (void)vfs; return (uint64_t)SIZE_MAX; } dqlite-1.16.7/src/vfs.h000066400000000000000000000051041465252713400146370ustar00rootroot00000000000000#ifndef VFS_H_ #define VFS_H_ #include #include "config.h" /* Initialize the given SQLite VFS interface with dqlite's custom * implementation. */ int VfsInit(struct sqlite3_vfs *vfs, const char *name); int VfsEnableDisk(struct sqlite3_vfs *vfs); /* Release all memory associated with the given dqlite in-memory VFS * implementation. * * This function also automatically unregister the implementation from the * SQLite global registry. */ void VfsClose(struct sqlite3_vfs *vfs); /* Check if the last sqlite3_step() call triggered a write transaction, and * return its content if so. */ int VfsPoll(sqlite3_vfs *vfs, const char *database, dqlite_vfs_frame **frames, unsigned *n); /* Append the given frames to the WAL. */ int VfsApply(sqlite3_vfs *vfs, const char *filename, unsigned n, unsigned long *page_numbers, void *frames); /* Cancel a pending transaction. */ int VfsAbort(sqlite3_vfs *vfs, const char *filename); /* Make a full snapshot of a database. */ int VfsSnapshot(sqlite3_vfs *vfs, const char *filename, void **data, size_t *n); /* Makes a full, shallow snapshot of a database. The first n-1 buffers will each * contain a pointer to the actual database pages, while the n'th buffer * will contain a copy of the WAL. `bufs` MUST point to an array of n * `dqlite_buffer` structs and n MUST equal 1 + the number of pages in * the database. */ int VfsShallowSnapshot(sqlite3_vfs *vfs, const char *filename, struct dqlite_buffer bufs[], uint32_t n); /* Copies the WAL into buf */ int VfsDiskSnapshotWal(sqlite3_vfs *vfs, const char *path, struct dqlite_buffer *buf); /* `mmap` the database into buf. */ int VfsDiskSnapshotDb(sqlite3_vfs *vfs, const char *path, struct dqlite_buffer *buf); /* Restore a database snapshot. */ int VfsRestore(sqlite3_vfs *vfs, const char *filename, const void *data, size_t n); /* Restore a disk database snapshot. */ int VfsDiskRestore(sqlite3_vfs *vfs, const char *path, const void *data, size_t main_size, size_t wal_size); /* Number of pages in the database. */ int VfsDatabaseNumPages(sqlite3_vfs *vfs, const char *filename, uint32_t *n); /* Returns the resulting size of the main file, wal file and n additional WAL * frames with the specified page_size. */ uint64_t VfsDatabaseSize(sqlite3_vfs *vfs, const char *path, unsigned n, unsigned page_size); /* Returns the the maximum size of the main file and wal file. */ uint64_t VfsDatabaseSizeLimit(sqlite3_vfs *vfs); #endif /* VFS_H_ */ dqlite-1.16.7/src/vfs2.c000066400000000000000000001550711465252713400147250ustar00rootroot00000000000000#include "vfs2.h" #include "lib/byte.h" #include "lib/queue.h" #include "lib/sm.h" #include "tracing.h" #include "utils.h" #include #include #include #include #include #include #include #include #include #include #include #include #define VFS2_WAL_FIXED_SUFFIX1 "-xwal1" #define VFS2_WAL_FIXED_SUFFIX2 "-xwal2" #define VFS2_WAL_INDEX_REGION_SIZE (1 << 15) #define VFS2_WAL_FRAME_HDR_SIZE 24 #define VFS2_EXCLUSIVE UINT_MAX #define WAL_WRITE_LOCK 0 #define WAL_CKPT_LOCK 1 #define WAL_RECOVER_LOCK 2 #define LE_MAGIC 0x377f0682 #define BE_MAGIC 0x377f0683 #define WAL_NREADER (SQLITE_SHM_NLOCK - 3) #define READ_MARK_UNUSED 0xffffffff static const uint32_t invalid_magic = 0x17171717; enum { /* Entry is not yet open. */ WTX_CLOSED, /* Next WAL write will be a header write, causing a WAL swap (WAL-cur is empty or fully checkpointed). */ WTX_EMPTY, /* Non-leader, at least one transaction in WAL-cur is not committed. */ WTX_FOLLOWING, /* Non-leader, all transactions in WAL-cur are committed (but at least one is not checkpointed). */ WTX_FLUSH, /* Leader, all transactions in WAL-cur are committed (but at least one is not checkpointed). */ WTX_BASE, /* Leader, transaction in progress. */ WTX_ACTIVE, /* Leader, transaction committed by SQLite and hidden. */ WTX_HIDDEN, /* Leader, transation committed by SQLite, hidden, and polled. */ WTX_POLLED }; /* Diagram of the state machine (some transitions omitted when they would crowd the diagram even more): +----------------------+ sqlite3_open +------------------------------------------------------------------------------+ | CLOSED | ------------------------> | FOLLOWING | <+ +----------------------+ +------------------------------------------------------------------------------+ | | ^ ^ | | | sqlite3_open | vfs2_apply_uncommitted | vfs2_apply_uncommitted | vfs2_{commit,unapply} | vfs2_apply_uncommitted v | | v | +----------------------------------------------------------------------------+ | +------------------------+ | | | +----------------------- | | | | EMPTY | | FLUSH | | | | <------------------------ | | | +----------------------------------------------------------------------------+ sqlite3_wal_checkpoint +------------------------+ | | ^ | | | vfs2_commit_barrier | sqlite3_wal_checkpoint | | v | | | +----------------------------------------------------------------------------+ vfs2_commit_barrier | | | BASE | <---------------------------+ | +----------------------------------------------------------------------------+ | | ^ | | | sqlite3_step | vfs2_unhide +-------------------------------------------------------------------------------+ v | +----------------------+ | | ACTIVE | | +----------------------+ | | | | COMMIT_PHASETWO | v | +----------------------+ | | HIDDEN | | +----------------------+ | | | | vfs2_poll | v | +----------------------+ | | POLLED | -+ +----------------------+ */ static const struct sm_conf wtx_states[SM_STATES_MAX] = { [WTX_CLOSED] = { .flags = SM_INITIAL|SM_FINAL, .name = "closed", .allowed = BITS(WTX_EMPTY)|BITS(WTX_FOLLOWING)|BITS(WTX_FLUSH), }, [WTX_EMPTY] = { .flags = 0, .name = "empty", .allowed = BITS(WTX_FOLLOWING)|BITS(WTX_FLUSH)|BITS(WTX_ACTIVE)|BITS(WTX_CLOSED), }, [WTX_FOLLOWING] = { .flags = 0, .name = "following", .allowed = BITS(WTX_FOLLOWING)|BITS(WTX_FLUSH)|BITS(WTX_CLOSED), }, [WTX_FLUSH] = { .flags = 0, .name = "flush", .allowed = BITS(WTX_FOLLOWING)|BITS(WTX_FLUSH)|BITS(WTX_ACTIVE)|BITS(WTX_CLOSED), }, [WTX_BASE] = { .flags = 0, .name = "base", .allowed = BITS(WTX_FOLLOWING)|BITS(WTX_BASE)|BITS(WTX_ACTIVE)|BITS(WTX_EMPTY)|BITS(WTX_CLOSED), }, [WTX_ACTIVE] = { .flags = 0, .name = "active", .allowed = BITS(WTX_BASE)|BITS(WTX_ACTIVE)|BITS(WTX_HIDDEN)|BITS(WTX_CLOSED), }, [WTX_HIDDEN] = { .flags = 0, .name = "hidden", .allowed = BITS(WTX_BASE)|BITS(WTX_POLLED)|BITS(WTX_CLOSED), }, [WTX_POLLED] = { .flags = 0, .name = "polled", .allowed = BITS(WTX_BASE)|BITS(WTX_CLOSED), }, }; /** * Userdata owned by the VFS. */ struct common { sqlite3_vfs *orig; /* underlying VFS */ pthread_rwlock_t rwlock; /* protects the queue */ queue queue; /* queue of entry */ }; struct cksums { uint32_t cksum1; uint32_t cksum2; }; static bool is_bigendian(void) { int x = 1; return *(char *)(&x) == 0; } static uint32_t native_magic(void) { return is_bigendian() ? BE_MAGIC : LE_MAGIC; } static void update_cksums(uint32_t magic, const uint8_t *p, size_t len, struct cksums *sums) { PRE(magic == BE_MAGIC || magic == LE_MAGIC); PRE(len % 8 == 0); const uint8_t *end = p + len; for (; p != end; p += 8) { if (magic == BE_MAGIC) { sums->cksum1 += ByteGetBe32(p) + sums->cksum2; sums->cksum2 += ByteGetBe32(p + 4) + sums->cksum1; } else { sums->cksum1 += ByteGetLe32(p) + sums->cksum2; sums->cksum2 += ByteGetLe32(p + 4) + sums->cksum1; } } } static bool cksums_equal(struct cksums a, struct cksums b) { return a.cksum1 == b.cksum1 && a.cksum2 == b.cksum2; } /** * Layout-compatible with the first part of the WAL index header. * * Note: everything is native-endian except the salts, hence the use of * native integer types here. */ struct wal_index_basic_hdr { uint32_t iVersion; uint8_t unused[4]; uint32_t iChange; uint8_t isInit; uint8_t bigEndCksum; uint16_t szPage; uint32_t mxFrame; uint32_t nPage; struct cksums frame_cksums; struct vfs2_salts salts; struct cksums cksums; }; struct wal_hdr { uint8_t magic[4]; uint8_t version[4]; uint8_t page_size[4]; uint8_t ckpoint_seqno[4]; struct vfs2_salts salts; uint8_t cksum1[4]; uint8_t cksum2[4]; }; struct wal_frame_hdr { uint8_t page_number[4]; uint8_t commit[4]; struct vfs2_salts salts; uint8_t cksum1[4]; uint8_t cksum2[4]; }; struct wal_index_full_hdr { struct wal_index_basic_hdr basic[2]; uint32_t nBackfill; uint32_t marks[WAL_NREADER]; uint8_t locks[SQLITE_SHM_NLOCK]; uint32_t nBackfillAttempted; uint8_t unused[4]; }; /** * View of the zeroth shm region, which contains the WAL index header. */ union vfs2_shm_region0 { struct wal_index_full_hdr hdr; char bytes[VFS2_WAL_INDEX_REGION_SIZE]; }; struct entry { /* Next/prev entries for this VFS. */ queue link; /* e.g. /path/to/some.db */ char *main_db_name; /* The WALs are represented by two physical files (inodes) * and three filenames. For each of the two physical files * there is a "fixed name" that always points to that file. * The "moving name" always points to one of the two physical * files, but switches between them on every WAL swap. */ /* e.g. /path/to/some.db-wal */ char *wal_moving_name; /* e.g. /path/to/some.db-xwal1 */ char *wal_cur_fixed_name; /* Base VFS file object for WAL-cur */ sqlite3_file *wal_cur; /* e.g. /path/to/some.db-xwal2 */ char *wal_prev_fixed_name; /* Base VFS file object for WAL-prev */ sqlite3_file *wal_prev; /* Number of `struct file` with SQLITE_OPEN_MAIN_DB that point to this entry */ unsigned refcount_main_db; /* Number of `struct file` with SQLITE_OPEN_WAL that point to this entry */ unsigned refcount_wal; /* if WAL-cur is nonempty at startup, we read its header, verify the checkum, * and use it to initialize the page size. otherwise, we wait until the first * write to the WAL, which should be the header */ uint32_t page_size; /* For ACTIVE, HIDDEN, POLLED: the header that hides the pending txn */ struct wal_index_basic_hdr prev_txn_hdr; /* For ACTIVE, HIDDEN, POLLED: the header that shows the pending txn */ struct wal_index_basic_hdr pending_txn_hdr; /* shm implementation; holds the WAL index */ void **shm_regions; int shm_regions_len; unsigned shm_refcount; /* Zero for unlocked, positive for read-locked, UINT_MAX for write-locked */ unsigned shm_locks[SQLITE_SHM_NLOCK]; /* For ACTIVE, HIDDEN: the pending txn. start and len * are in units of frames. */ struct vfs2_wal_frame *pending_txn_frames; uint32_t pending_txn_start; uint32_t pending_txn_len; uint32_t pending_txn_last_frame_commit; /* Frame index, points to the physical end of WAL-cur */ uint32_t wal_cursor; /* Cached header of WAL-cur */ struct wal_hdr wal_cur_hdr; /* Cached header of WAL-prev */ struct wal_hdr wal_prev_hdr; struct sm wtx_sm; /* VFS-wide data (immutable) */ struct common *common; }; /** * VFS-specific file object, upcastable to sqlite3_file. */ struct file { /* Our custom sqlite3_io_methods vtable; must come first. Always * present. */ struct sqlite3_file base; /* Flags passed to the xOpen that created this file. */ int flags; /* File object created by the base (unix) VFS. Not used for WAL. */ sqlite3_file *orig; /* Common data between main DB and WAL. Not used for other kinds of * file. */ struct entry *entry; }; static void free_pending_txn(struct entry *e) { if (e->pending_txn_frames != NULL) { for (uint32_t i = 0; i < e->pending_txn_len; i++) { sqlite3_free(e->pending_txn_frames[i].page); } sqlite3_free(e->pending_txn_frames); } e->pending_txn_frames = 0; e->pending_txn_len = 0; e->pending_txn_last_frame_commit = 0; } static uint32_t get_salt1(struct vfs2_salts s) { return ByteGetBe32(s.salt1); } static uint32_t get_salt2(struct vfs2_salts s) { return ByteGetBe32(s.salt2); } static bool salts_equal(struct vfs2_salts a, struct vfs2_salts b) { return get_salt1(a) == get_salt1(b) && get_salt2(a) == get_salt2(b); } static struct wal_index_full_hdr *get_full_hdr(struct entry *e) { PRE(e->shm_regions_len > 0); PRE(e->shm_regions != NULL); return e->shm_regions[0]; } static bool no_pending_txn(const struct entry *e) { return e->pending_txn_len == 0 && e->pending_txn_frames == NULL && e->pending_txn_last_frame_commit == 0; } static bool write_lock_held(const struct entry *e) { return e->shm_locks[WAL_WRITE_LOCK] == VFS2_EXCLUSIVE; } static bool wal_index_basic_hdr_equal(struct wal_index_basic_hdr a, struct wal_index_basic_hdr b) { return memcmp(&a, &b, sizeof(struct wal_index_basic_hdr)) == 0; } static bool wal_index_basic_hdr_zeroed(struct wal_index_basic_hdr h) { return wal_index_basic_hdr_equal(h, (struct wal_index_basic_hdr){}); } static bool wal_index_basic_hdr_advanced(struct wal_index_basic_hdr new, struct wal_index_basic_hdr old) { return new.iChange == old.iChange + 1 && new.nPage >= old.nPage /* no vacuums here */ && ((get_salt1(new.salts) == get_salt1(old.salts) && get_salt2(new.salts) == get_salt2(old.salts)) || /* note the weirdness with zero salts */ (get_salt1(old.salts) == 0 && get_salt2(old.salts) == 0)) && new.mxFrame > old.mxFrame; } /* Check that the hash tables in the WAL index have been initialized * by looking for nonzero bytes after the WAL index header. (TODO: * actually parse the hash tables?) */ static bool wal_index_recovered(const struct entry *e) { PRE(e->shm_regions_len > 0); char *p = e->shm_regions[0]; for (size_t i = sizeof(struct wal_index_full_hdr); i < VFS2_WAL_INDEX_REGION_SIZE; i++) { if (p[i] != 0) { return true; } } return false; } static bool is_valid_page_size(unsigned long n) { return n >= 1 << 9 && n <= 1 << 16 && is_po2(n); } static bool is_open(const struct entry *e) { return e->main_db_name != NULL && e->wal_moving_name != NULL && e->wal_cur_fixed_name != NULL && e->wal_cur != NULL && e->wal_prev_fixed_name != NULL && e->wal_prev != NULL && (e->refcount_main_db > 0 || e->refcount_wal > 0) && e->shm_regions != NULL && e->shm_regions_len > 0 && e->shm_regions[0] != NULL && e->common != NULL; } static bool basic_hdr_valid(struct wal_index_basic_hdr bhdr) { struct cksums sums = {}; update_cksums(bhdr.bigEndCksum ? BE_MAGIC : LE_MAGIC, (uint8_t *)&bhdr, offsetof(struct wal_index_basic_hdr, cksums), &sums); return bhdr.iVersion == 3007000 && bhdr.isInit == 1 && cksums_equal(sums, bhdr.cksums); } static bool full_hdr_valid(const struct wal_index_full_hdr *ihdr) { return basic_hdr_valid(ihdr->basic[0]) && wal_index_basic_hdr_equal(ihdr->basic[0], ihdr->basic[1]); } static bool wtx_invariant(const struct sm *sm, int prev) { /* TODO make use of this */ (void)prev; struct entry *e = CONTAINER_OF(sm, struct entry, wtx_sm); if (sm_state(sm) == WTX_CLOSED) { char *region = (char *)e; char zeroed[offsetof(struct entry, wtx_sm)] = {}; return CHECK(memcmp(region, zeroed, sizeof(zeroed)) == 0) && CHECK(e->common != NULL); } if (!CHECK(is_open(e))) { return false; } struct wal_index_full_hdr *ihdr = get_full_hdr(e); if (!CHECK(full_hdr_valid(ihdr))) { return false; } uint32_t mx = ihdr->basic[0].mxFrame; uint32_t backfill = ihdr->nBackfill; uint32_t cursor = e->wal_cursor; if (!CHECK(backfill <= mx) || !CHECK(mx <= cursor)) { return false; } /* TODO any checks applicable to the read marks and read locks? */ if (sm_state(sm) == WTX_EMPTY) { return CHECK(mx == backfill) && CHECK(mx == cursor) && CHECK(no_pending_txn(e)) && CHECK(!write_lock_held(e)); } if (!CHECK(is_valid_page_size(e->page_size))) { return false; } if (sm_state(sm) == WTX_FOLLOWING) { return CHECK(no_pending_txn(e)) && CHECK(write_lock_held(e)) && CHECK(mx < cursor); } if (sm_state(sm) == WTX_FLUSH) { return CHECK(no_pending_txn(e)) && CHECK(!write_lock_held(e)) && CHECK(ERGO(mx > 0, backfill < mx)) && CHECK(mx == cursor); } if (sm_state(sm) == WTX_BASE) { return CHECK(no_pending_txn(e)) && CHECK(!write_lock_held(e)) && CHECK(ERGO(mx > 0, backfill < mx)) && CHECK(mx == cursor) && CHECK(ERGO(mx > 0, wal_index_recovered(e))); } if (sm_state(sm) == WTX_ACTIVE) { return CHECK(wal_index_basic_hdr_equal(get_full_hdr(e)->basic[0], e->prev_txn_hdr)) && CHECK(wal_index_basic_hdr_zeroed(e->pending_txn_hdr)) && CHECK(write_lock_held(e)); } if (!CHECK(mx < cursor) || !CHECK(e->pending_txn_len > 0) || !CHECK(e->pending_txn_start + e->pending_txn_len == e->wal_cursor)) { return false; } if (sm_state(sm) == WTX_HIDDEN) { bool res = CHECK(wal_index_basic_hdr_equal(get_full_hdr(e)->basic[0], e->prev_txn_hdr)) && CHECK(wal_index_basic_hdr_advanced(e->pending_txn_hdr, e->prev_txn_hdr)) && CHECK(!write_lock_held(e)) && CHECK(e->pending_txn_frames != NULL); if (!res) { return false; } for (uint32_t i = 0; i < e->pending_txn_len; i++) { res &= CHECK(e->pending_txn_frames[i].page != NULL); } return res; } if (sm_state(sm) == WTX_POLLED) { return CHECK(wal_index_basic_hdr_equal(get_full_hdr(e)->basic[0], e->prev_txn_hdr)) && CHECK(wal_index_basic_hdr_advanced(e->pending_txn_hdr, e->prev_txn_hdr)) && CHECK(write_lock_held(e)) && CHECK(e->pending_txn_frames == NULL); } assert(0); } static int check_wal_integrity(sqlite3_file *f) { /* TODO */ (void)f; return SQLITE_OK; } /* sqlite3_io_methods implementations begin here */ static sqlite3_file *get_orig(struct file *f) { return (f->flags & SQLITE_OPEN_WAL) ? f->entry->wal_cur : f->orig; } static void maybe_close_entry(struct entry *e) { if (e->refcount_main_db > 0 || e->refcount_wal > 0) { return; } sqlite3_free(e->main_db_name); sqlite3_free(e->wal_moving_name); sqlite3_free(e->wal_cur_fixed_name); if (e->wal_cur->pMethods != NULL) { e->wal_cur->pMethods->xClose(e->wal_cur); } sqlite3_free(e->wal_cur); sqlite3_free(e->wal_prev_fixed_name); if (e->wal_prev->pMethods != NULL) { e->wal_prev->pMethods->xClose(e->wal_prev); } sqlite3_free(e->wal_prev); free_pending_txn(e); pthread_rwlock_wrlock(&e->common->rwlock); queue_remove(&e->link); pthread_rwlock_unlock(&e->common->rwlock); sqlite3_free(e); } static int vfs2_close(sqlite3_file *file) { struct file *xfile = (struct file *)file; int rv; rv = SQLITE_OK; if (xfile->flags & SQLITE_OPEN_MAIN_DB) { if (xfile->orig->pMethods != NULL) { rv = xfile->orig->pMethods->xClose(xfile->orig); } sqlite3_free(xfile->orig); xfile->entry->refcount_main_db -= 1; maybe_close_entry(xfile->entry); } else if (xfile->flags & SQLITE_OPEN_WAL) { xfile->entry->refcount_wal -= 1; maybe_close_entry(xfile->entry); } else if (xfile->orig->pMethods != NULL) { rv = xfile->orig->pMethods->xClose(xfile->orig); sqlite3_free(xfile->orig); } return rv; } static int vfs2_read(sqlite3_file *file, void *buf, int amt, sqlite3_int64 ofst) { struct file *xfile = (struct file *)file; sqlite3_file *orig = get_orig(xfile); return orig->pMethods->xRead(orig, buf, amt, ofst); } static int wal_swap(struct entry *e, const struct wal_hdr *wal_hdr) { PRE(e->pending_txn_len == 0); PRE(e->pending_txn_frames == NULL); int rv; e->page_size = ByteGetBe32(wal_hdr->page_size); /* Terminology: the outgoing WAL is the one that's moving * from cur to prev. The incoming WAL is the one that's moving * from prev to cur. */ sqlite3_file *phys_outgoing = e->wal_cur; char *name_outgoing = e->wal_cur_fixed_name; sqlite3_file *phys_incoming = e->wal_prev; char *name_incoming = e->wal_prev_fixed_name; tracef("wal swap outgoing=%s incoming=%s", name_outgoing, name_incoming); /* Write the new header of the incoming WAL. */ rv = phys_incoming->pMethods->xWrite(phys_incoming, wal_hdr, sizeof(struct wal_hdr), 0); if (rv != SQLITE_OK) { return rv; } /* In-memory WAL swap. */ e->wal_cur = phys_incoming; e->wal_cur_fixed_name = name_incoming; e->wal_prev = phys_outgoing; e->wal_prev_fixed_name = name_outgoing; e->wal_cursor = 0; e->wal_prev_hdr = e->wal_cur_hdr; e->wal_cur_hdr = *wal_hdr; /* Move the moving name. */ rv = unlink(e->wal_moving_name); if (rv != 0 && errno != ENOENT) { tracef("unlink = IOERR"); return SQLITE_IOERR; } rv = link(name_incoming, e->wal_moving_name); if (rv != 0) { tracef("link = IOERR"); return SQLITE_IOERR; } /* TODO do we need an fsync here? */ /* Best-effort: invalidate the outgoing physical WAL so that nobody gets * confused. */ (void)phys_outgoing->pMethods->xWrite(phys_outgoing, &invalid_magic, sizeof(invalid_magic), 0); return SQLITE_OK; } static int vfs2_wal_write_frame_hdr(struct entry *e, const struct wal_frame_hdr *fhdr, uint32_t x) { struct vfs2_wal_frame *frames = e->pending_txn_frames; if (no_pending_txn(e)) { assert(x == e->wal_cursor); e->pending_txn_start = x; } uint32_t n = e->pending_txn_len; tracef("orig=%u start=%u n=%u", x, e->pending_txn_start, n); x -= e->pending_txn_start; assert(x <= n); if (e->pending_txn_len == 0 && x == 0) { /* check that the WAL-index hdr makes sense and save it */ struct wal_index_basic_hdr hdr = get_full_hdr(e)->basic[0]; assert(hdr.isInit == 1); assert(hdr.mxFrame == e->pending_txn_start); e->prev_txn_hdr = hdr; } if (x == n) { /* FIXME reallocating every time seems bad */ sqlite3_uint64 z = (sqlite3_uint64)sizeof(*frames) * (sqlite3_uint64)(n + 1); e->pending_txn_frames = sqlite3_realloc64(frames, z); if (e->pending_txn_frames == NULL) { return SQLITE_NOMEM; } struct vfs2_wal_frame *frame = &e->pending_txn_frames[n]; uint32_t commit = ByteGetBe32(fhdr->commit); frame->page_number = ByteGetBe32(fhdr->page_number); frame->commit = commit; frame->page = NULL; e->pending_txn_last_frame_commit = commit; e->pending_txn_len++; } else { /* Overwriting a previously-written frame in the current * transaction. */ struct vfs2_wal_frame *frame = &e->pending_txn_frames[x]; frame->page_number = ByteGetBe32(fhdr->page_number); frame->commit = ByteGetBe32(fhdr->commit); sqlite3_free(frame->page); frame->page = NULL; } sm_move(&e->wtx_sm, WTX_ACTIVE); return SQLITE_OK; } static int vfs2_wal_post_write(struct entry *e, const void *buf, int amt, sqlite3_int64 ofst) { uint32_t frame_size = VFS2_WAL_FRAME_HDR_SIZE + e->page_size; if (amt == VFS2_WAL_FRAME_HDR_SIZE) { ofst -= (sqlite3_int64)sizeof(struct wal_hdr); assert(ofst % frame_size == 0); sqlite3_int64 frame_ofst = ofst / (sqlite3_int64)frame_size; return vfs2_wal_write_frame_hdr(e, buf, (uint32_t)frame_ofst); } else if (amt == (int)e->page_size) { sqlite3_int64 x = ofst - VFS2_WAL_FRAME_HDR_SIZE - (sqlite3_int64)sizeof(struct wal_hdr); assert(x % frame_size == 0); x /= frame_size; x -= e->pending_txn_start; assert(0 <= x && x < e->pending_txn_len); struct vfs2_wal_frame *frame = &e->pending_txn_frames[x]; assert(frame->page == NULL); frame->page = sqlite3_malloc(amt); if (frame->page == NULL) { return SQLITE_NOMEM; } memcpy(frame->page, buf, (size_t)amt); sm_move(&e->wtx_sm, WTX_ACTIVE); return SQLITE_OK; } else { assert(0); } } static int vfs2_write(sqlite3_file *file, const void *buf, int amt, sqlite3_int64 ofst) { struct file *xfile = (struct file *)file; int rv; if ((xfile->flags & SQLITE_OPEN_WAL) && ofst == 0) { assert(amt == sizeof(struct wal_hdr)); const struct wal_hdr *hdr = buf; struct entry *e = xfile->entry; tracef("about to wal swap"); rv = wal_swap(e, hdr); if (rv != SQLITE_OK) { return rv; } /* check that the WAL-index hdr makes sense and save it */ struct wal_index_basic_hdr ihdr = get_full_hdr(e)->basic[0]; assert(ihdr.isInit == 1); assert(ihdr.mxFrame == 0); e->prev_txn_hdr = ihdr; sm_move(&e->wtx_sm, WTX_ACTIVE); return SQLITE_OK; } sqlite3_file *orig = get_orig(xfile); rv = orig->pMethods->xWrite(orig, buf, amt, ofst); if (rv != SQLITE_OK) { return rv; } if (xfile->flags & SQLITE_OPEN_WAL) { struct entry *e = xfile->entry; tracef("wrote to WAL name=%s amt=%d ofst=%lld", e->wal_cur_fixed_name, amt, ofst); return vfs2_wal_post_write(e, buf, amt, ofst); } return SQLITE_OK; } static int vfs2_truncate(sqlite3_file *file, sqlite3_int64 size) { struct file *xfile = (struct file *)file; sqlite3_file *orig = get_orig(xfile); return orig->pMethods->xTruncate(orig, size); } static int vfs2_sync(sqlite3_file *file, int flags) { struct file *xfile = (struct file *)file; sqlite3_file *orig = get_orig(xfile); return orig->pMethods->xSync(orig, flags); } static int vfs2_file_size(sqlite3_file *file, sqlite3_int64 *size) { struct file *xfile = (struct file *)file; sqlite3_file *orig = get_orig(xfile); return orig->pMethods->xFileSize(orig, size); } static int vfs2_lock(sqlite3_file *file, int mode) { struct file *xfile = (struct file *)file; sqlite3_file *orig = get_orig(xfile); return orig->pMethods->xLock(orig, mode); } static int vfs2_unlock(sqlite3_file *file, int mode) { struct file *xfile = (struct file *)file; sqlite3_file *orig = get_orig(xfile); return orig->pMethods->xUnlock(orig, mode); } static int vfs2_check_reserved_lock(sqlite3_file *file, int *out) { struct file *xfile = (struct file *)file; sqlite3_file *orig = get_orig(xfile); return orig->pMethods->xCheckReservedLock(orig, out); } static int interpret_pragma(char **args) { char **e = &args[0]; char *left = args[1]; PRE(left != NULL); char *right = args[2]; if (strcmp(left, "journal_mode") == 0 && right != NULL && strcasecmp(right, "wal") != 0) { *e = sqlite3_mprintf("dqlite requires WAL mode"); return SQLITE_ERROR; } return SQLITE_NOTFOUND; } static int vfs2_file_control(sqlite3_file *file, int op, void *arg) { struct file *xfile = (struct file *)file; PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); struct entry *e = xfile->entry; int rv; if (op == SQLITE_FCNTL_COMMIT_PHASETWO && e->pending_txn_len != 0) { /* Hide the transaction that was just written by resetting * the WAL index header. */ struct wal_index_full_hdr *hdr = get_full_hdr(e); e->pending_txn_hdr = hdr->basic[0]; hdr->basic[0] = e->prev_txn_hdr; hdr->basic[1] = hdr->basic[0]; e->wal_cursor += e->pending_txn_len; sm_move(&xfile->entry->wtx_sm, WTX_HIDDEN); } else if (op == SQLITE_FCNTL_PRAGMA) { rv = interpret_pragma(arg); if (rv != SQLITE_NOTFOUND) { return rv; } } else if (op == SQLITE_FCNTL_PERSIST_WAL) { /* TODO handle setting as well as getting (?) */ int *out = arg; *out = 1; return SQLITE_OK; } sqlite3_file *orig = get_orig(xfile); rv = orig->pMethods->xFileControl(orig, op, arg); return rv; } static int vfs2_sector_size(sqlite3_file *file) { struct file *xfile = (struct file *)file; sqlite3_file *orig = get_orig(xfile); return orig->pMethods->xSectorSize(orig); } static int vfs2_device_characteristics(sqlite3_file *file) { struct file *xfile = (struct file *)file; sqlite3_file *orig = get_orig(xfile); return orig->pMethods->xDeviceCharacteristics(orig); } static int vfs2_fetch(sqlite3_file *file, sqlite3_int64 ofst, int amt, void **out) { struct file *xfile = (struct file *)file; sqlite3_file *orig = get_orig(xfile); return orig->pMethods->xFetch(orig, ofst, amt, out); } static int vfs2_unfetch(sqlite3_file *file, sqlite3_int64 ofst, void *buf) { struct file *xfile = (struct file *)file; sqlite3_file *orig = get_orig(xfile); return orig->pMethods->xUnfetch(orig, ofst, buf); } static int vfs2_shm_map(sqlite3_file *file, int regno, int regsz, int extend, void volatile **out) { struct file *xfile = (struct file *)file; struct entry *e = xfile->entry; void *region; int rv; if (e->shm_regions != NULL && regno < e->shm_regions_len) { region = e->shm_regions[regno]; assert(region != NULL); } else if (extend != 0) { assert(regno == e->shm_regions_len); region = sqlite3_malloc(regsz); if (region == NULL) { rv = SQLITE_NOMEM; goto err; } memset(region, 0, (size_t)regsz); /* FIXME reallocating every time seems bad */ sqlite3_uint64 z = (sqlite3_uint64)sizeof(*e->shm_regions) * (sqlite3_uint64)(e->shm_regions_len + 1); void **regions = sqlite3_realloc64(e->shm_regions, z); if (regions == NULL) { rv = SQLITE_NOMEM; goto err_after_region_malloc; } e->shm_regions = regions; e->shm_regions[regno] = region; e->shm_regions_len++; } else { region = NULL; } *out = region; if (regno == 0 && region != NULL) { e->shm_refcount++; } return SQLITE_OK; err_after_region_malloc: sqlite3_free(region); err: assert(rv != SQLITE_OK); *out = NULL; return rv; } static __attribute__((noinline)) int busy(void) { return SQLITE_BUSY; } static int vfs2_shm_lock(sqlite3_file *file, int ofst, int n, int flags) { struct file *xfile = (struct file *)file; PRE(xfile != NULL); struct entry *e = xfile->entry; assert(ofst >= 0 && ofst + n <= SQLITE_SHM_NLOCK); assert(n >= 1); assert(n == 1 || (flags & SQLITE_SHM_EXCLUSIVE) != 0); assert(flags == (SQLITE_SHM_LOCK | SQLITE_SHM_SHARED) || flags == (SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE) || flags == (SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED) || flags == (SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE)); assert(xfile->flags & SQLITE_OPEN_MAIN_DB); if (flags == (SQLITE_SHM_LOCK | SQLITE_SHM_SHARED)) { for (int i = ofst; i < ofst + n; i++) { if (e->shm_locks[i] == VFS2_EXCLUSIVE) { return busy(); } } for (int i = ofst; i < ofst + n; i++) { e->shm_locks[i]++; } } else if (flags == (SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE)) { for (int i = ofst; i < ofst + n; i++) { if (e->shm_locks[i] > 0) { return busy(); } } for (int i = ofst; i < ofst + n; i++) { e->shm_locks[i] = VFS2_EXCLUSIVE; } /* XXX maybe this shouldn't be an assertion */ if (ofst == WAL_WRITE_LOCK) { assert(n == 1); assert(e->pending_txn_len == 0); } } else if (flags == (SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED)) { for (int i = ofst; i < ofst + n; i++) { assert(e->shm_locks[i] > 0); e->shm_locks[i]--; } } else if (flags == (SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE)) { for (int i = ofst; i < ofst + n; i++) { assert(e->shm_locks[i] == VFS2_EXCLUSIVE); e->shm_locks[i] = 0; } if (ofst <= WAL_RECOVER_LOCK && WAL_RECOVER_LOCK < ofst + n) { tracef("unlocking the recovery lock!"); } if (ofst == WAL_WRITE_LOCK) { /* Unlocking the write lock: roll back any uncommitted * transaction. */ assert(n == 1); tracef("unlocking write lock"); /* TODO make sure this is correct */ if (e->pending_txn_last_frame_commit == 0) { free_pending_txn(e); sm_move(&e->wtx_sm, WTX_BASE); } } else if (ofst == WAL_CKPT_LOCK && n == 1) { /* End of a checkpoint: if all frames have been backfilled, * move to EMPTY. */ assert(n == 1); struct wal_index_full_hdr *ihdr = get_full_hdr(e); if (ihdr->nBackfill == ihdr->basic[0].mxFrame) { sm_move(&e->wtx_sm, WTX_EMPTY); } } /* else if (ofst <= WAL_RECOVER_LOCK && WAL_RECOVER_LOCK < ofst + n) { sm_move(&e->wtx_sm, WTX_BASE); } */ } else { assert(0); } return SQLITE_OK; } static void vfs2_shm_barrier(sqlite3_file *file) { (void)file; } static int vfs2_shm_unmap(sqlite3_file *file, int delete) { (void)delete; struct file *xfile = (struct file *)file; struct entry *e = xfile->entry; e->shm_refcount--; if (e->shm_refcount == 0) { for (int i = 0; i < e->shm_regions_len; i++) { void *region = e->shm_regions[i]; assert(region != NULL); sqlite3_free(region); } sqlite3_free(e->shm_regions); e->shm_regions = NULL; e->shm_regions_len = 0; memset(e->shm_locks, 0, sizeof(e->shm_locks)); } return SQLITE_OK; } static struct sqlite3_io_methods vfs2_io_methods = { .iVersion = 3, .xClose = vfs2_close, .xRead = vfs2_read, .xWrite = vfs2_write, .xTruncate = vfs2_truncate, .xSync = vfs2_sync, .xFileSize = vfs2_file_size, .xLock = vfs2_lock, .xUnlock = vfs2_unlock, .xCheckReservedLock = vfs2_check_reserved_lock, .xFileControl = vfs2_file_control, .xSectorSize = vfs2_sector_size, .xDeviceCharacteristics = vfs2_device_characteristics, .xShmMap = vfs2_shm_map, .xShmLock = vfs2_shm_lock, .xShmBarrier = vfs2_shm_barrier, .xShmUnmap = vfs2_shm_unmap, .xFetch = vfs2_fetch, .xUnfetch = vfs2_unfetch }; static int compare_wal_headers(struct wal_hdr a, struct wal_hdr b, bool *ordered) { if (get_salt1(a.salts) == get_salt1(b.salts) + 1) { *ordered = true; } else if (get_salt1(b.salts) == get_salt1(a.salts) + 1) { *ordered = false; } else { return SQLITE_ERROR; } return SQLITE_OK; } static int read_wal_hdr(sqlite3_file *wal, sqlite3_int64 *size, struct wal_hdr *hdr) { int rv; rv = wal->pMethods->xFileSize(wal, size); if (rv != SQLITE_OK) { return rv; } if (*size >= (sqlite3_int64)sizeof(struct wal_hdr)) { rv = wal->pMethods->xRead(wal, hdr, sizeof(*hdr), 0); if (rv != SQLITE_OK) { return rv; } } else { *hdr = (struct wal_hdr){}; } return SQLITE_OK; } static struct wal_index_full_hdr initial_full_hdr(struct wal_hdr whdr) { struct wal_index_full_hdr ihdr = {}; ihdr.basic[0].iVersion = 3007000; ihdr.basic[0].isInit = 1; ihdr.basic[0].bigEndCksum = is_bigendian(); ihdr.basic[0].szPage = (uint16_t)ByteGetBe32(whdr.page_size); struct cksums sums = {}; update_cksums(native_magic(), (const void *)&ihdr.basic[0], offsetof(struct wal_index_basic_hdr, cksums), &sums); ihdr.basic[0].cksums = sums; ihdr.basic[1] = ihdr.basic[0]; ihdr.marks[0] = 0; ihdr.marks[1] = 0; ihdr.marks[2] = READ_MARK_UNUSED; ihdr.marks[3] = READ_MARK_UNUSED; ihdr.marks[4] = READ_MARK_UNUSED; return ihdr; } static void set_mx_frame(struct wal_index_full_hdr *ihdr, uint32_t mx, struct wal_frame_hdr fhdr) { uint32_t num_pages = ByteGetBe32(fhdr.commit); PRE(num_pages > 0); ihdr->basic[0].iChange += 1; ihdr->basic[0].mxFrame = mx; ihdr->basic[0].nPage = num_pages; /* XXX byte order */ ihdr->basic[0].frame_cksums.cksum1 = ByteGetBe32(fhdr.cksum1); ihdr->basic[0].frame_cksums.cksum2 = ByteGetBe32(fhdr.cksum2); struct cksums sums = {}; update_cksums(native_magic(), (const void *)&ihdr->basic[0], 40, &sums); ihdr->basic[0].cksums = sums; ihdr->basic[1] = ihdr->basic[0]; } static void restart_full_hdr(struct wal_index_full_hdr *ihdr, struct wal_hdr new_whdr) { /* cf. walRestartHdr */ ihdr->basic[0].mxFrame = 0; ihdr->basic[0].salts = new_whdr.salts; struct cksums sums = {}; update_cksums(native_magic(), (const void *)&ihdr->basic[0], 40, &sums); ihdr->basic[0].cksums = sums; ihdr->basic[1] = ihdr->basic[0]; ihdr->nBackfill = 0; ihdr->nBackfillAttempted = 0; } static uint32_t wal_cursor_from_size(uint32_t page_size, sqlite3_int64 size) { sqlite3_int64 whdr_size = (sqlite3_int64)sizeof(struct wal_hdr); if (size < whdr_size) { return 0; } sqlite3_int64 x = (size - whdr_size) / ((sqlite3_int64)sizeof(struct wal_frame_hdr) + (sqlite3_int64)page_size); return (uint32_t)x; } static sqlite3_int64 wal_offset_from_cursor(uint32_t page_size, uint32_t cursor) { return (sqlite3_int64)sizeof(struct wal_hdr) + (sqlite3_int64)cursor * ((sqlite3_int64)sizeof(struct wal_frame_hdr) + (sqlite3_int64)page_size); } static int open_entry(struct common *common, const char *name, struct entry *e) { sqlite3_vfs *v = common->orig; int path_cap = v->mxPathname + 1; int file_cap = v->szOsFile; int rv; *e = (struct entry){}; e->common = common; sm_init(&e->wtx_sm, wtx_invariant, NULL, wtx_states, "wtx", WTX_CLOSED); e->refcount_main_db = 1; e->main_db_name = sqlite3_malloc(path_cap); e->wal_moving_name = sqlite3_malloc(path_cap); e->wal_cur_fixed_name = sqlite3_malloc(path_cap); e->wal_prev_fixed_name = sqlite3_malloc(path_cap); if (e->main_db_name == NULL || e->wal_moving_name == NULL || e->wal_cur_fixed_name == NULL || e->wal_prev_fixed_name == NULL) { return SQLITE_NOMEM; } strcpy(e->main_db_name, name); strcpy(e->wal_moving_name, name); strcat(e->wal_moving_name, "-wal"); strcpy(e->wal_cur_fixed_name, name); strcat(e->wal_cur_fixed_name, "-xwal1"); strcpy(e->wal_prev_fixed_name, name); strcat(e->wal_prev_fixed_name, "-xwal2"); /* TODO EXRESCODE? */ int phys_wal_flags = SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_WAL|SQLITE_OPEN_NOFOLLOW; e->wal_cur = sqlite3_malloc(file_cap); if (e->wal_cur == NULL) { return SQLITE_NOMEM; } rv = v->xOpen(v, e->wal_cur_fixed_name, e->wal_cur, phys_wal_flags, NULL); if (rv != SQLITE_OK) { return rv; } e->wal_prev = sqlite3_malloc(file_cap); if (e->wal_prev == NULL) { return SQLITE_NOMEM; } rv = v->xOpen(v, e->wal_prev_fixed_name, e->wal_prev, phys_wal_flags, NULL); if (rv != SQLITE_OK) { return rv; } sqlite3_int64 size1; struct wal_hdr hdr1; rv = read_wal_hdr(e->wal_cur, &size1, &hdr1); if (rv != SQLITE_OK) { return rv; } sqlite3_int64 size2; struct wal_hdr hdr2; rv = read_wal_hdr(e->wal_prev, &size2, &hdr2); if (rv != SQLITE_OK) { return rv; } struct wal_hdr hdr_cur = hdr1; sqlite3_int64 size_cur = size1; struct wal_hdr hdr_prev = hdr2; bool wal1_is_fresh; if (size2 < (sqlite3_int64)sizeof(struct wal_hdr)) { wal1_is_fresh = true; } else if (size1 < (sqlite3_int64)sizeof(struct wal_hdr)) { wal1_is_fresh = false; } else { rv = compare_wal_headers(hdr1, hdr2, &wal1_is_fresh); if (rv != SQLITE_OK) { return rv; } } if (!wal1_is_fresh) { void *temp; temp = e->wal_cur; e->wal_cur = e->wal_prev; e->wal_prev = temp; temp = e->wal_cur_fixed_name; e->wal_cur_fixed_name = e->wal_prev_fixed_name; e->wal_prev_fixed_name = temp; hdr_cur = hdr2; size_cur = size2; hdr_prev = hdr1; } e->wal_cur_hdr = hdr_cur; e->wal_prev_hdr = hdr_prev; rv = unlink(e->wal_moving_name); (void)rv; rv = link(e->wal_cur_fixed_name, e->wal_moving_name); (void)rv; e->shm_regions = sqlite3_malloc(sizeof(void *[1])); if (e->shm_regions == NULL) { return SQLITE_NOMEM; } e->shm_regions[0] = sqlite3_malloc(VFS2_WAL_INDEX_REGION_SIZE); if (e->shm_regions[0] == NULL) { return SQLITE_NOMEM; } memset(e->shm_regions[0], 0, VFS2_WAL_INDEX_REGION_SIZE); e->shm_regions_len = 1; *get_full_hdr(e) = initial_full_hdr(hdr_cur); e->wal_cursor = wal_cursor_from_size(e->page_size, size_cur); int next = WTX_EMPTY; if (size_cur >= wal_offset_from_cursor(0 /* this doesn't matter */, 0)) { /* TODO verify the header here */ e->page_size = ByteGetBe32(hdr_cur.page_size); next = WTX_FLUSH; } if (size_cur >= wal_offset_from_cursor(e->page_size, 1)) { e->shm_locks[WAL_WRITE_LOCK] = VFS2_EXCLUSIVE; next = WTX_FOLLOWING; } sm_move(&e->wtx_sm, next); return SQLITE_OK; } static int set_up_entry(struct common *common, const char *name, int flags, struct entry **e) { bool name_is_db = (flags & SQLITE_OPEN_MAIN_DB) != 0; bool name_is_wal = (flags & SQLITE_OPEN_WAL) != 0; assert(name_is_db ^ name_is_wal); int rv; struct entry *res = NULL; pthread_rwlock_rdlock(&common->rwlock); queue *q; QUEUE_FOREACH(q, &common->queue) { struct entry *cur = QUEUE_DATA(q, struct entry, link); if ((name_is_db && strcmp(cur->main_db_name, name) == 0) || (name_is_wal && strcmp(cur->wal_moving_name, name) == 0)) { res = cur; break; } } pthread_rwlock_unlock(&common->rwlock); if (res != NULL) { sqlite3_free(*e); *e = res; unsigned *refcount = name_is_db ? &res->refcount_main_db : &res->refcount_wal; *refcount += 1; return SQLITE_OK; } assert(name_is_db); res = *e; /* If open_entry fails we still want to link in the entry. Since we unconditionally * set pMethods in our file vtable, SQLite will xClose the file and vfs2_close * will run to clean up the partial work of open_entry. */ rv = open_entry(common, name, res); pthread_rwlock_wrlock(&common->rwlock); queue_insert_tail(&common->queue, &res->link); pthread_rwlock_unlock(&common->rwlock); return rv; } static int vfs2_open(sqlite3_vfs *vfs, const char *name, sqlite3_file *out, int flags, int *out_flags) { struct file *xout = (struct file *)out; struct common *common = vfs->pAppData; int rv; *xout = (struct file){}; xout->base.pMethods = &vfs2_io_methods; xout->flags = flags; if ((flags & SQLITE_OPEN_WAL) == 0) { sqlite3_vfs *v = common->orig; xout->orig = sqlite3_malloc(v->szOsFile); if (xout->orig == NULL) { return SQLITE_NOMEM; } rv = v->xOpen(v, name, xout->orig, flags, out_flags); if (rv != SQLITE_OK) { return rv; } } if (flags & (SQLITE_OPEN_MAIN_DB|SQLITE_OPEN_WAL)) { xout->entry = sqlite3_malloc(sizeof(*xout->entry)); if (xout->entry == NULL) { return SQLITE_NOMEM; } rv = set_up_entry(common, name, flags, &xout->entry); if (rv != SQLITE_OK) { return rv; } } if ((flags & SQLITE_OPEN_WAL) && out_flags != NULL) { *out_flags = flags; } return SQLITE_OK; } /* TODO does this need to be customized? should it ever be called on one of our files? */ static int vfs2_delete(sqlite3_vfs *vfs, const char *name, int sync_dir) { struct common *data = vfs->pAppData; return data->orig->xDelete(data->orig, name, sync_dir); } static int vfs2_access(sqlite3_vfs *vfs, const char *name, int flags, int *out) { /* TODO always report that the WAL exists (?) */ /* TODO other customizations? */ struct common *data = vfs->pAppData; return data->orig->xAccess(data->orig, name, flags, out); } static int vfs2_full_pathname(sqlite3_vfs *vfs, const char *name, int n, char *out) { struct common *data = vfs->pAppData; return data->orig->xFullPathname(data->orig, name, n, out); } static void *vfs2_dl_open(sqlite3_vfs *vfs, const char *filename) { struct common *data = vfs->pAppData; return data->orig->xDlOpen(data->orig, filename); } static void vfs2_dl_error(sqlite3_vfs *vfs, int n, char *msg) { struct common *data = vfs->pAppData; return data->orig->xDlError(data->orig, n, msg); } typedef void (*vfs2_sym)(void); static vfs2_sym vfs2_dl_sym(sqlite3_vfs *vfs, void *dl, const char *symbol) { struct common *data = vfs->pAppData; return data->orig->xDlSym(data->orig, dl, symbol); } static void vfs2_dl_close(sqlite3_vfs *vfs, void *dl) { struct common *data = vfs->pAppData; return data->orig->xDlClose(data->orig, dl); } static int vfs2_randomness(sqlite3_vfs *vfs, int n, char *out) { struct common *data = vfs->pAppData; return data->orig->xRandomness(data->orig, n, out); } static int vfs2_sleep(sqlite3_vfs *vfs, int microseconds) { struct common *data = vfs->pAppData; return data->orig->xSleep(data->orig, microseconds); } static int vfs2_current_time(sqlite3_vfs *vfs, double *out) { struct common *data = vfs->pAppData; return data->orig->xCurrentTime(data->orig, out); } /* TODO update this to reflect syscalls that we make ourselves (not through the * base VFS -- store last error in vfs [thread-local?]) */ static int vfs2_get_last_error(sqlite3_vfs *vfs, int n, char *out) { struct common *data = vfs->pAppData; return data->orig->xGetLastError(data->orig, n, out); } static int vfs2_current_time_int64(sqlite3_vfs *vfs, sqlite3_int64 *out) { struct common *data = vfs->pAppData; if (data->orig->iVersion < 2) { return SQLITE_ERROR; } return data->orig->xCurrentTimeInt64(data->orig, out); } /* sqlite3_vfs implementations end here */ sqlite3_vfs *vfs2_make(sqlite3_vfs *orig, const char *name) { struct common *common = sqlite3_malloc(sizeof(*common)); struct sqlite3_vfs *vfs = sqlite3_malloc(sizeof(*vfs)); if (common == NULL || vfs == NULL) { return NULL; } common->orig = orig; pthread_rwlock_init(&common->rwlock, NULL); queue_init(&common->queue); vfs->iVersion = 2; vfs->szOsFile = sizeof(struct file); vfs->mxPathname = orig->mxPathname; vfs->zName = name; vfs->pAppData = common; vfs->xOpen = vfs2_open; vfs->xDelete = vfs2_delete; vfs->xAccess = vfs2_access; vfs->xFullPathname = vfs2_full_pathname; vfs->xDlOpen = vfs2_dl_open; vfs->xDlError = vfs2_dl_error; vfs->xDlSym = vfs2_dl_sym; vfs->xDlClose = vfs2_dl_close; vfs->xRandomness = vfs2_randomness; vfs->xSleep = vfs2_sleep; vfs->xCurrentTime = vfs2_current_time; vfs->xGetLastError = vfs2_get_last_error; vfs->xCurrentTimeInt64 = vfs2_current_time_int64; return vfs; } int vfs2_unapply(sqlite3_file *file, struct vfs2_wal_slice first_to_unapply) { struct file *xfile = (struct file *)file; PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); struct entry *e = xfile->entry; PRE(salts_equal(first_to_unapply.salts, e->wal_cur_hdr.salts)); PRE(first_to_unapply.start + first_to_unapply.len <= e->wal_cursor); struct wal_index_full_hdr *ihdr = get_full_hdr(e); PRE(first_to_unapply.start >= ihdr->basic[0].mxFrame); PRE(e->shm_locks[WAL_WRITE_LOCK] == VFS2_EXCLUSIVE); e->wal_cursor = first_to_unapply.start; if (e->wal_cursor == ihdr->basic[0].mxFrame) { e->shm_locks[WAL_WRITE_LOCK] = 0; sm_move(&e->wtx_sm, WTX_FLUSH); } else { sm_move(&e->wtx_sm, WTX_FOLLOWING); } return SQLITE_OK; } int vfs2_unhide(sqlite3_file *file) { struct file *xfile = (struct file *)file; PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); struct entry *e = xfile->entry; PRE(e->shm_locks[WAL_WRITE_LOCK] == VFS2_EXCLUSIVE); e->shm_locks[WAL_WRITE_LOCK] = 0; struct wal_index_full_hdr *hdr = get_full_hdr(e); hdr->basic[0] = e->pending_txn_hdr; hdr->basic[1] = e->pending_txn_hdr; e->prev_txn_hdr = e->pending_txn_hdr; e->pending_txn_hdr = (struct wal_index_basic_hdr){}; e->pending_txn_len = 0; e->pending_txn_last_frame_commit = 0; sm_move(&xfile->entry->wtx_sm, WTX_BASE); return 0; } int vfs2_commit(sqlite3_file *file, struct vfs2_wal_slice stop) { struct file *xfile = (struct file *)file; PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); struct entry *e = xfile->entry; uint32_t commit = stop.start + stop.len; PRE(e->wal_cursor >= commit); PRE(salts_equal(stop.salts, e->wal_cur_hdr.salts)); PRE(e->shm_locks[WAL_WRITE_LOCK] == VFS2_EXCLUSIVE); sqlite3_file *wal_cur = e->wal_cur; struct wal_frame_hdr fhdr; int rv = wal_cur->pMethods->xRead(wal_cur, &fhdr, sizeof(fhdr), wal_offset_from_cursor(e->page_size, stop.start + stop.len - 1)); if (rv == SQLITE_OK) { return rv; } set_mx_frame(get_full_hdr(e), commit, fhdr); if (commit == e->wal_cursor) { e->shm_locks[WAL_WRITE_LOCK] = 0; sm_move(&e->wtx_sm, WTX_FLUSH); } else { sm_move(&e->wtx_sm, WTX_FOLLOWING); } return 0; } int vfs2_commit_barrier(sqlite3_file *file) { struct file *xfile = (struct file *)file; PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); struct entry *e = xfile->entry; if (e->wal_cursor > 0) { sqlite3_file *wal_cur = e->wal_cur; struct wal_frame_hdr fhdr; int rv = wal_cur->pMethods->xRead(wal_cur, &fhdr, sizeof(fhdr), wal_offset_from_cursor(e->page_size, e->wal_cursor - 1)); if (rv == SQLITE_OK) { return rv; } set_mx_frame(get_full_hdr(e), e->wal_cursor, fhdr); /* It's okay if the write lock isn't held */ e->shm_locks[WAL_WRITE_LOCK] = 0; get_full_hdr(e)->basic[0].isInit = 0; /* The next transaction will cause SQLite to run recovery which will complete the transition to BASE */ sm_move(&e->wtx_sm, WTX_FLUSH); } return 0; } int vfs2_poll(sqlite3_file *file, struct vfs2_wal_frame **frames, unsigned *n, struct vfs2_wal_slice *sl) { struct file *xfile = (struct file *)file; PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); struct entry *e = xfile->entry; uint32_t len = e->pending_txn_len; if (len > 0) { /* Don't go through vfs2_shm_lock here since that has additional * checks that assume the context of being called from inside * SQLite. */ if (e->shm_locks[WAL_WRITE_LOCK] > 0) { return 1; } e->shm_locks[WAL_WRITE_LOCK] = VFS2_EXCLUSIVE; } /* Note, not resetting pending_txn_{start,len} because they are used by later states */ if (n != NULL && frames != NULL) { *n = len; *frames = e->pending_txn_frames; } else { for (uint32_t i = 0; i < e->pending_txn_len; i++) { sqlite3_free(e->pending_txn_frames[i].page); } sqlite3_free(e->pending_txn_frames); } e->pending_txn_frames = NULL; if (sl != NULL) { sl->len = len; sl->salts = e->pending_txn_hdr.salts; sl->start = e->prev_txn_hdr.mxFrame; sl->len = len; } sm_move(&xfile->entry->wtx_sm, WTX_POLLED); return 0; } void vfs2_destroy(sqlite3_vfs *vfs) { struct common *data = vfs->pAppData; pthread_rwlock_destroy(&data->rwlock); sqlite3_free(data); sqlite3_free(vfs); } int vfs2_abort(sqlite3_file *file) { /* TODO maybe can "followerize" this and get rid of vfs2_unapply_after? */ struct file *xfile = (struct file *)file; PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); struct entry *e = xfile->entry; e->shm_locks[WAL_WRITE_LOCK] = 0; struct wal_index_full_hdr *hdr = get_full_hdr(e); hdr->basic[0] = e->prev_txn_hdr; hdr->basic[1] = e->prev_txn_hdr; e->pending_txn_hdr = (struct wal_index_basic_hdr){}; e->wal_cursor = e->pending_txn_start; free_pending_txn(e); sm_move(&xfile->entry->wtx_sm, WTX_BASE); return 0; } int vfs2_read_wal(sqlite3_file *file, struct vfs2_wal_txn *txns, size_t txns_len) { struct file *xfile = (struct file *)file; PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); struct entry *e = xfile->entry; int rv; /* TODO check wal integrity before reading it */ (void)check_wal_integrity; int page_size = (int)e->page_size; for (size_t i = 0; i < txns_len; i++) { struct vfs2_wal_frame *f = sqlite3_malloc64(txns[i].meta.len * sizeof(*f)); if (f == NULL) { goto oom; } txns[i].frames = f; for (size_t j = 0; j < txns[i].meta.len; j++) { void *p = sqlite3_malloc(page_size); if (p == NULL) { goto oom; } txns[i].frames[j].page = p; } } for (size_t i = 0; i < txns_len; i++) { sqlite3_file *wal; unsigned read_lock; bool from_wal_cur = salts_equal(txns[i].meta.salts, e->wal_cur_hdr.salts); bool from_wal_prev = salts_equal(txns[i].meta.salts, e->wal_prev_hdr.salts); assert(from_wal_cur ^ from_wal_prev); if (from_wal_cur) { rv = vfs2_pseudo_read_begin(file, e->wal_cursor, &read_lock); if (rv != SQLITE_OK) { return 1; } wal = e->wal_cur; } else { wal = e->wal_prev; } uint32_t start = txns[i].meta.start; uint32_t len = txns[i].meta.len; for (uint32_t j = 0; j < len; j++) { sqlite3_int64 off = wal_offset_from_cursor(e->page_size, start + j); struct wal_frame_hdr fhdr; rv = wal->pMethods->xRead(wal, &fhdr, sizeof(fhdr), off); if (rv != SQLITE_OK) { return 1; } off += (sqlite3_int64)sizeof(fhdr); rv = wal->pMethods->xRead(wal, txns[i].frames[j].page, page_size, off); if (rv != SQLITE_OK) { return 1; } txns[i].frames[j].page_number = ByteGetBe32(fhdr.page_number); txns[i].frames[j].commit = ByteGetBe32(fhdr.commit); } if (from_wal_cur) { vfs2_pseudo_read_end(file, read_lock); } } return 0; oom: for (uint32_t i = 0; i < txns_len; i++) { for (uint32_t j = 0; j < txns[i].meta.len; j++) { sqlite3_free(txns[i].frames[j].page); } sqlite3_free(txns[i].frames); txns[i].frames = NULL; } return 1; } static int write_one_frame(struct entry *e, struct wal_frame_hdr hdr, void *data) { int rv; sqlite3_int64 off = wal_offset_from_cursor(e->page_size, e->wal_cursor); rv = e->wal_cur->pMethods->xWrite(e->wal_cur, &hdr, sizeof(hdr), off); if (rv != SQLITE_OK) { return rv; } off += (sqlite3_int64)sizeof(hdr); rv = e->wal_cur->pMethods->xWrite(e->wal_cur, data, (int)e->page_size, off); if (rv != SQLITE_OK) { return rv; } e->wal_cursor += 1; return SQLITE_OK; } static struct wal_hdr next_wal_hdr(const struct entry *e) { struct wal_hdr ret; struct wal_hdr old = e->wal_cur_hdr; BytePutBe32(native_magic(), ret.magic); BytePutBe32(3007000, ret.version); BytePutBe32(e->page_size, ret.page_size); uint32_t ckpoint_seqno = ByteGetBe32(old.ckpoint_seqno); BytePutBe32(ckpoint_seqno + 1, ret.ckpoint_seqno); uint32_t salt1; if (ckpoint_seqno == 0) { salt1 = get_salt1(old.salts) + 1; } else { e->common->orig->xRandomness(e->common->orig, sizeof(salt1), (void *)&salt1); } BytePutBe32(salt1, ret.salts.salt1); e->common->orig->xRandomness(e->common->orig, sizeof(ret.salts.salt2), (void *)&ret.salts.salt2); return ret; } static struct wal_frame_hdr txn_frame_hdr(struct entry *e, struct cksums sums, struct vfs2_wal_frame frame) { struct wal_frame_hdr fhdr; BytePutBe32(frame.page_number, fhdr.page_number); BytePutBe32(frame.commit, fhdr.commit); update_cksums(ByteGetBe32(e->wal_cur_hdr.magic), (const void *)(&fhdr), 8, &sums); update_cksums(ByteGetBe32(e->wal_cur_hdr.magic), frame.page, e->page_size, &sums); fhdr.salts = e->wal_cur_hdr.salts; BytePutBe32(sums.cksum1, fhdr.cksum1); BytePutBe32(sums.cksum2, fhdr.cksum2); return fhdr; } int vfs2_apply_uncommitted(sqlite3_file *file, uint32_t page_size, const struct vfs2_wal_frame *frames, unsigned len, struct vfs2_wal_slice *out) { PRE(len > 0); PRE(is_valid_page_size(page_size)); for (unsigned i = 0; i < len - 1; i++) { PRE(frames[i].commit == 0); } PRE(frames[len - 1].commit > 0); struct file *xfile = (struct file *)file; PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); struct entry *e = xfile->entry; PRE(page_size == e->page_size); int rv; /* The write lock is always held if there is at least one * uncommitted frame in WAL-cur. In FOLLOWING state, we allow * adding more frames to WAL-cur even if there are already * some uncommitted frames. Hence we don't check the write * lock here before "acquiring" it, we just make sure that * it's held before returning. * * The write lock will be released when a call to vfs2_commit * or vfs2_unapply causes the number of committed frames in * WAL-cur (mxFrame) to equal the number of applies frames * (wal_cursor). */ e->shm_locks[WAL_WRITE_LOCK] = VFS2_EXCLUSIVE; struct wal_index_full_hdr *ihdr = get_full_hdr(e); uint32_t mx = ihdr->basic[0].mxFrame; if (mx > 0 && ihdr->nBackfill == mx) { struct wal_hdr new_whdr = next_wal_hdr(e); restart_full_hdr(ihdr, new_whdr); rv = wal_swap(e, &new_whdr); if (rv != SQLITE_OK) { return 1; } /* sm_move(&e->wtx_sm, WTX_FLUSH); */ } uint32_t start = e->wal_cursor; struct cksums sums; if (start > 0) { /* TODO cache this in the entry? */ struct wal_frame_hdr prev_fhdr; sqlite3_int64 off = wal_offset_from_cursor(e->page_size, e->wal_cursor - 1); rv = e->wal_cur->pMethods->xRead(e->wal_cur, &prev_fhdr, sizeof(prev_fhdr), off); if (rv != SQLITE_OK) { return 1; } sums.cksum1 = ByteGetBe32(prev_fhdr.cksum1); sums.cksum2 = ByteGetBe32(prev_fhdr.cksum2); } else { sums.cksum1 = ByteGetBe32(e->wal_cur_hdr.cksum1); sums.cksum2 = ByteGetBe32(e->wal_cur_hdr.cksum2); } struct wal_frame_hdr fhdr = txn_frame_hdr(e, sums, frames[0]); rv = write_one_frame(e, fhdr, frames[0].page); if (rv != SQLITE_OK) { return 1; } for (unsigned i = 1; i < len; i++) { sums.cksum1 = ByteGetBe32(fhdr.cksum1); sums.cksum2 = ByteGetBe32(fhdr.cksum2); fhdr = txn_frame_hdr(e, sums, frames[i]); rv = write_one_frame(e, fhdr, frames[i].page); if (rv != SQLITE_OK) { return 1; } } sm_move(&e->wtx_sm, WTX_FOLLOWING); out->salts = e->wal_cur_hdr.salts; out->start = start; out->len = len; return 0; } /* Get the index of the `i`th read lock in the array of * shm locks. There are five read locks, and three non-read * locks that come before the read locks. * See https://sqlite.org/walformat.html#wal_locks. */ static unsigned read_lock(unsigned i) { PRE(i < 5); return 3 + i; } int vfs2_pseudo_read_begin(sqlite3_file *file, uint32_t target, unsigned *out) { struct file *xfile = (struct file *)file; PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); struct entry *e = xfile->entry; struct wal_index_full_hdr *ihdr = get_full_hdr(e); /* adapted from walTryBeginRead */ uint32_t max_mark = 0; unsigned max_index = 0; for (unsigned i = 1; i < WAL_NREADER; i++){ uint32_t cur = ihdr->marks[i]; if (max_mark <= cur && cur <= target) { assert(cur != READ_MARK_UNUSED); max_mark = cur; max_index = i; } } if (max_mark < target || max_index == 0) { for (unsigned i = 1; i < WAL_NREADER; i++) { if (e->shm_locks[read_lock(i)] > 0) { continue; } ihdr->marks[i] = target; max_mark = target; max_index = i; break; } } if (max_index == 0) { return 1; } *out = max_index; return 0; } int vfs2_pseudo_read_end(sqlite3_file *file, unsigned i) { struct file *xfile = (struct file *)file; PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); struct entry *e = xfile->entry; PRE(e->shm_locks[i] > 0); e->shm_locks[i] -= 1; return 0; } dqlite-1.16.7/src/vfs2.h000066400000000000000000000060601465252713400147230ustar00rootroot00000000000000#ifndef DQLITE_VFS2_H #define DQLITE_VFS2_H #include #include #include /** * Create a new VFS object that wraps the given VFS object. * * The returned VFS is allocated on the heap and lives until vfs2_destroy is * called. Its methods are thread-safe if those of the wrapped VFS are, but * the methods of the sqlite3_file objects it creates are not thread-safe. * Therefore, a database connection that's created using this VFS should only * be used on the thread that opened it. The functions below that operate on * sqlite3_file objects created by this VFS should also only be used on that * thread. */ sqlite3_vfs *vfs2_make(sqlite3_vfs *orig, const char *name); struct vfs2_salts { uint8_t salt1[4]; uint8_t salt2[4]; }; /** * Identifying information about a write transaction. */ struct vfs2_wal_slice { struct vfs2_salts salts; uint32_t start; uint32_t len; }; struct vfs2_wal_frame { uint32_t page_number; uint32_t commit; void *page; }; /** * Retrieve frames that were appended to the WAL by the last write transaction, * and reacquire the write lock. * * Call this on the database main file object (SQLITE_FCNTL_FILE_POINTER). * * Polling the same transaction more than once is an error. */ int vfs2_poll(sqlite3_file *file, struct vfs2_wal_frame **frames, unsigned *n, struct vfs2_wal_slice *sl); int vfs2_unhide(sqlite3_file *file); int vfs2_commit(sqlite3_file *file, struct vfs2_wal_slice stop); int vfs2_commit_barrier(sqlite3_file *file); int vfs2_apply_uncommitted(sqlite3_file *file, uint32_t page_size, const struct vfs2_wal_frame *frames, unsigned n, struct vfs2_wal_slice *out); int vfs2_unapply(sqlite3_file *file, struct vfs2_wal_slice stop); struct vfs2_wal_txn { struct vfs2_wal_slice meta; struct vfs2_wal_frame *frames; }; /** * Synchronously read some transaction data directly from the WAL. * * Fill the `meta` field of each vfs2_wal_txn with a slice that was previously * returned by vfs2_shallow_poll. On return, this function will set the `frames` * field of each vfs2_wal_txn, using memory from the SQLite allocator that the * caller must free, if the transaction was read successfully. Setting this * field to NULL means that the transaction couldn't be read. */ int vfs2_read_wal(sqlite3_file *file, struct vfs2_wal_txn *txns, size_t txns_len); /** * Cancel a pending transaction and release the write lock. * * Call this on the database main file object (SQLITE_FCNTL_FILE_POINTER). * * Calling this function when there is no pending transaction is an error. * It's okay to call it whether or not the transaction has been polled. */ int vfs2_abort(sqlite3_file *file); int vfs2_pseudo_read_begin(sqlite3_file *file, uint32_t target, unsigned *out); int vfs2_pseudo_read_end(sqlite3_file *file, unsigned i); /** * Destroy the VFS object. * * Call this from the same thread that called vfs2_make. No connection may be * open that uses this VFS. */ void vfs2_destroy(sqlite3_vfs *vfs); // TODO access read marks and shm_locks // TODO access information about checkpoints #endif dqlite-1.16.7/test/000077500000000000000000000000001465252713400140605ustar00rootroot00000000000000dqlite-1.16.7/test/integration/000077500000000000000000000000001465252713400164035ustar00rootroot00000000000000dqlite-1.16.7/test/integration/main.c000066400000000000000000000000621465252713400174710ustar00rootroot00000000000000#include "../lib/runner.h" RUNNER("integration") dqlite-1.16.7/test/integration/test_client.c000066400000000000000000000070031465252713400210640ustar00rootroot00000000000000#include "../lib/client.h" #include "../lib/heap.h" #include "../lib/runner.h" #include "../lib/server.h" #include "../lib/sqlite.h" /****************************************************************************** * * Handle client requests * ******************************************************************************/ SUITE(client); static char *bools[] = { "0", "1", NULL }; static MunitParameterEnum client_params[] = { { "disk_mode", bools }, { NULL, NULL }, }; struct fixture { struct test_server server; struct client_proto *client; struct rows rows; }; static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); (void)user_data; f->rows = (struct rows){}; test_heap_setup(params, user_data); test_sqlite_setup(params); test_server_setup(&f->server, 1, params); test_server_start(&f->server, params); f->client = test_server_client(&f->server); HANDSHAKE; OPEN; return f; } static void tearDown(void *data) { struct fixture *f = data; test_server_tear_down(&f->server); test_sqlite_tear_down(); test_heap_tear_down(data); clientCloseRows(&f->rows); free(f); } TEST(client, exec, setUp, tearDown, 0, client_params) { struct fixture *f = data; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; (void)params; PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); return MUNIT_OK; } TEST(client, execWithOneParam, setUp, tearDown, 0, client_params) { struct fixture *f = data; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; struct value param = { 0 }; int rv; (void)params; PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); PREPARE("INSERT INTO test (n) VALUES(?)", &stmt_id); param.type = SQLITE_INTEGER; param.integer = 17; rv = clientSendExec(f->client, stmt_id, ¶m, 1, NULL); munit_assert_int(rv, ==, 0); rv = clientRecvResult(f->client, &last_insert_id, &rows_affected, NULL); munit_assert_int(rv, ==, 0); return MUNIT_OK; } TEST(client, execSql, setUp, tearDown, 0, client_params) { struct fixture *f = data; uint64_t last_insert_id; uint64_t rows_affected; (void)params; EXEC_SQL("CREATE TABLE test (n INT)", &last_insert_id, &rows_affected); return MUNIT_OK; } TEST(client, query, setUp, tearDown, 0, client_params) { struct fixture *f = data; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; unsigned i; (void)params; PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); PREPARE("BEGIN", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); PREPARE("INSERT INTO test (n) VALUES(123)", &stmt_id); for (i = 0; i < 256; i++) { EXEC(stmt_id, &last_insert_id, &rows_affected); } PREPARE("COMMIT", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); PREPARE("SELECT n FROM test", &stmt_id); QUERY_DONE(stmt_id, &f->rows, {}); return MUNIT_OK; } TEST(client, querySql, setUp, tearDown, 0, client_params) { struct fixture *f = data; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; unsigned i; (void)params; EXEC_SQL("CREATE TABLE test (n INT)", &last_insert_id, &rows_affected); EXEC_SQL("BEGIN", &last_insert_id, &rows_affected); PREPARE("INSERT INTO test (n) VALUES(123)", &stmt_id); for (i = 0; i < 256; i++) { EXEC(stmt_id, &last_insert_id, &rows_affected); } EXEC_SQL("COMMIT", &last_insert_id, &rows_affected); QUERY_SQL_DONE("SELECT n FROM test", &f->rows, {}); return MUNIT_OK; } dqlite-1.16.7/test/integration/test_cluster.c000066400000000000000000000167011465252713400212740ustar00rootroot00000000000000#include "../../src/client/protocol.h" #include "../../src/server.h" #include "../lib/client.h" #include "../lib/endpoint.h" #include "../lib/fs.h" #include "../lib/heap.h" #include "../lib/runner.h" #include "../lib/server.h" #include "../lib/sqlite.h" /****************************************************************************** * * Fixture * ******************************************************************************/ #define N_SERVERS 3 #define FIXTURE \ struct test_server servers[N_SERVERS]; \ struct client_proto *client #define SETUP \ unsigned i_; \ test_heap_setup(params, user_data); \ test_sqlite_setup(params); \ for (i_ = 0; i_ < N_SERVERS; i_++) { \ struct test_server *server = &f->servers[i_]; \ test_server_setup(server, i_ + 1, params); \ } \ test_server_network(f->servers, N_SERVERS); \ for (i_ = 0; i_ < N_SERVERS; i_++) { \ struct test_server *server = &f->servers[i_]; \ test_server_start(server, params); \ } \ SELECT(1) #define TEAR_DOWN \ unsigned i_; \ for (i_ = 0; i_ < N_SERVERS; i_++) { \ test_server_tear_down(&f->servers[i_]); \ } \ test_sqlite_tear_down(); \ test_heap_tear_down(data) /* Use the client connected to the server with the given ID. */ #define SELECT(ID) f->client = test_server_client(&f->servers[ID - 1]) /****************************************************************************** * * cluster * ******************************************************************************/ SUITE(cluster) struct fixture { FIXTURE; }; static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP; return f; } static void tearDown(void *data) { struct fixture *f = data; TEAR_DOWN; free(f); } static char *bools[] = { "0", "1", NULL }; static char *num_records[] = { "0", "1", "256", /* WAL will just have been checkpointed after 993 writes. */ "993", /* Non-empty WAL, checkpointed twice, 2 snapshots taken */ "2200", NULL }; static MunitParameterEnum cluster_params[] = { { "num_records", num_records }, { "disk_mode", bools }, { NULL, NULL }, }; /* Restart a node and check if all data is there */ TEST(cluster, restart, setUp, tearDown, 0, cluster_params) { struct fixture *f = data; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; struct rows rows; long n_records = strtol(munit_parameters_get(params, "num_records"), NULL, 0); char sql[128]; HANDSHAKE; OPEN; PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); for (int i = 0; i < n_records; ++i) { sprintf(sql, "INSERT INTO test(n) VALUES(%d)", i + 1); PREPARE(sql, &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); } struct test_server *server = &f->servers[0]; test_server_stop(server); test_server_start(server, params); /* The table is visible after restart. */ HANDSHAKE; OPEN; PREPARE("SELECT COUNT(*) from test", &stmt_id); QUERY_DONE(stmt_id, &rows, {}); return MUNIT_OK; } /* Add data to a node, add a new node and make sure data is there. */ TEST(cluster, dataOnNewNode, setUp, tearDown, 0, cluster_params) { struct fixture *f = data; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; struct rows rows; long n_records = strtol(munit_parameters_get(params, "num_records"), NULL, 0); char sql[128]; unsigned id = 2; const char *address = "@2"; HANDSHAKE; OPEN; PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); for (int i = 0; i < n_records; ++i) { sprintf(sql, "INSERT INTO test(n) VALUES(%d)", i + 1); PREPARE(sql, &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); } /* Add a second voting server, this one will receive all data from the * original leader. */ ADD(id, address); ASSIGN(id, DQLITE_VOTER); /* Remove original server so second server becomes leader after election * timeout */ REMOVE(1); sleep(1); /* The full table is visible from the new node */ SELECT(2); HANDSHAKE; OPEN; PREPARE("SELECT COUNT(*) from test", &stmt_id); QUERY(stmt_id, &rows); munit_assert_long(rows.next->values->integer, ==, n_records); clientCloseRows(&rows); return MUNIT_OK; } /* Insert a huge row, causing SQLite to allocate overflow pages. Then * insert the same row again. (Reproducer for * https://github.com/canonical/raft/issues/432.) */ TEST(cluster, hugeRow, setUp, tearDown, 0, NULL) { struct fixture *f = data; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; char *sql; ssize_t n; size_t huge = 20000000; (void)params; HANDSHAKE; OPEN; PREPARE( "CREATE TABLE IF NOT EXISTS model(key TEXT, value TEXT, " "UNIQUE(key))", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); sql = munit_malloc(huge); n = snprintf( sql, huge, "INSERT OR REPLACE INTO model (key, value) VALUES('my-key-1', '"); memset(sql + n, 'A', huge - n); memcpy(sql + huge - 3, "')", 3); PREPARE(sql, &stmt_id); free(sql); EXEC(stmt_id, &last_insert_id, &rows_affected); /* Again */ EXEC(stmt_id, &last_insert_id, &rows_affected); return MUNIT_OK; } TEST(cluster, modifyingQuery, setUp, tearDown, 0, cluster_params) { struct fixture *f = data; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; struct rows rows; long n_records = strtol(munit_parameters_get(params, "num_records"), NULL, 0); char sql[128]; unsigned id = 2; const char *address = "@2"; HANDSHAKE; OPEN; PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); for (int i = 0; i < n_records; ++i) { sprintf(sql, "INSERT INTO test(n) VALUES(%d)", i + 1); PREPARE(sql, &stmt_id); QUERY(stmt_id, &rows); munit_assert_uint64(rows.column_count, ==, 0); munit_assert_ptr(rows.next, ==, NULL); clientCloseRows(&rows); } ADD(id, address); ASSIGN(id, DQLITE_VOTER); REMOVE(1); sleep(1); SELECT(2); HANDSHAKE; OPEN; PREPARE("SELECT COUNT(*) from test", &stmt_id); QUERY(stmt_id, &rows); munit_assert_long(rows.next->values->integer, ==, n_records); clientCloseRows(&rows); return MUNIT_OK; } TEST(cluster, modifyingQuerySql, setUp, tearDown, 0, cluster_params) { struct fixture *f = data; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; struct rows rows; long n_records = strtol(munit_parameters_get(params, "num_records"), NULL, 0); char sql[128]; unsigned id = 2; const char *address = "@2"; HANDSHAKE; OPEN; PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); for (int i = 0; i < n_records; ++i) { sprintf(sql, "INSERT INTO test(n) VALUES(%d)", i + 1); QUERY_SQL(sql, &rows); munit_assert_uint64(rows.column_count, ==, 0); munit_assert_ptr(rows.next, ==, NULL); clientCloseRows(&rows); } ADD(id, address); ASSIGN(id, DQLITE_VOTER); REMOVE(1); sleep(1); SELECT(2); HANDSHAKE; OPEN; PREPARE("SELECT COUNT(*) from test", &stmt_id); QUERY(stmt_id, &rows); munit_assert_long(rows.next->values->integer, ==, n_records); clientCloseRows(&rows); return MUNIT_OK; } dqlite-1.16.7/test/integration/test_fsm.c000066400000000000000000000543451465252713400204060ustar00rootroot00000000000000#include "../../src/client/protocol.h" #include "../../src/command.h" #include "../../src/server.h" #include "../lib/client.h" #include "../lib/heap.h" #include "../lib/runner.h" #include "../lib/server.h" #include "../lib/sqlite.h" /****************************************************************************** * * Fixture * ******************************************************************************/ #define N_SERVERS 1 #define FIXTURE \ struct test_server servers[N_SERVERS]; \ struct client_proto *client #define SETUP \ unsigned i_; \ test_heap_setup(params, user_data); \ test_sqlite_setup(params); \ for (i_ = 0; i_ < N_SERVERS; i_++) { \ struct test_server *server = &f->servers[i_]; \ test_server_setup(server, i_ + 1, params); \ } \ test_server_network(f->servers, N_SERVERS); \ for (i_ = 0; i_ < N_SERVERS; i_++) { \ struct test_server *server = &f->servers[i_]; \ test_server_start(server, params); \ } \ SELECT(1) #define TEAR_DOWN \ unsigned i_; \ for (i_ = 0; i_ < N_SERVERS; i_++) { \ test_server_tear_down(&f->servers[i_]); \ } \ test_sqlite_tear_down(); \ test_heap_tear_down(data) /****************************************************************************** * * Helper macros. * ******************************************************************************/ /* Use the client connected to the server with the given ID. */ #define SELECT(ID) f->client = test_server_client(&f->servers[ID - 1]) static char *bools[] = {"0", "1", NULL}; /* Make sure the snapshots scheduled by raft don't interfere with the snapshots * scheduled by the tests. */ static char *snapshot_threshold[] = {"8192", NULL}; static MunitParameterEnum snapshot_params[] = { {SNAPSHOT_THRESHOLD_PARAM, snapshot_threshold}, {SNAPSHOT_COMPRESSION_PARAM, bools}, {"disk_mode", bools}, {NULL, NULL}, }; /****************************************************************************** * * snapshot * ******************************************************************************/ SUITE(fsm) struct fixture { FIXTURE; }; static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP; return f; } static void tearDown(void *data) { struct fixture *f = data; TEAR_DOWN; free(f); } TEST(fsm, snapshotFreshDb, setUp, tearDown, 0, snapshot_params) { struct fixture *f = data; struct raft_fsm *fsm = &f->servers[0].dqlite->raft_fsm; struct raft_buffer *bufs; unsigned n_bufs = 0; int rv; bool disk_mode = false; const char *disk_mode_param = munit_parameters_get(params, "disk_mode"); if (disk_mode_param != NULL) { disk_mode = (bool)atoi(disk_mode_param); } rv = fsm->snapshot(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); munit_assert_uint(n_bufs, ==, 1); /* Snapshot header */ if (disk_mode) { rv = fsm->snapshot_async(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); } rv = fsm->snapshot_finalize(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); munit_assert_ptr_null(bufs); munit_assert_uint(n_bufs, ==, 0); return MUNIT_OK; } TEST(fsm, snapshotWrittenDb, setUp, tearDown, 0, snapshot_params) { struct fixture *f = data; struct raft_fsm *fsm = &f->servers[0].dqlite->raft_fsm; struct raft_buffer *bufs; unsigned n_bufs = 0; int rv; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; bool disk_mode = false; const char *disk_mode_param = munit_parameters_get(params, "disk_mode"); if (disk_mode_param != NULL) { disk_mode = (bool)atoi(disk_mode_param); } /* Add some data to database */ HANDSHAKE; OPEN; PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); PREPARE("INSERT INTO test(n) VALUES(1)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); rv = fsm->snapshot(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); munit_assert_uint(n_bufs, >, 1); if (disk_mode) { rv = fsm->snapshot_async(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); } rv = fsm->snapshot_finalize(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); munit_assert_ptr_null(bufs); munit_assert_uint(n_bufs, ==, 0); return MUNIT_OK; } TEST(fsm, snapshotHeapFaultSingleDB, setUp, tearDown, 0, snapshot_params) { struct fixture *f = data; struct raft_fsm *fsm = &f->servers[0].dqlite->raft_fsm; struct raft_buffer *bufs; unsigned n_bufs = 0; int rv; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; bool disk_mode = false; const char *disk_mode_param = munit_parameters_get(params, "disk_mode"); if (disk_mode_param != NULL) { disk_mode = (bool)atoi(disk_mode_param); } /* Add some data to database */ HANDSHAKE; OPEN; PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); PREPARE("INSERT INTO test(n) VALUES(1)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); /* Inject heap faults at different stages of fsm__snapshot */ test_heap_fault_config(0, 1); test_heap_fault_enable(); rv = fsm->snapshot(fsm, &bufs, &n_bufs); munit_assert_int(rv, !=, 0); test_heap_fault_config(1, 1); rv = fsm->snapshot(fsm, &bufs, &n_bufs); munit_assert_int(rv, !=, 0); test_heap_fault_config(2, 1); rv = fsm->snapshot(fsm, &bufs, &n_bufs); munit_assert_int(rv, !=, 0); /* disk_mode does fewer allocations */ if (!disk_mode) { test_heap_fault_config(3, 1); rv = fsm->snapshot(fsm, &bufs, &n_bufs); munit_assert_int(rv, !=, 0); } return MUNIT_OK; } /* Inject faults into the async stage of the snapshot process */ TEST(fsm, snapshotHeapFaultSingleDBAsyncDisk, setUp, tearDown, 0, snapshot_params) { struct fixture *f = data; struct raft_fsm *fsm = &f->servers[0].dqlite->raft_fsm; struct raft_buffer *bufs; unsigned n_bufs = 0; int rv; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; bool disk_mode = false; const char *disk_mode_param = munit_parameters_get(params, "disk_mode"); if (disk_mode_param != NULL) { disk_mode = (bool)atoi(disk_mode_param); } if (!disk_mode) { return MUNIT_SKIP; } /* Add some data to database */ HANDSHAKE; OPEN; PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); PREPARE("INSERT INTO test(n) VALUES(1)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); /* Sync stage succeeds */ rv = fsm->snapshot(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); /* Inject heap fault in first call to encodeDiskDatabaseAsync */ test_heap_fault_config(0, 1); test_heap_fault_enable(); rv = fsm->snapshot_async(fsm, &bufs, &n_bufs); munit_assert_int(rv, !=, 0); /* Cleanup should succeed */ rv = fsm->snapshot_finalize(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); return MUNIT_OK; } TEST(fsm, snapshotHeapFaultTwoDB, setUp, tearDown, 0, snapshot_params) { struct fixture *f = data; struct raft_fsm *fsm = &f->servers[0].dqlite->raft_fsm; struct raft_buffer *bufs; unsigned n_bufs = 0; int rv; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; bool disk_mode = false; const char *disk_mode_param = munit_parameters_get(params, "disk_mode"); if (disk_mode_param != NULL) { disk_mode = (bool)atoi(disk_mode_param); } /* Open 2 databases and add data to them */ HANDSHAKE; OPEN_NAME("test"); PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); PREPARE("INSERT INTO test(n) VALUES(1)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); /* Close and reopen the client and open a second database */ test_server_client_reconnect(&f->servers[0], &f->servers[0].client); HANDSHAKE; OPEN_NAME("test2"); PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); PREPARE("INSERT INTO test(n) VALUES(1)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); /* Inject heap faults at different stages of fsm__snapshot */ test_heap_fault_config(0, 1); test_heap_fault_enable(); rv = fsm->snapshot(fsm, &bufs, &n_bufs); munit_assert_int(rv, !=, 0); test_heap_fault_config(1, 1); rv = fsm->snapshot(fsm, &bufs, &n_bufs); munit_assert_int(rv, !=, 0); test_heap_fault_config(2, 1); rv = fsm->snapshot(fsm, &bufs, &n_bufs); munit_assert_int(rv, !=, 0); test_heap_fault_config(3, 1); rv = fsm->snapshot(fsm, &bufs, &n_bufs); munit_assert_int(rv, !=, 0); /* disk_mode does fewer allocations */ if (!disk_mode) { test_heap_fault_config(4, 1); rv = fsm->snapshot(fsm, &bufs, &n_bufs); munit_assert_int(rv, !=, 0); test_heap_fault_config(5, 1); rv = fsm->snapshot(fsm, &bufs, &n_bufs); munit_assert_int(rv, !=, 0); } return MUNIT_OK; } TEST(fsm, snapshotHeapFaultTwoDBAsync, setUp, tearDown, 0, snapshot_params) { struct fixture *f = data; struct raft_fsm *fsm = &f->servers[0].dqlite->raft_fsm; struct raft_buffer *bufs; unsigned n_bufs = 0; int rv; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; bool disk_mode = false; const char *disk_mode_param = munit_parameters_get(params, "disk_mode"); if (disk_mode_param != NULL) { disk_mode = (bool)atoi(disk_mode_param); } if (!disk_mode) { return MUNIT_SKIP; } /* Open 2 databases and add data to them */ HANDSHAKE; OPEN_NAME("test"); PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); PREPARE("INSERT INTO test(n) VALUES(1)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); /* Close and reopen the client and open a second database */ test_server_client_reconnect(&f->servers[0], &f->servers[0].client); HANDSHAKE; OPEN_NAME("test2"); PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); PREPARE("INSERT INTO test(n) VALUES(1)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); /* sync fsm__snapshot succeeds. */ rv = fsm->snapshot(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); /* async step fails at different stages. */ test_heap_fault_enable(); test_heap_fault_config(0, 1); rv = fsm->snapshot_async(fsm, &bufs, &n_bufs); munit_assert_int(rv, !=, 0); rv = fsm->snapshot_finalize(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); /* Inject fault when encoding second Database */ /* sync fsm__snapshot succeeds. */ rv = fsm->snapshot(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); test_heap_fault_config(1, 1); rv = fsm->snapshot_async(fsm, &bufs, &n_bufs); munit_assert_int(rv, !=, 0); rv = fsm->snapshot_finalize(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); return MUNIT_OK; } TEST(fsm, snapshotNewDbAddedBeforeFinalize, setUp, tearDown, 0, snapshot_params) { struct fixture *f = data; struct raft_fsm *fsm = &f->servers[0].dqlite->raft_fsm; struct raft_buffer *bufs; unsigned n_bufs = 0; int rv; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; bool disk_mode = false; const char *disk_mode_param = munit_parameters_get(params, "disk_mode"); if (disk_mode_param != NULL) { disk_mode = (bool)atoi(disk_mode_param); } /* Add some data to database */ HANDSHAKE; OPEN_NAME("test"); PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); PREPARE("INSERT INTO test(n) VALUES(1)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); rv = fsm->snapshot(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); munit_assert_uint(n_bufs, >, 1); /* Close and reopen the client and open a second database, * and ensure finalize succeeds. */ test_server_client_reconnect(&f->servers[0], &f->servers[0].client); HANDSHAKE; OPEN_NAME("test2"); PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); if (disk_mode) { rv = fsm->snapshot_async(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); } PREPARE("INSERT INTO test(n) VALUES(1)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); rv = fsm->snapshot_finalize(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); munit_assert_ptr_null(bufs); munit_assert_uint(n_bufs, ==, 0); return MUNIT_OK; } TEST(fsm, snapshotWritesBeforeFinalize, setUp, tearDown, 0, snapshot_params) { struct fixture *f = data; struct raft_fsm *fsm = &f->servers[0].dqlite->raft_fsm; struct raft_buffer *bufs; unsigned n_bufs = 0; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; char sql[128]; int rv; bool disk_mode = false; const char *disk_mode_param = munit_parameters_get(params, "disk_mode"); if (disk_mode_param != NULL) { disk_mode = (bool)atoi(disk_mode_param); } /* Add some data to database */ HANDSHAKE; OPEN; PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); PREPARE("INSERT INTO test(n) VALUES(0)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); rv = fsm->snapshot(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); munit_assert_uint(n_bufs, >, 1); /* Add (a lot) more data to the database */ for (unsigned i = 0; i < 1000; ++i) { sprintf(sql, "INSERT INTO test(n) VALUES(%d)", i + 1); PREPARE(sql, &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); if (disk_mode && i == 512) { rv = fsm->snapshot_async(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); } } /* Finalize succeeds */ rv = fsm->snapshot_finalize(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); munit_assert_ptr_null(bufs); munit_assert_uint(n_bufs, ==, 0); /* Triggers a checkpoint */ PREPARE("INSERT INTO test(n) VALUES(1001)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); return MUNIT_OK; } TEST(fsm, concurrentSnapshots, setUp, tearDown, 0, snapshot_params) { struct fixture *f = data; struct raft_fsm *fsm = &f->servers[0].dqlite->raft_fsm; struct raft_buffer *bufs; struct raft_buffer *bufs2; unsigned n_bufs = 0; unsigned n_bufs2 = 0; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; int rv; bool disk_mode = false; const char *disk_mode_param = munit_parameters_get(params, "disk_mode"); if (disk_mode_param != NULL) { disk_mode = (bool)atoi(disk_mode_param); } /* Add some data to database */ HANDSHAKE; OPEN; PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); /* Second snapshot fails when first isn't finalized */ rv = fsm->snapshot(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); rv = fsm->snapshot(fsm, &bufs2, &n_bufs2); munit_assert_int(rv, ==, RAFT_BUSY); if (disk_mode) { rv = fsm->snapshot_async(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); } rv = fsm->snapshot_finalize(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); /* Second snapshot succeeds after first is finalized */ rv = fsm->snapshot(fsm, &bufs2, &n_bufs2); munit_assert_int(rv, ==, 0); if (disk_mode) { rv = fsm->snapshot_async(fsm, &bufs2, &n_bufs2); munit_assert_int(rv, ==, 0); } rv = fsm->snapshot_finalize(fsm, &bufs2, &n_bufs2); munit_assert_int(rv, ==, 0); return MUNIT_OK; } /* Copies n raft buffers to a single raft buffer */ static struct raft_buffer n_bufs_to_buf(struct raft_buffer bufs[], unsigned n) { uint8_t *cursor; struct raft_buffer buf = {0}; /* Allocate a suitable buffer */ for (unsigned i = 0; i < n; ++i) { buf.len += bufs[i].len; } buf.base = raft_malloc(buf.len); munit_assert_ptr_not_null(buf.base); /* Copy all data */ cursor = buf.base; for (unsigned i = 0; i < n; ++i) { memcpy(cursor, bufs[i].base, bufs[i].len); cursor += bufs[i].len; } munit_assert_ullong((uintptr_t)(cursor - (uint8_t *)buf.base), ==, buf.len); return buf; } static char *num_records[] = { "0", "1", "256", /* WAL will just have been checkpointed after 993 writes. */ "993", /* Non-empty WAL, checkpointed twice */ "2200", NULL}; static MunitParameterEnum restore_params[] = { {"num_records", num_records}, {SNAPSHOT_THRESHOLD_PARAM, snapshot_threshold}, {SNAPSHOT_COMPRESSION_PARAM, bools}, {"disk_mode", bools}, {NULL, NULL}, }; TEST(fsm, snapshotRestore, setUp, tearDown, 0, restore_params) { struct fixture *f = data; struct raft_fsm *fsm = &f->servers[0].dqlite->raft_fsm; struct raft_buffer *bufs; struct raft_buffer snapshot; long n_records = strtol(munit_parameters_get(params, "num_records"), NULL, 0); unsigned n_bufs = 0; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; struct rows rows; int rv; char sql[128]; bool disk_mode = false; const char *disk_mode_param = munit_parameters_get(params, "disk_mode"); if (disk_mode_param != NULL) { disk_mode = (bool)atoi(disk_mode_param); } /* Add some data to database */ HANDSHAKE; OPEN; PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); for (int i = 0; i < n_records; ++i) { sprintf(sql, "INSERT INTO test(n) VALUES(%d)", i + 1); PREPARE(sql, &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); } rv = fsm->snapshot(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); if (disk_mode) { rv = fsm->snapshot_async(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); } /* Deep copy snapshot */ snapshot = n_bufs_to_buf(bufs, n_bufs); rv = fsm->snapshot_finalize(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); /* Additionally frees snapshot.base */ rv = fsm->restore(fsm, &snapshot); munit_assert_int(rv, ==, 0); /* Table is there on fresh connection. */ test_server_client_reconnect(&f->servers[0], &f->servers[0].client); HANDSHAKE; OPEN; PREPARE("SELECT COUNT(*) from test", &stmt_id); QUERY(stmt_id, &rows); munit_assert_long(rows.next->values->integer, ==, n_records); clientCloseRows(&rows); /* Still possible to insert entries */ for (int i = 0; i < n_records; ++i) { sprintf(sql, "INSERT INTO test(n) VALUES(%ld)", n_records + i + 1); PREPARE(sql, &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); } return MUNIT_OK; } TEST(fsm, snapshotRestoreMultipleDBs, setUp, tearDown, 0, snapshot_params) { struct fixture *f = data; struct raft_fsm *fsm = &f->servers[0].dqlite->raft_fsm; struct raft_buffer *bufs; struct raft_buffer snapshot; unsigned n_bufs = 0; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; struct rows rows; uint64_t code; char *msg; int rv; bool disk_mode = false; const char *disk_mode_param = munit_parameters_get(params, "disk_mode"); if (disk_mode_param != NULL) { disk_mode = (bool)atoi(disk_mode_param); } /* Create 2 databases and add data to them. */ HANDSHAKE; OPEN_NAME("test"); PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); PREPARE("INSERT INTO test(n) VALUES(1)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); test_server_client_reconnect(&f->servers[0], &f->servers[0].client); HANDSHAKE; OPEN_NAME("test2"); PREPARE("CREATE TABLE test2a (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); PREPARE("INSERT INTO test2a(n) VALUES(1)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); /* Snapshot both databases and restore the data. */ rv = fsm->snapshot(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); if (disk_mode) { rv = fsm->snapshot_async(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); } /* Copy the snapshot to restore it */ snapshot = n_bufs_to_buf(bufs, n_bufs); rv = fsm->snapshot_finalize(fsm, &bufs, &n_bufs); munit_assert_int(rv, ==, 0); /* Create a new table in test2 that shouldn't be visible after * restoring the snapshot. */ PREPARE("CREATE TABLE test2b (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); PREPARE("INSERT INTO test2b(n) VALUES(1)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); /* Restore snapshot */ rv = fsm->restore(fsm, &snapshot); munit_assert_int(rv, ==, 0); /* Reopen connection */ test_server_client_reconnect(&f->servers[0], &f->servers[0].client); HANDSHAKE; OPEN_NAME("test2"); /* Table before snapshot is there on second DB */ PREPARE("SELECT * from test2a", &stmt_id); QUERY(stmt_id, &rows); clientCloseRows(&rows); /* Table after snapshot is not there on second DB */ PREPARE_FAIL("SELECT * from test2b", &stmt_id, &code, &msg); munit_assert_uint64(code, ==, DQLITE_ERROR); munit_assert_string_equal(msg, "no such table: test2b"); free(msg); /* Table is there on first DB */ test_server_client_reconnect(&f->servers[0], &f->servers[0].client); HANDSHAKE; OPEN_NAME("test"); PREPARE("SELECT * from test", &stmt_id); QUERY(stmt_id, &rows); clientCloseRows(&rows); return MUNIT_OK; } /****************************************************************************** * * apply * ******************************************************************************/ TEST(fsm, applyFail, setUp, tearDown, 0, NULL) { int rv; struct command_frames c; struct raft_buffer buf; struct fixture *f = data; struct raft_fsm *fsm = &f->servers[0].dqlite->raft_fsm; void *result = (void *)(uintptr_t)0xDEADBEEF; /* Create a frames command without data. */ c.filename = "test"; c.tx_id = 0; c.truncate = 0; c.is_commit = 0; c.frames.n_pages = 0; c.frames.page_size = 4096; c.frames.data = NULL; rv = command__encode(COMMAND_FRAMES, &c, &buf); /* Apply the command and expect it to fail. */ rv = fsm->apply(fsm, &buf, &result); munit_assert_int(rv, !=, 0); munit_assert_ptr_null(result); raft_free(buf.base); return MUNIT_OK; } TEST(fsm, applyUnknownTypeFail, setUp, tearDown, 0, NULL) { int rv; struct command_frames c; struct raft_buffer buf; struct fixture *f = data; struct raft_fsm *fsm = &f->servers[0].dqlite->raft_fsm; void *result = (void *)(uintptr_t)0xDEADBEEF; /* Create a frames command without data. */ c.filename = "test"; c.tx_id = 0; c.truncate = 0; c.is_commit = 0; c.frames.n_pages = 0; c.frames.page_size = 4096; c.frames.data = NULL; rv = command__encode(COMMAND_FRAMES, &c, &buf); /* Command type does not exist. */ ((uint8_t *)(buf.base))[1] = COMMAND_CHECKPOINT + 8; /* Apply the command and expect it to fail. */ rv = fsm->apply(fsm, &buf, &result); munit_assert_int(rv, ==, DQLITE_PROTO); munit_assert_ptr_null(result); raft_free(buf.base); return MUNIT_OK; } dqlite-1.16.7/test/integration/test_membership.c000066400000000000000000000247121465252713400217470ustar00rootroot00000000000000#include "../../src/client/protocol.h" #include "../../src/server.h" #include "../lib/client.h" #include "../lib/endpoint.h" #include "../lib/fs.h" #include "../lib/heap.h" #include "../lib/runner.h" #include "../lib/server.h" #include "../lib/sqlite.h" #include "../lib/util.h" /****************************************************************************** * * Fixture * ******************************************************************************/ #define N_SERVERS 3 #define FIXTURE \ struct test_server servers[N_SERVERS]; \ struct client_proto *client; \ struct rows rows; #define SETUP \ unsigned i_; \ test_heap_setup(params, user_data); \ test_sqlite_setup(params); \ for (i_ = 0; i_ < N_SERVERS; i_++) { \ struct test_server *server = &f->servers[i_]; \ test_server_setup(server, i_ + 1, params); \ } \ test_server_network(f->servers, N_SERVERS); \ for (i_ = 0; i_ < N_SERVERS; i_++) { \ struct test_server *server = &f->servers[i_]; \ test_server_start(server, params); \ } \ SELECT(1) #define TEAR_DOWN \ unsigned i_; \ for (i_ = 0; i_ < N_SERVERS; i_++) { \ test_server_tear_down(&f->servers[i_]); \ } \ test_sqlite_tear_down(); \ test_heap_tear_down(data) /****************************************************************************** * * Helper macros. * ******************************************************************************/ /* Use the client connected to the server with the given ID. */ #define SELECT(ID) f->client = test_server_client(&f->servers[ID - 1]) /****************************************************************************** * * join * ******************************************************************************/ static char *bools[] = {"0", "1", NULL}; static MunitParameterEnum membership_params[] = { {"disk_mode", bools}, {NULL, NULL}, }; SUITE(membership) struct fixture { FIXTURE; }; static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP; return f; } static void tearDown(void *data) { struct fixture *f = data; TEAR_DOWN; free(f); } TEST(membership, join, setUp, tearDown, 0, membership_params) { struct fixture *f = data; unsigned id = 2; const char *address = "@2"; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; HANDSHAKE; ADD(id, address); ASSIGN(id, DQLITE_VOTER); OPEN; PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); PREPARE("INSERT INTO test(n) VALUES(1)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); /* The table is visible from the new node */ TRANSFER(id, f->client); SELECT(2); HANDSHAKE; OPEN; PREPARE("SELECT * FROM test", &stmt_id); /* TODO: fix the standalone test for remove */ REMOVE(1); return MUNIT_OK; } struct id_last_applied { struct fixture *f; int id; raft_index last_applied; }; static bool last_applied_cond(struct id_last_applied arg) { return arg.f->servers[arg.id].dqlite->raft.last_applied >= arg.last_applied; } TEST(membership, transfer, setUp, tearDown, 0, membership_params) { struct fixture *f = data; unsigned id = 2; const char *address = "@2"; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; raft_index last_applied; struct client_proto c_transfer; /* Client used for transfer requests */ struct id_last_applied await_arg; HANDSHAKE; ADD(id, address); ASSIGN(id, DQLITE_VOTER); OPEN; PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); PREPARE("INSERT INTO test(n) VALUES(1)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); /* Transfer leadership and wait until first leader has applied a new * entry replicated from the new leader. */ test_server_client_connect(&f->servers[0], &c_transfer); HANDSHAKE_C(&c_transfer); TRANSFER(2, &c_transfer); test_server_client_close(&f->servers[0], &c_transfer); last_applied = f->servers[0].dqlite->raft.last_applied; SELECT(2); HANDSHAKE; OPEN; PREPARE("INSERT INTO test(n) VALUES(1)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); await_arg.f = f; await_arg.id = 0; await_arg.last_applied = last_applied + 1; AWAIT_TRUE(last_applied_cond, await_arg, 2); return MUNIT_OK; } /* Transfer leadership away from a member that has a pending transaction */ TEST(membership, transferPendingTransaction, setUp, tearDown, 0, membership_params) { struct fixture *f = data; unsigned id = 2; const char *address = "@2"; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; raft_index last_applied; struct client_proto c_transfer; /* Client used for transfer requests */ struct id_last_applied await_arg; HANDSHAKE; ADD(id, address); ASSIGN(id, DQLITE_VOTER); OPEN; PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); PREPARE("INSERT INTO test(n) VALUES(1)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); /* Pending transaction */ PREPARE("BEGIN", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); PREPARE("SELECT * FROM test", &stmt_id); QUERY(stmt_id, &f->rows); clientCloseRows(&f->rows); /* Transfer leadership and wait until first leader has applied a new * entry replicated from the new leader. */ test_server_client_connect(&f->servers[0], &c_transfer); HANDSHAKE_C(&c_transfer); last_applied = f->servers[0].dqlite->raft.last_applied; TRANSFER(2, &c_transfer); test_server_client_close(&f->servers[0], &c_transfer); /* Wait for new leader barrier to be applied. */ await_arg.f = f; await_arg.id = 0; await_arg.last_applied = last_applied + 1; AWAIT_TRUE(last_applied_cond, await_arg, 2); /* New write tx */ SELECT(2); HANDSHAKE; OPEN; PREPARE("INSERT INTO test(n) VALUES(2)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); await_arg.f = f; await_arg.id = 0; /* Wait for the write tx to be applied. */ await_arg.last_applied = last_applied + 2; AWAIT_TRUE(last_applied_cond, await_arg, 2); return MUNIT_OK; } struct fixture_id { struct fixture *f; int id; }; static bool transfer_started_cond(struct fixture_id arg) { return arg.f->servers[arg.id].dqlite->raft.transfer != NULL; } /* Transfer leadership away from a member and immediately try to EXEC a * prepared SQL statement that needs a barrier */ TEST(membership, transferAndSqlExecWithBarrier, setUp, tearDown, 0, NULL) { int rv; struct fixture *f = data; unsigned id = 2; const char *address = "@2"; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; struct client_proto c_transfer; /* Client used for transfer requests */ struct fixture_id arg; HANDSHAKE; ADD(id, address); ASSIGN(id, DQLITE_VOTER); OPEN; PREPARE("CREATE TABLE test (n INT)", &stmt_id); /* Iniate transfer of leadership. This will cause a raft_barrier * failure while the node is technically still the leader, so the * gateway functionality that checks for leadership still succeeds. */ test_server_client_connect(&f->servers[0], &c_transfer); HANDSHAKE_C(&c_transfer); rv = clientSendTransfer(&c_transfer, 2, NULL); munit_assert_int(rv, ==, 0); /* Wait until transfer is started by raft so the barrier can fail. */ arg.f = f; arg.id = 0; AWAIT_TRUE(transfer_started_cond, arg, 2); /* Force a barrier. * TODO this is hacky, but I can't seem to hit the codepath otherwise */ f->servers[0].dqlite->raft.last_applied = 0; rv = clientSendExec(f->client, stmt_id, NULL, 0, NULL); munit_assert_int(rv, ==, 0); rv = clientRecvResult(f->client, &last_insert_id, &rows_affected, NULL); munit_assert_int(rv, ==, DQLITE_CLIENT_PROTO_ERROR); test_server_client_close(&f->servers[1], &c_transfer); return MUNIT_OK; } /* Transfer leadership back and forth from a member that has a pending * transaction */ TEST(membership, transferTwicePendingTransaction, setUp, tearDown, 0, membership_params) { struct fixture *f = data; unsigned id = 2; const char *address = "@2"; uint32_t stmt_id; uint64_t last_insert_id; uint64_t rows_affected; raft_index last_applied; struct client_proto c_transfer; /* Client used for transfer requests */ struct id_last_applied await_arg; HANDSHAKE; ADD(id, address); ASSIGN(id, DQLITE_VOTER); OPEN; PREPARE("CREATE TABLE test (n INT)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); PREPARE("INSERT INTO test(n) VALUES(1)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); /* Pending transaction */ PREPARE("BEGIN", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); PREPARE("SELECT * FROM test", &stmt_id); QUERY(stmt_id, &f->rows); clientCloseRows(&f->rows); /* Transfer leadership and wait until first leader has applied a new * entry replicated from the new leader. */ test_server_client_connect(&f->servers[0], &c_transfer); HANDSHAKE_C(&c_transfer); last_applied = f->servers[0].dqlite->raft.last_applied; TRANSFER(2, &c_transfer); test_server_client_close(&f->servers[0], &c_transfer); /* Wait for new leader barrier to be applied. */ await_arg.f = f; await_arg.id = 0; await_arg.last_applied = last_applied + 1; AWAIT_TRUE(last_applied_cond, await_arg, 2); /* New write tx. */ SELECT(2); HANDSHAKE; OPEN; PREPARE("INSERT INTO test(n) VALUES(2)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); await_arg.f = f; await_arg.id = 0; /* Wait for new write tx to be applied. */ await_arg.last_applied = last_applied + 2; AWAIT_TRUE(last_applied_cond, await_arg, 2); /* Transfer leadership back to original node, reconnect the client and * ensure queries can be executed. */ test_server_client_connect(&f->servers[1], &c_transfer); HANDSHAKE_C(&c_transfer); TRANSFER(1, &c_transfer); test_server_client_close(&f->servers[1], &c_transfer); last_applied = f->servers[1].dqlite->raft.last_applied; test_server_client_reconnect(&f->servers[0], &f->servers[0].client); SELECT(1); HANDSHAKE; OPEN; PREPARE("INSERT INTO test(n) VALUES(3)", &stmt_id); EXEC(stmt_id, &last_insert_id, &rows_affected); await_arg.id = 1; AWAIT_TRUE(last_applied_cond, await_arg, 2); return MUNIT_OK; } dqlite-1.16.7/test/integration/test_node.c000066400000000000000000000312741465252713400205420ustar00rootroot00000000000000#include "../lib/fs.h" #include "../lib/heap.h" #include "../lib/runner.h" #include "../lib/server.h" #include "../lib/sqlite.h" #include "../../include/dqlite.h" #include "../../src/protocol.h" #include "../../src/utils.h" /****************************************************************************** * * Fixture * ******************************************************************************/ static char *bools[] = {"0", "1", NULL}; static MunitParameterEnum node_params[] = { {"disk_mode", bools}, {SNAPSHOT_COMPRESSION_PARAM, bools}, {NULL, NULL}, }; struct fixture { char *dir; /* Data directory. */ dqlite_node *node; /* Node instance. */ }; static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); int rv; test_heap_setup(params, user_data); test_sqlite_setup(params); f->dir = test_dir_setup(); rv = dqlite_node_create(1, "1", f->dir, &f->node); munit_assert_int(rv, ==, 0); rv = dqlite_node_set_bind_address(f->node, "@123"); munit_assert_int(rv, ==, 0); const char *disk_mode_param = munit_parameters_get(params, "disk_mode"); if (disk_mode_param != NULL) { bool disk_mode = (bool)atoi(disk_mode_param); if (disk_mode) { rv = dqlite_node_enable_disk_mode(f->node); munit_assert_int(rv, ==, 0); } } return f; } static void *setUpInet(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); int rv; test_heap_setup(params, user_data); test_sqlite_setup(params); f->dir = test_dir_setup(); rv = dqlite_node_create(1, "1", f->dir, &f->node); munit_assert_int(rv, ==, 0); rv = dqlite_node_set_bind_address(f->node, "127.0.0.1:9001"); munit_assert_int(rv, ==, 0); const char *disk_mode_param = munit_parameters_get(params, "disk_mode"); if (disk_mode_param != NULL) { bool disk_mode = (bool)atoi(disk_mode_param); if (disk_mode) { rv = dqlite_node_enable_disk_mode(f->node); munit_assert_int(rv, ==, 0); } } return f; } /* Tests if node starts/stops successfully and also performs some memory cleanup */ static void startStopNode(struct fixture *f) { munit_assert_int(dqlite_node_start(f->node), ==, 0); munit_assert_int(dqlite_node_stop(f->node), ==, 0); } /* Recovery only works if a node has been started regularly for a first time. */ static void *setUpForRecovery(const MunitParameter params[], void *user_data) { int rv; struct fixture *f = setUp(params, user_data); startStopNode(f); dqlite_node_destroy(f->node); rv = dqlite_node_create(1, "1", f->dir, &f->node); munit_assert_int(rv, ==, 0); rv = dqlite_node_set_bind_address(f->node, "@123"); munit_assert_int(rv, ==, 0); const char *disk_mode_param = munit_parameters_get(params, "disk_mode"); if (disk_mode_param != NULL) { bool disk_mode = (bool)atoi(disk_mode_param); if (disk_mode) { rv = dqlite_node_enable_disk_mode(f->node); munit_assert_int(rv, ==, 0); } } return f; } static void tearDown(void *data) { struct fixture *f = data; dqlite_node_destroy(f->node); test_dir_tear_down(f->dir); test_sqlite_tear_down(); test_heap_tear_down(data); free(f); } SUITE(node); /****************************************************************************** * * dqlite_node_start * ******************************************************************************/ TEST(node, start, setUp, tearDown, 0, node_params) { struct fixture *f = data; int rv; rv = dqlite_node_start(f->node); munit_assert_int(rv, ==, 0); rv = dqlite_node_stop(f->node); munit_assert_int(rv, ==, 0); return MUNIT_OK; } TEST(node, startInet, setUpInet, tearDown, 0, node_params) { struct fixture *f = data; int rv; rv = dqlite_node_start(f->node); munit_assert_int(rv, ==, 0); rv = dqlite_node_stop(f->node); munit_assert_int(rv, ==, 0); return MUNIT_OK; } TEST(node, snapshotParams, setUp, tearDown, 0, node_params) { struct fixture *f = data; int rv; rv = dqlite_node_set_snapshot_params(f->node, 2048, 2048); munit_assert_int(rv, ==, 0); startStopNode(f); return MUNIT_OK; } TEST(node, snapshotParamsRunning, setUp, tearDown, 0, node_params) { struct fixture *f = data; int rv; rv = dqlite_node_start(f->node); munit_assert_int(rv, ==, 0); rv = dqlite_node_set_snapshot_params(f->node, 2048, 2048); munit_assert_int(rv, !=, 0); rv = dqlite_node_stop(f->node); munit_assert_int(rv, ==, 0); return MUNIT_OK; } TEST(node, snapshotParamsTrailingTooSmall, setUp, tearDown, 0, node_params) { struct fixture *f = data; int rv; rv = dqlite_node_set_snapshot_params(f->node, 2, 2); munit_assert_int(rv, !=, 0); startStopNode(f); return MUNIT_OK; } TEST(node, snapshotParamsThresholdLargerThanTrailing, setUp, tearDown, 0, node_params) { struct fixture *f = data; int rv; rv = dqlite_node_set_snapshot_params(f->node, 2049, 2048); munit_assert_int(rv, !=, 0); startStopNode(f); return MUNIT_OK; } TEST(node, networkLatency, setUp, tearDown, 0, node_params) { struct fixture *f = data; int rv; rv = dqlite_node_set_network_latency(f->node, 3600000000000ULL); munit_assert_int(rv, ==, 0); startStopNode(f); return MUNIT_OK; } TEST(node, networkLatencyRunning, setUp, tearDown, 0, node_params) { struct fixture *f = data; int rv; rv = dqlite_node_start(f->node); munit_assert_int(rv, ==, 0); rv = dqlite_node_set_network_latency(f->node, 3600000000000ULL); munit_assert_int(rv, ==, DQLITE_MISUSE); rv = dqlite_node_stop(f->node); munit_assert_int(rv, ==, 0); return MUNIT_OK; } TEST(node, networkLatencyTooLarge, setUp, tearDown, 0, node_params) { struct fixture *f = data; int rv; rv = dqlite_node_set_network_latency(f->node, 3600000000000ULL + 1ULL); munit_assert_int(rv, ==, DQLITE_MISUSE); startStopNode(f); return MUNIT_OK; } TEST(node, networkLatencyMs, setUp, tearDown, 0, node_params) { struct fixture *f = data; int rv; rv = dqlite_node_set_network_latency_ms(f->node, 5); munit_assert_int(rv, ==, 0); rv = dqlite_node_set_network_latency_ms(f->node, (3600U * 1000U)); munit_assert_int(rv, ==, 0); startStopNode(f); return MUNIT_OK; } TEST(node, networkLatencyMsRunning, setUp, tearDown, 0, node_params) { struct fixture *f = data; int rv; rv = dqlite_node_start(f->node); munit_assert_int(rv, ==, 0); rv = dqlite_node_set_network_latency_ms(f->node, 2); munit_assert_int(rv, ==, DQLITE_MISUSE); rv = dqlite_node_stop(f->node); munit_assert_int(rv, ==, 0); return MUNIT_OK; } TEST(node, networkLatencyMsTooSmall, setUp, tearDown, 0, node_params) { struct fixture *f = data; int rv; rv = dqlite_node_set_network_latency_ms(f->node, 0); munit_assert_int(rv, ==, DQLITE_MISUSE); startStopNode(f); return MUNIT_OK; } TEST(node, networkLatencyMsTooLarge, setUp, tearDown, 0, node_params) { struct fixture *f = data; int rv; rv = dqlite_node_set_network_latency_ms(f->node, (3600U * 1000U) + 1); munit_assert_int(rv, ==, DQLITE_MISUSE); startStopNode(f); return MUNIT_OK; } TEST(node, blockSize, setUp, tearDown, 0, NULL) { struct fixture *f = data; int rv; rv = dqlite_node_set_block_size(f->node, 0); munit_assert_int(rv, ==, DQLITE_ERROR); rv = dqlite_node_set_block_size(f->node, 1); munit_assert_int(rv, ==, DQLITE_ERROR); rv = dqlite_node_set_block_size(f->node, 511); munit_assert_int(rv, ==, DQLITE_ERROR); rv = dqlite_node_set_block_size(f->node, 1024 * 512); munit_assert_int(rv, ==, DQLITE_ERROR); rv = dqlite_node_set_block_size(f->node, 64 * 1024); munit_assert_int(rv, ==, 0); startStopNode(f); return MUNIT_OK; } TEST(node, blockSizeRunning, setUp, tearDown, 0, NULL) { struct fixture *f = data; int rv; rv = dqlite_node_start(f->node); munit_assert_int(rv, ==, 0); rv = dqlite_node_set_block_size(f->node, 64 * 1024); munit_assert_int(rv, ==, DQLITE_MISUSE); rv = dqlite_node_stop(f->node); munit_assert_int(rv, ==, 0); return MUNIT_OK; } /* Our file locking prevents starting a second dqlite instance that * uses the same directory as a running instance. */ TEST(node, locked, setUp, tearDown, 0, NULL) { struct fixture *f = data; int rv; dqlite_node *node2; rv = dqlite_node_create(2, "2", f->dir, &node2); munit_assert_int(rv, ==, 0); rv = dqlite_node_set_bind_address(node2, "@456"); munit_assert_int(rv, ==, 0); rv = dqlite_node_start(f->node); munit_assert_int(rv, ==, 0); char buf[PATH_MAX]; snprintf(buf, sizeof(buf), "%s/dqlite-lock", f->dir); rv = access(buf, F_OK); munit_assert_int(rv, ==, 0); rv = dqlite_node_start(node2); munit_assert_int(rv, ==, DQLITE_ERROR); munit_assert_string_equal(dqlite_node_errmsg(node2), "couldn't lock the raft directory"); rv = dqlite_node_stop(f->node); munit_assert_int(rv, ==, 0); rv = dqlite_node_start(node2); munit_assert_int(rv, ==, 0); rv = dqlite_node_stop(node2); munit_assert_int(rv, ==, 0); dqlite_node_destroy(node2); return MUNIT_OK; } /****************************************************************************** * * dqlite_node_recover * ******************************************************************************/ TEST(node, recover, setUpForRecovery, tearDown, 0, node_params) { struct fixture *f = data; int rv; /* Setup the infos structs */ static struct dqlite_node_info infos[2] = {0}; infos[0].id = 1; infos[0].address = "1"; infos[1].id = 2; infos[1].address = "2"; rv = dqlite_node_recover(f->node, infos, 2); munit_assert_int(rv, ==, 0); startStopNode(f); return MUNIT_OK; } TEST(node, recoverExt, setUpForRecovery, tearDown, 0, node_params) { struct fixture *f = data; int rv; /* Setup the infos structs */ static struct dqlite_node_info_ext infos[2] = {0}; infos[0].size = sizeof(*infos); infos[0].id = dqlite_generate_node_id("1"); infos[0].address = PTR_TO_UINT64("1"); infos[0].dqlite_role = DQLITE_VOTER; infos[1].size = sizeof(*infos); infos[1].id = dqlite_generate_node_id("2"); ; infos[1].address = PTR_TO_UINT64("2"); infos[1].dqlite_role = DQLITE_SPARE; rv = dqlite_node_recover_ext(f->node, infos, 2); munit_assert_int(rv, ==, 0); startStopNode(f); return MUNIT_OK; } TEST(node, recoverExtUnaligned, setUpForRecovery, tearDown, 0, node_params) { struct fixture *f = data; int rv; /* Setup the infos structs */ static struct dqlite_node_info_ext infos[1] = {0}; infos[0].size = sizeof(*infos) + 1; /* Unaligned */ infos[0].id = 1; infos[0].address = PTR_TO_UINT64("1"); infos[0].dqlite_role = DQLITE_VOTER; rv = dqlite_node_recover_ext(f->node, infos, 1); munit_assert_int(rv, ==, DQLITE_MISUSE); startStopNode(f); return MUNIT_OK; } TEST(node, recoverExtTooSmall, setUpForRecovery, tearDown, 0, node_params) { struct fixture *f = data; int rv; /* Setup the infos structs */ static struct dqlite_node_info_ext infos[1] = {0}; infos[0].size = DQLITE_NODE_INFO_EXT_SZ_ORIG - 1; infos[0].id = 1; infos[0].address = PTR_TO_UINT64("1"); infos[0].dqlite_role = DQLITE_VOTER; rv = dqlite_node_recover_ext(f->node, infos, 1); munit_assert_int(rv, ==, DQLITE_MISUSE); startStopNode(f); return MUNIT_OK; } struct dqlite_node_info_ext_new { struct dqlite_node_info_ext orig; uint64_t new1; uint64_t new2; }; TEST(node, recoverExtNewFields, setUpForRecovery, tearDown, 0, node_params) { struct fixture *f = data; int rv; /* Setup the infos structs */ static struct dqlite_node_info_ext_new infos[1] = {0}; infos[0].orig.size = sizeof(*infos); infos[0].orig.id = 1; infos[0].orig.address = PTR_TO_UINT64("1"); infos[0].orig.dqlite_role = DQLITE_VOTER; infos[0].new1 = 0; infos[0].new2 = 0; rv = dqlite_node_recover_ext(f->node, (struct dqlite_node_info_ext *)infos, 1); munit_assert_int(rv, ==, 0); startStopNode(f); return MUNIT_OK; } TEST(node, recoverExtNewFieldsNotZero, setUpForRecovery, tearDown, 0, node_params) { struct fixture *f = data; int rv; /* Setup the infos structs */ static struct dqlite_node_info_ext_new infos[1] = {0}; infos[0].orig.size = sizeof(*infos); infos[0].orig.id = 1; infos[0].orig.address = PTR_TO_UINT64("1"); infos[0].orig.dqlite_role = DQLITE_VOTER; infos[0].new1 = 0; infos[0].new2 = 1; /* This will cause a failure */ rv = dqlite_node_recover_ext(f->node, (struct dqlite_node_info_ext *)infos, 1); munit_assert_int(rv, ==, DQLITE_MISUSE); startStopNode(f); return MUNIT_OK; } /****************************************************************************** * * dqlite_node_errmsg * ******************************************************************************/ TEST(node, errMsgNodeNull, NULL, NULL, 0, NULL) { munit_assert_string_equal(dqlite_node_errmsg(NULL), "node is NULL"); return MUNIT_OK; } TEST(node, errMsg, setUp, tearDown, 0, node_params) { struct fixture *f = data; int rv; munit_assert_string_equal(dqlite_node_errmsg(f->node), ""); rv = dqlite_node_start(f->node); munit_assert_int(rv, ==, 0); rv = dqlite_node_stop(f->node); munit_assert_int(rv, ==, 0); return MUNIT_OK; } dqlite-1.16.7/test/integration/test_role_management.c000066400000000000000000000073331465252713400227510ustar00rootroot00000000000000#include "../../src/client/protocol.h" #include "../../src/server.h" #include "../lib/client.h" #include "../lib/endpoint.h" #include "../lib/fs.h" #include "../lib/heap.h" #include "../lib/runner.h" #include "../lib/server.h" #include "../lib/sqlite.h" #include "../lib/util.h" #define N_SERVERS 5 #define FIXTURE \ struct test_server servers[N_SERVERS]; \ struct client_proto *client; \ struct rows rows; #define SETUP \ unsigned i_; \ test_heap_setup(params, user_data); \ test_sqlite_setup(params); \ for (i_ = 0; i_ < N_SERVERS; i_++) { \ struct test_server *server = &f->servers[i_]; \ test_server_setup(server, i_ + 1, params); \ } \ test_server_network(f->servers, N_SERVERS); \ for (i_ = 0; i_ < N_SERVERS; i_++) { \ struct test_server *server = &f->servers[i_]; \ test_server_start(server, params); \ } \ SELECT(1) #define TEAR_DOWN \ unsigned i_; \ for (i_ = 0; i_ < N_SERVERS; i_++) { \ tracef("test_server_tear_down(%u)", i_); \ test_server_tear_down(&f->servers[i_]); \ } \ test_sqlite_tear_down(); \ test_heap_tear_down(data) #define SELECT(ID) f->client = test_server_client(&f->servers[ID - 1]) #define TRIES 5 static char *trueonly[] = {"1", NULL}; static char *threeonly[] = {"3", NULL}; static MunitParameterEnum role_management_params[] = { {"role_management", trueonly}, {"target_voters", threeonly}, {"target_standbys", threeonly}, {NULL, NULL}, }; SUITE(role_management) struct fixture { FIXTURE; }; static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP; return f; } static void tearDown(void *data) { struct fixture *f = data; TEAR_DOWN; free(f); } static bool hasRole(struct fixture *f, dqlite_node_id id, int role) { struct client_node_info *servers; uint64_t n_servers; struct client_context context; unsigned i; bool ret = false; int rv; clientContextMillis(&context, 5000); rv = clientSendCluster(f->client, &context); munit_assert_int(rv, ==, 0); rv = clientRecvServers(f->client, &servers, &n_servers, &context); munit_assert_int(rv, ==, 0); for (i = 0; i < n_servers; i += 1) { if (servers[i].id == id) { ret = servers[i].role == role; break; } } for (i = 0; i < n_servers; i += 1) { free(servers[i].addr); } free(servers); return ret; } TEST(role_management, promote, setUp, tearDown, 0, role_management_params) { struct fixture *f = data; unsigned id = 2; const char *address = "@2"; int tries; HANDSHAKE; id = 2; address = "@2"; ADD(id, address); for (tries = 0; tries < TRIES && !hasRole(f, 2, DQLITE_VOTER); tries += 1) { sleep(1); } if (tries == TRIES) { return MUNIT_FAIL; }; id = 3; address = "@3"; ADD(id, address); for (tries = 0; tries < TRIES && !hasRole(f, 3, DQLITE_VOTER); tries += 1) { sleep(1); } if (tries == TRIES) { return MUNIT_FAIL; }; id = 4; address = "@4"; ADD(id, address); for (tries = 0; tries < TRIES && !hasRole(f, 4, DQLITE_STANDBY); tries += 1) { sleep(1); } if (tries == TRIES) { return MUNIT_FAIL; }; id = 5; address = "@5"; ADD(id, address); for (tries = 0; tries < TRIES && !hasRole(f, 5, DQLITE_STANDBY); tries += 1) { sleep(1); } if (tries == TRIES) { return MUNIT_FAIL; }; return MUNIT_OK; } dqlite-1.16.7/test/integration/test_server.c000066400000000000000000000147461465252713400211300ustar00rootroot00000000000000#include "../../include/dqlite.h" #include "../../src/server.h" #include "../lib/fs.h" #include "../lib/munit.h" #include "../lib/runner.h" #include #include SUITE(server); #define N_SERVERS 3 struct fixture { char *dirs[N_SERVERS]; dqlite_server *servers[N_SERVERS]; }; static void *setup(const MunitParameter params[], void *user_data) { (void)params; (void)user_data; struct fixture *f = munit_malloc(sizeof *f); unsigned i; int rv; for (i = 0; i < N_SERVERS; i += 1) { f->dirs[i] = test_dir_setup(); rv = dqlite_server_create(f->dirs[i], &f->servers[i]); munit_assert_int(rv, ==, 0); } return f; } static void teardown(void *data) { struct fixture *f = data; unsigned i; for (i = 0; i < N_SERVERS; i += 1) { dqlite_server_destroy(f->servers[i]); test_dir_tear_down(f->dirs[i]); } free(f); } #define PREPARE_FILE(i, name, ...) \ do { \ char path[100]; \ snprintf(path, 100, "%s/%s", f->dirs[i], name); \ FILE *fp = fopen(path, "w+"); \ fprintf(fp, __VA_ARGS__); \ fclose(fp); \ } while (0) #define NODE(x) x #define NODE0_ID "3297041220608546238" void start_each_server(struct fixture *f) { const char *addrs[] = {"127.0.0.1:8880", "127.0.0.1:8881"}; int rv; rv = dqlite_server_set_address(f->servers[0], "127.0.0.1:8880"); munit_assert_int(rv, ==, 0); rv = dqlite_server_set_auto_bootstrap(f->servers[0], true); munit_assert_int(rv, ==, 0); f->servers[0]->refresh_period = 100; rv = dqlite_server_start(f->servers[0]); munit_assert_int(rv, ==, 0); rv = dqlite_server_set_address(f->servers[1], "127.0.0.1:8881"); munit_assert_int(rv, ==, 0); rv = dqlite_server_set_auto_join(f->servers[1], addrs, 1); munit_assert_int(rv, ==, 0); f->servers[1]->refresh_period = 100; rv = dqlite_server_start(f->servers[1]); munit_assert_int(rv, ==, 0); rv = dqlite_server_set_address(f->servers[2], "127.0.0.1:8882"); munit_assert_int(rv, ==, 0); rv = dqlite_server_set_auto_join(f->servers[2], addrs, 2); munit_assert_int(rv, ==, 0); f->servers[2]->refresh_period = 100; rv = dqlite_server_start(f->servers[2]); munit_assert_int(rv, ==, 0); } void stop_each_server(struct fixture *f) { int rv; rv = dqlite_server_stop(f->servers[2]); munit_assert_int(rv, ==, 0); rv = dqlite_server_stop(f->servers[1]); munit_assert_int(rv, ==, 0); rv = dqlite_server_stop(f->servers[0]); munit_assert_int(rv, ==, 0); } TEST(server, restart_follower, setup, teardown, 0, NULL) { struct fixture *f = data; struct timespec ts = {0}; int rv; /* Between operations we sleep for 200 milliseconds, twice * the configured refresh period, so that the refresh task * has a chance to be triggered. */ ts.tv_nsec = 200 * 1000 * 1000; start_each_server(f); nanosleep(&ts, NULL); rv = dqlite_server_stop(f->servers[1]); munit_assert_int(rv, ==, 0); nanosleep(&ts, NULL); rv = dqlite_server_start(f->servers[1]); munit_assert_int(rv, ==, 0); nanosleep(&ts, NULL); stop_each_server(f); return MUNIT_OK; } TEST(server, restart_leader, setup, teardown, 0, NULL) { struct fixture *f = data; struct timespec ts = {0}; int rv; /* Between operations we sleep for 200 milliseconds, twice * the configured refresh period, so that the refresh task * has a chance to be triggered. */ ts.tv_nsec = 200 * 1000 * 1000; start_each_server(f); nanosleep(&ts, NULL); rv = dqlite_server_stop(f->servers[0]); munit_assert_int(rv, ==, 0); nanosleep(&ts, NULL); rv = dqlite_server_start(f->servers[0]); munit_assert_int(rv, ==, 0); nanosleep(&ts, NULL); stop_each_server(f); return MUNIT_OK; } TEST(server, bad_info_file, setup, teardown, 0, NULL) { struct fixture *f = data; int rv; PREPARE_FILE(NODE(0), "server-info", "blah"); rv = dqlite_server_set_address(f->servers[0], "127.0.0.1:8880"); munit_assert_int(rv, ==, 0); rv = dqlite_server_set_auto_bootstrap(f->servers[0], true); munit_assert_int(rv, ==, 0); rv = dqlite_server_start(f->servers[0]); munit_assert_int(rv, !=, 0); return MUNIT_OK; } TEST(server, bad_node_store, setup, teardown, 0, NULL) { struct fixture *f = data; int rv; PREPARE_FILE(NODE(0), "server-info", "v1\n127.0.0.1:8880\n" NODE0_ID "\n"); PREPARE_FILE(NODE(0), "node-store", "blah"); rv = dqlite_server_set_address(f->servers[0], "127.0.0.1:8880"); munit_assert_int(rv, ==, 0); rv = dqlite_server_set_auto_bootstrap(f->servers[0], true); munit_assert_int(rv, ==, 0); rv = dqlite_server_start(f->servers[0]); munit_assert_int(rv, !=, 0); return MUNIT_OK; } TEST(server, node_store_but_no_info, setup, teardown, 0, NULL) { struct fixture *f = data; int rv; PREPARE_FILE(NODE(0), "node-store", "v1\n127.0.0.1:8880\n" NODE0_ID "\nvoter\n"); rv = dqlite_server_set_address(f->servers[0], "127.0.0.1:8880"); munit_assert_int(rv, ==, 0); rv = dqlite_server_set_auto_bootstrap(f->servers[0], true); munit_assert_int(rv, ==, 0); rv = dqlite_server_start(f->servers[0]); munit_assert_int(rv, !=, 0); return MUNIT_OK; } TEST(server, missing_bootstrap, setup, teardown, 0, NULL) { struct fixture *f = data; const char *addrs[] = {"127.0.0.1:8880"}; int rv; rv = dqlite_server_set_address(f->servers[1], "127.0.0.1:8881"); munit_assert_int(rv, ==, 0); rv = dqlite_server_set_auto_join(f->servers[1], addrs, 1); munit_assert_int(rv, ==, 0); rv = dqlite_server_start(f->servers[1]); munit_assert_int(rv, !=, 0); return MUNIT_OK; } TEST(server, start_twice, setup, teardown, 0, NULL) { struct fixture *f = data; int rv; rv = dqlite_server_set_address(f->servers[0], "127.0.0.1:8880"); munit_assert_int(rv, ==, 0); rv = dqlite_server_set_auto_bootstrap(f->servers[0], true); munit_assert_int(rv, ==, 0); rv = dqlite_server_start(f->servers[0]); munit_assert_int(rv, ==, 0); rv = dqlite_server_start(f->servers[0]); munit_assert_int(rv, !=, 0); rv = dqlite_server_stop(f->servers[0]); munit_assert_int(rv, ==, 0); return MUNIT_OK; } TEST(server, stop_twice, setup, teardown, 0, NULL) { struct fixture *f = data; int rv; rv = dqlite_server_set_address(f->servers[0], "127.0.0.1:8880"); munit_assert_int(rv, ==, 0); rv = dqlite_server_set_auto_bootstrap(f->servers[0], true); munit_assert_int(rv, ==, 0); rv = dqlite_server_start(f->servers[0]); munit_assert_int(rv, ==, 0); rv = dqlite_server_stop(f->servers[0]); munit_assert_int(rv, ==, 0); rv = dqlite_server_stop(f->servers[0]); munit_assert_int(rv, !=, 0); return MUNIT_OK; } dqlite-1.16.7/test/integration/test_vfs.c000066400000000000000000001242701465252713400204120ustar00rootroot00000000000000#include #include "../lib/fs.h" #include "../lib/heap.h" #include "../lib/runner.h" #include "../lib/sqlite.h" #include "../../include/dqlite.h" #include "../../src/raft.h" #include SUITE(vfs); #define N_VFS 2 static char *bools[] = {"0", "1", NULL}; #define SNAPSHOT_SHALLOW_PARAM "snapshot-shallow-param" static MunitParameterEnum vfs_params[] = { {SNAPSHOT_SHALLOW_PARAM, bools}, {"disk_mode", bools}, {NULL, NULL}, }; struct fixture { struct sqlite3_vfs vfs[N_VFS]; /* A "cluster" of VFS objects. */ char names[8][N_VFS]; /* Registration names */ char *dirs[N_VFS]; /* For the disk vfs. */ }; static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); unsigned i; int rv; SETUP_HEAP; SETUP_SQLITE; for (i = 0; i < N_VFS; i++) { f->dirs[i] = NULL; sprintf(f->names[i], "%u", i + 1); rv = dqlite_vfs_init(&f->vfs[i], f->names[i]); munit_assert_int(rv, ==, 0); const char *disk_mode_param = munit_parameters_get(params, "disk_mode"); if (disk_mode_param != NULL) { bool disk_mode = (bool)atoi(disk_mode_param); if (disk_mode) { f->dirs[i] = test_dir_setup(); rv = dqlite_vfs_enable_disk(&f->vfs[i]); munit_assert_int(rv, ==, 0); } } rv = sqlite3_vfs_register(&f->vfs[i], 0); munit_assert_int(rv, ==, 0); } return f; } static void tearDown(void *data) { struct fixture *f = data; unsigned i; int rv; for (i = 0; i < N_VFS; i++) { rv = sqlite3_vfs_unregister(&f->vfs[i]); munit_assert_int(rv, ==, 0); dqlite_vfs_close(&f->vfs[i]); test_dir_tear_down(f->dirs[i]); } TEAR_DOWN_SQLITE; TEAR_DOWN_HEAP; free(f); } extern unsigned dq_sqlite_pending_byte; static void tearDownRestorePendingByte(void *data) { sqlite3_test_control(SQLITE_TESTCTRL_PENDING_BYTE, 0x40000000); dq_sqlite_pending_byte = 0x40000000; tearDown(data); } #define PAGE_SIZE 512 #define PRAGMA(DB, COMMAND) \ _rv = sqlite3_exec(DB, "PRAGMA " COMMAND, NULL, NULL, NULL); \ if (_rv != SQLITE_OK) { \ munit_errorf("PRAGMA " COMMAND ": %s (%d)", \ sqlite3_errmsg(DB), _rv); \ } #define VFS_PATH_SZ 512 static void vfsFillDbPath(struct fixture *f, char *vfs, char *filename, char *path) { int rv; char *dir = f->dirs[atoi(vfs) - 1]; if (dir != NULL) { rv = snprintf(path, VFS_PATH_SZ, "%s/%s", dir, filename); } else { rv = snprintf(path, VFS_PATH_SZ, "%s", filename); } munit_assert_int(rv, >, 0); munit_assert_int(rv, <, VFS_PATH_SZ); } /* Open a new database connection on the given VFS. */ #define OPEN(VFS, DB) \ do { \ int _flags = SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE; \ int _rv; \ char path[VFS_PATH_SZ]; \ struct fixture *f = data; \ vfsFillDbPath(f, VFS, "test.db", path); \ _rv = sqlite3_open_v2(path, &DB, _flags, VFS); \ munit_assert_int(_rv, ==, SQLITE_OK); \ _rv = sqlite3_extended_result_codes(DB, 1); \ munit_assert_int(_rv, ==, SQLITE_OK); \ PRAGMA(DB, "page_size=512"); \ PRAGMA(DB, "synchronous=OFF"); \ PRAGMA(DB, "journal_mode=WAL"); \ PRAGMA(DB, "cache_size=1"); \ _rv = sqlite3_db_config(DB, SQLITE_DBCONFIG_NO_CKPT_ON_CLOSE, \ 1, NULL); \ munit_assert_int(_rv, ==, SQLITE_OK); \ } while (0) /* Close a database connection. */ #define CLOSE(DB) \ do { \ int _rv; \ _rv = sqlite3_close(DB); \ munit_assert_int(_rv, ==, SQLITE_OK); \ } while (0) /* Prepare a statement. */ #define PREPARE(DB, STMT, SQL) \ do { \ int _rv; \ _rv = sqlite3_prepare_v2(DB, SQL, -1, &STMT, NULL); \ if (_rv != SQLITE_OK) { \ munit_errorf("prepare '%s': %s (%d)", SQL, \ sqlite3_errmsg(DB), _rv); \ } \ } while (0) /* Reset a statement. */ #define RESET(STMT, RV) \ do { \ int _rv; \ _rv = sqlite3_reset(STMT); \ munit_assert_int(_rv, ==, RV); \ } while (0) /* Finalize a statement. */ #define FINALIZE(STMT) \ do { \ int _rv; \ _rv = sqlite3_finalize(STMT); \ munit_assert_int(_rv, ==, SQLITE_OK); \ } while (0) /* Shortcut for PREPARE, STEP, FINALIZE. */ #define EXEC(DB, SQL) \ do { \ sqlite3_stmt *_stmt; \ PREPARE(DB, _stmt, SQL); \ STEP(_stmt, SQLITE_DONE); \ FINALIZE(_stmt); \ } while (0) /* Step through a statement and assert that the given value is returned. */ #define STEP(STMT, RV) \ do { \ int _rv; \ _rv = sqlite3_step(STMT); \ if (_rv != RV) { \ munit_errorf("step: %s (%d)", \ sqlite3_errmsg(sqlite3_db_handle(STMT)), \ _rv); \ } \ } while (0) /* Hold WAL replication information about a single transaction. */ struct tx { unsigned n; unsigned long *page_numbers; void *frames; }; /* Poll the given VFS object and serialize the transaction data into the given * tx object. */ #define POLL(VFS, TX) \ do { \ sqlite3_vfs *vfs = sqlite3_vfs_find(VFS); \ dqlite_vfs_frame *_frames; \ unsigned _i; \ int _rv; \ memset(&TX, 0, sizeof TX); \ char path[VFS_PATH_SZ]; \ struct fixture *f = data; \ vfsFillDbPath(f, VFS, "test.db", path); \ _rv = dqlite_vfs_poll(vfs, path, &_frames, &TX.n); \ munit_assert_int(_rv, ==, 0); \ if (_frames != NULL) { \ TX.page_numbers = \ munit_malloc(sizeof *TX.page_numbers * TX.n); \ TX.frames = munit_malloc(PAGE_SIZE * TX.n); \ for (_i = 0; _i < TX.n; _i++) { \ dqlite_vfs_frame *_frame = &_frames[_i]; \ TX.page_numbers[_i] = _frame->page_number; \ memcpy(TX.frames + _i * PAGE_SIZE, \ _frame->data, PAGE_SIZE); \ sqlite3_free(_frame->data); \ } \ sqlite3_free(_frames); \ } \ } while (0) /* Apply WAL frames to the given VFS. */ #define APPLY(VFS, TX) \ do { \ sqlite3_vfs *vfs = sqlite3_vfs_find(VFS); \ int _rv; \ char path[VFS_PATH_SZ]; \ struct fixture *f = data; \ vfsFillDbPath(f, VFS, "test.db", path); \ _rv = dqlite_vfs_apply(vfs, path, TX.n, TX.page_numbers, \ TX.frames); \ munit_assert_int(_rv, ==, 0); \ } while (0) /* Abort a transaction on the given VFS. */ #define ABORT(VFS) \ do { \ sqlite3_vfs *vfs = sqlite3_vfs_find(VFS); \ int _rv; \ char path[VFS_PATH_SZ]; \ struct fixture *f = data; \ vfsFillDbPath(f, VFS, "test.db", path); \ _rv = dqlite_vfs_abort(vfs, path); \ munit_assert_int(_rv, ==, 0); \ } while (0) /* Release all memory used by a struct tx object. */ #define DONE(TX) \ do { \ free(TX.frames); \ free(TX.page_numbers); \ } while (0) /* Peform a full checkpoint on the given database. */ #define CHECKPOINT(DB) \ do { \ int _size; \ int _ckpt; \ int _rv; \ _rv = sqlite3_wal_checkpoint_v2( \ DB, "main", SQLITE_CHECKPOINT_TRUNCATE, &_size, &_ckpt); \ if (_rv != SQLITE_OK) { \ munit_errorf("checkpoint: %s (%d)", \ sqlite3_errmsg(DB), _rv); \ } \ munit_assert_int(_size, ==, 0); \ munit_assert_int(_ckpt, ==, 0); \ } while (0) /* Perform a full checkpoint on a fresh connection, mimicking dqlite's * checkpoint behavior. */ #define CHECKPOINT_FRESH(VFS) \ do { \ sqlite3 *_db; \ OPEN(VFS, _db); \ CHECKPOINT(_db); \ CLOSE(_db); \ } while (0) /* Attempt to perform a full checkpoint on the given database, but fail. */ #define CHECKPOINT_FAIL(DB, RV) \ do { \ int _size; \ int _ckpt; \ int _rv; \ _rv = sqlite3_wal_checkpoint_v2( \ DB, "main", SQLITE_CHECKPOINT_TRUNCATE, &_size, &_ckpt); \ munit_assert_int(_rv, ==, RV); \ } while (0) struct snapshot { void *data; size_t n; size_t main_size; size_t wal_size; }; /* Copies n dqlite_buffers to a single dqlite buffer */ static struct dqlite_buffer n_bufs_to_buf(struct dqlite_buffer bufs[], unsigned n) { uint8_t *cursor; struct dqlite_buffer buf = {0}; /* Allocate a suitable buffer */ for (unsigned i = 0; i < n; ++i) { buf.len += bufs[i].len; tracef("buf.len %zu", buf.len); } buf.base = raft_malloc(buf.len); munit_assert_ptr_not_null(buf.base); /* Copy all data */ cursor = buf.base; for (unsigned i = 0; i < n; ++i) { memcpy(cursor, bufs[i].base, bufs[i].len); cursor += bufs[i].len; } munit_assert_ullong((uintptr_t)(cursor - (uint8_t *)buf.base), ==, buf.len); return buf; } #define SNAPSHOT_DISK(VFS, SNAPSHOT) \ do { \ sqlite3_vfs *vfs = sqlite3_vfs_find(VFS); \ int _rv; \ unsigned _n; \ struct dqlite_buffer *_bufs; \ struct dqlite_buffer _all_data; \ _n = 2; \ _bufs = sqlite3_malloc64(_n * sizeof(*_bufs)); \ char path[VFS_PATH_SZ]; \ struct fixture *f = data; \ vfsFillDbPath(f, VFS, "test.db", path); \ _rv = dqlite_vfs_snapshot_disk(vfs, path, _bufs, _n); \ munit_assert_int(_rv, ==, 0); \ _all_data = n_bufs_to_buf(_bufs, _n); \ /* Free WAL buffer after copy. */ \ SNAPSHOT.main_size = _bufs[0].len; \ SNAPSHOT.wal_size = _bufs[1].len; \ sqlite3_free(_bufs[1].base); \ munmap(_bufs[0].base, _bufs[0].len); \ sqlite3_free(_bufs); \ SNAPSHOT.data = _all_data.base; \ SNAPSHOT.n = _all_data.len; \ } while (0) /* Take a snapshot of the database on the given VFS. */ #define SNAPSHOT_DEEP(VFS, SNAPSHOT) \ do { \ sqlite3_vfs *vfs = sqlite3_vfs_find(VFS); \ int _rv; \ _rv = dqlite_vfs_snapshot(vfs, "test.db", &SNAPSHOT.data, \ &SNAPSHOT.n); \ munit_assert_int(_rv, ==, 0); \ } while (0) /* Take a shallow snapshot of the database on the given VFS. */ #define SNAPSHOT_SHALLOW(VFS, SNAPSHOT) \ do { \ sqlite3_vfs *vfs = sqlite3_vfs_find(VFS); \ int _rv; \ unsigned _n; \ unsigned _n_pages; \ struct dqlite_buffer *_bufs; \ struct dqlite_buffer _all_data; \ _rv = dqlite_vfs_num_pages(vfs, "test.db", &_n_pages); \ munit_assert_int(_rv, ==, 0); \ _n = _n_pages + 1; /* + 1 for WAL */ \ _bufs = sqlite3_malloc64(_n * sizeof(*_bufs)); \ _rv = dqlite_vfs_shallow_snapshot(vfs, "test.db", _bufs, _n); \ munit_assert_int(_rv, ==, 0); \ _all_data = n_bufs_to_buf(_bufs, _n); \ /* Free WAL buffer after copy. */ \ sqlite3_free(_bufs[_n - 1].base); \ sqlite3_free(_bufs); \ SNAPSHOT.data = _all_data.base; \ SNAPSHOT.n = _all_data.len; \ } while (0) #define SNAPSHOT(VFS, SNAPSHOT) \ do { \ bool _shallow = false; \ bool _disk_mode = false; \ if (munit_parameters_get(params, SNAPSHOT_SHALLOW_PARAM) != \ NULL) { \ _shallow = atoi(munit_parameters_get( \ params, SNAPSHOT_SHALLOW_PARAM)); \ } \ if (munit_parameters_get(params, "disk_mode") != NULL) { \ _disk_mode = \ atoi(munit_parameters_get(params, "disk_mode")); \ } \ if (_shallow && !_disk_mode) { \ SNAPSHOT_SHALLOW(VFS, SNAPSHOT); \ } else if (!_shallow && !_disk_mode) { \ SNAPSHOT_DEEP(VFS, SNAPSHOT); \ } else { \ SNAPSHOT_DISK(VFS, SNAPSHOT); \ } \ } while (0) /* Restore a snapshot onto the given VFS. */ #define RESTORE(VFS, SNAPSHOT) \ do { \ bool _disk_mode = false; \ if (munit_parameters_get(params, "disk_mode") != NULL) { \ _disk_mode = \ atoi(munit_parameters_get(params, "disk_mode")); \ } \ sqlite3_vfs *vfs = sqlite3_vfs_find(VFS); \ int _rv; \ char path[VFS_PATH_SZ]; \ struct fixture *f = data; \ vfsFillDbPath(f, VFS, "test.db", path); \ if (_disk_mode) { \ _rv = dqlite_vfs_restore_disk( \ vfs, path, SNAPSHOT.data, SNAPSHOT.main_size, \ SNAPSHOT.wal_size); \ } else { \ _rv = dqlite_vfs_restore(vfs, path, SNAPSHOT.data, \ SNAPSHOT.n); \ } \ munit_assert_int(_rv, ==, 0); \ } while (0) /* Open and close a new connection using the dqlite VFS. */ TEST(vfs, open, setUp, tearDown, 0, vfs_params) { sqlite3 *db; OPEN("1", db); CLOSE(db); return MUNIT_OK; } /* New frames appended to the WAL file by a sqlite3_step() call that has * triggered a write transactions are not immediately visible to other * connections after sqlite3_step() has returned. */ TEST(vfs, writeTransactionNotImmediatelyVisible, setUp, tearDown, 0, vfs_params) { sqlite3 *db1; sqlite3 *db2; sqlite3_stmt *stmt; int rv; OPEN("1", db1); EXEC(db1, "CREATE TABLE test(n INT)"); OPEN("1", db2); rv = sqlite3_prepare_v2(db2, "SELECT * FROM test", -1, &stmt, NULL); munit_assert_int(rv, ==, SQLITE_ERROR); munit_assert_string_equal(sqlite3_errmsg(db2), "no such table: test"); CLOSE(db1); CLOSE(db2); return MUNIT_OK; } /* Invoking dqlite_vfs_poll() after a call to sqlite3_step() has triggered a * write transaction returns the newly appended WAL frames. */ TEST(vfs, pollAfterWriteTransaction, setUp, tearDown, 0, vfs_params) { sqlite3 *db; sqlite3_stmt *stmt; struct tx tx; unsigned i; OPEN("1", db); PREPARE(db, stmt, "CREATE TABLE test(n INT)"); STEP(stmt, SQLITE_DONE); POLL("1", tx); munit_assert_ptr_not_null(tx.frames); munit_assert_int(tx.n, ==, 2); for (i = 0; i < tx.n; i++) { munit_assert_int(tx.page_numbers[i], ==, i + 1); } DONE(tx); FINALIZE(stmt); CLOSE(db); return MUNIT_OK; } /* Invoking dqlite_vfs_poll() after a call to sqlite3_step() has triggered a * write transaction sets a write lock on the WAL, so calls to sqlite3_step() * from other connections return SQLITE_BUSY if they try to start a write * transaction. */ TEST(vfs, pollAcquireWriteLock, setUp, tearDown, 0, vfs_params) { sqlite3 *db1; sqlite3 *db2; sqlite3_stmt *stmt1; sqlite3_stmt *stmt2; struct tx tx; OPEN("1", db1); OPEN("1", db2); PREPARE(db1, stmt1, "CREATE TABLE test(n INT)"); PREPARE(db2, stmt2, "CREATE TABLE test2(n INT)"); STEP(stmt1, SQLITE_DONE); POLL("1", tx); STEP(stmt2, SQLITE_BUSY); RESET(stmt2, SQLITE_BUSY); FINALIZE(stmt1); FINALIZE(stmt2); CLOSE(db1); CLOSE(db2); DONE(tx); return MUNIT_OK; } /* If the page cache limit is exceeded during a call to sqlite3_step() that has * triggered a write transaction, some WAL frames will be written and then * overwritten before the final commit. Only the final version of the frame is * included in the set returned by dqlite_vfs_poll(). */ TEST(vfs, pollAfterPageStress, setUp, tearDown, 0, vfs_params) { sqlite3 *db; sqlite3_stmt *stmt; struct tx tx; unsigned i; char sql[64]; OPEN("1", db); EXEC(db, "CREATE TABLE test(n INT)"); POLL("1", tx); APPLY("1", tx); DONE(tx); EXEC(db, "BEGIN"); for (i = 0; i < 163; i++) { sprintf(sql, "INSERT INTO test(n) VALUES(%d)", i + 1); EXEC(db, sql); POLL("1", tx); munit_assert_int(tx.n, ==, 0); } for (i = 0; i < 163; i++) { sprintf(sql, "UPDATE test SET n=%d WHERE n=%d", i, i + 1); EXEC(db, sql); POLL("1", tx); munit_assert_int(tx.n, ==, 0); } EXEC(db, "COMMIT"); POLL("1", tx); /* Five frames were replicated and the first frame actually contains a * spill of the third page. */ munit_assert_int(tx.n, ==, 6); munit_assert_int(tx.page_numbers[0], ==, 3); munit_assert_int(tx.page_numbers[1], ==, 4); munit_assert_int(tx.page_numbers[2], ==, 5); munit_assert_int(tx.page_numbers[3], ==, 1); munit_assert_int(tx.page_numbers[4], ==, 2); APPLY("1", tx); DONE(tx); /* All records have been inserted. */ PREPARE(db, stmt, "SELECT * FROM test"); for (i = 0; i < 163; i++) { STEP(stmt, SQLITE_ROW); munit_assert_int(sqlite3_column_int(stmt, 0), ==, i); } STEP(stmt, SQLITE_DONE); FINALIZE(stmt); CLOSE(db); return MUNIT_OK; } /* Set the SQLite PENDING_BYTE at the start of the second page and make sure * all data entry is successful. */ TEST(vfs, adaptPendingByte, setUp, tearDownRestorePendingByte, 0, vfs_params) { sqlite3 *db; sqlite3_stmt *stmt; struct tx tx; int i; int n; char sql[64]; /* Set the pending byte at the start of the second page */ const unsigned new_pending_byte = 512; dq_sqlite_pending_byte = new_pending_byte; sqlite3_test_control(SQLITE_TESTCTRL_PENDING_BYTE, new_pending_byte); OPEN("1", db); EXEC(db, "CREATE TABLE test(n INT)"); POLL("1", tx); APPLY("1", tx); DONE(tx); EXEC(db, "BEGIN"); n = 65536; for (i = 0; i < n; i++) { sprintf(sql, "INSERT INTO test(n) VALUES(%d)", i); EXEC(db, sql); POLL("1", tx); munit_assert_uint(tx.n, ==, 0); } EXEC(db, "COMMIT"); POLL("1", tx); APPLY("1", tx); DONE(tx); /* All records have been inserted. */ PREPARE(db, stmt, "SELECT * FROM test"); for (i = 0; i < n; i++) { STEP(stmt, SQLITE_ROW); munit_assert_int(sqlite3_column_int(stmt, 0), ==, i); } STEP(stmt, SQLITE_DONE); FINALIZE(stmt); CLOSE(db); return MUNIT_OK; } /* Use dqlite_vfs_apply() to actually modify the WAL after a write transaction * was triggered by a call to sqlite3_step(), then perform a read transaction * and check that it can see the transaction changes. */ TEST(vfs, applyMakesTransactionVisible, setUp, tearDown, 0, vfs_params) { sqlite3 *db; sqlite3_stmt *stmt; struct tx tx; OPEN("1", db); EXEC(db, "CREATE TABLE test(n INT)"); POLL("1", tx); APPLY("1", tx); DONE(tx); PREPARE(db, stmt, "SELECT * FROM test"); STEP(stmt, SQLITE_DONE); FINALIZE(stmt); CLOSE(db); return MUNIT_OK; } /* Use dqlite_vfs_apply() to actually modify the WAL after a write transaction * was triggered by an explicit "COMMIT" statement and check that changes are * visible. */ TEST(vfs, applyExplicitTransaction, setUp, tearDown, 0, vfs_params) { sqlite3 *db; sqlite3_stmt *stmt; struct tx tx; OPEN("1", db); PREPARE(db, stmt, "BEGIN"); STEP(stmt, SQLITE_DONE); POLL("1", tx); munit_assert_int(tx.n, ==, 0); FINALIZE(stmt); PREPARE(db, stmt, "CREATE TABLE test(n INT)"); STEP(stmt, SQLITE_DONE); POLL("1", tx); munit_assert_int(tx.n, ==, 0); FINALIZE(stmt); PREPARE(db, stmt, "COMMIT"); STEP(stmt, SQLITE_DONE); POLL("1", tx); munit_assert_int(tx.n, ==, 2); APPLY("1", tx); DONE(tx); FINALIZE(stmt); PREPARE(db, stmt, "SELECT * FROM test"); STEP(stmt, SQLITE_DONE); FINALIZE(stmt); CLOSE(db); return MUNIT_OK; } /* Perform two consecutive full write transactions using sqlite3_step(), * dqlite_vfs_poll() and dqlite_vfs_apply(), then run a read transaction and * check that it can see all committed changes. */ TEST(vfs, consecutiveWriteTransactions, setUp, tearDown, 0, vfs_params) { sqlite3 *db; sqlite3_stmt *stmt; struct tx tx; OPEN("1", db); EXEC(db, "CREATE TABLE test(n INT)"); POLL("1", tx); APPLY("1", tx); DONE(tx); EXEC(db, "INSERT INTO test(n) VALUES(123)"); POLL("1", tx); APPLY("1", tx); DONE(tx); PREPARE(db, stmt, "SELECT * FROM test"); STEP(stmt, SQLITE_ROW); munit_assert_int(sqlite3_column_int(stmt, 0), ==, 123); STEP(stmt, SQLITE_DONE); FINALIZE(stmt); CLOSE(db); return MUNIT_OK; } /* Perform three consecutive write transactions, then re-open the database and * finally run a read transaction and check that it can see all committed * changes. */ TEST(vfs, reopenAfterConsecutiveWriteTransactions, setUp, tearDown, 0, vfs_params) { sqlite3 *db; sqlite3_stmt *stmt; struct tx tx; OPEN("1", db); EXEC(db, "CREATE TABLE foo(id INT)"); POLL("1", tx); APPLY("1", tx); DONE(tx); EXEC(db, "CREATE TABLE bar (id INT)"); POLL("1", tx); APPLY("1", tx); DONE(tx); EXEC(db, "INSERT INTO foo(id) VALUES(1)"); POLL("1", tx); APPLY("1", tx); DONE(tx); CLOSE(db); OPEN("1", db); PREPARE(db, stmt, "SELECT * FROM sqlite_master"); STEP(stmt, SQLITE_ROW); FINALIZE(stmt); CLOSE(db); return MUNIT_OK; } /* Use dqlite_vfs_apply() to actually modify the WAL after a write transaction * was triggered by sqlite3_step(), and verify that the transaction is visible * from another existing connection. */ TEST(vfs, transactionIsVisibleFromExistingConnection, setUp, tearDown, 0, vfs_params) { sqlite3 *db1; sqlite3 *db2; sqlite3_stmt *stmt; struct tx tx; OPEN("1", db1); OPEN("1", db2); EXEC(db1, "CREATE TABLE test(n INT)"); POLL("1", tx); APPLY("1", tx); DONE(tx); PREPARE(db2, stmt, "SELECT * FROM test"); STEP(stmt, SQLITE_DONE); FINALIZE(stmt); CLOSE(db1); CLOSE(db2); return MUNIT_OK; } /* Use dqlite_vfs_apply() to actually modify the WAL after a write transaction * was triggered by sqlite3_step(), and verify that the transaction is visible * from a brand new connection. */ TEST(vfs, transactionIsVisibleFromNewConnection, setUp, tearDown, 0, vfs_params) { sqlite3 *db1; sqlite3 *db2; sqlite3_stmt *stmt; struct tx tx; OPEN("1", db1); EXEC(db1, "CREATE TABLE test(n INT)"); POLL("1", tx); APPLY("1", tx); DONE(tx); OPEN("1", db2); PREPARE(db2, stmt, "SELECT * FROM test"); STEP(stmt, SQLITE_DONE); FINALIZE(stmt); CLOSE(db1); CLOSE(db2); return MUNIT_OK; } /* Use dqlite_vfs_apply() to actually modify the WAL after a write transaction * was triggered by sqlite3_step(), then close the connection and open a new * one. A read transaction started in the new connection can see the changes * committed by the first one. */ TEST(vfs, transactionIsVisibleFromReopenedConnection, setUp, tearDown, 0, vfs_params) { sqlite3 *db; sqlite3_stmt *stmt; struct tx tx; OPEN("1", db); EXEC(db, "CREATE TABLE test(n INT)"); POLL("1", tx); APPLY("1", tx); DONE(tx); CLOSE(db); OPEN("1", db); PREPARE(db, stmt, "SELECT * FROM test"); STEP(stmt, SQLITE_DONE); FINALIZE(stmt); CLOSE(db); return MUNIT_OK; } /* Use dqlite_vfs_apply() to replicate the very first write transaction on a * different VFS than the one that initially generated it. In that case it's * necessary to initialize the database file on the other VFS by opening and * closing a connection. */ TEST(vfs, firstApplyOnDifferentVfs, setUp, tearDown, 0, vfs_params) { sqlite3 *db1; sqlite3 *db2; sqlite3_stmt *stmt; struct tx tx; OPEN("1", db1); PREPARE(db1, stmt, "CREATE TABLE test(n INT)"); STEP(stmt, SQLITE_DONE); POLL("1", tx); APPLY("1", tx); OPEN("2", db2); CLOSE(db2); APPLY("2", tx); DONE(tx); FINALIZE(stmt); CLOSE(db1); return MUNIT_OK; } /* Use dqlite_vfs_apply() to replicate a second write transaction on a different * VFS than the one that initially generated it. In that case it's not necessary * to do anything special before calling dqlite_vfs_apply(). */ TEST(vfs, secondApplyOnDifferentVfs, setUp, tearDown, 0, vfs_params) { sqlite3 *db1; sqlite3 *db2; struct tx tx; OPEN("1", db1); EXEC(db1, "CREATE TABLE test(n INT)"); POLL("1", tx); APPLY("1", tx); OPEN("2", db2); CLOSE(db2); APPLY("2", tx); DONE(tx); EXEC(db1, "INSERT INTO test(n) VALUES(123)"); POLL("1", tx); APPLY("1", tx); APPLY("2", tx); DONE(tx); CLOSE(db1); return MUNIT_OK; } /* Use dqlite_vfs_apply() to replicate a second write transaction on a different * VFS than the one that initially generated it and that has an open connection * which has built the WAL index header by preparing a statement. */ TEST(vfs, applyOnDifferentVfsWithOpenConnection, setUp, tearDown, 0, vfs_params) { sqlite3 *db1; sqlite3 *db2; sqlite3_stmt *stmt; struct tx tx; OPEN("1", db1); PREPARE(db1, stmt, "CREATE TABLE test(n INT)"); STEP(stmt, SQLITE_DONE); FINALIZE(stmt); POLL("1", tx); APPLY("1", tx); OPEN("2", db2); CLOSE(db2); APPLY("2", tx); DONE(tx); EXEC(db1, "INSERT INTO test(n) VALUES(123)"); POLL("1", tx); CLOSE(db1); OPEN("2", db2); PREPARE(db2, stmt, "PRAGMA cache_size=-5000"); FINALIZE(stmt); APPLY("2", tx); PREPARE(db2, stmt, "SELECT * FROM test"); STEP(stmt, SQLITE_ROW); FINALIZE(stmt); DONE(tx); CLOSE(db2); return MUNIT_OK; } /* A write transaction that gets replicated to a different VFS is visible to a * new connection opened on that VFS. */ TEST(vfs, transactionVisibleOnDifferentVfs, setUp, tearDown, 0, vfs_params) { sqlite3 *db1; sqlite3 *db2; sqlite3_stmt *stmt; struct tx tx; OPEN("1", db1); EXEC(db1, "CREATE TABLE test(n INT)"); POLL("1", tx); APPLY("1", tx); OPEN("2", db2); CLOSE(db2); APPLY("2", tx); DONE(tx); CLOSE(db1); OPEN("2", db1); PREPARE(db1, stmt, "SELECT * FROM test"); STEP(stmt, SQLITE_DONE); FINALIZE(stmt); CLOSE(db1); return MUNIT_OK; } /* Calling dqlite_vfs_abort() to cancel a transaction releases the write * lock on the WAL. */ TEST(vfs, abort, setUp, tearDown, 0, vfs_params) { sqlite3 *db1; sqlite3 *db2; sqlite3_stmt *stmt1; sqlite3_stmt *stmt2; struct tx tx; OPEN("1", db1); OPEN("1", db2); PREPARE(db1, stmt1, "CREATE TABLE test(n INT)"); PREPARE(db2, stmt2, "CREATE TABLE test2(n INT)"); STEP(stmt1, SQLITE_DONE); POLL("1", tx); ABORT("1"); STEP(stmt2, SQLITE_DONE); FINALIZE(stmt1); FINALIZE(stmt2); CLOSE(db1); CLOSE(db2); DONE(tx); return MUNIT_OK; } /* Perform a checkpoint after a write transaction has completed, then perform * another write transaction and check that changes both before and after the * checkpoint are visible. */ TEST(vfs, checkpoint, setUp, tearDown, 0, vfs_params) { sqlite3 *db1; sqlite3 *db2; sqlite3_stmt *stmt; struct tx tx; OPEN("1", db1); EXEC(db1, "CREATE TABLE test(n INT)"); POLL("1", tx); APPLY("1", tx); DONE(tx); EXEC(db1, "INSERT INTO test(n) VALUES(123)"); POLL("1", tx); APPLY("1", tx); DONE(tx); OPEN("1", db2); CHECKPOINT(db2); CLOSE(db2); EXEC(db1, "INSERT INTO test(n) VALUES(456)"); POLL("1", tx); APPLY("1", tx); DONE(tx); PREPARE(db1, stmt, "SELECT * FROM test"); STEP(stmt, SQLITE_ROW); munit_assert_int(sqlite3_column_int(stmt, 0), ==, 123); STEP(stmt, SQLITE_ROW); munit_assert_int(sqlite3_column_int(stmt, 0), ==, 456); STEP(stmt, SQLITE_DONE); FINALIZE(stmt); CLOSE(db1); return MUNIT_OK; } /* Replicate a write transaction that happens after a checkpoint. */ TEST(vfs, applyOnDifferentVfsAfterCheckpoint, setUp, tearDown, 0, vfs_params) { sqlite3 *db; sqlite3_stmt *stmt; struct tx tx1; struct tx tx2; struct tx tx3; OPEN("1", db); EXEC(db, "CREATE TABLE test(n INT)"); POLL("1", tx1); APPLY("1", tx1); EXEC(db, "INSERT INTO test(n) VALUES(123)"); POLL("1", tx2); APPLY("1", tx2); CHECKPOINT(db); EXEC(db, "INSERT INTO test(n) VALUES(456)"); POLL("1", tx3); APPLY("1", tx3); CLOSE(db); OPEN("2", db); CLOSE(db); APPLY("2", tx1); APPLY("2", tx2); OPEN("2", db); CHECKPOINT(db); CLOSE(db); APPLY("2", tx3); OPEN("2", db); PREPARE(db, stmt, "SELECT * FROM test ORDER BY n"); STEP(stmt, SQLITE_ROW); munit_assert_int(sqlite3_column_int(stmt, 0), ==, 123); STEP(stmt, SQLITE_ROW); munit_assert_int(sqlite3_column_int(stmt, 0), ==, 456); STEP(stmt, SQLITE_DONE); FINALIZE(stmt); CLOSE(db); DONE(tx1); DONE(tx2); DONE(tx3); return MUNIT_OK; } /* Replicate a write transaction that happens after a checkpoint, without * performing the checkpoint on the replicated DB. */ TEST(vfs, applyOnDifferentVfsAfterCheckpointOtherVfsNoCheckpoint, setUp, tearDown, 0, vfs_params) { sqlite3 *db; sqlite3_stmt *stmt; struct tx tx1; struct tx tx2; struct tx tx3; struct tx tx4; /* Create transactions and checkpoint the DB after every transaction */ OPEN("1", db); EXEC(db, "CREATE TABLE test(n INT)"); POLL("1", tx1); APPLY("1", tx1); CHECKPOINT_FRESH("1"); EXEC(db, "CREATE TABLE test2(n INT)"); POLL("1", tx2); APPLY("1", tx2); CHECKPOINT_FRESH("1"); EXEC(db, "INSERT INTO test(n) VALUES(123)"); POLL("1", tx3); APPLY("1", tx3); CHECKPOINT_FRESH("1"); EXEC(db, "INSERT INTO test2(n) VALUES(456)"); POLL("1", tx4); APPLY("1", tx4); CHECKPOINT_FRESH("1"); CLOSE(db); /* Create a second VFS and Apply the transactions without checkpointing * the DB in between. */ OPEN("2", db); APPLY("2", tx1); APPLY("2", tx2); APPLY("2", tx3); APPLY("2", tx4); /* Ensure data is there. */ PREPARE(db, stmt, "SELECT * FROM test ORDER BY n"); STEP(stmt, SQLITE_ROW); munit_assert_int(sqlite3_column_int(stmt, 0), ==, 123); STEP(stmt, SQLITE_DONE); FINALIZE(stmt); PREPARE(db, stmt, "SELECT * FROM test2 ORDER BY n"); STEP(stmt, SQLITE_ROW); munit_assert_int(sqlite3_column_int(stmt, 0), ==, 456); STEP(stmt, SQLITE_DONE); FINALIZE(stmt); /* Make sure checkpoint succeeds */ CHECKPOINT_FRESH("2"); CLOSE(db); DONE(tx1); DONE(tx2); DONE(tx3); DONE(tx4); return MUNIT_OK; } /* Replicate a write transaction that happens before a checkpoint, and is * replicated on a DB that has been checkpointed. */ TEST(vfs, applyOnDifferentVfsExtraCheckpointsOnOtherVfs, setUp, tearDown, 0, vfs_params) { sqlite3 *db; sqlite3_stmt *stmt; struct tx tx1; struct tx tx2; struct tx tx3; struct tx tx4; /* Create transactions */ OPEN("1", db); EXEC(db, "CREATE TABLE test(n INT)"); POLL("1", tx1); APPLY("1", tx1); EXEC(db, "CREATE TABLE test2(n INT)"); POLL("1", tx2); APPLY("1", tx2); EXEC(db, "INSERT INTO test(n) VALUES(123)"); POLL("1", tx3); APPLY("1", tx3); EXEC(db, "INSERT INTO test2(n) VALUES(456)"); POLL("1", tx4); APPLY("1", tx4); CLOSE(db); /* Create a second VFS and Apply the transactions while checkpointing * after every transaction. */ OPEN("2", db); CLOSE(db); APPLY("2", tx1); CHECKPOINT_FRESH("2"); APPLY("2", tx2); CHECKPOINT_FRESH("2"); APPLY("2", tx3); CHECKPOINT_FRESH("2"); APPLY("2", tx4); CHECKPOINT_FRESH("2"); /* Ensure all the data is there. */ OPEN("2", db); PREPARE(db, stmt, "SELECT * FROM test ORDER BY n"); STEP(stmt, SQLITE_ROW); munit_assert_int(sqlite3_column_int(stmt, 0), ==, 123); STEP(stmt, SQLITE_DONE); FINALIZE(stmt); PREPARE(db, stmt, "SELECT * FROM test2 ORDER BY n"); STEP(stmt, SQLITE_ROW); munit_assert_int(sqlite3_column_int(stmt, 0), ==, 456); STEP(stmt, SQLITE_DONE); FINALIZE(stmt); CLOSE(db); DONE(tx1); DONE(tx2); DONE(tx3); DONE(tx4); return MUNIT_OK; } /* Replicate to another VFS a series of changes including a checkpoint, then * perform a new write transaction on that other VFS. */ TEST(vfs, checkpointThenPerformTransaction, setUp, tearDown, 0, vfs_params) { sqlite3 *db1; struct tx tx1; struct tx tx2; struct tx tx3; OPEN("1", db1); EXEC(db1, "CREATE TABLE test(n INT)"); POLL("1", tx1); APPLY("1", tx1); EXEC(db1, "INSERT INTO test(n) VALUES(123)"); POLL("1", tx2); APPLY("1", tx2); CHECKPOINT(db1); EXEC(db1, "INSERT INTO test(n) VALUES(456)"); POLL("1", tx3); APPLY("1", tx3); CLOSE(db1); OPEN("2", db1); APPLY("2", tx1); APPLY("2", tx2); CHECKPOINT_FRESH("2"); APPLY("2", tx3); DONE(tx1); DONE(tx2); DONE(tx3); EXEC(db1, "INSERT INTO test(n) VALUES(789)"); POLL("2", tx1); APPLY("2", tx1); DONE(tx1); CLOSE(db1); return MUNIT_OK; } /* Rollback a transaction that didn't hit the page cache limit and hence didn't * perform any pre-commit WAL writes. */ TEST(vfs, rollbackTransactionWithoutPageStress, setUp, tearDown, 0, vfs_params) { sqlite3 *db; struct tx tx; sqlite3_stmt *stmt; OPEN("1", db); EXEC(db, "CREATE TABLE test(n INT)"); POLL("1", tx); APPLY("1", tx); DONE(tx); EXEC(db, "BEGIN"); EXEC(db, "INSERT INTO test(n) VALUES(1)"); EXEC(db, "ROLLBACK"); POLL("1", tx); munit_assert_int(tx.n, ==, 0); PREPARE(db, stmt, "SELECT * FROM test"); STEP(stmt, SQLITE_DONE); RESET(stmt, SQLITE_OK); EXEC(db, "INSERT INTO test(n) VALUES(1)"); POLL("1", tx); APPLY("1", tx); DONE(tx); STEP(stmt, SQLITE_ROW); FINALIZE(stmt); CLOSE(db); return MUNIT_OK; } /* Rollback a transaction that hit the page cache limit and hence performed some * pre-commit WAL writes. */ TEST(vfs, rollbackTransactionWithPageStress, setUp, tearDown, 0, vfs_params) { sqlite3 *db; sqlite3_stmt *stmt; struct tx tx; unsigned i; OPEN("1", db); EXEC(db, "CREATE TABLE test(n INT)"); POLL("1", tx); APPLY("1", tx); DONE(tx); EXEC(db, "BEGIN"); for (i = 0; i < 163; i++) { char sql[64]; sprintf(sql, "INSERT INTO test(n) VALUES(%d)", i + 1); EXEC(db, sql); POLL("1", tx); munit_assert_int(tx.n, ==, 0); } EXEC(db, "ROLLBACK"); POLL("1", tx); munit_assert_int(tx.n, ==, 0); PREPARE(db, stmt, "SELECT * FROM test"); STEP(stmt, SQLITE_DONE); RESET(stmt, SQLITE_OK); EXEC(db, "INSERT INTO test(n) VALUES(1)"); POLL("1", tx); APPLY("1", tx); DONE(tx); STEP(stmt, SQLITE_ROW); FINALIZE(stmt); CLOSE(db); return MUNIT_OK; } /* Try and fail to checkpoint a WAL that performed some pre-commit WAL writes. */ TEST(vfs, checkpointTransactionWithPageStress, setUp, tearDown, 0, vfs_params) { sqlite3 *db; struct tx tx; unsigned i; OPEN("1", db); EXEC(db, "CREATE TABLE test(n INT)"); POLL("1", tx); APPLY("1", tx); DONE(tx); EXEC(db, "BEGIN"); for (i = 0; i < 163; i++) { char sql[64]; sprintf(sql, "INSERT INTO test(n) VALUES(%d)", i + 1); EXEC(db, sql); POLL("1", tx); munit_assert_int(tx.n, ==, 0); } CHECKPOINT_FAIL(db, SQLITE_LOCKED); CLOSE(db); return MUNIT_OK; } /* A snapshot of a brand new database that has been just initialized contains * just the first page of the main database file. */ TEST(vfs, snapshotInitialDatabase, setUp, tearDown, 0, vfs_params) { sqlite3 *db; struct snapshot snapshot; uint8_t *page; uint8_t page_size[2] = {2, 0}; /* Big-endian page size */ uint8_t database_size[4] = {0, 0, 0, 1}; /* Big-endian database size */ OPEN("1", db); CLOSE(db); SNAPSHOT("1", snapshot); munit_assert_int(snapshot.n, ==, PAGE_SIZE); page = snapshot.data; munit_assert_int(memcmp(&page[16], page_size, 2), ==, 0); munit_assert_int(memcmp(&page[28], database_size, 4), ==, 0); raft_free(snapshot.data); return MUNIT_OK; } /* A snapshot of a database after the first write transaction gets applied * contains the first page of the database plus the WAL file containing the * transaction frames. */ TEST(vfs, snapshotAfterFirstTransaction, setUp, tearDown, 0, vfs_params) { sqlite3 *db; struct snapshot snapshot; struct tx tx; uint8_t *page; uint8_t page_size[2] = {2, 0}; /* Big-endian page size */ uint8_t database_size[4] = {0, 0, 0, 1}; /* Big-endian database size */ OPEN("1", db); EXEC(db, "CREATE TABLE test(n INT)"); POLL("1", tx); APPLY("1", tx); DONE(tx); CLOSE(db); SNAPSHOT("1", snapshot); munit_assert_int(snapshot.n, ==, PAGE_SIZE + 32 + (24 + PAGE_SIZE) * 2); page = snapshot.data; munit_assert_int(memcmp(&page[16], page_size, 2), ==, 0); munit_assert_int(memcmp(&page[28], database_size, 4), ==, 0); raft_free(snapshot.data); return MUNIT_OK; } /* A snapshot of a database after a checkpoint contains all checkpointed pages * and no WAL frames. */ TEST(vfs, snapshotAfterCheckpoint, setUp, tearDown, 0, vfs_params) { sqlite3 *db; struct snapshot snapshot; struct tx tx; uint8_t *page; uint8_t page_size[2] = {2, 0}; /* Big-endian page size */ uint8_t database_size[4] = {0, 0, 0, 2}; /* Big-endian database size */ OPEN("1", db); EXEC(db, "CREATE TABLE test(n INT)"); POLL("1", tx); APPLY("1", tx); DONE(tx); CHECKPOINT(db); CLOSE(db); SNAPSHOT("1", snapshot); munit_assert_int(snapshot.n, ==, PAGE_SIZE * 2); page = snapshot.data; munit_assert_int(memcmp(&page[16], page_size, 2), ==, 0); munit_assert_int(memcmp(&page[28], database_size, 4), ==, 0); raft_free(snapshot.data); return MUNIT_OK; } /* Restore a snapshot taken after a brand new database has been just * initialized. */ TEST(vfs, restoreInitialDatabase, setUp, tearDown, 0, vfs_params) { sqlite3 *db; struct snapshot snapshot; OPEN("1", db); CLOSE(db); SNAPSHOT("1", snapshot); OPEN("2", db); CLOSE(db); RESTORE("2", snapshot); raft_free(snapshot.data); return MUNIT_OK; } /* Restore a snapshot of a database taken after the first write transaction gets * applied. */ TEST(vfs, restoreAfterFirstTransaction, setUp, tearDown, 0, vfs_params) { sqlite3 *db; sqlite3_stmt *stmt; struct snapshot snapshot; struct tx tx; OPEN("1", db); EXEC(db, "CREATE TABLE test(n INT)"); POLL("1", tx); APPLY("1", tx); DONE(tx); CLOSE(db); SNAPSHOT("1", snapshot); OPEN("2", db); CLOSE(db); RESTORE("2", snapshot); OPEN("2", db); PREPARE(db, stmt, "SELECT * FROM test"); STEP(stmt, SQLITE_DONE); FINALIZE(stmt); CLOSE(db); raft_free(snapshot.data); return MUNIT_OK; } /* Restore a snapshot of a database while a connection is open. */ TEST(vfs, restoreWithOpenConnection, setUp, tearDown, 0, vfs_params) { sqlite3 *db; sqlite3_stmt *stmt; struct snapshot snapshot; struct tx tx; OPEN("1", db); EXEC(db, "CREATE TABLE test(n INT)"); POLL("1", tx); APPLY("1", tx); DONE(tx); CLOSE(db); SNAPSHOT("1", snapshot); OPEN("2", db); RESTORE("2", snapshot); PREPARE(db, stmt, "SELECT * FROM test"); STEP(stmt, SQLITE_DONE); FINALIZE(stmt); CLOSE(db); raft_free(snapshot.data); return MUNIT_OK; } /* Changing page_size to non-default value fails. */ TEST(vfs, changePageSize, setUp, tearDown, 0, vfs_params) { sqlite3 *db; int rv; OPEN("1", db); rv = sqlite3_exec(db, "PRAGMA page_size=1024", NULL, NULL, NULL); munit_assert_int(rv, !=, 0); CLOSE(db); return MUNIT_OK; } /* Changing page_size to current value succeeds. */ TEST(vfs, changePageSizeSameValue, setUp, tearDown, 0, vfs_params) { sqlite3 *db; int rv; OPEN("1", db); rv = sqlite3_exec(db, "PRAGMA page_size=512", NULL, NULL, NULL); munit_assert_int(rv, ==, 0); CLOSE(db); return MUNIT_OK; } dqlite-1.16.7/test/lib/000077500000000000000000000000001465252713400146265ustar00rootroot00000000000000dqlite-1.16.7/test/lib/client.h000066400000000000000000000214001465252713400162520ustar00rootroot00000000000000/* Setup a test dqlite client. */ #include "endpoint.h" #ifndef TEST_CLIENT_H #define TEST_CLIENT_H #define FIXTURE_CLIENT \ struct client_proto client; \ struct test_endpoint endpoint; \ int server #define SETUP_CLIENT \ { \ int _rv; \ int _client; \ test_endpoint_setup(&f->endpoint, params); \ _rv = listen(f->endpoint.fd, 16); \ munit_assert_int(_rv, ==, 0); \ test_endpoint_pair(&f->endpoint, &f->server, &_client); \ memset(&f->client, 0, sizeof f->client); \ buffer__init(&f->client.read); \ buffer__init(&f->client.write); \ f->client.fd = _client; \ } #define TEAR_DOWN_CLIENT \ clientClose(&f->client); \ test_endpoint_tear_down(&f->endpoint) /****************************************************************************** * * Helper macros. * ******************************************************************************/ /* Send the initial client handshake. */ #define HANDSHAKE \ { \ int rv_; \ rv_ = clientSendHandshake(f->client, NULL); \ munit_assert_int(rv_, ==, 0); \ } /* Send the initial client handshake for a specific client. */ #define HANDSHAKE_C(CLIENT) \ { \ int rv_; \ rv_ = clientSendHandshake(CLIENT, NULL); \ munit_assert_int(rv_, ==, 0); \ } /* Send an add request. */ #define ADD(ID, ADDRESS) \ { \ int rv_; \ rv_ = clientSendAdd(f->client, ID, ADDRESS, NULL); \ munit_assert_int(rv_, ==, 0); \ rv_ = clientRecvEmpty(f->client, NULL); \ munit_assert_int(rv_, ==, 0); \ } /* Send an assign role request. */ #define ASSIGN(ID, ROLE) \ { \ int rv_; \ rv_ = clientSendAssign(f->client, ID, ROLE, NULL); \ munit_assert_int(rv_, ==, 0); \ rv_ = clientRecvEmpty(f->client, NULL); \ munit_assert_int(rv_, ==, 0); \ } /* Send a remove request. */ #define REMOVE(ID) \ { \ int rv_; \ rv_ = clientSendRemove(f->client, ID, NULL); \ munit_assert_int(rv_, ==, 0); \ rv_ = clientRecvEmpty(f->client, NULL); \ munit_assert_int(rv_, ==, 0); \ } /* Send a transfer request. */ #define TRANSFER(ID, CLIENT) \ { \ int rv_; \ rv_ = clientSendTransfer(CLIENT, ID, NULL); \ munit_assert_int(rv_, ==, 0); \ rv_ = clientRecvEmpty(CLIENT, NULL); \ munit_assert_int(rv_, ==, 0); \ } /* Open a test database. */ #define OPEN \ { \ int rv_; \ rv_ = clientSendOpen(f->client, "test", NULL); \ munit_assert_int(rv_, ==, 0); \ rv_ = clientRecvDb(f->client, NULL); \ munit_assert_int(rv_, ==, 0); \ } /* Open a test database with a specific name. */ #define OPEN_NAME(NAME) \ { \ int rv_; \ rv_ = clientSendOpen(f->client, NAME, NULL); \ munit_assert_int(rv_, ==, 0); \ rv_ = clientRecvDb(f->client, NULL); \ munit_assert_int(rv_, ==, 0); \ } /* Prepare a statement. */ #define PREPARE(SQL, STMT_ID) \ { \ int rv_; \ rv_ = clientSendPrepare(f->client, SQL, NULL); \ munit_assert_int(rv_, ==, 0); \ rv_ = clientRecvStmt(f->client, STMT_ID, NULL, NULL, NULL); \ munit_assert_int(rv_, ==, 0); \ } #define PREPARE_FAIL(SQL, STMT_ID, RV, MSG) \ { \ int rv_; \ rv_ = clientSendPrepare(f->client, SQL, NULL); \ munit_assert_int(rv_, ==, 0); \ rv_ = clientRecvFailure(f->client, RV, MSG, NULL); \ munit_assert_int(rv_, ==, 0); \ } /* Execute a statement. */ #define EXEC(STMT_ID, LAST_INSERT_ID, ROWS_AFFECTED) \ { \ int rv_; \ rv_ = clientSendExec(f->client, STMT_ID, NULL, 0, NULL); \ munit_assert_int(rv_, ==, 0); \ rv_ = clientRecvResult(f->client, LAST_INSERT_ID, \ ROWS_AFFECTED, NULL); \ munit_assert_int(rv_, ==, 0); \ } #define EXEC_SQL(SQL, LAST_INSERT_ID, ROWS_AFFECTED) \ { \ int rv_; \ rv_ = clientSendExecSQL(f->client, SQL, NULL, 0, NULL); \ munit_assert_int(rv_, ==, 0); \ rv_ = clientRecvResult(f->client, LAST_INSERT_ID, \ ROWS_AFFECTED, NULL); \ munit_assert_int(rv_, ==, 0); \ } /* Perform a query. */ #define QUERY(STMT_ID, ROWS) \ { \ int rv_; \ rv_ = clientSendQuery(f->client, STMT_ID, NULL, 0, NULL); \ munit_assert_int(rv_, ==, 0); \ rv_ = clientRecvRows(f->client, ROWS, NULL, NULL); \ munit_assert_int(rv_, ==, 0); \ } #define QUERY_DONE(STMT_ID, ROWS, ROWS_HOOK) \ { \ int rv_; \ bool done; \ rv_ = clientSendQuery(f->client, STMT_ID, NULL, 0, NULL); \ munit_assert_int(rv_, ==, 0); \ do { \ rv_ = clientRecvRows(f->client, (ROWS), &done, NULL); \ munit_assert_int(rv_, ==, 0); \ ROWS_HOOK; \ clientCloseRows((ROWS)); \ *(ROWS) = (struct rows){}; \ } while (!done); \ } #define QUERY_SQL(SQL, ROWS) \ { \ int rv_; \ rv_ = clientSendQuerySQL(f->client, SQL, NULL, 0, NULL); \ munit_assert_int(rv_, ==, 0); \ rv_ = clientRecvRows(f->client, ROWS, NULL, NULL); \ munit_assert_int(rv_, ==, 0); \ } #define QUERY_SQL_DONE(SQL, ROWS, ROWS_HOOK) \ { \ int rv_; \ bool done; \ rv_ = clientSendQuerySQL(f->client, SQL, NULL, 0, NULL); \ munit_assert_int(rv_, ==, 0); \ do { \ rv_ = clientRecvRows(f->client, (ROWS), &done, NULL); \ munit_assert_int(rv_, ==, 0); \ ROWS_HOOK; \ clientCloseRows((ROWS)); \ *(ROWS) = (struct rows){}; \ } while (!done); \ } #endif /* TEST_CLIENT_H */ dqlite-1.16.7/test/lib/cluster.h000066400000000000000000000144571465252713400164730ustar00rootroot00000000000000/** * Helpers to setup a raft cluster in test fixtures. * * Each raft instance will use its own dqlite FSM, which in turn will be created * using its own config, registry and logger. * * The fixture will also register a VFS and a SQLite replication object for each * raft instance, using "test" as registration name, where is the raft * instance index. * * This fixture is meant to be used as base-line fixture for most higher-level * tests. */ #ifndef TEST_CLUSTER_H #define TEST_CLUSTER_H #include "../../src/config.h" #include "../../src/fsm.h" #include "../../src/raft.h" #include "../../src/registry.h" #include "../../src/vfs.h" #include "../lib/fs.h" #include "../lib/heap.h" #include "../lib/logger.h" #include "../lib/sqlite.h" #define N_SERVERS 3 #define V1 0 #define V2 1 struct server { struct logger logger; struct config config; sqlite3_vfs vfs; struct registry registry; char *dir; }; #define FIXTURE_CLUSTER \ struct server servers[N_SERVERS]; \ struct raft_fsm fsms[N_SERVERS]; \ struct raft_fixture cluster; #define SETUP_CLUSTER(VERSION) \ { \ struct raft_configuration _configuration; \ unsigned _i; \ int _rv; \ SETUP_HEAP; \ SETUP_SQLITE; \ _rv = raft_fixture_init(&f->cluster); \ munit_assert_int(_rv, ==, 0); \ for (_i = 0; _i < N_SERVERS; _i++) { \ SETUP_SERVER(_i, VERSION); \ raft_fixture_grow(&f->cluster, &f->fsms[_i]); \ } \ _rv = raft_fixture_configuration(&f->cluster, N_SERVERS, \ &_configuration); \ munit_assert_int(_rv, ==, 0); \ _rv = raft_fixture_bootstrap(&f->cluster, &_configuration); \ munit_assert_int(_rv, ==, 0); \ raft_configuration_close(&_configuration); \ _rv = raft_fixture_start(&f->cluster); \ munit_assert_int(_rv, ==, 0); \ } #define SETUP_SERVER(I, VERSION) \ { \ struct server *_s = &f->servers[I]; \ struct raft_fsm *_fsm = &f->fsms[I]; \ char address[16]; \ int _rc; \ \ test_logger_setup(params, &_s->logger); \ \ sprintf(address, "%d", I + 1); \ \ char *dir = test_dir_setup(); \ _s->dir = dir; \ \ _rc = config__init(&_s->config, I + 1, address, NULL, dir); \ munit_assert_int(_rc, ==, 0); \ \ registry__init(&_s->registry, &_s->config); \ \ _rc = VfsInit(&_s->vfs, _s->config.name); \ munit_assert_int(_rc, ==, 0); \ _rc = sqlite3_vfs_register(&_s->vfs, 0); \ munit_assert_int(_rc, ==, 0); \ \ _rc = fsm__init(_fsm, &_s->config, &_s->registry); \ munit_assert_int(_rc, ==, 0); \ } #define TEAR_DOWN_CLUSTER \ { \ int _i; \ for (_i = 0; _i < N_SERVERS; _i++) { \ TEAR_DOWN_SERVER(_i); \ } \ raft_fixture_close(&f->cluster); \ TEAR_DOWN_SQLITE; \ TEAR_DOWN_HEAP; \ } #define TEAR_DOWN_SERVER(I) \ { \ struct server *s = &f->servers[I]; \ struct raft_fsm *fsm = &f->fsms[I]; \ fsm__close(fsm); \ registry__close(&s->registry); \ sqlite3_vfs_unregister(&s->vfs); \ VfsClose(&s->vfs); \ config__close(&s->config); \ test_dir_tear_down(s->dir); \ test_logger_tear_down(&s->logger); \ } #define CLUSTER_CONFIG(I) &f->servers[I].config #define CLUSTER_LOGGER(I) &f->servers[I].logger #define CLUSTER_LEADER(I) &f->servers[I].leader #define CLUSTER_REGISTRY(I) &f->servers[I].registry #define CLUSTER_RAFT(I) raft_fixture_get(&f->cluster, I) #define CLUSTER_LAST_INDEX(I) raft_last_index(CLUSTER_RAFT(I)) #define CLUSTER_DISCONNECT(I, J) raft_fixture_disconnect(&f->cluster, I, J) #define CLUSTER_RECONNECT(I, J) raft_fixture_reconnect(&f->cluster, I, J) #define CLUSTER_ELECT(I) raft_fixture_elect(&f->cluster, I) #define CLUSTER_DEPOSE raft_fixture_depose(&f->cluster) #define CLUSTER_APPLIED(N) \ { \ int _i; \ for (_i = 0; _i < N_SERVERS; _i++) { \ bool done; \ done = raft_fixture_step_until_applied(&f->cluster, \ _i, N, 1000); \ munit_assert_true(done); \ } \ } #define CLUSTER_STEP raft_fixture_step(&f->cluster) #define CLUSTER_SNAPSHOT_THRESHOLD(I, N) \ raft_set_snapshot_threshold(CLUSTER_RAFT(I), N) #define CLUSTER_SNAPSHOT_TRAILING(I, N) \ raft_set_snapshot_trailing(CLUSTER_RAFT(I), N) #endif /* TEST_CLUSTER_H */ dqlite-1.16.7/test/lib/config.h000066400000000000000000000012711465252713400162450ustar00rootroot00000000000000/** * Options object for tests. */ #ifndef TEST_OPTIONS_H #define TEST_OPTIONS_H #include "../../src/config.h" #include "logger.h" #define FIXTURE_CONFIG struct config config; #define SETUP_CONFIG \ { \ int rc; \ rc = config__init(&f->config, 1, "1", NULL, "dir"); \ munit_assert_int(rc, ==, 0); \ test_logger_setup(params, &f->config.logger); \ } #define TEAR_DOWN_CONFIG \ test_logger_tear_down(&f->config.logger); \ config__close(&f->config) #endif /* TEST_OPTIONS_H */ dqlite-1.16.7/test/lib/endpoint.c000066400000000000000000000100641465252713400166130ustar00rootroot00000000000000#include #include #include #include #include "endpoint.h" static int getFamily(const MunitParameter params[]) { const char *family = NULL; if (params != NULL) { family = munit_parameters_get(params, TEST_ENDPOINT_FAMILY); } if (family == NULL) { family = "unix"; } if (strcmp(family, "tcp") == 0) { return AF_INET; } else if (strcmp(family, "unix") == 0) { return AF_UNIX; } munit_errorf("unexpected socket family: %s", family); return -1; } void test_endpoint_setup(struct test_endpoint *e, const MunitParameter params[]) { struct sockaddr *address; socklen_t size; int rv; e->family = getFamily(params); /* Initialize the appropriate socket address structure, depending on the * selected socket family. */ switch (e->family) { case AF_INET: /* TCP socket on loopback device */ memset(&e->in_address, 0, sizeof e->in_address); e->in_address.sin_family = AF_INET; e->in_address.sin_addr.s_addr = inet_addr("127.0.0.1"); e->in_address.sin_port = 0; /* Get a random free port */ address = (struct sockaddr *)(&e->in_address); size = sizeof e->in_address; break; case AF_UNIX: /* Abstract Unix socket */ memset(&e->un_address, 0, sizeof e->un_address); e->un_address.sun_family = AF_UNIX; strcpy(e->un_address.sun_path, ""); /* Random address */ address = (struct sockaddr *)(&e->un_address); size = sizeof e->un_address; break; default: munit_errorf("unexpected socket family: %d", e->family); } /* Create the listener fd. */ e->fd = socket(e->family, SOCK_STREAM, 0); if (e->fd < 0) { munit_errorf("socket(): %s", strerror(errno)); } /* Bind the listener fd. */ rv = bind(e->fd, address, size); if (rv != 0) { munit_errorf("bind(): %s", strerror(errno)); } /* Get the actual addressed assigned by the kernel and save it back in * the relevant struct server field (pointed to by address). */ rv = getsockname(e->fd, address, &size); if (rv != 0) { munit_errorf("getsockname(): %s", strerror(errno)); } /* Render the endpoint address. */ switch (e->family) { case AF_INET: sprintf(e->address, "127.0.0.1:%d", htons(e->in_address.sin_port)); break; case AF_UNIX: /* TODO */ break; } } void test_endpoint_tear_down(struct test_endpoint *e) { close(e->fd); } int test_endpoint_connect(struct test_endpoint *e) { struct sockaddr *address; socklen_t size; int fd; int rv; switch (e->family) { case AF_INET: address = (struct sockaddr *)&e->in_address; size = sizeof e->in_address; break; case AF_UNIX: address = (struct sockaddr *)&e->un_address; size = sizeof e->un_address; break; } /* Create the socket. */ fd = socket(e->family, SOCK_STREAM, 0); if (fd < 0) { munit_errorf("socket(): %s", strerror(errno)); } /* Connect to the server */ rv = connect(fd, address, size); if (rv != 0 && errno != ECONNREFUSED) { munit_errorf("connect(): %s", strerror(errno)); } return fd; } int test_endpoint_accept(struct test_endpoint *e) { struct sockaddr_in in_address; struct sockaddr_un un_address; struct sockaddr *address; socklen_t size; int fd; int rv; switch (e->family) { case AF_INET: address = (struct sockaddr *)&in_address; size = sizeof in_address; break; case AF_UNIX: address = (struct sockaddr *)&un_address; size = sizeof un_address; break; } /* Accept the client connection. */ fd = accept(e->fd, address, &size); if (fd < 0) { /* Check if the endpoint has been closed, so this is benign. */ if (errno == EBADF || errno == EINVAL || errno == ENOTSOCK) { return -1; } munit_errorf("accept(): %s", strerror(errno)); } /* Set non-blocking mode */ rv = fcntl(fd, F_SETFL, O_NONBLOCK); if (rv != 0) { munit_errorf("set non-blocking mode: %s", strerror(errno)); } return fd; } void test_endpoint_pair(struct test_endpoint *e, int *server, int *client) { *client = test_endpoint_connect(e); *server = test_endpoint_accept(e); } const char *test_endpoint_address(struct test_endpoint *e) { return e->address; } char *test_endpoint_family_values[] = {"tcp", "unix", NULL}; dqlite-1.16.7/test/lib/endpoint.h000066400000000000000000000035121465252713400166200ustar00rootroot00000000000000/* Helpers to create and connect Unix or TCP sockets. */ #ifndef TEST_ENDPOINT_H #define TEST_ENDPOINT_H #include #include #include "munit.h" /* A few tests depend on knowing that certain reads and writes will not be short * and will happen immediately. */ #define TEST_SOCKET_MIN_BUF_SIZE 4096 /* Munit parameter defining the socket type to use in test_endpoint_setup. * * If set to "unix" a pair of unix abstract sockets will be created. If set to * "tcp" a pair of TCP sockets using the loopback interface will be created. */ #define TEST_ENDPOINT_FAMILY "endpoint-family" /* Null-terminated list of legal values for TEST_ENDPOINT_FAMILY. Currently * "unix" and "tcp". */ extern char *test_endpoint_family_values[]; /* Listening socket endpoint. */ struct test_endpoint { char address[256]; /* Rendered address string. */ sa_family_t family; /* Address family (either AF_INET or AF_UNIX) */ int fd; /* Listening socket. */ union { /* Server address (either a TCP or Unix) */ struct sockaddr_in in_address; struct sockaddr_un un_address; }; }; /* Create a listening endpoint. * * This will bind a random address and start listening to it. */ void test_endpoint_setup(struct test_endpoint *e, const MunitParameter params[]); /* Tear down a listening endpoint. */ void test_endpoint_tear_down(struct test_endpoint *e); /* Establish a new client connection. */ int test_endpoint_connect(struct test_endpoint *e); /* Accept a new client connection. */ int test_endpoint_accept(struct test_endpoint *e); /* Connect and accept a connection, returning the pair of connected sockets. */ void test_endpoint_pair(struct test_endpoint *e, int *server, int *client); /* Return the endpoint address. */ const char *test_endpoint_address(struct test_endpoint *e); #endif /* TEST_ENDPOINT_H */ dqlite-1.16.7/test/lib/fault.c000066400000000000000000000022271465252713400161100ustar00rootroot00000000000000#include "fault.h" #include "munit.h" void test_fault_init(struct test_fault *f) { f->countdown = -1; f->n = -1; f->enabled = false; } bool test_fault_tick(struct test_fault *f) { if (MUNIT_UNLIKELY(!f->enabled)) { return false; } /* If the initial delay parameter was set to -1, then never fail. This * is the most common case. */ if (MUNIT_LIKELY(f->countdown < 0)) { return false; } /* If we did not yet reach 'delay' ticks, then just decrease the * countdown. */ if (f->countdown > 0) { f->countdown--; return false; } munit_assert_int(f->countdown, ==, 0); /* We reached 'delay' ticks, let's see how many times we have to trigger * the fault, if any. */ if (f->n < 0) { /* Trigger the fault forever. */ return true; } if (f->n > 0) { /* Trigger the fault at least this time. */ f->n--; return true; } munit_assert_int(f->n, ==, 0); /* We reached 'repeat' ticks, let's stop triggering the fault. */ f->countdown--; return false; } void test_fault_config(struct test_fault *f, int delay, int repeat) { f->countdown = delay; f->n = repeat; } void test_fault_enable(struct test_fault *f) { f->enabled = true; } dqlite-1.16.7/test/lib/fault.h000066400000000000000000000015641465252713400161200ustar00rootroot00000000000000/** * Helper for test components supporting fault injection. */ #ifndef TEST_FAULT_H #define TEST_FAULT_H #include /** * Information about a fault that should occurr in a component. */ struct test_fault { int countdown; /* Trigger the fault when this counter gets to zero. */ int n; /* Repeat the fault this many times. Default is -1. */ bool enabled; /* Enable fault triggering. */ }; /** * Initialize a fault. */ void test_fault_init(struct test_fault *f); /** * Advance the counters of the fault. Return true if the fault should be * triggered, false otherwise. */ bool test_fault_tick(struct test_fault *f); /** * Configure the fault with the given values. */ void test_fault_config(struct test_fault *f, int delay, int repeat); /** * Enable fault triggering. */ void test_fault_enable(struct test_fault *f); #endif /* TEST_FAULT_H */ dqlite-1.16.7/test/lib/fs.c000066400000000000000000000014221465252713400154010ustar00rootroot00000000000000#include #include #include #include #include #include "fs.h" #include "munit.h" char *test_dir_setup() { char *dir = munit_malloc(strlen(TEST__DIR_TEMPLATE) + 1); strcpy(dir, TEST__DIR_TEMPLATE); munit_assert_ptr_not_null(mkdtemp(dir)); return dir; } static int test__dir_tear_down_nftw_fn(const char *path, const struct stat *sb, int type, struct FTW *ftwb) { int rc; (void)sb; (void)type; (void)ftwb; rc = remove(path); munit_assert_int(rc, ==, 0); return 0; } void test_dir_tear_down(char *dir) { int rc; if (dir == NULL) { return; } rc = nftw(dir, test__dir_tear_down_nftw_fn, 10, FTW_DEPTH | FTW_MOUNT | FTW_PHYS); munit_assert_int(rc, ==, 0); free(dir); } dqlite-1.16.7/test/lib/fs.h000066400000000000000000000004221465252713400154050ustar00rootroot00000000000000#ifndef DQLITE_TEST_FS_H #define DQLITE_TEST_FS_H #define TEST__DIR_TEMPLATE "/tmp/dqlite-test-XXXXXX" /* Setup a temporary directory. */ char *test_dir_setup(void); /* Remove the temporary directory. */ void test_dir_tear_down(char *dir); #endif /* DQLITE_TEST_FS_H */ dqlite-1.16.7/test/lib/heap.c000066400000000000000000000123161465252713400157120ustar00rootroot00000000000000#include #include "fault.h" #include "heap.h" /* This structure is used to encapsulate the global state variables used by * malloc() fault simulation. */ struct mem_fault { struct test_fault fault; /* Fault trigger */ sqlite3_mem_methods m; /* Actual malloc implementation */ }; /* We need to use a global variable here because after a sqlite3_mem_methods * instance has been installed using sqlite3_config(), and after * sqlite3_initialize() has been called, there's no way to retrieve it back with * sqlite3_config(). */ static struct mem_fault memFault; /* A version of sqlite3_mem_methods.xMalloc() that includes fault simulation * logic.*/ static void *mem_fault_malloc(int n) { void *p = NULL; if (!test_fault_tick(&memFault.fault)) { p = memFault.m.xMalloc(n); } return p; } /* A version of sqlite3_mem_methods.xRealloc() that includes fault simulation * logic. */ static void *mem_fault_realloc(void *old, int n) { void *p = NULL; if (!test_fault_tick(&memFault.fault)) { p = memFault.m.xRealloc(old, n); } return p; } /* The following method calls are passed directly through to the underlying * malloc system: * * xFree * xSize * xRoundup * xInit * xShutdown */ static void mem_fault_free(void *p) { memFault.m.xFree(p); } static int mem_fault_size(void *p) { return memFault.m.xSize(p); } static int mem_fault_roundup(int n) { return memFault.m.xRoundup(n); } static int mem_fault_init(void *p) { (void)p; return memFault.m.xInit(memFault.m.pAppData); } static void mem_fault_shutdown(void *p) { (void)p; memFault.m.xShutdown(memFault.m.pAppData); } /* Wrap the given SQLite memory management instance with the faulty memory * management interface. By default no faults will be triggered. */ static void mem_wrap(sqlite3_mem_methods *m, sqlite3_mem_methods *wrap) { test_fault_init(&memFault.fault); memFault.m = *m; wrap->xMalloc = mem_fault_malloc; wrap->xFree = mem_fault_free; wrap->xRealloc = mem_fault_realloc; wrap->xSize = mem_fault_size; wrap->xRoundup = mem_fault_roundup; wrap->xInit = mem_fault_init; wrap->xShutdown = mem_fault_shutdown; wrap->pAppData = &memFault; } /* Unwrap the given faulty memory management instance returning the original * one. */ static void mem_unwrap(sqlite3_mem_methods *wrap, sqlite3_mem_methods *m) { (void)wrap; *m = memFault.m; } /* Get the current number of outstanding malloc()'s without a matching free() * and the total number of used memory. */ static void mem_stats(int *malloc_count, int *memory_used) { int rc; int watermark; rc = sqlite3_status(SQLITE_STATUS_MALLOC_COUNT, malloc_count, &watermark, 1); if (rc != SQLITE_OK) { munit_errorf("can't get malloc count: %s", sqlite3_errstr(rc)); } rc = sqlite3_status(SQLITE_STATUS_MEMORY_USED, memory_used, &watermark, 1); if (rc != SQLITE_OK) { munit_errorf("can't get memory: %s\n:", sqlite3_errstr(rc)); } } /* Ensure we're starting from a clean memory state with no allocations and * optionally inject malloc failures. */ void test_heap_setup(const MunitParameter params[], void *user_data) { int malloc_count; int memory_used; const char *fault_delay; const char *fault_repeat; sqlite3_mem_methods mem; sqlite3_mem_methods mem_fault; int rc; (void)params; (void)user_data; /* Install the faulty malloc implementation */ rc = sqlite3_config(SQLITE_CONFIG_GETMALLOC, &mem); if (rc != SQLITE_OK) { munit_errorf("can't get default mem: %s", sqlite3_errstr(rc)); } mem_wrap(&mem, &mem_fault); rc = sqlite3_config(SQLITE_CONFIG_MALLOC, &mem_fault); if (rc != SQLITE_OK) { munit_errorf("can't set faulty mem: %s", sqlite3_errstr(rc)); } /* Check that memory is clean. */ mem_stats(&malloc_count, &memory_used); if (malloc_count > 0 || memory_used > 0) { munit_errorf( "setup memory:\n bytes: %11d\n allocations: %5d\n", malloc_count, memory_used); } /* Optionally inject memory allocation failures. */ fault_delay = munit_parameters_get(params, "mem-fault-delay"); fault_repeat = munit_parameters_get(params, "mem-fault-repeat"); munit_assert((fault_delay != NULL && fault_repeat != NULL) || (fault_delay == NULL && fault_repeat == NULL)); if (fault_delay != NULL) { test_heap_fault_config(atoi(fault_delay), atoi(fault_repeat)); } } /* Ensure we're starting leaving a clean memory behind. */ void test_heap_tear_down(void *data) { sqlite3_mem_methods mem; sqlite3_mem_methods mem_fault; int rc; (void)data; int malloc_count; int memory_used; mem_stats(&malloc_count, &memory_used); if (malloc_count > 0 || memory_used > 0) { /* munit_errorf( "teardown memory:\n bytes: %11d\n allocations: %5d\n", memory_used, malloc_count); */ } /* Restore default memory management. */ rc = sqlite3_config(SQLITE_CONFIG_GETMALLOC, &mem_fault); if (rc != SQLITE_OK) { munit_errorf("can't get faulty mem: %s", sqlite3_errstr(rc)); } mem_unwrap(&mem_fault, &mem); rc = sqlite3_config(SQLITE_CONFIG_MALLOC, &mem); if (rc != SQLITE_OK) { munit_errorf("can't reset default mem: %s", sqlite3_errstr(rc)); } } void test_heap_fault_config(int delay, int repeat) { test_fault_config(&memFault.fault, delay, repeat); } void test_heap_fault_enable() { test_fault_enable(&memFault.fault); } dqlite-1.16.7/test/lib/heap.h000066400000000000000000000023431465252713400157160ustar00rootroot00000000000000#ifndef DQLITE_TEST_HEAP_H #define DQLITE_TEST_HEAP_H #include "munit.h" /* Munit parameter defining the delay of the faulty memory implementation. */ #define TEST_HEAP_FAULT_DELAY "mem-fault-delay" /* Munit parameter defining the repeat of the faulty memory implementation. */ #define TEST_HEAP_FAULT_REPEAT "mem-fault-repeat" void test_heap_setup(const MunitParameter params[], void *user_data); void test_heap_tear_down(void *data); /* Configure the faulty memory management implementation so malloc()-related * functions start returning NULL pointers after 'delay' calls, and keep failing * for 'repeat' consecutive times. * * Note that the faults won't automatically take place, an explicit call to * test_mem_fault_enable() is needed. This allows configuration and actual * behavior to happen at different times (e.g. configure at test setup time and * enable at test case time). */ void test_heap_fault_config(int delay, int repeat); /* Enable the faulty behavior, which from this point on will honor the * parameters passed to test_mem_fault_config(). */ void test_heap_fault_enable(void); #define SETUP_HEAP test_heap_setup(params, user_data); #define TEAR_DOWN_HEAP test_heap_tear_down(data); #endif /* DQLITE_TEST_HEAP_H */ dqlite-1.16.7/test/lib/leader.h000066400000000000000000000013661465252713400162410ustar00rootroot00000000000000/** * Setup a test struct leader object. */ #ifndef TEST_LEADER_H #define TEST_LEADER_H #include "../../src/leader.h" #include "../../src/registry.h" #define FIXTURE_LEADER struct leader leader #define SETUP_LEADER \ { \ struct db *db; \ int rv; \ rv = registry__db_get(&f->registry, "test.db", &db); \ munit_assert_int(rv, ==, 0); \ rv = leader__init(&f->leader, db, &f->raft); \ munit_assert_int(rv, ==, 0); \ } #define TEAR_DOWN_LEADER leader__close(&f->leader) #endif /* TEST_LEADER_H */ dqlite-1.16.7/test/lib/logger.c000066400000000000000000000023541465252713400162550ustar00rootroot00000000000000#include #include #include "../../include/dqlite.h" #include "logger.h" #include "munit.h" void test_logger_emit(void *data, int level, const char *format, va_list args) { struct test_logger *t = data; char buf[1024]; const char *level_name; int i; (void)data; switch (level) { case DQLITE_DEBUG: level_name = "DEBUG"; break; case DQLITE_INFO: level_name = "INFO "; break; case DQLITE_WARN: level_name = "WARN "; break; case DQLITE_LOG_ERROR: level_name = "ERROR"; break; }; buf[0] = 0; sprintf(buf + strlen(buf), "%2d -> [%s] ", t->id, level_name); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wformat-nonliteral" vsnprintf(buf + strlen(buf), 1024 - strlen(buf), format, args); #pragma GCC diagnostic pop munit_log(MUNIT_LOG_INFO, buf); return; snprintf(buf + strlen(buf), 1024 - strlen(buf), " "); for (i = strlen(buf); i < 85; i++) { buf[i] = ' '; } munit_log(MUNIT_LOG_INFO, buf); } void test_logger_setup(const MunitParameter params[], struct logger *l) { struct test_logger *t; (void)params; t = munit_malloc(sizeof *t); t->data = NULL; l->data = t; l->emit = test_logger_emit; } void test_logger_tear_down(struct logger *l) { free(l->data); } dqlite-1.16.7/test/lib/logger.h000066400000000000000000000010641465252713400162570ustar00rootroot00000000000000/** * Test logger. */ #ifndef TEST_LOGGER_H #define TEST_LOGGER_H #include "../../src/logger.h" #include "munit.h" void test_logger_setup(const MunitParameter params[], struct logger *l); void test_logger_tear_down(struct logger *l); struct test_logger { unsigned id; void *data; }; void test_logger_emit(void *data, int level, const char *fmt, va_list args); #define FIXTURE_LOGGER struct logger logger; #define SETUP_LOGGER test_logger_setup(params, &f->logger); #define TEAR_DOWN_LOGGER test_logger_tear_down(&f->logger); #endif /* TEST_LOGGER_H */ dqlite-1.16.7/test/lib/munit.c000066400000000000000000002045651465252713400161420ustar00rootroot00000000000000/* Copyright (c) 2013-2018 Evan Nemerson * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, * modify, merge, publish, distribute, sublicense, and/or sell copies * of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ /*** Configuration ***/ /* This is just where the output from the test goes. It's really just * meant to let you choose stdout or stderr, but if anyone really want * to direct it to a file let me know, it would be fairly easy to * support. */ #if !defined(MUNIT_OUTPUT_FILE) # define MUNIT_OUTPUT_FILE stdout #endif /* This is a bit more useful; it tells µnit how to format the seconds in * timed tests. If your tests run for longer you might want to reduce * it, and if your computer is really fast and your tests are tiny you * can increase it. */ #if !defined(MUNIT_TEST_TIME_FORMAT) # define MUNIT_TEST_TIME_FORMAT "0.8f" #endif /* If you have long test names you might want to consider bumping * this. The result information takes 43 characters. */ #if !defined(MUNIT_TEST_NAME_LEN) # define MUNIT_TEST_NAME_LEN 37 #endif /* If you don't like the timing information, you can disable it by * defining MUNIT_DISABLE_TIMING. */ #if !defined(MUNIT_DISABLE_TIMING) # define MUNIT_ENABLE_TIMING #endif /*** End configuration ***/ #if defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE < 200809L) # undef _POSIX_C_SOURCE #endif #if !defined(_POSIX_C_SOURCE) # define _POSIX_C_SOURCE 200809L #endif /* Solaris freaks out if you try to use a POSIX or SUS standard without * the "right" C standard. */ #if defined(_XOPEN_SOURCE) # undef _XOPEN_SOURCE #endif #if defined(__STDC_VERSION__) # if __STDC_VERSION__ >= 201112L # define _XOPEN_SOURCE 700 # elif __STDC_VERSION__ >= 199901L # define _XOPEN_SOURCE 600 # endif #endif /* Because, according to Microsoft, POSIX is deprecated. You've got * to appreciate the chutzpah. */ #if defined(_MSC_VER) && !defined(_CRT_NONSTDC_NO_DEPRECATE) # define _CRT_NONSTDC_NO_DEPRECATE #endif #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) # include #elif defined(_WIN32) /* https://msdn.microsoft.com/en-us/library/tf4dy80a.aspx */ #endif #include #include #include #include #include #include #include #include #if !defined(MUNIT_NO_NL_LANGINFO) && !defined(_WIN32) #define MUNIT_NL_LANGINFO #include #include #include #endif #if !defined(_WIN32) # include # include # include #else # include # include # include # if !defined(STDERR_FILENO) # define STDERR_FILENO _fileno(stderr) # endif #endif #include "munit.h" #define MUNIT_STRINGIFY(x) #x #define MUNIT_XSTRINGIFY(x) MUNIT_STRINGIFY(x) #if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_CC) || defined(__IBMCPP__) # define MUNIT_THREAD_LOCAL __thread #elif (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201102L)) || defined(_Thread_local) # define MUNIT_THREAD_LOCAL _Thread_local #elif defined(_WIN32) # define MUNIT_THREAD_LOCAL __declspec(thread) #endif /* MSVC 12.0 will emit a warning at /W4 for code like 'do { ... } * while (0)', or 'do { ... } while (true)'. I'm pretty sure nobody * at Microsoft compiles with /W4. */ #if defined(_MSC_VER) && (_MSC_VER <= 1800) #pragma warning(disable: 4127) #endif #if defined(_WIN32) || defined(__EMSCRIPTEN__) # define MUNIT_NO_FORK #endif #if defined(__EMSCRIPTEN__) # define MUNIT_NO_BUFFER #endif /*** Logging ***/ static MunitLogLevel munit_log_level_visible = MUNIT_LOG_INFO; static MunitLogLevel munit_log_level_fatal = MUNIT_LOG_ERROR; #if defined(MUNIT_THREAD_LOCAL) static MUNIT_THREAD_LOCAL bool munit_error_jmp_buf_valid = false; static MUNIT_THREAD_LOCAL jmp_buf munit_error_jmp_buf; #endif /* At certain warning levels, mingw will trigger warnings about * suggesting the format attribute, which we've explicity *not* set * because it will then choke on our attempts to use the MS-specific * I64 modifier for size_t (which we have to use since MSVC doesn't * support the C99 z modifier). */ #if defined(__MINGW32__) || defined(__MINGW64__) # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wsuggest-attribute=format" #endif MUNIT_PRINTF(5,0) static void munit_logf_exv(MunitLogLevel level, FILE* fp, const char* filename, int line, const char* format, va_list ap) { if (level < munit_log_level_visible) return; switch (level) { case MUNIT_LOG_DEBUG: fputs("Debug", fp); break; case MUNIT_LOG_INFO: fputs("Info", fp); break; case MUNIT_LOG_WARNING: fputs("Warning", fp); break; case MUNIT_LOG_ERROR: fputs("Error", fp); break; default: munit_logf_ex(MUNIT_LOG_ERROR, filename, line, "Invalid log level (%d)", level); return; } fputs(": ", fp); if (filename != NULL) fprintf(fp, "%s:%d: ", filename, line); vfprintf(fp, format, ap); fputc('\n', fp); } MUNIT_PRINTF(3,4) static void munit_logf_internal(MunitLogLevel level, FILE* fp, const char* format, ...) { va_list ap; va_start(ap, format); munit_logf_exv(level, fp, NULL, 0, format, ap); va_end(ap); } static void munit_log_internal(MunitLogLevel level, FILE* fp, const char* message) { munit_logf_internal(level, fp, "%s", message); } void munit_logf_ex(MunitLogLevel level, const char* filename, int line, const char* format, ...) { va_list ap; va_start(ap, format); munit_logf_exv(level, stderr, filename, line, format, ap); va_end(ap); if (level >= munit_log_level_fatal) { #if defined(MUNIT_THREAD_LOCAL) if (munit_error_jmp_buf_valid) longjmp(munit_error_jmp_buf, 1); #endif abort(); } } void munit_errorf_ex(const char* filename, int line, const char* format, ...) { va_list ap; va_start(ap, format); munit_logf_exv(MUNIT_LOG_ERROR, stderr, filename, line, format, ap); va_end(ap); #if defined(MUNIT_THREAD_LOCAL) if (munit_error_jmp_buf_valid) longjmp(munit_error_jmp_buf, 1); #endif abort(); } #if defined(__MINGW32__) || defined(__MINGW64__) #pragma GCC diagnostic pop #endif #if !defined(MUNIT_STRERROR_LEN) # define MUNIT_STRERROR_LEN 80 #endif static void munit_log_errno(MunitLogLevel level, FILE* fp, const char* msg) { #if defined(MUNIT_NO_STRERROR_R) || (defined(__MINGW32__) && !defined(MINGW_HAS_SECURE_API)) munit_logf_internal(level, fp, "%s: %s (%d)", msg, strerror(errno), errno); #else char munit_error_str[MUNIT_STRERROR_LEN]; munit_error_str[0] = '\0'; #if !defined(_WIN32) strerror_r(errno, munit_error_str, MUNIT_STRERROR_LEN); #else strerror_s(munit_error_str, MUNIT_STRERROR_LEN, errno); #endif munit_logf_internal(level, fp, "%s: %s (%d)", msg, munit_error_str, errno); #endif } /*** Memory allocation ***/ void* munit_malloc_ex(const char* filename, int line, size_t size) { void* ptr; if (size == 0) return NULL; ptr = calloc(1, size); if (MUNIT_UNLIKELY(ptr == NULL)) { munit_logf_ex(MUNIT_LOG_ERROR, filename, line, "Failed to allocate %" MUNIT_SIZE_MODIFIER "u bytes.", size); } return ptr; } /*** Timer code ***/ #if defined(MUNIT_ENABLE_TIMING) #define psnip_uint64_t munit_uint64_t #define psnip_uint32_t munit_uint32_t /* Code copied from portable-snippets * . If you need to * change something, please do it there so we can keep the code in * sync. */ /* Clocks (v1) * Portable Snippets - https://gitub.com/nemequ/portable-snippets * Created by Evan Nemerson * * To the extent possible under law, the authors have waived all * copyright and related or neighboring rights to this code. For * details, see the Creative Commons Zero 1.0 Universal license at * https://creativecommons.org/publicdomain/zero/1.0/ */ #if !defined(PSNIP_CLOCK_H) #define PSNIP_CLOCK_H #if !defined(psnip_uint64_t) # include "../exact-int/exact-int.h" #endif #if !defined(PSNIP_CLOCK_STATIC_INLINE) # if defined(__GNUC__) # define PSNIP_CLOCK__COMPILER_ATTRIBUTES __attribute__((__unused__)) # else # define PSNIP_CLOCK__COMPILER_ATTRIBUTES # endif # define PSNIP_CLOCK__FUNCTION PSNIP_CLOCK__COMPILER_ATTRIBUTES static #endif enum PsnipClockType { /* This clock provides the current time, in units since 1970-01-01 * 00:00:00 UTC not including leap seconds. In other words, UNIX * time. Keep in mind that this clock doesn't account for leap * seconds, and can go backwards (think NTP adjustments). */ PSNIP_CLOCK_TYPE_WALL = 1, /* The CPU time is a clock which increases only when the current * process is active (i.e., it doesn't increment while blocking on * I/O). */ PSNIP_CLOCK_TYPE_CPU = 2, /* Monotonic time is always running (unlike CPU time), but it only ever moves forward unless you reboot the system. Things like NTP adjustments have no effect on this clock. */ PSNIP_CLOCK_TYPE_MONOTONIC = 3 }; struct PsnipClockTimespec { psnip_uint64_t seconds; psnip_uint64_t nanoseconds; }; /* Methods we support: */ #define PSNIP_CLOCK_METHOD_CLOCK_GETTIME 1 #define PSNIP_CLOCK_METHOD_TIME 2 #define PSNIP_CLOCK_METHOD_GETTIMEOFDAY 3 #define PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER 4 #define PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME 5 #define PSNIP_CLOCK_METHOD_CLOCK 6 #define PSNIP_CLOCK_METHOD_GETPROCESSTIMES 7 #define PSNIP_CLOCK_METHOD_GETRUSAGE 8 #define PSNIP_CLOCK_METHOD_GETSYSTEMTIMEPRECISEASFILETIME 9 #define PSNIP_CLOCK_METHOD_GETTICKCOUNT64 10 #include #if defined(HEDLEY_UNREACHABLE) # define PSNIP_CLOCK_UNREACHABLE() HEDLEY_UNREACHABLE() #else # define PSNIP_CLOCK_UNREACHABLE() assert(0) #endif /* Choose an implementation */ /* #undef PSNIP_CLOCK_WALL_METHOD */ /* #undef PSNIP_CLOCK_CPU_METHOD */ /* #undef PSNIP_CLOCK_MONOTONIC_METHOD */ /* We want to be able to detect the libc implementation, so we include ( isn't available everywhere). */ #if defined(__unix__) || defined(__unix) || defined(__linux__) # include # include #endif #if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0) /* These are known to work without librt. If you know of others * please let us know so we can add them. */ # if \ (defined(__GLIBC__) && (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 17))) || \ (defined(__FreeBSD__)) # define PSNIP_CLOCK_HAVE_CLOCK_GETTIME # elif !defined(PSNIP_CLOCK_NO_LIBRT) # define PSNIP_CLOCK_HAVE_CLOCK_GETTIME # endif #endif #if defined(_WIN32) # if !defined(PSNIP_CLOCK_CPU_METHOD) # define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_GETPROCESSTIMES # endif # if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) # define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER # endif #endif #if defined(__MACH__) && !defined(__gnu_hurd__) # if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) # define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME # endif #endif #if defined(PSNIP_CLOCK_HAVE_CLOCK_GETTIME) # include # if !defined(PSNIP_CLOCK_WALL_METHOD) # if defined(CLOCK_REALTIME_PRECISE) # define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME # define PSNIP_CLOCK_CLOCK_GETTIME_WALL CLOCK_REALTIME_PRECISE # elif !defined(__sun) # define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME # define PSNIP_CLOCK_CLOCK_GETTIME_WALL CLOCK_REALTIME # endif # endif # if !defined(PSNIP_CLOCK_CPU_METHOD) # if defined(_POSIX_CPUTIME) || defined(CLOCK_PROCESS_CPUTIME_ID) # define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME # define PSNIP_CLOCK_CLOCK_GETTIME_CPU CLOCK_PROCESS_CPUTIME_ID # elif defined(CLOCK_VIRTUAL) # define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME # define PSNIP_CLOCK_CLOCK_GETTIME_CPU CLOCK_VIRTUAL # endif # endif # if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) # if defined(CLOCK_MONOTONIC_RAW) # define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME # define PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC CLOCK_MONOTONIC # elif defined(CLOCK_MONOTONIC_PRECISE) # define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME # define PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC CLOCK_MONOTONIC_PRECISE # elif defined(_POSIX_MONOTONIC_CLOCK) || defined(CLOCK_MONOTONIC) # define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME # define PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC CLOCK_MONOTONIC # endif # endif #endif #if defined(_POSIX_VERSION) && (_POSIX_VERSION >= 200112L) # if !defined(PSNIP_CLOCK_WALL_METHOD) # define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_GETTIMEOFDAY # endif #endif #if !defined(PSNIP_CLOCK_WALL_METHOD) # define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_TIME #endif #if !defined(PSNIP_CLOCK_CPU_METHOD) # define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_CLOCK #endif /* Primarily here for testing. */ #if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) && defined(PSNIP_CLOCK_REQUIRE_MONOTONIC) # error No monotonic clock found. #endif /* Implementations */ #if \ (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \ (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \ (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \ (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK)) || \ (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK)) || \ (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK)) || \ (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_TIME)) || \ (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_TIME)) || \ (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_TIME)) # include #endif #if \ (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY)) || \ (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY)) || \ (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY)) # include #endif #if \ (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES)) || \ (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES)) || \ (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES)) || \ (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64)) || \ (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64)) || \ (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64)) # include #endif #if \ (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE)) || \ (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE)) || \ (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE)) # include # include #endif #if \ (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME)) || \ (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME)) || \ (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME)) # include # include # include #endif /*** Implementations ***/ #define PSNIP_CLOCK_NSEC_PER_SEC ((psnip_uint32_t) (1000000000ULL)) #if \ (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \ (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \ (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) PSNIP_CLOCK__FUNCTION psnip_uint32_t psnip_clock__clock_getres (clockid_t clk_id) { struct timespec res; int r; r = clock_getres(clk_id, &res); if (r != 0) return 0; return (psnip_uint32_t) (PSNIP_CLOCK_NSEC_PER_SEC / res.tv_nsec); } PSNIP_CLOCK__FUNCTION int psnip_clock__clock_gettime (clockid_t clk_id, struct PsnipClockTimespec* res) { struct timespec ts; if (clock_gettime(clk_id, &ts) != 0) return -10; res->seconds = (psnip_uint64_t) (ts.tv_sec); res->nanoseconds = (psnip_uint64_t) (ts.tv_nsec); return 0; } #endif PSNIP_CLOCK__FUNCTION psnip_uint32_t psnip_clock_wall_get_precision (void) { #if !defined(PSNIP_CLOCK_WALL_METHOD) return 0; #elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME return psnip_clock__clock_getres(PSNIP_CLOCK_CLOCK_GETTIME_WALL); #elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY return 1000000; #elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_TIME return 1; #else return 0; #endif } PSNIP_CLOCK__FUNCTION int psnip_clock_wall_get_time (struct PsnipClockTimespec* res) { (void) res; #if !defined(PSNIP_CLOCK_WALL_METHOD) return -2; #elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME return psnip_clock__clock_gettime(PSNIP_CLOCK_CLOCK_GETTIME_WALL, res); #elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_TIME res->seconds = time(NULL); res->nanoseconds = 0; #elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY struct timeval tv; if (gettimeofday(&tv, NULL) != 0) return -6; res->seconds = tv.tv_sec; res->nanoseconds = tv.tv_usec * 1000; #else return -2; #endif return 0; } PSNIP_CLOCK__FUNCTION psnip_uint32_t psnip_clock_cpu_get_precision (void) { #if !defined(PSNIP_CLOCK_CPU_METHOD) return 0; #elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME return psnip_clock__clock_getres(PSNIP_CLOCK_CLOCK_GETTIME_CPU); #elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK return CLOCKS_PER_SEC; #elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES return PSNIP_CLOCK_NSEC_PER_SEC / 100; #else return 0; #endif } PSNIP_CLOCK__FUNCTION int psnip_clock_cpu_get_time (struct PsnipClockTimespec* res) { #if !defined(PSNIP_CLOCK_CPU_METHOD) (void) res; return -2; #elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME return psnip_clock__clock_gettime(PSNIP_CLOCK_CLOCK_GETTIME_CPU, res); #elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK clock_t t = clock(); if (t == ((clock_t) -1)) return -5; res->seconds = t / CLOCKS_PER_SEC; res->nanoseconds = (t % CLOCKS_PER_SEC) * (PSNIP_CLOCK_NSEC_PER_SEC / CLOCKS_PER_SEC); #elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES FILETIME CreationTime, ExitTime, KernelTime, UserTime; LARGE_INTEGER date, adjust; if (!GetProcessTimes(GetCurrentProcess(), &CreationTime, &ExitTime, &KernelTime, &UserTime)) return -7; /* http://www.frenk.com/2009/12/convert-filetime-to-unix-timestamp/ */ date.HighPart = UserTime.dwHighDateTime; date.LowPart = UserTime.dwLowDateTime; adjust.QuadPart = 11644473600000 * 10000; date.QuadPart -= adjust.QuadPart; res->seconds = date.QuadPart / 10000000; res->nanoseconds = (date.QuadPart % 10000000) * (PSNIP_CLOCK_NSEC_PER_SEC / 100); #elif PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE struct rusage usage; if (getrusage(RUSAGE_SELF, &usage) != 0) return -8; res->seconds = usage.ru_utime.tv_sec; res->nanoseconds = tv.tv_usec * 1000; #else (void) res; return -2; #endif return 0; } PSNIP_CLOCK__FUNCTION psnip_uint32_t psnip_clock_monotonic_get_precision (void) { #if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) return 0; #elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME return psnip_clock__clock_getres(PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC); #elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME static mach_timebase_info_data_t tbi = { 0, }; if (tbi.denom == 0) mach_timebase_info(&tbi); return (psnip_uint32_t) (tbi.numer / tbi.denom); #elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64 return 1000; #elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER LARGE_INTEGER Frequency; QueryPerformanceFrequency(&Frequency); return (psnip_uint32_t) ((Frequency.QuadPart > PSNIP_CLOCK_NSEC_PER_SEC) ? PSNIP_CLOCK_NSEC_PER_SEC : Frequency.QuadPart); #else return 0; #endif } PSNIP_CLOCK__FUNCTION int psnip_clock_monotonic_get_time (struct PsnipClockTimespec* res) { #if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) (void) res; return -2; #elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME return psnip_clock__clock_gettime(PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC, res); #elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME psnip_uint64_t nsec = mach_absolute_time(); static mach_timebase_info_data_t tbi = { 0, }; if (tbi.denom == 0) mach_timebase_info(&tbi); nsec *= ((psnip_uint64_t) tbi.numer) / ((psnip_uint64_t) tbi.denom); res->seconds = nsec / PSNIP_CLOCK_NSEC_PER_SEC; res->nanoseconds = nsec % PSNIP_CLOCK_NSEC_PER_SEC; #elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER LARGE_INTEGER t, f; if (QueryPerformanceCounter(&t) == 0) return -12; QueryPerformanceFrequency(&f); res->seconds = t.QuadPart / f.QuadPart; res->nanoseconds = t.QuadPart % f.QuadPart; if (f.QuadPart > PSNIP_CLOCK_NSEC_PER_SEC) res->nanoseconds /= f.QuadPart / PSNIP_CLOCK_NSEC_PER_SEC; else res->nanoseconds *= PSNIP_CLOCK_NSEC_PER_SEC / f.QuadPart; #elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64 const ULONGLONG msec = GetTickCount64(); res->seconds = msec / 1000; res->nanoseconds = sec % 1000; #else return -2; #endif return 0; } /* Returns the number of ticks per second for the specified clock. * For example, a clock with millisecond precision would return 1000, * and a clock with 1 second (such as the time() function) would * return 1. * * If the requested clock isn't available, it will return 0. * Hopefully this will be rare, but if it happens to you please let us * know so we can work on finding a way to support your system. * * Note that different clocks on the same system often have a * different precisions. */ PSNIP_CLOCK__FUNCTION psnip_uint32_t psnip_clock_get_precision (enum PsnipClockType clock_type) { switch (clock_type) { case PSNIP_CLOCK_TYPE_MONOTONIC: return psnip_clock_monotonic_get_precision (); case PSNIP_CLOCK_TYPE_CPU: return psnip_clock_cpu_get_precision (); case PSNIP_CLOCK_TYPE_WALL: return psnip_clock_wall_get_precision (); } PSNIP_CLOCK_UNREACHABLE(); return 0; } /* Set the provided timespec to the requested time. Returns 0 on * success, or a negative value on failure. */ PSNIP_CLOCK__FUNCTION int psnip_clock_get_time (enum PsnipClockType clock_type, struct PsnipClockTimespec* res) { assert(res != NULL); switch (clock_type) { case PSNIP_CLOCK_TYPE_MONOTONIC: return psnip_clock_monotonic_get_time (res); case PSNIP_CLOCK_TYPE_CPU: return psnip_clock_cpu_get_time (res); case PSNIP_CLOCK_TYPE_WALL: return psnip_clock_wall_get_time (res); } return -1; } #endif /* !defined(PSNIP_CLOCK_H) */ static psnip_uint64_t munit_clock_get_elapsed(struct PsnipClockTimespec* start, struct PsnipClockTimespec* end) { psnip_uint64_t r = (end->seconds - start->seconds) * PSNIP_CLOCK_NSEC_PER_SEC; if (end->nanoseconds < start->nanoseconds) { r -= (start->nanoseconds - end->nanoseconds); } else { r += (end->nanoseconds - start->nanoseconds); } return r; } #else # include #endif /* defined(MUNIT_ENABLE_TIMING) */ /*** PRNG stuff ***/ /* This is (unless I screwed up, which is entirely possible) the * version of PCG with 32-bit state. It was chosen because it has a * small enough state that we should reliably be able to use CAS * instead of requiring a lock for thread-safety. * * If I did screw up, I probably will not bother changing it unless * there is a significant bias. It's really not important this be * particularly strong, as long as it is fairly random it's much more * important that it be reproducible, so bug reports have a better * chance of being reproducible. */ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__) && !defined(__EMSCRIPTEN__) && (!defined(__GNUC_MINOR__) || (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ > 8)) # define HAVE_STDATOMIC #elif defined(__clang__) # if __has_extension(c_atomic) # define HAVE_CLANG_ATOMICS # endif #endif /* Workaround for http://llvm.org/bugs/show_bug.cgi?id=26911 */ #if defined(__clang__) && defined(_WIN32) # undef HAVE_STDATOMIC # if defined(__c2__) # undef HAVE_CLANG_ATOMICS # endif #endif #if defined(_OPENMP) # define ATOMIC_UINT32_T uint32_t # define ATOMIC_UINT32_INIT(x) (x) #elif defined(HAVE_STDATOMIC) # include # define ATOMIC_UINT32_T _Atomic uint32_t # define ATOMIC_UINT32_INIT(x) ATOMIC_VAR_INIT(x) #elif defined(HAVE_CLANG_ATOMICS) # define ATOMIC_UINT32_T _Atomic uint32_t # define ATOMIC_UINT32_INIT(x) (x) #elif defined(_WIN32) # define ATOMIC_UINT32_T volatile LONG # define ATOMIC_UINT32_INIT(x) (x) #else # define ATOMIC_UINT32_T volatile uint32_t # define ATOMIC_UINT32_INIT(x) (x) #endif static ATOMIC_UINT32_T munit_rand_state = ATOMIC_UINT32_INIT(42); #if defined(_OPENMP) static inline void munit_atomic_store(ATOMIC_UINT32_T* dest, ATOMIC_UINT32_T value) { #pragma omp critical (munit_atomics) *dest = value; } static inline uint32_t munit_atomic_load(ATOMIC_UINT32_T* src) { int ret; #pragma omp critical (munit_atomics) ret = *src; return ret; } static inline uint32_t munit_atomic_cas(ATOMIC_UINT32_T* dest, ATOMIC_UINT32_T* expected, ATOMIC_UINT32_T desired) { bool ret; #pragma omp critical (munit_atomics) { if (*dest == *expected) { *dest = desired; ret = true; } else { ret = false; } } return ret; } #elif defined(HAVE_STDATOMIC) # define munit_atomic_store(dest, value) atomic_store(dest, value) # define munit_atomic_load(src) atomic_load(src) # define munit_atomic_cas(dest, expected, value) atomic_compare_exchange_weak(dest, expected, value) #elif defined(HAVE_CLANG_ATOMICS) # define munit_atomic_store(dest, value) __c11_atomic_store(dest, value, __ATOMIC_SEQ_CST) # define munit_atomic_load(src) __c11_atomic_load(src, __ATOMIC_SEQ_CST) # define munit_atomic_cas(dest, expected, value) __c11_atomic_compare_exchange_weak(dest, expected, value, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) #elif defined(__GNUC__) && (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7) # define munit_atomic_store(dest, value) __atomic_store_n(dest, value, __ATOMIC_SEQ_CST) # define munit_atomic_load(src) __atomic_load_n(src, __ATOMIC_SEQ_CST) # define munit_atomic_cas(dest, expected, value) __atomic_compare_exchange_n(dest, expected, value, true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) #elif defined(__GNUC__) && (__GNUC__ >= 4) # define munit_atomic_store(dest,value) do { *(dest) = (value); } while (0) # define munit_atomic_load(src) (*(src)) # define munit_atomic_cas(dest, expected, value) __sync_bool_compare_and_swap(dest, *expected, value) #elif defined(_WIN32) /* Untested */ # define munit_atomic_store(dest,value) do { *(dest) = (value); } while (0) # define munit_atomic_load(src) (*(src)) # define munit_atomic_cas(dest, expected, value) InterlockedCompareExchange((dest), (value), *(expected)) #else # warning No atomic implementation, PRNG will not be thread-safe # define munit_atomic_store(dest, value) do { *(dest) = (value); } while (0) # define munit_atomic_load(src) (*(src)) static inline bool munit_atomic_cas(ATOMIC_UINT32_T* dest, ATOMIC_UINT32_T* expected, ATOMIC_UINT32_T desired) { if (*dest == *expected) { *dest = desired; return true; } else { return false; } } #endif #define MUNIT_PRNG_MULTIPLIER (747796405U) #define MUNIT_PRNG_INCREMENT (1729U) static munit_uint32_t munit_rand_next_state(munit_uint32_t state) { return state * MUNIT_PRNG_MULTIPLIER + MUNIT_PRNG_INCREMENT; } static munit_uint32_t munit_rand_from_state(munit_uint32_t state) { munit_uint32_t res = ((state >> ((state >> 28) + 4)) ^ state) * (277803737U); res ^= res >> 22; return res; } void munit_rand_seed(munit_uint32_t seed) { munit_uint32_t state = munit_rand_next_state(seed + MUNIT_PRNG_INCREMENT); munit_atomic_store(&munit_rand_state, state); } static munit_uint32_t munit_rand_generate_seed(void) { munit_uint32_t seed, state; #if defined(MUNIT_ENABLE_TIMING) struct PsnipClockTimespec wc = { 0, 0 }; psnip_clock_get_time(PSNIP_CLOCK_TYPE_WALL, &wc); seed = (munit_uint32_t) wc.nanoseconds; #else seed = (munit_uint32_t) time(NULL); #endif state = munit_rand_next_state(seed + MUNIT_PRNG_INCREMENT); return munit_rand_from_state(state); } static munit_uint32_t munit_rand_state_uint32(munit_uint32_t* state) { const munit_uint32_t old = *state; *state = munit_rand_next_state(old); return munit_rand_from_state(old); } munit_uint32_t munit_rand_uint32(void) { munit_uint32_t old, state; do { old = munit_atomic_load(&munit_rand_state); state = munit_rand_next_state(old); } while (!munit_atomic_cas(&munit_rand_state, &old, state)); return munit_rand_from_state(old); } static void munit_rand_state_memory(munit_uint32_t* state, size_t size, munit_uint8_t data[MUNIT_ARRAY_PARAM(size)]) { size_t members_remaining = size / sizeof(munit_uint32_t); size_t bytes_remaining = size % sizeof(munit_uint32_t); munit_uint8_t* b = data; munit_uint32_t rv; while (members_remaining-- > 0) { rv = munit_rand_state_uint32(state); memcpy(b, &rv, sizeof(munit_uint32_t)); b += sizeof(munit_uint32_t); } if (bytes_remaining != 0) { rv = munit_rand_state_uint32(state); memcpy(b, &rv, bytes_remaining); } } void munit_rand_memory(size_t size, munit_uint8_t data[MUNIT_ARRAY_PARAM(size)]) { munit_uint32_t old, state; do { state = old = munit_atomic_load(&munit_rand_state); munit_rand_state_memory(&state, size, data); } while (!munit_atomic_cas(&munit_rand_state, &old, state)); } static munit_uint32_t munit_rand_state_at_most(munit_uint32_t* state, munit_uint32_t salt, munit_uint32_t max) { /* We want (UINT32_MAX + 1) % max, which in unsigned arithmetic is the same * as (UINT32_MAX + 1 - max) % max = -max % max. We compute -max using not * to avoid compiler warnings. */ const munit_uint32_t min = (~max + 1U) % max; munit_uint32_t x; if (max == (~((munit_uint32_t) 0U))) return munit_rand_state_uint32(state) ^ salt; max++; do { x = munit_rand_state_uint32(state) ^ salt; } while (x < min); return x % max; } static munit_uint32_t munit_rand_at_most(munit_uint32_t salt, munit_uint32_t max) { munit_uint32_t old, state; munit_uint32_t retval; do { state = old = munit_atomic_load(&munit_rand_state); retval = munit_rand_state_at_most(&state, salt, max); } while (!munit_atomic_cas(&munit_rand_state, &old, state)); return retval; } int munit_rand_int_range(int min, int max) { munit_uint64_t range = (munit_uint64_t) max - (munit_uint64_t) min; if (min > max) return munit_rand_int_range(max, min); if (range > (~((munit_uint32_t) 0U))) range = (~((munit_uint32_t) 0U)); return min + munit_rand_at_most(0, (munit_uint32_t) range); } double munit_rand_double(void) { munit_uint32_t old, state; double retval = 0.0; do { state = old = munit_atomic_load(&munit_rand_state); /* See http://mumble.net/~campbell/tmp/random_real.c for how to do * this right. Patches welcome if you feel that this is too * biased. */ retval = munit_rand_state_uint32(&state) / ((~((munit_uint32_t) 0U)) + 1.0); } while (!munit_atomic_cas(&munit_rand_state, &old, state)); return retval; } /*** Test suite handling ***/ typedef struct { unsigned int successful; unsigned int skipped; unsigned int failed; unsigned int errored; #if defined(MUNIT_ENABLE_TIMING) munit_uint64_t cpu_clock; munit_uint64_t wall_clock; #endif } MunitReport; typedef struct { const char* prefix; const MunitSuite* suite; const char** tests; munit_uint32_t seed; unsigned int iterations; MunitParameter* parameters; bool single_parameter_mode; void* user_data; MunitReport report; bool colorize; bool fork; bool show_stderr; bool fatal_failures; } MunitTestRunner; const char* munit_parameters_get(const MunitParameter params[], const char* key) { const MunitParameter* param; for (param = params ; param != NULL && param->name != NULL ; param++) if (strcmp(param->name, key) == 0) return param->value; return NULL; } #if defined(MUNIT_ENABLE_TIMING) static void munit_print_time(FILE* fp, munit_uint64_t nanoseconds) { fprintf(fp, "%" MUNIT_TEST_TIME_FORMAT, ((double) nanoseconds) / ((double) PSNIP_CLOCK_NSEC_PER_SEC)); } #endif /* Add a paramter to an array of parameters. */ static MunitResult munit_parameters_add(size_t* params_size, MunitParameter* params[MUNIT_ARRAY_PARAM(*params_size)], char* name, char* value) { *params = realloc(*params, sizeof(MunitParameter) * (*params_size + 2)); if (*params == NULL) return MUNIT_ERROR; (*params)[*params_size].name = name; (*params)[*params_size].value = value; (*params_size)++; (*params)[*params_size].name = NULL; (*params)[*params_size].value = NULL; return MUNIT_OK; } /* Concatenate two strings, but just return one of the components * unaltered if the other is NULL or "". */ static char* munit_maybe_concat(size_t* len, char* prefix, char* suffix) { char* res; size_t res_l; const size_t prefix_l = prefix != NULL ? strlen(prefix) : 0; const size_t suffix_l = suffix != NULL ? strlen(suffix) : 0; if (prefix_l == 0 && suffix_l == 0) { res = NULL; res_l = 0; } else if (prefix_l == 0 && suffix_l != 0) { res = suffix; res_l = suffix_l; } else if (prefix_l != 0 && suffix_l == 0) { res = prefix; res_l = prefix_l; } else { res_l = prefix_l + suffix_l; res = malloc(res_l + 1); memcpy(res, prefix, prefix_l); memcpy(res + prefix_l, suffix, suffix_l); res[res_l] = 0; } if (len != NULL) *len = res_l; return res; } /* Possbily free a string returned by munit_maybe_concat. */ static void munit_maybe_free_concat(char* s, const char* prefix, const char* suffix) { if (prefix != s && suffix != s) free(s); } /* Cheap string hash function, just used to salt the PRNG. */ static munit_uint32_t munit_str_hash(const char* name) { const char *p; munit_uint32_t h = 5381U; for (p = name; *p != '\0'; p++) h = (h << 5) + h + *p; return h; } static void munit_splice(int from, int to) { munit_uint8_t buf[1024]; #if !defined(_WIN32) ssize_t len; ssize_t bytes_written; ssize_t write_res; #else int len; int bytes_written; int write_res; #endif do { len = read(from, buf, sizeof(buf)); if (len > 0) { bytes_written = 0; do { write_res = write(to, buf + bytes_written, len - bytes_written); if (write_res < 0) break; bytes_written += write_res; } while (bytes_written < len); } else break; } while (true); } /* This is the part that should be handled in the child process */ static MunitResult munit_test_runner_exec(MunitTestRunner* runner, const MunitTest* test, const MunitParameter params[], MunitReport* report) { unsigned int iterations = runner->iterations; MunitResult result = MUNIT_FAIL; #if defined(MUNIT_ENABLE_TIMING) struct PsnipClockTimespec wall_clock_begin = { 0, 0 }, wall_clock_end = { 0, 0 }; struct PsnipClockTimespec cpu_clock_begin = { 0, 0 }, cpu_clock_end = { 0, 0 }; #endif unsigned int i = 0; if ((test->options & MUNIT_TEST_OPTION_SINGLE_ITERATION) == MUNIT_TEST_OPTION_SINGLE_ITERATION) iterations = 1; else if (iterations == 0) iterations = runner->suite->iterations; munit_rand_seed(runner->seed); do { void* data = (test->setup == NULL) ? runner->user_data : test->setup(params, runner->user_data); #if defined(MUNIT_ENABLE_TIMING) psnip_clock_get_time(PSNIP_CLOCK_TYPE_WALL, &wall_clock_begin); psnip_clock_get_time(PSNIP_CLOCK_TYPE_CPU, &cpu_clock_begin); #endif result = test->test(params, data); #if defined(MUNIT_ENABLE_TIMING) psnip_clock_get_time(PSNIP_CLOCK_TYPE_WALL, &wall_clock_end); psnip_clock_get_time(PSNIP_CLOCK_TYPE_CPU, &cpu_clock_end); #endif if (test->tear_down != NULL) test->tear_down(data); if (MUNIT_LIKELY(result == MUNIT_OK)) { report->successful++; #if defined(MUNIT_ENABLE_TIMING) report->wall_clock += munit_clock_get_elapsed(&wall_clock_begin, &wall_clock_end); report->cpu_clock += munit_clock_get_elapsed(&cpu_clock_begin, &cpu_clock_end); #endif } else { switch ((int) result) { case MUNIT_SKIP: report->skipped++; break; case MUNIT_FAIL: report->failed++; break; case MUNIT_ERROR: report->errored++; break; default: break; } break; } } while (++i < iterations); return result; } #if defined(MUNIT_EMOTICON) # define MUNIT_RESULT_STRING_OK ":)" # define MUNIT_RESULT_STRING_SKIP ":|" # define MUNIT_RESULT_STRING_FAIL ":(" # define MUNIT_RESULT_STRING_ERROR ":o" # define MUNIT_RESULT_STRING_TODO ":/" #else # define MUNIT_RESULT_STRING_OK "OK " # define MUNIT_RESULT_STRING_SKIP "SKIP " # define MUNIT_RESULT_STRING_FAIL "FAIL " # define MUNIT_RESULT_STRING_ERROR "ERROR" # define MUNIT_RESULT_STRING_TODO "TODO " #endif static void munit_test_runner_print_color(const MunitTestRunner* runner, const char* string, char color) { if (runner->colorize) fprintf(MUNIT_OUTPUT_FILE, "\x1b[3%cm%s\x1b[39m", color, string); else fputs(string, MUNIT_OUTPUT_FILE); } #if !defined(MUNIT_NO_BUFFER) static int munit_replace_stderr(FILE* stderr_buf) { if (stderr_buf != NULL) { const int orig_stderr = dup(STDERR_FILENO); int errfd = fileno(stderr_buf); if (MUNIT_UNLIKELY(errfd == -1)) { exit(EXIT_FAILURE); } dup2(errfd, STDERR_FILENO); return orig_stderr; } return -1; } static void munit_restore_stderr(int orig_stderr) { if (orig_stderr != -1) { dup2(orig_stderr, STDERR_FILENO); close(orig_stderr); } } #endif /* !defined(MUNIT_NO_BUFFER) */ /* Run a test with the specified parameters. */ static void munit_test_runner_run_test_with_params(MunitTestRunner* runner, const MunitTest* test, const MunitParameter params[]) { MunitResult result = MUNIT_OK; MunitReport report = { 0, 0, 0, 0, #if defined(MUNIT_ENABLE_TIMING) 0, 0 #endif }; unsigned int output_l; bool first; const MunitParameter* param; FILE* stderr_buf; #if !defined(MUNIT_NO_FORK) int pipefd[2]; pid_t fork_pid; ssize_t bytes_written = 0; ssize_t write_res; ssize_t bytes_read = 0; ssize_t read_res; int status = 0; pid_t changed_pid; #endif if (params != NULL) { output_l = 2; fputs(" ", MUNIT_OUTPUT_FILE); first = true; for (param = params ; param != NULL && param->name != NULL ; param++) { if (!first) { fputs(", ", MUNIT_OUTPUT_FILE); output_l += 2; } else { first = false; } output_l += fprintf(MUNIT_OUTPUT_FILE, "%s=%s", param->name, param->value); } while (output_l++ < MUNIT_TEST_NAME_LEN) { fputc(' ', MUNIT_OUTPUT_FILE); } } fflush(MUNIT_OUTPUT_FILE); stderr_buf = NULL; #if !defined(_WIN32) || defined(__MINGW32__) stderr_buf = tmpfile(); #else tmpfile_s(&stderr_buf); #endif if (stderr_buf == NULL) { munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to create buffer for stderr"); result = MUNIT_ERROR; goto print_result; } #if !defined(MUNIT_NO_FORK) if (runner->fork) { pipefd[0] = -1; pipefd[1] = -1; if (pipe(pipefd) != 0) { munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to create pipe"); result = MUNIT_ERROR; goto print_result; } fork_pid = fork(); if (fork_pid == 0) { #if !defined(MUNIT_NO_BUFFER) int orig_stderr; #endif close(pipefd[0]); #if !defined(MUNIT_NO_BUFFER) orig_stderr = munit_replace_stderr(stderr_buf); #endif munit_test_runner_exec(runner, test, params, &report); #if !defined(MUNIT_NO_BUFFER) /* Note that we don't restore stderr. This is so we can buffer * things written to stderr later on (such as by * asan/tsan/ubsan, valgrind, etc.) */ close(orig_stderr); #endif do { write_res = write(pipefd[1], ((munit_uint8_t*) (&report)) + bytes_written, sizeof(report) - bytes_written); if (write_res < 0) { if (stderr_buf != NULL) { munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to write to pipe"); } exit(EXIT_FAILURE); } bytes_written += write_res; } while ((size_t) bytes_written < sizeof(report)); if (stderr_buf != NULL) fclose(stderr_buf); close(pipefd[1]); exit(EXIT_SUCCESS); } else if (fork_pid == -1) { close(pipefd[0]); close(pipefd[1]); if (stderr_buf != NULL) { munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to fork"); } report.errored++; result = MUNIT_ERROR; } else { close(pipefd[1]); do { read_res = read(pipefd[0], ((munit_uint8_t*) (&report)) + bytes_read, sizeof(report) - bytes_read); if (read_res < 1) break; bytes_read += read_res; } while (bytes_read < (ssize_t) sizeof(report)); changed_pid = waitpid(fork_pid, &status, 0); if (MUNIT_LIKELY(changed_pid == fork_pid) && MUNIT_LIKELY(WIFEXITED(status))) { if (bytes_read != sizeof(report)) { munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child exited unexpectedly with status %d", WEXITSTATUS(status)); report.errored++; } else if (WEXITSTATUS(status) != EXIT_SUCCESS) { munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child exited with status %d", WEXITSTATUS(status)); report.errored++; } } else { if (WIFSIGNALED(status)) { #if defined(_XOPEN_VERSION) && (_XOPEN_VERSION >= 700) munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child killed by signal %d (%s)", WTERMSIG(status), strsignal(WTERMSIG(status))); #else munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child killed by signal %d", WTERMSIG(status)); #endif } else if (WIFSTOPPED(status)) { munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child stopped by signal %d", WSTOPSIG(status)); } report.errored++; } close(pipefd[0]); waitpid(fork_pid, NULL, 0); } } else #endif { #if !defined(MUNIT_NO_BUFFER) const volatile int orig_stderr = munit_replace_stderr(stderr_buf); #endif #if defined(MUNIT_THREAD_LOCAL) if (MUNIT_UNLIKELY(setjmp(munit_error_jmp_buf) != 0)) { result = MUNIT_FAIL; report.failed++; } else { munit_error_jmp_buf_valid = true; result = munit_test_runner_exec(runner, test, params, &report); } #else result = munit_test_runner_exec(runner, test, params, &report); #endif #if !defined(MUNIT_NO_BUFFER) munit_restore_stderr(orig_stderr); #endif /* Here just so that the label is used on Windows and we don't get * a warning */ goto print_result; } print_result: fputs("[ ", MUNIT_OUTPUT_FILE); if ((test->options & MUNIT_TEST_OPTION_TODO) == MUNIT_TEST_OPTION_TODO) { if (report.failed != 0 || report.errored != 0 || report.skipped != 0) { munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_TODO, '3'); result = MUNIT_OK; } else { munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_ERROR, '1'); if (MUNIT_LIKELY(stderr_buf != NULL)) munit_log_internal(MUNIT_LOG_ERROR, stderr_buf, "Test marked TODO, but was successful."); runner->report.failed++; result = MUNIT_ERROR; } } else if (report.failed > 0) { munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_FAIL, '1'); runner->report.failed++; result = MUNIT_FAIL; } else if (report.errored > 0) { munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_ERROR, '1'); runner->report.errored++; result = MUNIT_ERROR; } else if (report.skipped > 0) { munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_SKIP, '3'); runner->report.skipped++; result = MUNIT_SKIP; } else if (report.successful > 1) { munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_OK, '2'); #if defined(MUNIT_ENABLE_TIMING) fputs(" ] [ ", MUNIT_OUTPUT_FILE); munit_print_time(MUNIT_OUTPUT_FILE, report.wall_clock / report.successful); fputs(" / ", MUNIT_OUTPUT_FILE); munit_print_time(MUNIT_OUTPUT_FILE, report.cpu_clock / report.successful); fprintf(MUNIT_OUTPUT_FILE, " CPU ]\n %-" MUNIT_XSTRINGIFY(MUNIT_TEST_NAME_LEN) "s Total: [ ", ""); munit_print_time(MUNIT_OUTPUT_FILE, report.wall_clock); fputs(" / ", MUNIT_OUTPUT_FILE); munit_print_time(MUNIT_OUTPUT_FILE, report.cpu_clock); fputs(" CPU", MUNIT_OUTPUT_FILE); #endif runner->report.successful++; result = MUNIT_OK; } else if (report.successful > 0) { munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_OK, '2'); #if defined(MUNIT_ENABLE_TIMING) fputs(" ] [ ", MUNIT_OUTPUT_FILE); munit_print_time(MUNIT_OUTPUT_FILE, report.wall_clock); fputs(" / ", MUNIT_OUTPUT_FILE); munit_print_time(MUNIT_OUTPUT_FILE, report.cpu_clock); fputs(" CPU", MUNIT_OUTPUT_FILE); #endif runner->report.successful++; result = MUNIT_OK; } fputs(" ]\n", MUNIT_OUTPUT_FILE); if (stderr_buf != NULL) { if (result == MUNIT_FAIL || result == MUNIT_ERROR || runner->show_stderr) { fflush(MUNIT_OUTPUT_FILE); rewind(stderr_buf); munit_splice(fileno(stderr_buf), STDERR_FILENO); fflush(stderr); } fclose(stderr_buf); } } static void munit_test_runner_run_test_wild(MunitTestRunner* runner, const MunitTest* test, const char* test_name, MunitParameter* params, MunitParameter* p) { const MunitParameterEnum* pe; char** values; MunitParameter* next; for (pe = test->parameters ; pe != NULL && pe->name != NULL ; pe++) { if (p->name == pe->name) break; } if (pe == NULL) return; for (values = pe->values ; *values != NULL ; values++) { next = p + 1; p->value = *values; if (next->name == NULL) { munit_test_runner_run_test_with_params(runner, test, params); } else { munit_test_runner_run_test_wild(runner, test, test_name, params, next); } if (runner->fatal_failures && (runner->report.failed != 0 || runner->report.errored != 0)) break; } } /* Run a single test, with every combination of parameters * requested. */ static void munit_test_runner_run_test(MunitTestRunner* runner, const MunitTest* test, const char* prefix) { char* test_name = munit_maybe_concat(NULL, (char*) prefix, (char*) test->name); /* The array of parameters to pass to * munit_test_runner_run_test_with_params */ MunitParameter* params = NULL; size_t params_l = 0; /* Wildcard parameters are parameters which have possible values * specified in the test, but no specific value was passed to the * CLI. That means we want to run the test once for every * possible combination of parameter values or, if --single was * passed to the CLI, a single time with a random set of * parameters. */ MunitParameter* wild_params = NULL; size_t wild_params_l = 0; const MunitParameterEnum* pe; const MunitParameter* cli_p; bool filled; unsigned int possible; char** vals; size_t first_wild; const MunitParameter* wp; int pidx; munit_rand_seed(runner->seed); fprintf(MUNIT_OUTPUT_FILE, "%-" MUNIT_XSTRINGIFY(MUNIT_TEST_NAME_LEN) "s", test_name); if (test->parameters == NULL) { /* No parameters. Simple, nice. */ munit_test_runner_run_test_with_params(runner, test, NULL); } else { fputc('\n', MUNIT_OUTPUT_FILE); for (pe = test->parameters ; pe != NULL && pe->name != NULL ; pe++) { /* Did we received a value for this parameter from the CLI? */ filled = false; for (cli_p = runner->parameters ; cli_p != NULL && cli_p->name != NULL ; cli_p++) { if (strcmp(cli_p->name, pe->name) == 0) { if (MUNIT_UNLIKELY(munit_parameters_add(¶ms_l, ¶ms, pe->name, cli_p->value) != MUNIT_OK)) goto cleanup; filled = true; break; } } if (filled) continue; /* Nothing from CLI, is the enum NULL/empty? We're not a * fuzzer… */ if (pe->values == NULL || pe->values[0] == NULL) continue; /* If --single was passed to the CLI, choose a value from the * list of possibilities randomly. */ if (runner->single_parameter_mode) { possible = 0; for (vals = pe->values ; *vals != NULL ; vals++) possible++; /* We want the tests to be reproducible, even if you're only * running a single test, but we don't want every test with * the same number of parameters to choose the same parameter * number, so use the test name as a primitive salt. */ pidx = munit_rand_at_most(munit_str_hash(test_name), possible - 1); if (MUNIT_UNLIKELY(munit_parameters_add(¶ms_l, ¶ms, pe->name, pe->values[pidx]) != MUNIT_OK)) goto cleanup; } else { /* We want to try every permutation. Put in a placeholder * entry, we'll iterate through them later. */ if (MUNIT_UNLIKELY(munit_parameters_add(&wild_params_l, &wild_params, pe->name, NULL) != MUNIT_OK)) goto cleanup; } } if (wild_params_l != 0) { first_wild = params_l; for (wp = wild_params ; wp != NULL && wp->name != NULL ; wp++) { for (pe = test->parameters ; pe != NULL && pe->name != NULL && pe->values != NULL ; pe++) { if (strcmp(wp->name, pe->name) == 0) { if (MUNIT_UNLIKELY(munit_parameters_add(¶ms_l, ¶ms, pe->name, pe->values[0]) != MUNIT_OK)) goto cleanup; } } } munit_test_runner_run_test_wild(runner, test, test_name, params, params + first_wild); } else { munit_test_runner_run_test_with_params(runner, test, params); } cleanup: free(params); free(wild_params); } munit_maybe_free_concat(test_name, prefix, test->name); } /* Recurse through the suite and run all the tests. If a list of * tests to run was provied on the command line, run only those * tests. */ static void munit_test_runner_run_suite(MunitTestRunner* runner, const MunitSuite* suite, const char* prefix) { size_t pre_l; char* pre = munit_maybe_concat(&pre_l, (char*) prefix, (char*) suite->prefix); const MunitTest* test; const char** test_name; const MunitSuite* child_suite; /* Run the tests. */ for (test = suite->tests ; test != NULL && test->test != NULL ; test++) { if (runner->tests != NULL) { /* Specific tests were requested on the CLI */ for (test_name = runner->tests ; test_name != NULL && *test_name != NULL ; test_name++) { if ((pre_l == 0 || strncmp(pre, *test_name, pre_l) == 0) && strncmp(test->name, *test_name + pre_l, strlen(*test_name + pre_l)) == 0) { munit_test_runner_run_test(runner, test, pre); if (runner->fatal_failures && (runner->report.failed != 0 || runner->report.errored != 0)) goto cleanup; } } } else { /* Run all tests */ munit_test_runner_run_test(runner, test, pre); } } if (runner->fatal_failures && (runner->report.failed != 0 || runner->report.errored != 0)) goto cleanup; /* Run any child suites. */ for (child_suite = suite->suites ; child_suite != NULL && child_suite->prefix != NULL ; child_suite++) { munit_test_runner_run_suite(runner, child_suite, pre); } cleanup: munit_maybe_free_concat(pre, prefix, suite->prefix); } static void munit_test_runner_run(MunitTestRunner* runner) { munit_test_runner_run_suite(runner, runner->suite, NULL); } static void munit_print_help(int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)], void* user_data, const MunitArgument arguments[]) { const MunitArgument* arg; (void) argc; printf("USAGE: %s [OPTIONS...] [TEST...]\n\n", argv[0]); puts(" --seed SEED\n" " Value used to seed the PRNG. Must be a 32-bit integer in decimal\n" " notation with no separators (commas, decimals, spaces, etc.), or\n" " hexidecimal prefixed by \"0x\".\n" " --iterations N\n" " Run each test N times. 0 means the default number.\n" " --param name value\n" " A parameter key/value pair which will be passed to any test with\n" " takes a parameter of that name. If not provided, the test will be\n" " run once for each possible parameter value.\n" " --list Write a list of all available tests.\n" " --list-params\n" " Write a list of all available tests and their possible parameters.\n" " --single Run each parameterized test in a single configuration instead of\n" " every possible combination\n" " --log-visible debug|info|warning|error\n" " --log-fatal debug|info|warning|error\n" " Set the level at which messages of different severities are visible,\n" " or cause the test to terminate.\n" #if !defined(MUNIT_NO_FORK) " --no-fork Do not execute tests in a child process. If this option is supplied\n" " and a test crashes (including by failing an assertion), no further\n" " tests will be performed.\n" #endif " --fatal-failures\n" " Stop executing tests as soon as a failure is found.\n" " --show-stderr\n" " Show data written to stderr by the tests, even if the test succeeds.\n" " --color auto|always|never\n" " Colorize (or don't) the output.\n" /* 12345678901234567890123456789012345678901234567890123456789012345678901234567890 */ " --help Print this help message and exit.\n"); #if defined(MUNIT_NL_LANGINFO) setlocale(LC_ALL, ""); fputs((strcasecmp("UTF-8", nl_langinfo(CODESET)) == 0) ? "µnit" : "munit", stdout); #else puts("munit"); #endif printf(" %d.%d.%d\n" "Full documentation at: https://nemequ.github.io/munit/\n", (MUNIT_CURRENT_VERSION >> 16) & 0xff, (MUNIT_CURRENT_VERSION >> 8) & 0xff, (MUNIT_CURRENT_VERSION >> 0) & 0xff); for (arg = arguments ; arg != NULL && arg->name != NULL ; arg++) arg->write_help(arg, user_data); } static const MunitArgument* munit_arguments_find(const MunitArgument arguments[], const char* name) { const MunitArgument* arg; for (arg = arguments ; arg != NULL && arg->name != NULL ; arg++) if (strcmp(arg->name, name) == 0) return arg; return NULL; } static void munit_suite_list_tests(const MunitSuite* suite, bool show_params, const char* prefix) { size_t pre_l; char* pre = munit_maybe_concat(&pre_l, (char*) prefix, (char*) suite->prefix); const MunitTest* test; const MunitParameterEnum* params; bool first; char** val; const MunitSuite* child_suite; for (test = suite->tests ; test != NULL && test->name != NULL ; test++) { if (pre != NULL) fputs(pre, stdout); puts(test->name); if (show_params) { for (params = test->parameters ; params != NULL && params->name != NULL ; params++) { fprintf(stdout, " - %s: ", params->name); if (params->values == NULL) { puts("Any"); } else { first = true; for (val = params->values ; *val != NULL ; val++ ) { if(!first) { fputs(", ", stdout); } else { first = false; } fputs(*val, stdout); } putc('\n', stdout); } } } } for (child_suite = suite->suites ; child_suite != NULL && child_suite->prefix != NULL ; child_suite++) { munit_suite_list_tests(child_suite, show_params, pre); } munit_maybe_free_concat(pre, prefix, suite->prefix); } static bool munit_stream_supports_ansi(FILE *stream) { #if !defined(_WIN32) return isatty(fileno(stream)); #else #if !defined(__MINGW32__) size_t ansicon_size = 0; #endif if (isatty(fileno(stream))) { #if !defined(__MINGW32__) getenv_s(&ansicon_size, NULL, 0, "ANSICON"); return ansicon_size != 0; #else return getenv("ANSICON") != NULL; #endif } return false; #endif } int munit_suite_main_custom(const MunitSuite* suite, void* user_data, int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)], const MunitArgument arguments[]) { int result = EXIT_FAILURE; MunitTestRunner runner; size_t parameters_size = 0; size_t tests_size = 0; int arg; char* envptr; unsigned long ts; char* endptr; unsigned long long iterations; MunitLogLevel level; const MunitArgument* argument; const char** runner_tests; unsigned int tests_run; unsigned int tests_total; runner.prefix = NULL; runner.suite = NULL; runner.tests = NULL; runner.seed = 0; runner.iterations = 0; runner.parameters = NULL; runner.single_parameter_mode = false; runner.user_data = NULL; runner.report.successful = 0; runner.report.skipped = 0; runner.report.failed = 0; runner.report.errored = 0; #if defined(MUNIT_ENABLE_TIMING) runner.report.cpu_clock = 0; runner.report.wall_clock = 0; #endif runner.colorize = false; #if !defined(_WIN32) runner.fork = true; #else runner.fork = false; #endif runner.show_stderr = false; runner.fatal_failures = false; runner.suite = suite; runner.user_data = user_data; runner.seed = munit_rand_generate_seed(); runner.colorize = munit_stream_supports_ansi(MUNIT_OUTPUT_FILE); for (arg = 1 ; arg < argc ; arg++) { if (strncmp("--", argv[arg], 2) == 0) { if (strcmp("seed", argv[arg] + 2) == 0) { if (arg + 1 >= argc) { munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires an argument", argv[arg]); goto cleanup; } envptr = argv[arg + 1]; ts = strtoul(argv[arg + 1], &envptr, 0); if (*envptr != '\0' || ts > (~((munit_uint32_t) 0U))) { munit_logf_internal(MUNIT_LOG_ERROR, stderr, "invalid value ('%s') passed to %s", argv[arg + 1], argv[arg]); goto cleanup; } runner.seed = (munit_uint32_t) ts; arg++; } else if (strcmp("iterations", argv[arg] + 2) == 0) { if (arg + 1 >= argc) { munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires an argument", argv[arg]); goto cleanup; } endptr = argv[arg + 1]; iterations = strtoul(argv[arg + 1], &endptr, 0); if (*endptr != '\0' || iterations > UINT_MAX) { munit_logf_internal(MUNIT_LOG_ERROR, stderr, "invalid value ('%s') passed to %s", argv[arg + 1], argv[arg]); goto cleanup; } runner.iterations = (unsigned int) iterations; arg++; } else if (strcmp("param", argv[arg] + 2) == 0) { if (arg + 2 >= argc) { munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires two arguments", argv[arg]); goto cleanup; } runner.parameters = realloc(runner.parameters, sizeof(MunitParameter) * (parameters_size + 2)); if (runner.parameters == NULL) { munit_log_internal(MUNIT_LOG_ERROR, stderr, "failed to allocate memory"); goto cleanup; } runner.parameters[parameters_size].name = (char*) argv[arg + 1]; runner.parameters[parameters_size].value = (char*) argv[arg + 2]; parameters_size++; runner.parameters[parameters_size].name = NULL; runner.parameters[parameters_size].value = NULL; arg += 2; } else if (strcmp("color", argv[arg] + 2) == 0) { if (arg + 1 >= argc) { munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires an argument", argv[arg]); goto cleanup; } if (strcmp(argv[arg + 1], "always") == 0) runner.colorize = true; else if (strcmp(argv[arg + 1], "never") == 0) runner.colorize = false; else if (strcmp(argv[arg + 1], "auto") == 0) runner.colorize = munit_stream_supports_ansi(MUNIT_OUTPUT_FILE); else { munit_logf_internal(MUNIT_LOG_ERROR, stderr, "invalid value ('%s') passed to %s", argv[arg + 1], argv[arg]); goto cleanup; } arg++; } else if (strcmp("help", argv[arg] + 2) == 0) { munit_print_help(argc, argv, user_data, arguments); result = EXIT_SUCCESS; goto cleanup; } else if (strcmp("single", argv[arg] + 2) == 0) { runner.single_parameter_mode = true; } else if (strcmp("show-stderr", argv[arg] + 2) == 0) { runner.show_stderr = true; #if !defined(_WIN32) } else if (strcmp("no-fork", argv[arg] + 2) == 0) { runner.fork = false; #endif } else if (strcmp("fatal-failures", argv[arg] + 2) == 0) { runner.fatal_failures = true; } else if (strcmp("log-visible", argv[arg] + 2) == 0 || strcmp("log-fatal", argv[arg] + 2) == 0) { if (arg + 1 >= argc) { munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires an argument", argv[arg]); goto cleanup; } if (strcmp(argv[arg + 1], "debug") == 0) level = MUNIT_LOG_DEBUG; else if (strcmp(argv[arg + 1], "info") == 0) level = MUNIT_LOG_INFO; else if (strcmp(argv[arg + 1], "warning") == 0) level = MUNIT_LOG_WARNING; else if (strcmp(argv[arg + 1], "error") == 0) level = MUNIT_LOG_ERROR; else { munit_logf_internal(MUNIT_LOG_ERROR, stderr, "invalid value ('%s') passed to %s", argv[arg + 1], argv[arg]); goto cleanup; } if (strcmp("log-visible", argv[arg] + 2) == 0) munit_log_level_visible = level; else munit_log_level_fatal = level; arg++; } else if (strcmp("list", argv[arg] + 2) == 0) { munit_suite_list_tests(suite, false, NULL); result = EXIT_SUCCESS; goto cleanup; } else if (strcmp("list-params", argv[arg] + 2) == 0) { munit_suite_list_tests(suite, true, NULL); result = EXIT_SUCCESS; goto cleanup; } else { argument = munit_arguments_find(arguments, argv[arg] + 2); if (argument == NULL) { munit_logf_internal(MUNIT_LOG_ERROR, stderr, "unknown argument ('%s')", argv[arg]); goto cleanup; } if (!argument->parse_argument(suite, user_data, &arg, argc, argv)) goto cleanup; } } else { runner_tests = realloc((void*) runner.tests, sizeof(char*) * (tests_size + 2)); if (runner_tests == NULL) { munit_log_internal(MUNIT_LOG_ERROR, stderr, "failed to allocate memory"); goto cleanup; } runner.tests = runner_tests; runner.tests[tests_size++] = argv[arg]; runner.tests[tests_size] = NULL; } } fflush(stderr); fprintf(MUNIT_OUTPUT_FILE, "Running test suite with seed 0x%08" PRIx32 "...\n", runner.seed); munit_test_runner_run(&runner); tests_run = runner.report.successful + runner.report.failed + runner.report.errored; tests_total = tests_run + runner.report.skipped; if (tests_run == 0) { fprintf(stderr, "No tests run, %d (100%%) skipped.\n", runner.report.skipped); } else { fprintf(MUNIT_OUTPUT_FILE, "%d of %d (%0.0f%%) tests successful, %d (%0.0f%%) test skipped.\n", runner.report.successful, tests_run, (((double) runner.report.successful) / ((double) tests_run)) * 100.0, runner.report.skipped, (((double) runner.report.skipped) / ((double) tests_total)) * 100.0); } if (runner.report.failed == 0 && runner.report.errored == 0) { result = EXIT_SUCCESS; } cleanup: free(runner.parameters); free((void*) runner.tests); return result; } int munit_suite_main(const MunitSuite* suite, void* user_data, int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)]) { return munit_suite_main_custom(suite, user_data, argc, argv, NULL); } dqlite-1.16.7/test/lib/munit.h000066400000000000000000000422131465252713400161350ustar00rootroot00000000000000/* µnit Testing Framework * Copyright (c) 2013-2017 Evan Nemerson * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, * modify, merge, publish, distribute, sublicense, and/or sell copies * of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #if !defined(MUNIT_H) #define MUNIT_H #include #include #define MUNIT_VERSION(major, minor, revision) \ (((major) << 16) | ((minor) << 8) | (revision)) #define MUNIT_CURRENT_VERSION MUNIT_VERSION(0, 4, 1) #if defined(_MSC_VER) && (_MSC_VER < 1600) # define munit_int8_t __int8 # define munit_uint8_t unsigned __int8 # define munit_int16_t __int16 # define munit_uint16_t unsigned __int16 # define munit_int32_t __int32 # define munit_uint32_t unsigned __int32 # define munit_int64_t __int64 # define munit_uint64_t unsigned __int64 #else # include # define munit_int8_t int8_t # define munit_uint8_t uint8_t # define munit_int16_t int16_t # define munit_uint16_t uint16_t # define munit_int32_t int32_t # define munit_uint32_t uint32_t # define munit_int64_t int64_t # define munit_uint64_t uint64_t #endif #if defined(_MSC_VER) && (_MSC_VER < 1800) # if !defined(PRIi8) # define PRIi8 "i" # endif # if !defined(PRIi16) # define PRIi16 "i" # endif # if !defined(PRIi32) # define PRIi32 "i" # endif # if !defined(PRIi64) # define PRIi64 "I64i" # endif # if !defined(PRId8) # define PRId8 "d" # endif # if !defined(PRId16) # define PRId16 "d" # endif # if !defined(PRId32) # define PRId32 "d" # endif # if !defined(PRId64) # define PRId64 "I64d" # endif # if !defined(PRIx8) # define PRIx8 "x" # endif # if !defined(PRIx16) # define PRIx16 "x" # endif # if !defined(PRIx32) # define PRIx32 "x" # endif # if !defined(PRIx64) # define PRIx64 "I64x" # endif # if !defined(PRIu8) # define PRIu8 "u" # endif # if !defined(PRIu16) # define PRIu16 "u" # endif # if !defined(PRIu32) # define PRIu32 "u" # endif # if !defined(PRIu64) # define PRIu64 "I64u" # endif # if !defined(bool) # define bool int # endif # if !defined(true) # define true (!0) # endif # if !defined(false) # define false (!!0) # endif #else # include # include #endif #if defined(__cplusplus) extern "C" { #endif #if defined(__GNUC__) # define MUNIT_LIKELY(expr) (__builtin_expect ((expr), 1)) # define MUNIT_UNLIKELY(expr) (__builtin_expect ((expr), 0)) # define MUNIT_UNUSED __attribute__((__unused__)) #else # define MUNIT_LIKELY(expr) (expr) # define MUNIT_UNLIKELY(expr) (expr) # define MUNIT_UNUSED #endif #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && !defined(__PGI) # define MUNIT_ARRAY_PARAM(name) name #else # define MUNIT_ARRAY_PARAM(name) #endif #if !defined(_WIN32) # define MUNIT_SIZE_MODIFIER "z" # define MUNIT_CHAR_MODIFIER "hh" # define MUNIT_SHORT_MODIFIER "h" #else # if defined(_M_X64) || defined(__amd64__) # define MUNIT_SIZE_MODIFIER "I64" # else # define MUNIT_SIZE_MODIFIER "" # endif # define MUNIT_CHAR_MODIFIER "" # define MUNIT_SHORT_MODIFIER "" #endif #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L # define MUNIT_NO_RETURN _Noreturn #elif defined(__GNUC__) # define MUNIT_NO_RETURN __attribute__((__noreturn__)) #elif defined(_MSC_VER) # define MUNIT_NO_RETURN __declspec(noreturn) #else # define MUNIT_NO_RETURN #endif #if defined(_MSC_VER) && (_MSC_VER >= 1500) # define MUNIT__PUSH_DISABLE_MSVC_C4127 __pragma(warning(push)) __pragma(warning(disable:4127)) # define MUNIT__POP_DISABLE_MSVC_C4127 __pragma(warning(pop)) #else # define MUNIT__PUSH_DISABLE_MSVC_C4127 # define MUNIT__POP_DISABLE_MSVC_C4127 #endif typedef enum { MUNIT_LOG_DEBUG, MUNIT_LOG_INFO, MUNIT_LOG_WARNING, MUNIT_LOG_ERROR } MunitLogLevel; #if defined(__GNUC__) && !defined(__MINGW32__) # define MUNIT_PRINTF(string_index, first_to_check) __attribute__((format (printf, string_index, first_to_check))) #else # define MUNIT_PRINTF(string_index, first_to_check) #endif MUNIT_PRINTF(4, 5) void munit_logf_ex(MunitLogLevel level, const char* filename, int line, const char* format, ...); #define munit_logf(level, format, ...) \ munit_logf_ex(level, __FILE__, __LINE__, format, __VA_ARGS__) #define munit_log(level, msg) \ munit_logf(level, "%s", msg) MUNIT_NO_RETURN MUNIT_PRINTF(3, 4) void munit_errorf_ex(const char* filename, int line, const char* format, ...); #define munit_errorf(format, ...) \ munit_errorf_ex(__FILE__, __LINE__, format, __VA_ARGS__) #define munit_error(msg) \ munit_errorf("%s", msg) #define munit_assert(expr) \ do { \ if (!MUNIT_LIKELY(expr)) { \ munit_error("assertion failed: " #expr); \ } \ MUNIT__PUSH_DISABLE_MSVC_C4127 \ } while (0) \ MUNIT__POP_DISABLE_MSVC_C4127 #define munit_assert_true(expr) \ do { \ if (!MUNIT_LIKELY(expr)) { \ munit_error("assertion failed: " #expr " is not true"); \ } \ MUNIT__PUSH_DISABLE_MSVC_C4127 \ } while (0) \ MUNIT__POP_DISABLE_MSVC_C4127 #define munit_assert_false(expr) \ do { \ if (!MUNIT_LIKELY(!(expr))) { \ munit_error("assertion failed: " #expr " is not false"); \ } \ MUNIT__PUSH_DISABLE_MSVC_C4127 \ } while (0) \ MUNIT__POP_DISABLE_MSVC_C4127 #define munit_assert_type_full(prefix, suffix, T, fmt, a, op, b) \ do { \ T munit_tmp_a_ = (a); \ T munit_tmp_b_ = (b); \ if (!(munit_tmp_a_ op munit_tmp_b_)) { \ munit_errorf("assertion failed: %s %s %s (" prefix "%" fmt suffix " %s " prefix "%" fmt suffix ")", \ #a, #op, #b, munit_tmp_a_, #op, munit_tmp_b_); \ } \ MUNIT__PUSH_DISABLE_MSVC_C4127 \ } while (0) \ MUNIT__POP_DISABLE_MSVC_C4127 #define munit_assert_type(T, fmt, a, op, b) \ munit_assert_type_full("", "", T, fmt, a, op, b) #define munit_assert_char(a, op, b) \ munit_assert_type_full("'\\x", "'", char, "02" MUNIT_CHAR_MODIFIER "x", a, op, b) #define munit_assert_uchar(a, op, b) \ munit_assert_type_full("'\\x", "'", unsigned char, "02" MUNIT_CHAR_MODIFIER "x", a, op, b) #define munit_assert_short(a, op, b) \ munit_assert_type(short, MUNIT_SHORT_MODIFIER "d", a, op, b) #define munit_assert_ushort(a, op, b) \ munit_assert_type(unsigned short, MUNIT_SHORT_MODIFIER "u", a, op, b) #define munit_assert_int(a, op, b) \ munit_assert_type(int, "d", a, op, b) #define munit_assert_uint(a, op, b) \ munit_assert_type(unsigned int, "u", a, op, b) #define munit_assert_long(a, op, b) \ munit_assert_type(long int, "ld", a, op, b) #define munit_assert_ulong(a, op, b) \ munit_assert_type(unsigned long int, "lu", a, op, b) #define munit_assert_llong(a, op, b) \ munit_assert_type(long long int, "lld", a, op, b) #define munit_assert_ullong(a, op, b) \ munit_assert_type(unsigned long long int, "llu", a, op, b) #define munit_assert_size(a, op, b) \ munit_assert_type(size_t, MUNIT_SIZE_MODIFIER "u", a, op, b) #define munit_assert_float(a, op, b) \ munit_assert_type(float, "f", a, op, b) #define munit_assert_double(a, op, b) \ munit_assert_type(double, "g", a, op, b) #define munit_assert_ptr(a, op, b) \ munit_assert_type(const void*, "p", a, op, b) #define munit_assert_int8(a, op, b) \ munit_assert_type(munit_int8_t, PRIi8, a, op, b) #define munit_assert_uint8(a, op, b) \ munit_assert_type(munit_uint8_t, PRIu8, a, op, b) #define munit_assert_int16(a, op, b) \ munit_assert_type(munit_int16_t, PRIi16, a, op, b) #define munit_assert_uint16(a, op, b) \ munit_assert_type(munit_uint16_t, PRIu16, a, op, b) #define munit_assert_int32(a, op, b) \ munit_assert_type(munit_int32_t, PRIi32, a, op, b) #define munit_assert_uint32(a, op, b) \ munit_assert_type(munit_uint32_t, PRIu32, a, op, b) #define munit_assert_int64(a, op, b) \ munit_assert_type(munit_int64_t, PRIi64, a, op, b) #define munit_assert_uint64(a, op, b) \ munit_assert_type(munit_uint64_t, PRIu64, a, op, b) #define munit_assert_double_equal(a, b, precision) \ do { \ const double munit_tmp_a_ = (a); \ const double munit_tmp_b_ = (b); \ const double munit_tmp_diff_ = ((munit_tmp_a_ - munit_tmp_b_) < 0) ? \ -(munit_tmp_a_ - munit_tmp_b_) : \ (munit_tmp_a_ - munit_tmp_b_); \ if (MUNIT_UNLIKELY(munit_tmp_diff_ > 1e-##precision)) { \ munit_errorf("assertion failed: %s == %s (%0." #precision "g == %0." #precision "g)", \ #a, #b, munit_tmp_a_, munit_tmp_b_); \ } \ MUNIT__PUSH_DISABLE_MSVC_C4127 \ } while (0) \ MUNIT__POP_DISABLE_MSVC_C4127 #include #define munit_assert_string_equal(a, b) \ do { \ const char* munit_tmp_a_ = a; \ const char* munit_tmp_b_ = b; \ if (MUNIT_UNLIKELY(strcmp(munit_tmp_a_, munit_tmp_b_) != 0)) { \ munit_errorf("assertion failed: string %s == %s (\"%s\" == \"%s\")", \ #a, #b, munit_tmp_a_, munit_tmp_b_); \ } \ MUNIT__PUSH_DISABLE_MSVC_C4127 \ } while (0) \ MUNIT__POP_DISABLE_MSVC_C4127 #define munit_assert_string_not_equal(a, b) \ do { \ const char* munit_tmp_a_ = a; \ const char* munit_tmp_b_ = b; \ if (MUNIT_UNLIKELY(strcmp(munit_tmp_a_, munit_tmp_b_) == 0)) { \ munit_errorf("assertion failed: string %s != %s (\"%s\" == \"%s\")", \ #a, #b, munit_tmp_a_, munit_tmp_b_); \ } \ MUNIT__PUSH_DISABLE_MSVC_C4127 \ } while (0) \ MUNIT__POP_DISABLE_MSVC_C4127 #define munit_assert_memory_equal(size, a, b) \ do { \ const unsigned char* munit_tmp_a_ = (const unsigned char*) (a); \ const unsigned char* munit_tmp_b_ = (const unsigned char*) (b); \ const size_t munit_tmp_size_ = (size); \ if (MUNIT_UNLIKELY(memcmp(munit_tmp_a_, munit_tmp_b_, munit_tmp_size_)) != 0) { \ size_t munit_tmp_pos_; \ for (munit_tmp_pos_ = 0 ; munit_tmp_pos_ < munit_tmp_size_ ; munit_tmp_pos_++) { \ if (munit_tmp_a_[munit_tmp_pos_] != munit_tmp_b_[munit_tmp_pos_]) { \ munit_errorf("assertion failed: memory %s == %s, at offset %" MUNIT_SIZE_MODIFIER "u", \ #a, #b, munit_tmp_pos_); \ break; \ } \ } \ } \ MUNIT__PUSH_DISABLE_MSVC_C4127 \ } while (0) \ MUNIT__POP_DISABLE_MSVC_C4127 #define munit_assert_memory_not_equal(size, a, b) \ do { \ const unsigned char* munit_tmp_a_ = (const unsigned char*) (a); \ const unsigned char* munit_tmp_b_ = (const unsigned char*) (b); \ const size_t munit_tmp_size_ = (size); \ if (MUNIT_UNLIKELY(memcmp(munit_tmp_a_, munit_tmp_b_, munit_tmp_size_)) == 0) { \ munit_errorf("assertion failed: memory %s != %s (%zu bytes)", \ #a, #b, munit_tmp_size_); \ } \ MUNIT__PUSH_DISABLE_MSVC_C4127 \ } while (0) \ MUNIT__POP_DISABLE_MSVC_C4127 #define munit_assert_ptr_equal(a, b) \ munit_assert_ptr(a, ==, b) #define munit_assert_ptr_not_equal(a, b) \ munit_assert_ptr(a, !=, b) #define munit_assert_null(ptr) \ munit_assert_ptr(ptr, ==, NULL) #define munit_assert_not_null(ptr) \ munit_assert_ptr(ptr, !=, NULL) #define munit_assert_ptr_null(ptr) \ munit_assert_ptr(ptr, ==, NULL) #define munit_assert_ptr_not_null(ptr) \ munit_assert_ptr(ptr, !=, NULL) /*** Memory allocation ***/ void* munit_malloc_ex(const char* filename, int line, size_t size); #define munit_malloc(size) \ munit_malloc_ex(__FILE__, __LINE__, (size)) #define munit_new(type) \ ((type*) munit_malloc(sizeof(type))) #define munit_calloc(nmemb, size) \ munit_malloc((nmemb) * (size)) #define munit_newa(type, nmemb) \ ((type*) munit_calloc((nmemb), sizeof(type))) /*** Random number generation ***/ void munit_rand_seed(munit_uint32_t seed); munit_uint32_t munit_rand_uint32(void); int munit_rand_int_range(int min, int max); double munit_rand_double(void); void munit_rand_memory(size_t size, munit_uint8_t buffer[MUNIT_ARRAY_PARAM(size)]); /*** Tests and Suites ***/ typedef enum { /* Test successful */ MUNIT_OK, /* Test failed */ MUNIT_FAIL, /* Test was skipped */ MUNIT_SKIP, /* Test failed due to circumstances not intended to be tested * (things like network errors, invalid parameter value, failure to * allocate memory in the test harness, etc.). */ MUNIT_ERROR } MunitResult; typedef struct { char* name; char** values; } MunitParameterEnum; typedef struct { char* name; char* value; } MunitParameter; const char* munit_parameters_get(const MunitParameter params[], const char* key); typedef enum { MUNIT_TEST_OPTION_NONE = 0, MUNIT_TEST_OPTION_SINGLE_ITERATION = 1 << 0, MUNIT_TEST_OPTION_TODO = 1 << 1 } MunitTestOptions; typedef MunitResult (* MunitTestFunc)(const MunitParameter params[], void* user_data_or_fixture); typedef void* (* MunitTestSetup)(const MunitParameter params[], void* user_data); typedef void (* MunitTestTearDown)(void* fixture); typedef struct { char* name; MunitTestFunc test; MunitTestSetup setup; MunitTestTearDown tear_down; MunitTestOptions options; MunitParameterEnum* parameters; } MunitTest; typedef enum { MUNIT_SUITE_OPTION_NONE = 0 } MunitSuiteOptions; typedef struct MunitSuite_ MunitSuite; struct MunitSuite_ { char* prefix; MunitTest* tests; MunitSuite* suites; unsigned int iterations; MunitSuiteOptions options; }; int munit_suite_main(const MunitSuite* suite, void* user_data, int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)]); /* Note: I'm not very happy with this API; it's likely to change if I * figure out something better. Suggestions welcome. */ typedef struct MunitArgument_ MunitArgument; struct MunitArgument_ { char* name; bool (* parse_argument)(const MunitSuite* suite, void* user_data, int* arg, int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)]); void (* write_help)(const MunitArgument* argument, void* user_data); }; int munit_suite_main_custom(const MunitSuite* suite, void* user_data, int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)], const MunitArgument arguments[]); #if defined(MUNIT_ENABLE_ASSERT_ALIASES) #define assert_true(expr) munit_assert_true(expr) #define assert_false(expr) munit_assert_false(expr) #define assert_char(a, op, b) munit_assert_char(a, op, b) #define assert_uchar(a, op, b) munit_assert_uchar(a, op, b) #define assert_short(a, op, b) munit_assert_short(a, op, b) #define assert_ushort(a, op, b) munit_assert_ushort(a, op, b) #define assert_int(a, op, b) munit_assert_int(a, op, b) #define assert_uint(a, op, b) munit_assert_uint(a, op, b) #define assert_long(a, op, b) munit_assert_long(a, op, b) #define assert_ulong(a, op, b) munit_assert_ulong(a, op, b) #define assert_llong(a, op, b) munit_assert_llong(a, op, b) #define assert_ullong(a, op, b) munit_assert_ullong(a, op, b) #define assert_size(a, op, b) munit_assert_size(a, op, b) #define assert_float(a, op, b) munit_assert_float(a, op, b) #define assert_double(a, op, b) munit_assert_double(a, op, b) #define assert_ptr(a, op, b) munit_assert_ptr(a, op, b) #define assert_int8(a, op, b) munit_assert_int8(a, op, b) #define assert_uint8(a, op, b) munit_assert_uint8(a, op, b) #define assert_int16(a, op, b) munit_assert_int16(a, op, b) #define assert_uint16(a, op, b) munit_assert_uint16(a, op, b) #define assert_int32(a, op, b) munit_assert_int32(a, op, b) #define assert_uint32(a, op, b) munit_assert_uint32(a, op, b) #define assert_int64(a, op, b) munit_assert_int64(a, op, b) #define assert_uint64(a, op, b) munit_assert_uint64(a, op, b) #define assert_double_equal(a, b, precision) munit_assert_double_equal(a, b, precision) #define assert_string_equal(a, b) munit_assert_string_equal(a, b) #define assert_string_not_equal(a, b) munit_assert_string_not_equal(a, b) #define assert_memory_equal(size, a, b) munit_assert_memory_equal(size, a, b) #define assert_memory_not_equal(size, a, b) munit_assert_memory_not_equal(size, a, b) #define assert_ptr_equal(a, b) munit_assert_ptr_equal(a, b) #define assert_ptr_not_equal(a, b) munit_assert_ptr_not_equal(a, b) #define assert_ptr_null(ptr) munit_assert_null_equal(ptr) #define assert_ptr_not_null(ptr) munit_assert_not_null(ptr) #define assert_null(ptr) munit_assert_null(ptr) #define assert_not_null(ptr) munit_assert_not_null(ptr) #endif /* defined(MUNIT_ENABLE_ASSERT_ALIASES) */ #if defined(__cplusplus) } #endif #endif /* !defined(MUNIT_H) */ #if defined(MUNIT_ENABLE_ASSERT_ALIASES) # if defined(assert) # undef assert # endif # define assert(expr) munit_assert(expr) #endif dqlite-1.16.7/test/lib/raft.h000066400000000000000000000054151465252713400157400ustar00rootroot00000000000000/** * Helpers for setting up a standalone raft instance with a libuv transport. */ #ifndef TEST_RAFT_H #define TEST_RAFT_H #include #include "../../src/fsm.h" #include "../../src/raft.h" #include "../../src/transport.h" #include "fs.h" #include "logger.h" #include "munit.h" #include "uv.h" #include "../../src/lib/threadpool.h" #define FIXTURE_RAFT \ char *dir; \ struct uv_loop_s loop; \ struct raft_uv_transport raft_transport; \ struct raft_io raft_io; \ struct raft_fsm fsm; \ struct raft raft #define SETUP_RAFT \ { \ int rv2; \ f->dir = test_dir_setup(); \ test_uv_setup(params, &f->loop); \ rv2 = raftProxyInit(&f->raft_transport, &f->loop); \ munit_assert_int(rv2, ==, 0); \ rv2 = raft_uv_init(&f->raft_io, &f->loop, f->dir, \ &f->raft_transport); \ munit_assert_int(rv2, ==, 0); \ rv2 = fsm__init(&f->fsm, &f->config, &f->registry); \ munit_assert_int(rv2, ==, 0); \ rv2 = raft_init(&f->raft, &f->raft_io, &f->fsm, 1, "1"); \ munit_assert_int(rv2, ==, 0); \ } #define TEAR_DOWN_RAFT \ { \ raft_close(&f->raft, NULL); \ test_uv_stop(&f->loop); \ raft_uv_close(&f->raft_io); \ fsm__close(&f->fsm); \ test_uv_tear_down(&f->loop); \ raftProxyClose(&f->raft_transport); \ test_dir_tear_down(f->dir); \ } /** * Bootstrap the fixture raft instance with a configuration containing only * itself. */ #define RAFT_BOOTSTRAP \ { \ struct raft_configuration configuration; \ int rv2; \ raft_configuration_init(&configuration); \ rv2 = raft_configuration_add(&configuration, 1, "1", \ RAFT_VOTER); \ munit_assert_int(rv2, ==, 0); \ rv2 = raft_bootstrap(&f->raft, &configuration); \ munit_assert_int(rv2, ==, 0); \ raft_configuration_close(&configuration); \ } #define RAFT_START \ { \ int rv2; \ rv2 = raft_start(&f->raft); \ munit_assert_int(rv2, ==, 0); \ } #endif /* TEST_RAFT_H */ dqlite-1.16.7/test/lib/raft_heap.c000066400000000000000000000043551465252713400167320ustar00rootroot00000000000000#include "../../src/raft.h" #include "fault.h" #include "raft_heap.h" struct heapFault { struct test_fault fault; const struct raft_heap *orig_heap; }; static struct heapFault faulty; static void *faultyMalloc(void *data, size_t size) { (void)data; if (test_fault_tick(&faulty.fault)) { return NULL; } else { return faulty.orig_heap->malloc(faulty.orig_heap->data, size); } } static void faultyFree(void *data, void *ptr) { (void)data; faulty.orig_heap->free(faulty.orig_heap->data, ptr); } static void *faultyCalloc(void *data, size_t nmemb, size_t size) { (void)data; if (test_fault_tick(&faulty.fault)) { return NULL; } else { return faulty.orig_heap->calloc(faulty.orig_heap->data, nmemb, size); } } static void *faultyRealloc(void *data, void *ptr, size_t size) { (void)data; if (test_fault_tick(&faulty.fault)) { return NULL; } else { return faulty.orig_heap->realloc(faulty.orig_heap->data, ptr, size); } } static void *faultyAlignedAlloc(void *data, size_t alignment, size_t size) { (void)data; if (test_fault_tick(&faulty.fault)) { return NULL; } else { return faulty.orig_heap->aligned_alloc(faulty.orig_heap->data, alignment, size); } } static void faultyAlignedFree(void *data, size_t alignment, void *ptr) { (void)data; (void)alignment; faulty.orig_heap->aligned_free(faulty.orig_heap->data, alignment, ptr); } void test_raft_heap_setup(const MunitParameter params[], void *user_data) { (void)params; (void)user_data; struct raft_heap *heap = munit_malloc(sizeof(*heap)); test_fault_init(&faulty.fault); faulty.orig_heap = raft_heap_get(); heap->data = NULL; heap->malloc = faultyMalloc; heap->free = faultyFree; heap->calloc = faultyCalloc; heap->realloc = faultyRealloc; heap->aligned_alloc = faultyAlignedAlloc; heap->aligned_free = faultyAlignedFree; raft_heap_set(heap); } void test_raft_heap_tear_down(void *data) { struct raft_heap *heap = (struct raft_heap *)raft_heap_get(); (void)data; raft_heap_set((struct raft_heap *)faulty.orig_heap); faulty.orig_heap = NULL; free(heap); } void test_raft_heap_fault_config(int delay, int repeat) { test_fault_config(&faulty.fault, delay, repeat); } void test_raft_heap_fault_enable(void) { test_fault_enable(&faulty.fault); } dqlite-1.16.7/test/lib/raft_heap.h000066400000000000000000000006241465252713400167320ustar00rootroot00000000000000/** * Helpers for injecting failures into raft's allocator. */ #ifndef DQLITE_TEST_RAFT_HEAP_H #define DQLITE_TEST_RAFT_HEAP_H #include "munit.h" void test_raft_heap_setup(const MunitParameter params[], void *user_data); void test_raft_heap_tear_down(void *data); void test_raft_heap_fault_config(int delay, int repeat); void test_raft_heap_fault_enable(void); #endif /* DQLITE_TEST_RAFT_HEAP_H */ dqlite-1.16.7/test/lib/registry.h000066400000000000000000000004341465252713400166500ustar00rootroot00000000000000#ifndef TEST_REGISTRY_H #define TEST_REGISTRY_H #include "../../src/registry.h" #define FIXTURE_REGISTRY struct registry registry #define SETUP_REGISTRY registry__init(&f->registry, &f->config) #define TEAR_DOWN_REGISTRY registry__close(&f->registry); #endif /* TEST_REGISTRY_H */ dqlite-1.16.7/test/lib/runner.h000066400000000000000000000427451465252713400163240ustar00rootroot00000000000000/* Convenience macros to reduce munit boiler plate. */ #ifndef TEST_RUNNER_H #define TEST_RUNNER_H #include #include #include #include #include "munit.h" #include "../../src/tracing.h" /* Top-level suites array declaration. * * These top-level suites hold all module-level child suites and must be defined * and then set as child suites of a root suite created at runtime by the test * runner's main(). This can be done using the RUNNER macro. */ extern MunitSuite _main_suites[]; extern int _main_suites_n; /* Maximum number of test cases for each suite */ #define SUITE__CAP 128 #define TEST__CAP SUITE__CAP static inline void log_sqlite_error(void *arg, int e, const char *msg) { (void)arg; fprintf(stderr, "SQLITE %d %s\n", e, msg); } /* Define the top-level suites array and the main() function of the test. */ #define RUNNER(NAME) \ MunitSuite _main_suites[SUITE__CAP]; \ int _main_suites_n = 0; \ \ int main(int argc, char *argv[MUNIT_ARRAY_PARAM(argc)]) \ { \ signal(SIGPIPE, SIG_IGN); \ dqliteTracingMaybeEnable(true); \ sqlite3_config(SQLITE_CONFIG_LOG, log_sqlite_error, NULL); \ MunitSuite suite = {(char *)"", NULL, _main_suites, 1, 0}; \ return munit_suite_main(&suite, (void *)NAME, argc, argv); \ } /* Declare and register a new test suite #S belonging to the file's test module. * * A test suite is a pair of static variables: * * static MunitTest _##S##_suites[SUITE__CAP] * static MunitTest _##S##_tests[SUITE__CAP] * * The tests and suites attributes of the next available MunitSuite slot in the * _module_suites array will be set to the suite's tests and suites arrays, and * the prefix attribute of the slot will be set to /S. */ #define SUITE(S) \ SUITE__DECLARE(S) \ SUITE__ADD_CHILD(main, #S, S) /* Declare and register a new test. */ #define TEST(S, C, SETUP, TEAR_DOWN, OPTIONS, PARAMS) \ static MunitResult test_##S##_##C(const MunitParameter params[], \ void *data); \ TEST__ADD_TO_SUITE(S, C, SETUP, TEAR_DOWN, OPTIONS, PARAMS) \ static MunitResult test_##S##_##C( \ MUNIT_UNUSED const MunitParameter params[], \ MUNIT_UNUSED void *data) #define SKIP_IF_NO_FIXTURE \ if (f == NULL) { \ return MUNIT_SKIP; \ } /* Declare the MunitSuite[] and the MunitTest[] arrays that compose the test * suite identified by S. */ #define SUITE__DECLARE(S) \ static MunitSuite _##S##_suites[SUITE__CAP]; \ static MunitTest _##S##_tests[SUITE__CAP]; \ static MunitTestSetup _##S##_setup = NULL; \ static MunitTestTearDown _##S##_tear_down = NULL; \ static int _##S##_suites_n = 0; \ static int _##S##_tests_n = 0; \ __attribute__((constructor(101))) static void _##S##_init(void) \ { \ memset(_##S##_suites, 0, sizeof(_##S##_suites)); \ memset(_##S##_tests, 0, sizeof(_##S##_tests)); \ (void)_##S##_suites_n; \ (void)_##S##_tests_n; \ (void)_##S##_setup; \ (void)_##S##_tear_down; \ } /* Set the tests and suites attributes of the next available slot of the * MunitSuite[] array of S1 to the MunitTest[] and MunitSuite[] arrays of S2, * using the given PREXIX. */ #define SUITE__ADD_CHILD(S1, PREFIX, S2) \ __attribute__((constructor(102))) static void _##S1##_##S2##_init( \ void) \ { \ int n = _##S1##_suites_n; \ _##S1##_suites[n].prefix = PREFIX; \ _##S1##_suites[n].tests = _##S2##_tests; \ _##S1##_suites[n].suites = _##S2##_suites; \ _##S1##_suites[n].iterations = 0; \ _##S1##_suites[n].options = 0; \ _##S1##_suites_n = n + 1; \ } /* Add a test case to the MunitTest[] array of suite S. */ #define TEST__ADD_TO_SUITE(S, C, SETUP, TEAR_DOWN, OPTIONS, PARAMS) \ __attribute__((constructor(103))) static void _##S##_tests_##C##_init( \ void) \ { \ MunitTest *tests = _##S##_tests; \ int n = _##S##_tests_n; \ TEST__SET_IN_ARRAY(tests, n, "/" #C, test_##S##_##C, SETUP, \ TEAR_DOWN, OPTIONS, PARAMS); \ _##S##_tests_n = n + 1; \ } /* Set the values of the I'th test case slot in the given test array */ #define TEST__SET_IN_ARRAY(TESTS, I, NAME, FUNC, SETUP, TEAR_DOWN, OPTIONS, \ PARAMS) \ TESTS[I].name = NAME; \ TESTS[I].test = FUNC; \ TESTS[I].setup = SETUP; \ TESTS[I].tear_down = TEAR_DOWN; \ TESTS[I].options = OPTIONS; \ TESTS[I].parameters = PARAMS /** * Declare and register a new test module #M. * * A test module is a test suite (i.e. a pair of MunitTest[] and MunitSuite[] * arrays), directly or indirectly containing all test cases in a test file. * * This macro uses hard-coded names to declare the module's tests and suites * arrays static, so they can be easly referenced in other static declarations * generated by the macros below: * * static MunitTest _module_tests[TEST__CAP]; * static MunitSuite _module_suites[TEST__CAP]; * * The tests and suites attributes of the next available MunitSuite slot in the * top-level suites array will be set to the module's tests and suites arrays, * and the prefix attribute of the slot will be set to #M. * * Each test file should declare one and only one test module. */ #define TEST_MODULE(M) \ TEST_SUITE__DECLARE(module); \ TEST_SUITE__ADD_CHILD(main, #M, module); /** * Declare and register a new test suite #S belonging to the file's test module. * * A test suite is a pair of static variables: * * static MunitTest _##S##_suites[TEST__CAP] * static MunitTest _##S##_tests[TEST__CAP] * * The tests and suites attributes of the next available MunitSuite slot in the * #_module_suites array will be set to the suite's tests and suites arrays, and * the prefix attribute of the slot will be set to /S. * * All tests in the suite will use the same setup and tear down functions. */ #define TEST_SUITE(S) \ TEST_SUITE__DECLARE(S); \ TEST_SUITE__ADD_CHILD(module, "/" #S, S); /** * Declare a setup function. * * Possible signatures are: * * - TEST_SETUP(S): Declare the setup function for suite S inline. * - TEST_SETUP(S, F): Set the setup function for suite S to F. */ #define TEST_SETUP(...) TEST_SETUP__MACRO_CHOOSER(__VA_ARGS__)(__VA_ARGS__) #define TEST_SETUP_(S) \ static void *S##_setup(const MunitParameter[], void *); \ _##S##_setup = S##_setup; \ static void *S##_setup(const MunitParameter params[], void *user_data) /** * Declare a tear down function. * * Possible signatures are: * * - TEST_SETUP(S): Declare the tear down function for suite S inline. * - TEST_SETUP(S, F): Set the tear down function for suite S to F. */ #define TEST_TEAR_DOWN(...) \ TEST_TEAR_DOWN__MACRO_CHOOSER(__VA_ARGS__)(__VA_ARGS__) /** * Declare and register a new group of tests #G, belonging to suite #S in the * file's test module. */ #define TEST_GROUP(C, T) \ static MunitTest _##C##_##T##_tests[TEST__CAP]; \ static int _##C##_##T##_tests_n = 0; \ TEST_SUITE__ADD_GROUP(C, T); /** * Declare and register a new test case. * * Possible signatures are: * * - TEST_CASE(C): C gets added to the tests array of the file module. * - TEST_CASE(S, C): C gets added to the tests array of suite S. * - TEST_CASE(S, G, C): C gets added to the tests array of group G in suite S. * * The test body declaration must follow the macro. */ #define TEST_CASE(...) TEST_CASE__MACRO_CHOOSER(__VA_ARGS__)(__VA_ARGS__) /* Declare the MunitSuite[] and the MunitTest[] arrays that compose the test * suite identified by S. */ #define TEST_SUITE__DECLARE(S) \ static MunitSuite _##S##_suites[TEST__CAP]; \ static MunitTest _##S##_tests[TEST__CAP]; \ static MunitTestSetup _##S##_setup = NULL; \ static MunitTestTearDown _##S##_tear_down = NULL; \ static int _##S##_suites_n = 0; \ static int _##S##_tests_n = 0; \ __attribute__((constructor)) static void _##S##_init(void) \ { \ memset(_##S##_suites, 0, sizeof(_##S##_suites)); \ memset(_##S##_tests, 0, sizeof(_##S##_tests)); \ (void)_##S##_suites_n; \ (void)_##S##_tests_n; \ (void)_##S##_setup; \ (void)_##S##_tear_down; \ } /* Set the tests and suites attributes of the next available slot of the * MunitSuite[] array of S1 to the MunitTest[] and MunitSuite[] arrays of S2, * using the given PREXIX. */ #define TEST_SUITE__ADD_CHILD(S1, PREFIX, S2) \ __attribute__((constructor)) static void _##S1##_##S2##_init(void) \ { \ int n = _##S1##_suites_n; \ _##S1##_suites[n].prefix = PREFIX; \ _##S1##_suites[n].tests = _##S2##_tests; \ _##S1##_suites[n].suites = _##S2##_suites; \ _##S1##_suites[n].iterations = 0; \ _##S1##_suites[n].options = 0; \ _##S1##_suites_n = n + 1; \ } /* Set the tests attribute of the next available slot of the MunitSuite[] array * of S to the MunitTest[] array of G, using /G as prefix. */ #define TEST_SUITE__ADD_GROUP(S, G) \ __attribute__((constructor)) static void _##S##_##G##_init(void) \ { \ int n = _##S##_suites_n; \ _##S##_suites[n].prefix = "/" #G; \ _##S##_suites[n].tests = _##S##_##G##_tests; \ _##S##_suites[n].suites = NULL; \ _##S##_suites[n].iterations = 0; \ _##S##_suites[n].options = 0; \ _##S##_suites_n = n + 1; \ } /* Choose the appropriate TEST_SETUP__N_ARGS() macro depending on the number of * arguments passed to TEST_SETUP(). */ #define TEST_SETUP__MACRO_CHOOSER(...) \ TEST__GET_3RD_ARG(__VA_ARGS__, TEST_SETUP__2_ARGS, TEST_SETUP__1_ARGS) #define TEST_SETUP__1_ARGS(S) \ static void *S##__setup(const MunitParameter[], void *); \ __attribute__((constructor)) static void _##S##_setup_init(void) \ { \ _##S##_setup = S##__setup; \ } \ static void *S##__setup(const MunitParameter params[], void *user_data) #define TEST_SETUP__2_ARGS(S, F) \ __attribute__((constructor)) static void _##S##_setup_init(void) \ { \ _##S##_setup = F; \ } /* Choose the appropriate TEST_TEAR_DOWN__N_ARGS() macro depending on the number * of arguments passed to TEST_TEAR_DOWN(). */ #define TEST_TEAR_DOWN__MACRO_CHOOSER(...) \ TEST__GET_3RD_ARG(__VA_ARGS__, TEST_TEAR_DOWN__2_ARGS, \ TEST_TEAR_DOWN__1_ARGS) #define TEST_TEAR_DOWN__1_ARGS(S) \ static void S##__tear_down(void *data); \ __attribute__((constructor)) static void _##S##__tear_down_init(void) \ { \ _##S##_tear_down = S##__tear_down; \ } \ static void S##__tear_down(void *data) #define TEST_TEAR_DOWN__2_ARGS(S, F) \ __attribute__((constructor)) static void _##S##_tear_down_init(void) \ { \ _##S##_tear_down = F; \ } /* Choose the appropriate TEST_CASE__N_ARGS() macro depending on the number of * arguments passed to TEST_CASE(). */ #define TEST_CASE__MACRO_CHOOSER(...) \ TEST__GET_5TH_ARG(__VA_ARGS__, TEST_CASE__4_ARGS, TEST_CASE__3_ARGS, \ TEST_CASE__2_ARGS) /* Add the test case to the module's MunitTest[] array. */ #define TEST_CASE__2_ARGS(C, PARAMS) \ static MunitResult test_##C(const MunitParameter[], void *); \ TEST_CASE__ADD_TO_MODULE(C, PARAMS); \ static MunitResult test_##C(const MunitParameter params[], void *data) /* Add test case C to the MunitTest[] array of suite S. */ #define TEST_CASE__3_ARGS(S, C, PARAMS) \ static MunitResult test_##S##_##C(const MunitParameter[], void *); \ TEST_CASE__ADD_TO_SUITE(S, C, PARAMS); \ static MunitResult test_##S##_##C(const MunitParameter params[], \ void *data) /* Add test case C to the MunitTest[] array of group G of suite S. */ #define TEST_CASE__4_ARGS(S, G, C, PARAMS) \ static MunitResult test_##S##_##G##_##C(const MunitParameter[], \ void *); \ TEST_CASE__ADD_TO_GROUP(S, G, C, PARAMS); \ static MunitResult test_##S##_##G##_##C(const MunitParameter params[], \ void *data) /* Add a test case to the MunitTest[] array of the file module. */ #define TEST_CASE__ADD_TO_MODULE(C, PARAMS) \ __attribute__((constructor)) static void _module_tests_##C##_init( \ void) \ { \ MunitTest *tests = _module_tests; \ int n = _module_tests_n; \ TEST_CASE__SET_IN_ARRAY(tests, n, "/" #C, test_##C, NULL, \ NULL, PARAMS); \ _module_tests_n = n + 1; \ } /* Add a test case to the MunitTest[] array of suite S. */ #define TEST_CASE__ADD_TO_SUITE(S, C, PARAMS) \ __attribute__((constructor)) static void _##S##_tests_##C##_init(void) \ { \ MunitTest *tests = _##S##_tests; \ int n = _##S##_tests_n; \ TEST_CASE__SET_IN_ARRAY(tests, n, "/" #C, test_##S##_##C, \ _##S##_setup, _##S##_tear_down, \ PARAMS); \ _##S##_tests_n = n + 1; \ } /* Add a test case to MunitTest[] array of group G in suite S. */ #define TEST_CASE__ADD_TO_GROUP(S, G, C, PARAMS) \ __attribute__(( \ constructor)) static void _##S##_##G##_tests_##C##_init(void) \ { \ MunitTest *tests = _##S##_##G##_tests; \ int n = _##S##_##G##_tests_n; \ TEST_CASE__SET_IN_ARRAY(tests, n, "/" #C, \ test_##S##_##G##_##C, _##S##_setup, \ _##S##_tear_down, PARAMS); \ _##S##_##G##_tests_n = n + 1; \ } /* Set the values of the I'th test case slot in the given test array */ #define TEST_CASE__SET_IN_ARRAY(TESTS, I, NAME, FUNC, SETUP, TEAR_DOWN, \ PARAMS) \ TESTS[I].name = NAME; \ TESTS[I].test = FUNC; \ TESTS[I].setup = SETUP; \ TESTS[I].tear_down = TEAR_DOWN; \ TESTS[I].options = 0; \ TESTS[I].parameters = PARAMS #define TEST__GET_3RD_ARG(arg1, arg2, arg3, ...) arg3 #define TEST__GET_5TH_ARG(arg1, arg2, arg3, arg4, arg5, ...) arg5 #endif /* TEST_RUNNER_H */ dqlite-1.16.7/test/lib/server.c000066400000000000000000000110721465252713400163010ustar00rootroot00000000000000#include #include "fs.h" #include "server.h" static int endpointConnect(void *data, const char *address, int *fd) { struct sockaddr_un addr; int rv; (void)address; (void)data; memset(&addr, 0, sizeof addr); addr.sun_family = AF_UNIX; strcpy(addr.sun_path + 1, address + 1); *fd = socket(AF_UNIX, SOCK_STREAM, 0); munit_assert_int(*fd, !=, -1); rv = connect(*fd, (struct sockaddr *)&addr, sizeof(sa_family_t) + strlen(address + 1) + 1); return rv; } void test_server_setup(struct test_server *s, const unsigned id, const MunitParameter params[]) { (void)params; s->id = id; sprintf(s->address, "@%u", id); s->dir = test_dir_setup(); s->role_management = false; memset(s->others, 0, sizeof s->others); } void test_server_stop(struct test_server *s) { int rv; test_server_client_close(s, &s->client); if (s->role_management) { dqlite_node_handover(s->dqlite); rv = dqlite_node_stop(s->dqlite); } else { rv = dqlite_node_stop(s->dqlite); } munit_assert_int(rv, ==, 0); dqlite_node_destroy(s->dqlite); } void test_server_tear_down(struct test_server *s) { test_server_stop(s); test_dir_tear_down(s->dir); } void test_server_start(struct test_server *s, const MunitParameter params[]) { int rv; rv = dqlite_node_create(s->id, s->address, s->dir, &s->dqlite); munit_assert_int(rv, ==, 0); rv = dqlite_node_set_bind_address(s->dqlite, s->address); munit_assert_int(rv, ==, 0); rv = dqlite_node_set_connect_func(s->dqlite, endpointConnect, s); munit_assert_int(rv, ==, 0); rv = dqlite_node_set_network_latency_ms(s->dqlite, 10); munit_assert_int(rv, ==, 0); const char *snapshot_threshold_param = munit_parameters_get(params, SNAPSHOT_THRESHOLD_PARAM); if (snapshot_threshold_param != NULL) { unsigned threshold = (unsigned)atoi(snapshot_threshold_param); rv = dqlite_node_set_snapshot_params(s->dqlite, threshold, threshold); munit_assert_int(rv, ==, 0); } const char *snapshot_compression_param = munit_parameters_get(params, SNAPSHOT_COMPRESSION_PARAM); if (snapshot_compression_param != NULL) { bool snapshot_compression = (bool)atoi(snapshot_compression_param); rv = dqlite_node_set_snapshot_compression(s->dqlite, snapshot_compression); munit_assert_int(rv, ==, 0); } const char *disk_mode_param = munit_parameters_get(params, "disk_mode"); if (disk_mode_param != NULL) { bool disk_mode = (bool)atoi(disk_mode_param); if (disk_mode) { rv = dqlite_node_enable_disk_mode(s->dqlite); munit_assert_int(rv, ==, 0); } } const char *target_voters_param = munit_parameters_get(params, "target_voters"); if (target_voters_param != NULL) { int n = atoi(target_voters_param); rv = dqlite_node_set_target_voters(s->dqlite, n); munit_assert_int(rv, ==, 0); } const char *target_standbys_param = munit_parameters_get(params, "target_standbys"); if (target_standbys_param != NULL) { int n = atoi(target_standbys_param); rv = dqlite_node_set_target_standbys(s->dqlite, n); munit_assert_int(rv, ==, 0); } const char *role_management_param = munit_parameters_get(params, "role_management"); if (role_management_param != NULL) { bool role_management = (bool)atoi(role_management_param); s->role_management = role_management; if (role_management) { rv = dqlite_node_enable_role_management(s->dqlite); munit_assert_int(rv, ==, 0); } } rv = dqlite_node_start(s->dqlite); munit_assert_int(rv, ==, 0); test_server_client_connect(s, &s->client); } struct client_proto *test_server_client(struct test_server *s) { return &s->client; } void test_server_client_reconnect(struct test_server *s, struct client_proto *c) { test_server_client_close(s, c); test_server_client_connect(s, c); } void test_server_client_connect(struct test_server *s, struct client_proto *c) { int rv; int fd; rv = endpointConnect(NULL, s->address, &fd); munit_assert_int(rv, ==, 0); memset(c, 0, sizeof *c); buffer__init(&c->read); buffer__init(&c->write); c->fd = fd; } void test_server_client_close(struct test_server *s, struct client_proto *c) { (void)s; clientClose(c); } static void setOther(struct test_server *s, struct test_server *other) { unsigned i = other->id - 1; munit_assert_ptr_null(s->others[i]); s->others[i] = other; } void test_server_network(struct test_server *servers, unsigned n_servers) { unsigned i; unsigned j; for (i = 0; i < n_servers; i++) { for (j = 0; j < n_servers; j++) { struct test_server *server = &servers[i]; struct test_server *other = &servers[j]; if (i == j) { continue; } setOther(server, other); } } } dqlite-1.16.7/test/lib/server.h000066400000000000000000000034141465252713400163070ustar00rootroot00000000000000/* Setup fully blown servers running in standalone threads. */ #ifndef TEST_SERVER_H #define TEST_SERVER_H #include #include #include "../../src/client/protocol.h" #include "../../include/dqlite.h" #include "endpoint.h" #include "munit.h" #define SNAPSHOT_THRESHOLD_PARAM "snapshot-threshold" #define SNAPSHOT_COMPRESSION_PARAM "snapshot_compression" struct test_server { unsigned id; /* Server ID. */ char address[8]; /* Server address. */ char *dir; /* Data directory. */ dqlite_node *dqlite; /* Dqlite instance. */ bool role_management; struct client_proto client; /* Connected client. */ struct test_server *others[5]; /* Other servers, by ID-1. */ }; /* Initialize the test server. */ void test_server_setup(struct test_server *s, unsigned id, const MunitParameter params[]); /* Cleanup the test server. */ void test_server_tear_down(struct test_server *s); /* Start the test server. */ void test_server_start(struct test_server *s, const MunitParameter params[]); /* Stop the test server. */ void test_server_stop(struct test_server *s); /* Connect all the given the servers to each other. */ void test_server_network(struct test_server *servers, unsigned n_servers); /* Return a client connected to the server. */ struct client_proto *test_server_client(struct test_server *s); /* Closes and reopens a client connection to the server. */ void test_server_client_reconnect(struct test_server *s, struct client_proto *c); /* Opens a client connection to the server. */ void test_server_client_connect(struct test_server *s, struct client_proto *c); /* Closes a client connection to ther server. */ void test_server_client_close(struct test_server *s, struct client_proto *c); #endif /* TEST_SERVER_H */ dqlite-1.16.7/test/lib/sqlite.c000066400000000000000000000007671465252713400163050ustar00rootroot00000000000000#include #include "sqlite.h" void test_sqlite_setup(const MunitParameter params[]) { int rc; (void)params; rc = sqlite3_initialize(); if (rc != SQLITE_OK) { munit_errorf("sqlite_init(): %s", sqlite3_errstr(rc)); } rc = sqlite3_threadsafe(); if (!(rc == 1 || rc == 2)) { munit_errorf("sqlite3_threadsafe(): %d", rc); } } void test_sqlite_tear_down() { int rc; rc = sqlite3_shutdown(); if (rc != SQLITE_OK) { munit_errorf("sqlite_shutdown(): %s", sqlite3_errstr(rc)); } } dqlite-1.16.7/test/lib/sqlite.h000066400000000000000000000006031465252713400162770ustar00rootroot00000000000000/* Global SQLite configuration. */ #ifndef TEST_SQLITE_H #define TEST_SQLITE_H #include "munit.h" /* Setup SQLite global state. */ void test_sqlite_setup(const MunitParameter params[]); /* Teardown SQLite global state. */ void test_sqlite_tear_down(void); #define SETUP_SQLITE test_sqlite_setup(params); #define TEAR_DOWN_SQLITE test_sqlite_tear_down(); #endif /* TEST_SQLITE_H */ dqlite-1.16.7/test/lib/stmt.h000066400000000000000000000015511465252713400157700ustar00rootroot00000000000000/** * Setup a test prepared statement. */ #ifndef TEST_STMT_H #define TEST_STMT_H #include #define FIXTURE_STMT sqlite3_stmt *stmt #define STMT_PREPARE(CONN, STMT, SQL) \ { \ int rc; \ rc = sqlite3_prepare_v2(CONN, SQL, -1, &STMT, NULL); \ munit_assert_int(rc, ==, 0); \ } #define STMT_FINALIZE(STMT) sqlite3_finalize(STMT) #define STMT_EXEC(CONN, SQL) \ { \ int rc; \ char *msg; \ rc = sqlite3_exec(CONN, SQL, NULL, NULL, &msg); \ munit_assert_int(rc, ==, SQLITE_OK); \ } #endif /* TEST_STMT_H */ dqlite-1.16.7/test/lib/util.h000066400000000000000000000017501465252713400157570ustar00rootroot00000000000000/** * Utility macros and functions. */ #ifndef TEST_UTIL_H #define TEST_UTIL_H #include "munit.h" #include /* Wait a bounded time in seconds until a condition is true. */ #define AWAIT_TRUE(FN, ARG, SEC) \ do { \ struct timespec _start = {0}; \ struct timespec _end = {0}; \ clock_gettime(CLOCK_MONOTONIC, &_start); \ clock_gettime(CLOCK_MONOTONIC, &_end); \ while (!FN(ARG) && ((_end.tv_sec - _start.tv_sec) < SEC)) { \ clock_gettime(CLOCK_MONOTONIC, &_end); \ } \ if (!FN(ARG)) { \ return MUNIT_FAIL; \ } \ } while (0) #endif /* TEST_UTIL_H */ dqlite-1.16.7/test/lib/uv.c000066400000000000000000000023311465252713400154230ustar00rootroot00000000000000#include "uv.h" #define TEST_UV_MAX_LOOP_RUN 10 /* Max n. of loop iterations upon teardown */ void test_uv_setup(const MunitParameter params[], struct uv_loop_s *l) { int rv; (void)params; rv = uv_loop_init(l); munit_assert_int(rv, ==, 0); } int test_uv_run(struct uv_loop_s *l, unsigned n) { unsigned i; int rv; munit_assert_int(n, >, 0); for (i = 0; i < n; i++) { rv = uv_run(l, UV_RUN_ONCE); if (rv < 0) { munit_errorf("uv_run: %s (%d)", uv_strerror(rv), rv); } if (rv == 0) { break; } } return rv; } void test_uv_stop(struct uv_loop_s *l) { unsigned n_handles; /* Spin a few times to trigger pending callbacks. */ n_handles = test_uv_run(l, TEST_UV_MAX_LOOP_RUN); if (n_handles > 0) { munit_errorf("loop has still %d pending active handles", n_handles); } } static void test_uv__walk_cb(uv_handle_t *handle, void *arg) { (void)arg; munit_logf(MUNIT_LOG_INFO, "handle %d", handle->type); } void test_uv_tear_down(struct uv_loop_s *l) { int rv; rv = uv_loop_close(l); if (rv != 0) { uv_walk(l, test_uv__walk_cb, NULL); munit_errorf("uv_loop_close: %s (%d)", uv_strerror(rv), rv); } rv = uv_replace_allocator(malloc, realloc, calloc, free); munit_assert_int(rv, ==, 0); } dqlite-1.16.7/test/lib/uv.h000066400000000000000000000051731465252713400154370ustar00rootroot00000000000000/** * Add support for using the libuv loop in tests. */ #ifndef TEST_UV_H #define TEST_UV_H #include #include "munit.h" /* Max n. of loop iterations ran by a single function call */ #define TEST_UV_MAX_LOOP_RUN 10 /** * Initialize the given libuv loop. */ void test_uv_setup(const MunitParameter params[], struct uv_loop_s *l); /** * Run the loop until there are no pending active handles. * * If there are still pending active handles after 10 loop iterations, the test * will fail. * * This is meant to be used in tear down functions. */ void test_uv_stop(struct uv_loop_s *l); /** * Tear down the loop making sure no active handles are left. */ void test_uv_tear_down(struct uv_loop_s *l); /** * Run the loop until there are no pending active handles or the given amount of * iterations is reached. * * Return non-zero if there are pending handles. */ int test_uv_run(struct uv_loop_s *l, unsigned n); /** * Run the loop until the given function returns true. * * If the loop exhausts all active handles or if #TEST_UV_MAX_LOOP_RUN is * reached without @f returning #true, the test fails. */ #define test_uv_run_until(DATA, FUNC) \ { \ unsigned i; \ int rv; \ for (i = 0; i < TEST_UV_MAX_LOOP_RUN; i++) { \ if (FUNC(DATA)) { \ break; \ } \ rv = uv_run(&f->loop, UV_RUN_ONCE); \ if (rv < 0) { \ munit_errorf("uv_run: %s", uv_strerror(rv)); \ } \ if (rv == 0) { \ if (FUNC(DATA)) { \ break; \ } \ munit_errorf( \ "uv_run: stopped after %u iterations", \ i + 1); \ } \ } \ if (i == TEST_UV_MAX_LOOP_RUN) { \ munit_errorf( \ "uv_run: condition not met in %d iterations", \ TEST_UV_MAX_LOOP_RUN); \ } \ } #endif /* TEST_UV_H */ dqlite-1.16.7/test/lib/vfs.h000066400000000000000000000012761465252713400156030ustar00rootroot00000000000000/** * Setup an in-memory VFS instance to use in tests. */ #ifndef TEST_VFS_H #define TEST_VFS_H #include "../../src/vfs.h" #define FIXTURE_VFS struct sqlite3_vfs vfs; #define SETUP_VFS \ { \ int rv_; \ rv_ = VfsInit(&f->vfs, f->config.name); \ munit_assert_int(rv_, ==, 0); \ rv_ = sqlite3_vfs_register(&f->vfs, 0); \ munit_assert_int(rv_, ==, 0); \ } #define TEAR_DOWN_VFS \ { \ sqlite3_vfs_unregister(&f->vfs); \ VfsClose(&f->vfs); \ } #endif /* TEST_VFS_H */ dqlite-1.16.7/test/raft/000077500000000000000000000000001465252713400150145ustar00rootroot00000000000000dqlite-1.16.7/test/raft/fuzzy/000077500000000000000000000000001465252713400162035ustar00rootroot00000000000000dqlite-1.16.7/test/raft/fuzzy/main_core.c000066400000000000000000000004531465252713400203050ustar00rootroot00000000000000#include "../lib/runner.h" MunitSuite _main_suites[64]; int _main_suites_n = 0; /* Test runner executable */ int main(int argc, char *argv[MUNIT_ARRAY_PARAM(argc)]) { MunitSuite suite = {(char *)"", NULL, _main_suites, 1, 0}; return munit_suite_main(&suite, (void *)"unit", argc, argv); } dqlite-1.16.7/test/raft/fuzzy/test_election.c000066400000000000000000000051651465252713400212170ustar00rootroot00000000000000#include "../lib/cluster.h" #include "../lib/runner.h" /****************************************************************************** * * Fixture * *****************************************************************************/ struct fixture { FIXTURE_CLUSTER; }; static char *cluster_n[] = {"3", "4", "5", "7", NULL}; static char *cluster_pre_vote[] = {"0", "1", NULL}; static MunitParameterEnum _params[] = { {CLUSTER_N_PARAM, cluster_n}, {CLUSTER_PRE_VOTE_PARAM, cluster_pre_vote}, {NULL, NULL}, }; static void *setup(const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_CLUSTER(0); CLUSTER_BOOTSTRAP; CLUSTER_RANDOMIZE; CLUSTER_START; return f; } static void tear_down(void *data) { struct fixture *f = data; TEAR_DOWN_CLUSTER; free(f); } /****************************************************************************** * * Tests * *****************************************************************************/ SUITE(election) /* A leader is eventually elected */ TEST(election, win, setup, tear_down, 0, _params) { struct fixture *f = data; CLUSTER_STEP_UNTIL_HAS_LEADER(10000); return MUNIT_OK; } /* A new leader is elected if the current one dies. */ TEST(election, change, setup, tear_down, 0, _params) { struct fixture *f = data; CLUSTER_STEP_UNTIL_HAS_LEADER(10000); CLUSTER_KILL_LEADER; CLUSTER_STEP_UNTIL_HAS_NO_LEADER(10000); CLUSTER_STEP_UNTIL_HAS_LEADER(20000); return MUNIT_OK; } /* A new leader is elected if the current one dies and a previously killed * server with an outdated log and outdated term is revived. */ TEST(election, changeReviveOutdated, setup, tear_down, 0, _params) { struct fixture *f = data; unsigned i; /* Kill a random server */ i = ((unsigned)rand()) % CLUSTER_N; CLUSTER_KILL(i); /* Server i's term will be lower than the term of the election. */ CLUSTER_STEP_UNTIL_HAS_LEADER(20000); /* Add some entries to the log */ CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_KILL_LEADER; CLUSTER_STEP_UNTIL_HAS_NO_LEADER(10000); /* Revive server i with an outdated log and term, the cluster * should be able to elect a new leader */ CLUSTER_REVIVE(i); CLUSTER_STEP_UNTIL_HAS_LEADER(20000); return MUNIT_OK; } /* If no majority of servers is online, no leader is elected. */ TEST(election, noQuorum, setup, tear_down, 0, _params) { struct fixture *f = data; CLUSTER_KILL_MAJORITY; CLUSTER_STEP_UNTIL_ELAPSED(30000); munit_assert_false(CLUSTER_HAS_LEADER); return MUNIT_OK; } dqlite-1.16.7/test/raft/fuzzy/test_liveness.c000066400000000000000000000101011465252713400212270ustar00rootroot00000000000000#include "../lib/cluster.h" #include "../lib/runner.h" /****************************************************************************** * * Fixture * *****************************************************************************/ /* Maximum number of cluster loop iterations each test should perform. */ #define MAX_ITERATIONS 25000 /* Maximum number of cluster loop iterations a pair of servers should stay * disconnected. */ #define MAX_DISCONNECT 150 struct disconnection { unsigned id1; unsigned id2; int start; int duration; }; struct fixture { FIXTURE_CLUSTER; struct disconnection *disconnections; }; static char *cluster_n[] = {"3", "4", NULL}; static char *cluster_pre_vote[] = {"0", "1", NULL}; static MunitParameterEnum _params[] = { {CLUSTER_N_PARAM, cluster_n}, {CLUSTER_PRE_VOTE_PARAM, cluster_pre_vote}, {NULL, NULL}, }; /* Return the number of distinct server pairs in the cluster. */ static int __server_pairs(struct fixture *f) { return CLUSTER_N * (CLUSTER_N - 1) / 2; } /* Update the cluster connectivity for the given iteration. */ static void __update_connectivity(struct fixture *f, int i) { int p; int pairs = __server_pairs(f); for (p = 0; p < pairs; p++) { struct disconnection *disconnection = &f->disconnections[p]; unsigned id1 = disconnection->id1; unsigned id2 = disconnection->id2; if (disconnection->start == 0) { /* Decide whether to disconnect this pair. */ if (munit_rand_int_range(1, 10) <= 1) { disconnection->start = i; disconnection->duration = munit_rand_int_range(50, MAX_DISCONNECT); raft_fixture_saturate(&f->cluster, id1 - 1, id2 - 1); raft_fixture_saturate(&f->cluster, id2 - 1, id1 - 1); } } else { /* Decide whether to reconnect this pair. */ if (i - disconnection->start > disconnection->duration) { raft_fixture_desaturate(&f->cluster, id1 - 1, id2 - 1); raft_fixture_desaturate(&f->cluster, id2 - 1, id1 - 1); disconnection->start = 0; } } } } static void *setup(const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_malloc(sizeof *f); int pairs; size_t i, j, k; SETUP_CLUSTER(0); CLUSTER_BOOTSTRAP; CLUSTER_RANDOMIZE; CLUSTER_START; /* Number of distinct pairs of servers. */ pairs = __server_pairs(f); f->disconnections = munit_malloc(pairs * sizeof *f->disconnections); k = 0; for (i = 0; i < CLUSTER_N; i++) { for (j = i + 1; j < CLUSTER_N; j++) { struct disconnection *disconnection = &f->disconnections[k]; disconnection->id1 = i + 1; disconnection->id2 = j + 1; disconnection->start = 0; disconnection->duration = 0; k++; } } return f; } static void tear_down(void *data) { struct fixture *f = data; TEAR_DOWN_CLUSTER; free(f->disconnections); free(f); } /****************************************************************************** * * Tests * *****************************************************************************/ SUITE(liveness) static void apply_cb(struct raft_apply *req, int status, void *result) { (void)status; (void)result; free(req); } /* The system makes progress even in case of network disruptions. */ TEST(liveness, networkDisconnect, setup, tear_down, 0, _params) { struct fixture *f = data; int i = 0; (void)params; for (i = 0; i < MAX_ITERATIONS; i++) { __update_connectivity(f, i); raft_fixture_step(&f->cluster); if (CLUSTER_LEADER != CLUSTER_N) { struct raft_apply *req = munit_malloc(sizeof *req); CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, req, 1, apply_cb); if (CLUSTER_LAST_APPLIED(CLUSTER_LEADER) >= 2) { break; } } } // munit_assert_int(CLUSTER_LAST_APPLIED(CLUSTER_LEADER), >=, 2); return MUNIT_OK; } dqlite-1.16.7/test/raft/fuzzy/test_membership.c000066400000000000000000000047751465252713400215560ustar00rootroot00000000000000#include "../lib/cluster.h" #include "../lib/runner.h" /****************************************************************************** * * Fixture * *****************************************************************************/ struct fixture { FIXTURE_CLUSTER; struct raft_change req; }; static char *cluster_n[] = {"3", "4", "5", NULL}; static MunitParameterEnum _params[] = { {CLUSTER_N_PARAM, cluster_n}, {NULL, NULL}, }; static void *setup(const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_CLUSTER(0); CLUSTER_BOOTSTRAP; CLUSTER_RANDOMIZE; CLUSTER_START; CLUSTER_STEP_UNTIL_HAS_LEADER(10000); return f; } static void tear_down(void *data) { struct fixture *f = data; TEAR_DOWN_CLUSTER; free(f); } /****************************************************************************** * * Tests * *****************************************************************************/ SUITE(membership) TEST(membership, addNonVoting, setup, tear_down, 0, _params) { struct fixture *f = data; const struct raft_server *server; struct raft *raft; CLUSTER_ADD(&f->req); CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, 3, 2000); /* Then promote it. */ CLUSTER_ASSIGN(&f->req, RAFT_STANDBY); CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_N, 4, 2000); raft = CLUSTER_RAFT(CLUSTER_LEADER); server = &raft->configuration.servers[CLUSTER_N - 1]; munit_assert_int(server->id, ==, CLUSTER_N); return MUNIT_OK; } TEST(membership, addVoting, setup, tear_down, 0, _params) { struct fixture *f = data; const struct raft_server *server; struct raft *raft; (void)params; CLUSTER_ADD(&f->req); CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, 3, 2000); /* Then promote it. */ CLUSTER_ASSIGN(&f->req, RAFT_VOTER); CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_N, 4, 2000); raft = CLUSTER_RAFT(CLUSTER_LEADER); server = &raft->configuration.servers[CLUSTER_N - 1]; munit_assert_int(server->role, ==, RAFT_VOTER); return MUNIT_OK; } TEST(membership, removeVoting, setup, tear_down, 0, _params) { struct fixture *f = data; struct raft *raft; int rv; (void)params; raft = CLUSTER_RAFT(CLUSTER_LEADER); rv = raft_remove(raft, &f->req, CLUSTER_LEADER % CLUSTER_N + 1, NULL); munit_assert_int(rv, ==, 0); CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, 3, 2000); munit_assert_int(raft->configuration.n, ==, CLUSTER_N - 1); return 0; } dqlite-1.16.7/test/raft/fuzzy/test_replication.c000066400000000000000000000107641465252713400217270ustar00rootroot00000000000000#include "../lib/cluster.h" #include "../lib/runner.h" /****************************************************************************** * * Fixture * *****************************************************************************/ struct fixture { FIXTURE_CLUSTER; }; static char *cluster_n[] = {"3", "5", "7", NULL}; static MunitParameterEnum _params[] = { {CLUSTER_N_PARAM, cluster_n}, {NULL, NULL}, }; static void *setup(const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_CLUSTER(0); CLUSTER_BOOTSTRAP; CLUSTER_RANDOMIZE; CLUSTER_START; CLUSTER_STEP_UNTIL_HAS_LEADER(10000); return f; } static void tear_down(void *data) { struct fixture *f = data; TEAR_DOWN_CLUSTER; free(f); } /****************************************************************************** * * Helper macros * *****************************************************************************/ #define APPLY_ADD_ONE(REQ) CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, REQ, 1, NULL) /****************************************************************************** * * Tests * *****************************************************************************/ SUITE(replication) /* New entries on the leader are eventually replicated to followers. */ TEST(replication, appendEntries, setup, tear_down, 0, _params) { struct fixture *f = data; struct raft_apply *req = munit_malloc(sizeof *req); (void)params; APPLY_ADD_ONE(req); CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_N, 3, 2000); free(req); return MUNIT_OK; } /* The cluster remains available even if the current leader dies and a new * leader gets elected. */ TEST(replication, availability, setup, tear_down, 0, _params) { struct fixture *f = data; struct raft_apply *req1 = munit_malloc(sizeof *req1); struct raft_apply *req2 = munit_malloc(sizeof *req2); (void)params; APPLY_ADD_ONE(req1); CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_N, 3, 2000); CLUSTER_KILL_LEADER; CLUSTER_STEP_UNTIL_HAS_NO_LEADER(10000); CLUSTER_STEP_UNTIL_HAS_LEADER(10000); APPLY_ADD_ONE(req2); /* Index 3 -> 5 = APPLY entry + BARRIER entry after becoming leader */ CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, 5, 2000); free(req1); free(req2); return MUNIT_OK; } static void apply_cb(struct raft_apply *req, int status, void *result) { (void)status; (void)result; free(req); } /* If no quorum is available, entries don't get committed. */ TEST(replication, noQuorum, setup, tear_down, 0, _params) { struct fixture *f = data; struct raft_apply *req = munit_malloc(sizeof *req); unsigned i; (void)params; CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, req, 1, apply_cb); CLUSTER_KILL_MAJORITY; CLUSTER_STEP_UNTIL_ELAPSED(10000); for (i = 0; i < CLUSTER_N; i++) { munit_assert_int(CLUSTER_LAST_APPLIED(i), ==, 1); } return MUNIT_OK; } /* If the cluster is partitioned, entries don't get committed. */ TEST(replication, partitioned, setup, tear_down, 0, _params) { struct fixture *f = data; struct raft_apply *req1 = munit_malloc(sizeof *req1); struct raft_apply *req2 = munit_malloc(sizeof *req2); unsigned leader_id; size_t i; size_t n; (void)params; leader_id = CLUSTER_LEADER + 1; /* Disconnect the leader from a majority of servers */ n = 0; for (i = 0; n < (CLUSTER_N / 2) + 1; i++) { struct raft *raft = CLUSTER_RAFT(i); if (raft->id == leader_id) { continue; } raft_fixture_saturate(&f->cluster, leader_id - 1, raft->id - 1); raft_fixture_saturate(&f->cluster, raft->id - 1, leader_id - 1); n++; } /* Try to append a new entry using the disconnected leader. */ CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, req1, 1, apply_cb); /* The leader gets deposed. */ CLUSTER_STEP_UNTIL_HAS_NO_LEADER(10000); /* The entry does not get committed. */ CLUSTER_STEP_UNTIL_ELAPSED(5000); /* Reconnect the old leader */ for (i = 0; i < CLUSTER_N; i++) { struct raft *raft = CLUSTER_RAFT(i); if (raft->id == leader_id) { continue; } raft_fixture_desaturate(&f->cluster, leader_id - 1, raft->id - 1); } // TODO this fails with seed 0x3914306f CLUSTER_STEP_UNTIL_HAS_LEADER(30000); /* Re-try now to append the entry. */ CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, req2, 1, apply_cb); CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, 2, 10000); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/000077500000000000000000000000001465252713400173375ustar00rootroot00000000000000dqlite-1.16.7/test/raft/integration/append_helpers.h000066400000000000000000000113751465252713400225100ustar00rootroot00000000000000#include "../../../src/raft/uv.h" #include "../lib/runner.h" /****************************************************************************** * * Helper macros * *****************************************************************************/ struct result { int status; bool done; void *data; }; static void appendCbAssertResult(struct raft_io_append *req, int status) { struct result *result = req->data; munit_assert_int(status, ==, result->status); result->done = true; } /* Declare and fill the entries array for the append request identified by * I. The array will have N entries, and each entry will have a data buffer of * SIZE bytes.*/ #define ENTRIES(I, N, SIZE) \ struct raft_entry _entries##I[N]; \ uint8_t _entries_data##I[N * SIZE]; \ { \ int _i; \ for (_i = 0; _i < N; _i++) { \ struct raft_entry *entry = &_entries##I[_i]; \ entry->term = 1; \ entry->type = RAFT_COMMAND; \ entry->buf.base = &_entries_data##I[_i * SIZE]; \ entry->buf.len = SIZE; \ entry->batch = NULL; \ munit_assert_ptr_not_null(entry->buf.base); \ memset(entry->buf.base, 0, entry->buf.len); \ uint64_t _temporary = f->count; \ memcpy(entry->buf.base, &_temporary, 8); \ f->count++; \ } \ } /* Submit an append request identified by I, with N_ENTRIES entries, each one of * size ENTRY_SIZE. When the append request completes, CB will be called * and DATA will be available in result->data. f->io.append is expected to * return RV. */ #define APPEND_SUBMIT_CB_DATA(I, N_ENTRIES, ENTRY_SIZE, CB, DATA, RV) \ struct raft_io_append _req##I; \ struct result _result##I = {0, false, DATA}; \ int _rv##I; \ ENTRIES(I, N_ENTRIES, ENTRY_SIZE); \ _req##I.data = &_result##I; \ _rv##I = f->io.append(&f->io, &_req##I, _entries##I, N_ENTRIES, CB); \ munit_assert_int(_rv##I, ==, RV) /* Submit an append request identified by I, with N_ENTRIES entries, each one of * size ENTRY_SIZE. The default expectation is for the operation to succeed. A * custom STATUS can be set with APPEND_EXPECT. */ #define APPEND_SUBMIT(I, N_ENTRIES, ENTRY_SIZE) \ APPEND_SUBMIT_CB_DATA(I, N_ENTRIES, ENTRY_SIZE, appendCbAssertResult, \ NULL, 0) /* Try to submit an append request and assert that the given error code and * message are returned. */ #define APPEND_ERROR(N_ENTRIES, ENTRY_SIZE, RV, ERRMSG) \ do { \ struct raft_io_append _req; \ int _rv; \ ENTRIES(0, N_ENTRIES, ENTRY_SIZE); \ _rv = f->io.append(&f->io, &_req, _entries0, N_ENTRIES, NULL); \ munit_assert_int(_rv, ==, RV); \ /* munit_assert_string_equal(f->io.errmsg, ERRMSG);*/ \ } while (0) #define APPEND_EXPECT(I, STATUS) _result##I.status = STATUS /* Wait for the append request identified by I to complete. */ #define APPEND_WAIT(I) LOOP_RUN_UNTIL(&_result##I.done) /* Submit an append request with an entries array with N_ENTRIES entries, each * one of size ENTRY_SIZE, and wait for the operation to successfully * complete. */ #define APPEND(N_ENTRIES, ENTRY_SIZE) \ do { \ APPEND_SUBMIT(0, N_ENTRIES, ENTRY_SIZE); \ APPEND_WAIT(0); \ } while (0) /* Submit an append request with the given parameters and wait for the operation * to fail with the given code and message. */ #define APPEND_FAILURE(N_ENTRIES, ENTRY_SIZE, STATUS, ERRMSG) \ { \ APPEND_SUBMIT(0, N_ENTRIES, ENTRY_SIZE); \ APPEND_EXPECT(0, STATUS); \ APPEND_WAIT(0); \ f->count--; \ munit_assert_string_equal(f->io.errmsg, ERRMSG); \ } dqlite-1.16.7/test/raft/integration/main_core.c000066400000000000000000000000531465252713400214350ustar00rootroot00000000000000#include "../lib/runner.h" RUNNER("core") dqlite-1.16.7/test/raft/integration/main_uv.c000066400000000000000000000000511465252713400211350ustar00rootroot00000000000000#include "../lib/runner.h" RUNNER("uv") dqlite-1.16.7/test/raft/integration/test_apply.c000066400000000000000000000116241465252713400216730ustar00rootroot00000000000000#include "../lib/cluster.h" #include "../lib/runner.h" /****************************************************************************** * * Fixture * *****************************************************************************/ struct fixture { FIXTURE_CLUSTER; }; static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_CLUSTER(2); CLUSTER_BOOTSTRAP; CLUSTER_START; CLUSTER_ELECT(0); return f; } static void tearDown(void *data) { struct fixture *f = data; TEAR_DOWN_CLUSTER; free(f); } /****************************************************************************** * * Helper macros * *****************************************************************************/ struct result { int status; bool done; raft_index prev_applied; struct raft *raft; }; static void applyCbAssertResult(struct raft_apply *req, int status, void *_) { struct result *result = req->data; (void)_; munit_assert_int(status, ==, result->status); if (status == 0) { munit_assert_ulong(result->prev_applied, <, raft_last_applied(result->raft)); } result->done = true; } static bool applyCbHasFired(struct raft_fixture *f, void *arg) { struct result *result = arg; (void)f; return result->done; } /* Submit an apply request. */ #define APPLY_SUBMIT(I, N) \ struct raft_buffer _buf; \ struct raft_apply _req; \ struct raft *r = CLUSTER_RAFT(I); \ struct result _result = {0, false, raft_last_applied(r), r}; \ int _rv; \ FsmEncodeSetX(N, &_buf); \ _req.data = &_result; \ _rv = raft_apply(CLUSTER_RAFT(I), &_req, &_buf, NULL, 1, applyCbAssertResult); \ munit_assert_int(_rv, ==, 0); /* Expect the apply callback to fire with the given status. */ #define APPLY_EXPECT(STATUS) _result.status = STATUS /* Wait until an apply request completes. */ #define APPLY_WAIT CLUSTER_STEP_UNTIL(applyCbHasFired, &_result, 2000) /* Submit to the I'th server a request to apply a new RAFT_COMMAND entry and * wait for the operation to succeed. */ #define APPLY(I, N) \ do { \ APPLY_SUBMIT(I, N); \ APPLY_WAIT; \ } while (0) /* Submit to the I'th server a request to apply a new RAFT_COMMAND entry and * assert that the given error is returned. */ #define APPLY_ERROR(I, RV, ERRMSG) \ do { \ struct raft_buffer _buf; \ struct raft_apply _req; \ int _rv; \ FsmEncodeSetX(123, &_buf); \ _rv = raft_apply(CLUSTER_RAFT(I), &_req, &_buf, NULL, 1, NULL); \ munit_assert_int(_rv, ==, RV); \ munit_assert_string_equal(CLUSTER_ERRMSG(I), ERRMSG); \ raft_free(_buf.base); \ } while (0) /****************************************************************************** * * Success scenarios * *****************************************************************************/ SUITE(raft_apply) /* Append the very first command entry. */ TEST(raft_apply, first, setUp, tearDown, 0, NULL) { struct fixture *f = data; int val = 123; APPLY(0, val); munit_assert_int(FsmGetX(CLUSTER_FSM(0)), ==, val); return MUNIT_OK; } /* Append two command entries. */ TEST(raft_apply, two, setUp, tearDown, 0, NULL) { struct fixture *f = data; int val = 123; APPLY(0, val); munit_assert_int(FsmGetX(CLUSTER_FSM(0)), ==, val); val = 124; APPLY(0, val); munit_assert_int(FsmGetX(CLUSTER_FSM(0)), ==, val); return MUNIT_OK; } /****************************************************************************** * * Failure scenarios * *****************************************************************************/ /* If the raft instance is not in leader state, an error is returned. */ TEST(raft_apply, notLeader, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPLY_ERROR(1, RAFT_NOTLEADER, "server is not the leader"); return MUNIT_OK; } /* If the raft instance steps down from leader state, the apply callback fires * with an error. */ TEST(raft_apply, leadershipLost, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPLY_SUBMIT(0, 123); APPLY_EXPECT(RAFT_LEADERSHIPLOST); CLUSTER_DEPOSE; APPLY_WAIT; return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_assign.c000066400000000000000000000364061465252713400220370ustar00rootroot00000000000000#include "../lib/cluster.h" #include "../lib/runner.h" /****************************************************************************** * * Fixture * *****************************************************************************/ struct fixture { FIXTURE_CLUSTER; }; /****************************************************************************** * * Helper macros * *****************************************************************************/ struct result { int status; bool done; }; /* Add a an empty server to the cluster and start it. */ #define GROW \ { \ int rv__; \ CLUSTER_GROW; \ rv__ = raft_start(CLUSTER_RAFT(2)); \ munit_assert_int(rv__, ==, 0); \ } static void changeCbAssertResult(struct raft_change *req, int status) { struct result *result = req->data; munit_assert_int(status, ==, result->status); result->done = true; } static bool changeCbHasFired(struct raft_fixture *f, void *arg) { struct result *result = arg; (void)f; return result->done; } /* Submit an add request. */ #define ADD_SUBMIT(I, ID) \ struct raft_change _req; \ char _address[16]; \ struct result _result = {0, false}; \ int _rv; \ _req.data = &_result; \ sprintf(_address, "%d", ID); \ _rv = \ raft_add(CLUSTER_RAFT(I), &_req, ID, _address, changeCbAssertResult); \ munit_assert_int(_rv, ==, 0); #define ADD(I, ID) \ do { \ ADD_SUBMIT(I, ID); \ CLUSTER_STEP_UNTIL(changeCbHasFired, &_result, 2000); \ } while (0) /* Submit an assign role request. */ #define ASSIGN_SUBMIT(I, ID, ROLE) \ struct raft_change _req; \ struct result _result = {0, false}; \ int _rv; \ _req.data = &_result; \ _rv = raft_assign(CLUSTER_RAFT(I), &_req, ID, ROLE, changeCbAssertResult); \ munit_assert_int(_rv, ==, 0); /* Expect the request callback to fire with the given status. */ #define ASSIGN_EXPECT(STATUS) _result.status = STATUS; /* Wait until a promote request completes. */ #define ASSIGN_WAIT CLUSTER_STEP_UNTIL(changeCbHasFired, &_result, 10000) /* Submit a request to assign the I'th server to the given role and wait for the * operation to succeed. */ #define ASSIGN(I, ID, ROLE) \ do { \ ASSIGN_SUBMIT(I, ID, ROLE); \ ASSIGN_WAIT; \ } while (0) /* Invoke raft_assign() against the I'th server and assert it the given error * code. */ #define ASSIGN_ERROR(I, ID, ROLE, RV, ERRMSG) \ { \ struct raft_change __req; \ int __rv; \ __rv = raft_assign(CLUSTER_RAFT(I), &__req, ID, ROLE, NULL); \ munit_assert_int(__rv, ==, RV); \ munit_assert_string_equal(ERRMSG, CLUSTER_ERRMSG(I)); \ } /****************************************************************************** * * Set up a cluster of 2 servers, with the first as leader. * *****************************************************************************/ static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_CLUSTER(2); CLUSTER_BOOTSTRAP; CLUSTER_START; CLUSTER_ELECT(0); return f; } static void tearDown(void *data) { struct fixture *f = data; TEAR_DOWN_CLUSTER; free(f); } /****************************************************************************** * * Assertions * *****************************************************************************/ /* Assert the values of the committed and uncommitted configuration indexes on * the raft instance with the given index. */ #define ASSERT_CONFIGURATION_INDEXES(I, COMMITTED, UNCOMMITTED) \ { \ struct raft *raft_ = CLUSTER_RAFT(I); \ munit_assert_int(raft_->configuration_committed_index, ==, COMMITTED); \ munit_assert_int(raft_->configuration_uncommitted_index, ==, \ UNCOMMITTED); \ } /* Assert that the state of the current catch up round matches the given * values. */ #define ASSERT_CATCH_UP_ROUND(I, PROMOTEED_ID, NUMBER, DURATION) \ { \ struct raft *raft_ = CLUSTER_RAFT(I); \ munit_assert_int(raft_->leader_state.promotee_id, ==, PROMOTEED_ID); \ munit_assert_int(raft_->leader_state.round_number, ==, NUMBER); \ munit_assert_int( \ raft_->io->time(raft_->io) - raft_->leader_state.round_start, >=, \ DURATION); \ } /****************************************************************************** * * raft_assign * *****************************************************************************/ SUITE(raft_assign) /* Assigning the voter role to a spare server whose log is already up-to-date * results in the relevant configuration change to be submitted immediately. */ TEST(raft_assign, promoteUpToDate, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft *raft; const struct raft_server *server; GROW; ADD(0, 3); CLUSTER_STEP_N(3); ASSIGN(0, 3, RAFT_VOTER); /* Server 3 is being considered as voting, even though the configuration * change is not committed yet. */ raft = CLUSTER_RAFT(0); server = &raft->configuration.servers[2]; munit_assert_int(server->role, ==, RAFT_VOTER); /* The configuration change request eventually succeeds. */ CLUSTER_STEP_UNTIL_APPLIED(0, 3, 2000); return MUNIT_OK; } static bool thirdServerHasCaughtUp(struct raft_fixture *f, void *arg) { struct raft *raft = raft_fixture_get(f, 0); (void)arg; return raft->leader_state.promotee_id == 0; } /* Assigning the voter role to a spare server whose log is not up-to-date * results in catch-up rounds to start. When the server has caught up, the * configuration change request gets submitted. */ TEST(raft_assign, promoteCatchUp, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft *raft; const struct raft_server *server; CLUSTER_MAKE_PROGRESS; GROW; ADD(0, 3); ASSIGN_SUBMIT(0, 3, RAFT_VOTER); /* Server 3 is not being considered as voting, since its log is behind. */ raft = CLUSTER_RAFT(0); server = &raft->configuration.servers[2]; munit_assert_int(server->role, ==, RAFT_SPARE); /* Advance the match index of server 3, by acknowledging the AppendEntries * request that the leader has sent to it. */ CLUSTER_STEP_UNTIL_APPLIED(2, 3, 2000); /* Disconnect the second server, so it doesn't participate in the quorum */ CLUSTER_SATURATE_BOTHWAYS(0, 1); /* Eventually the leader notices that the third server has caught. */ CLUSTER_STEP_UNTIL(thirdServerHasCaughtUp, NULL, 2000); /* The leader has submitted a configuration change request, but it's * uncommitted. */ ASSERT_CONFIGURATION_INDEXES(0, 4, 5); /* The third server notifies that it has appended the new * configuration. Since it's considered voting already, it counts for the * majority and the entry gets committed. */ CLUSTER_STEP_UNTIL_APPLIED(0, 5, 2000); CLUSTER_STEP_UNTIL_APPLIED(2, 5, 2000); /* The promotion is completed. */ ASSERT_CONFIGURATION_INDEXES(0, 5, 0); return MUNIT_OK; } static bool thirdServerHasCompletedFirstRound(struct raft_fixture *f, void *arg) { struct raft *raft = raft_fixture_get(f, 0); (void)arg; return raft->leader_state.round_number != 1; } /* Assigning the voter role to a spare a server whose log is not up-to-date * results in catch-up rounds to start. If new entries are appended after a * round is started, a new round is initiated once the former one completes. */ TEST(raft_assign, promoteNewRound, setUp, tearDown, 0, NULL) { struct fixture *f = data; unsigned election_timeout = CLUSTER_RAFT(0)->election_timeout; struct raft_apply *req = munit_malloc(sizeof *req); CLUSTER_MAKE_PROGRESS; GROW; ADD(0, 3); ASSIGN_SUBMIT(0, 3, RAFT_VOTER); ASSERT_CATCH_UP_ROUND(0, 3, 1, 0); /* Now that the catch-up round started, submit a new entry and set a very * high latency on the server being promoted, so it won't deliver * AppendEntry results within the round duration. */ CLUSTER_APPLY_ADD_X(0, req, 1, NULL); CLUSTER_STEP_UNTIL_ELAPSED(election_timeout + 100); // FIXME: unstable with 0xcf1f25b6 // ASSERT_CATCH_UP_ROUND(0, 3, 1, election_timeout + 100); /* The leader eventually receives the AppendEntries result from the * promotee, acknowledging all entries except the last one. The first round * has completes and a new one has starts. */ CLUSTER_STEP_UNTIL(thirdServerHasCompletedFirstRound, NULL, 2000); /* Eventually the server is promoted and everyone applies the entry. */ CLUSTER_STEP_UNTIL_APPLIED(0, req->index, 5000); /* The promotion is eventually completed. */ CLUSTER_STEP_UNTIL_APPLIED(0, req->index + 1, 5000); ASSERT_CONFIGURATION_INDEXES(0, 6, 0); free(req); return MUNIT_SKIP; } static bool secondServerHasNewConfiguration(struct raft_fixture *f, void *arg) { struct raft *raft = raft_fixture_get(f, 1); (void)arg; return raft->configuration.servers[2].role == RAFT_VOTER; } /* If a follower receives an AppendEntries RPC containing a RAFT_CHANGE entry * which changes the role of a server, the configuration change is immediately * applied locally, even if the entry is not yet committed. Once the entry is * committed, the change becomes permanent.*/ TEST(raft_assign, changeIsImmediate, setUp, tearDown, 0, NULL) { struct fixture *f = data; GROW; CLUSTER_MAKE_PROGRESS; ADD(0, 3); CLUSTER_STEP_UNTIL_APPLIED(1, 4, 2000); ASSIGN_SUBMIT(0, 3, RAFT_VOTER); CLUSTER_STEP_UNTIL(secondServerHasNewConfiguration, NULL, 3000); ASSERT_CONFIGURATION_INDEXES(1, 4, 5); ASSIGN_WAIT; return MUNIT_OK; } /* Assign the stand-by role to an idle server. */ TEST(raft_assign, promoteToStandBy, setUp, tearDown, 0, NULL) { struct fixture *f = data; GROW; ADD(0, 3); ASSIGN(0, 3, RAFT_STANDBY); return MUNIT_OK; } /* Trying to promote a server on a raft instance which is not the leader results * in an error. */ TEST(raft_assign, notLeader, setUp, tearDown, 0, NULL) { struct fixture *f = data; ASSIGN_ERROR(1, 3, RAFT_VOTER, RAFT_NOTLEADER, "server is not the leader"); return MUNIT_OK; } /* Trying to change the role of a server whose ID is unknown results in an * error. */ TEST(raft_assign, unknownId, setUp, tearDown, 0, NULL) { struct fixture *f = data; ASSIGN_ERROR(0, 3, RAFT_VOTER, RAFT_NOTFOUND, "no server has ID 3"); return MUNIT_OK; } /* Trying to promote a server to an unknown role in an. */ TEST(raft_assign, badRole, setUp, tearDown, 0, NULL) { struct fixture *f = data; ASSIGN_ERROR(0, 3, 999, RAFT_BADROLE, "server role is not valid"); return MUNIT_OK; } /* Trying to assign the voter role to a server which has already it results in * an error. */ TEST(raft_assign, alreadyHasRole, setUp, tearDown, 0, NULL) { struct fixture *f = data; ASSIGN_ERROR(0, 1, RAFT_VOTER, RAFT_BADROLE, "server is already voter"); return MUNIT_OK; } /* Trying to assign a new role to a server while a configuration change is in * progress results in an error. */ TEST(raft_assign, changeRequestAlreadyInProgress, setUp, tearDown, 0, NULL) { struct fixture *f = data; GROW; ADD(0, 3); ASSIGN_SUBMIT(0, 3, RAFT_VOTER); ASSIGN_ERROR(0, 3, RAFT_VOTER, RAFT_CANTCHANGE, "a configuration change is already in progress"); ASSIGN_WAIT; return MUNIT_OK; } /* If leadership is lost before the configuration change log entry for setting * the new server role is committed, the leader configuration gets rolled back * and the role of server being changed is reverted. */ TEST(raft_assign, leadershipLost, setUp, tearDown, 0, NULL) { struct fixture *f = data; const struct raft_server *server; /* TODO: fix */ return MUNIT_SKIP; GROW; ADD(0, 3); CLUSTER_STEP_N(2); ASSIGN_SUBMIT(0, 3, RAFT_VOTER); /* Server 3 is being considered as voting, even though the configuration * change is not committed yet. */ ASSERT_CATCH_UP_ROUND(0, 0, 0, 0); ASSERT_CONFIGURATION_INDEXES(0, 2, 3); server = configurationGet(&CLUSTER_RAFT(0)->configuration, 3); munit_assert_int(server->role, ==, RAFT_VOTER); /* Lose leadership. */ CLUSTER_DEPOSE; /* A new leader gets elected */ CLUSTER_ELECT(1); CLUSTER_STEP_N(5); /* Server 3 is not being considered voting anymore. */ server = configurationGet(&CLUSTER_RAFT(0)->configuration, 3); munit_assert_int(server->role, ==, RAFT_STANDBY); return MUNIT_OK; } /* Trying to assign the voter role to an unresponsive server eventually * fails. */ TEST(raft_assign, promoteUnresponsive, setUp, tearDown, 0, NULL) { struct fixture *f = data; CLUSTER_MAKE_PROGRESS; GROW; ADD(0, 3); ASSIGN_SUBMIT(0, 3, RAFT_VOTER); CLUSTER_KILL(2); ASSIGN_EXPECT(RAFT_NOCONNECTION); ASSIGN_WAIT; return MUNIT_OK; } /* Demote a voter node to stand-by. */ TEST(raft_assign, demoteToStandBy, setUp, tearDown, 0, NULL) { struct fixture *f = data; ASSIGN(0, 2, RAFT_STANDBY); return MUNIT_OK; } /* The leader can be demoted to stand-by and will no longer act as leader */ TEST(raft_assign, demoteLeader, setUp, tearDown, 0, NULL) { struct fixture *f = data; ASSIGN_SUBMIT(0, 1, RAFT_STANDBY); munit_assert_int(CLUSTER_LEADER, ==, 0); ASSIGN_WAIT; CLUSTER_STEP_UNTIL_HAS_LEADER(5000); munit_assert_int(CLUSTER_LEADER, !=, 0); return MUNIT_OK; } /* The leader can be demoted to spare and will no longer act as leader */ TEST(raft_assign, demoteLeaderToSpare, setUp, tearDown, 0, NULL) { struct fixture *f = data; ASSIGN_SUBMIT(0, 1, RAFT_SPARE); munit_assert_int(CLUSTER_LEADER, ==, 0); ASSIGN_WAIT; CLUSTER_STEP_UNTIL_HAS_LEADER(5000); munit_assert_int(CLUSTER_LEADER, !=, 0); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_barrier.c000066400000000000000000000046651465252713400222030ustar00rootroot00000000000000#include "../lib/cluster.h" #include "../lib/runner.h" /****************************************************************************** * * Fixture * *****************************************************************************/ struct fixture { FIXTURE_CLUSTER; }; static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_CLUSTER(2); CLUSTER_BOOTSTRAP; CLUSTER_START; CLUSTER_ELECT(0); return f; } static void tearDown(void *data) { struct fixture *f = data; TEAR_DOWN_CLUSTER; free(f); } /****************************************************************************** * * Helper macros * *****************************************************************************/ struct result { int status; bool done; }; static void barrierCbAssertResult(struct raft_barrier *req, int status) { struct result *result = req->data; munit_assert_int(status, ==, result->status); result->done = true; } static bool barrierCbHasFired(struct raft_fixture *f, void *arg) { struct result *result = arg; (void)f; return result->done; } /* Submit a barrier request. */ #define BARRIER_SUBMIT(I) \ struct raft_barrier _req; \ struct result _result = {0, false}; \ int _rv; \ _req.data = &_result; \ _rv = raft_barrier(CLUSTER_RAFT(I), &_req, barrierCbAssertResult); \ munit_assert_int(_rv, ==, 0); /* Expect the barrier callback to fire with the given status. */ #define BARRIER_EXPECT(STATUS) _result.status = STATUS /* Wait until the barrier request completes. */ #define BARRIER_WAIT CLUSTER_STEP_UNTIL(barrierCbHasFired, &_result, 2000) /* Submit to the I'th server a barrier request and wait for the operation to * succeed. */ #define BARRIER(I) \ do { \ BARRIER_SUBMIT(I); \ BARRIER_WAIT; \ } while (0) /****************************************************************************** * * Success scenarios * *****************************************************************************/ SUITE(raft_barrier) TEST(raft_barrier, cb, setUp, tearDown, 0, NULL) { struct fixture *f = data; BARRIER(0); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_bootstrap.c000066400000000000000000000025401465252713400225600ustar00rootroot00000000000000#include "../lib/cluster.h" #include "../lib/runner.h" /****************************************************************************** * * Fixture holding a pristine raft instance. * *****************************************************************************/ struct fixture { FIXTURE_CLUSTER; }; static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_CLUSTER(1); return f; } static void tearDown(void *data) { struct fixture *f = data; TEAR_DOWN_CLUSTER; free(f); } /****************************************************************************** * * Bootstrap tests. * *****************************************************************************/ SUITE(raft_bootstrap) /* Attempting to bootstrap an instance that's already started results in * RAFT_BUSY. */ TEST(raft_bootstrap, busy, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft *raft; struct raft_configuration configuration; int rv; /* Bootstrap and the first server. */ CLUSTER_BOOTSTRAP_N_VOTING(1); CLUSTER_START; raft = CLUSTER_RAFT(0); CLUSTER_CONFIGURATION(&configuration); rv = raft_bootstrap(raft, &configuration); munit_assert_int(rv, ==, RAFT_BUSY); raft_configuration_close(&configuration); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_digest.c000066400000000000000000000005561465252713400220270ustar00rootroot00000000000000#include "../../../src/raft.h" #include "../lib/runner.h" SUITE(raft_digest) /* Generation of the ID of the bootstrap dqlite node. */ TEST(raft_digest, bootstrapServerId, NULL, NULL, 0, NULL) { const char *address = "127.0.0.1:65536"; unsigned long long id; id = raft_digest(address, 0); munit_assert_int(id, ==, 138882483); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_election.c000066400000000000000000000576701465252713400223630ustar00rootroot00000000000000#include "../../../src/raft/configuration.h" #include "../lib/cluster.h" #include "../lib/runner.h" /****************************************************************************** * * Fixture * *****************************************************************************/ struct fixture { FIXTURE_CLUSTER; }; static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_malloc(sizeof *f); unsigned i; SETUP_CLUSTER(2); CLUSTER_BOOTSTRAP; for (i = 0; i < CLUSTER_N; i++) { struct raft *raft = CLUSTER_RAFT(i); raft->data = f; } return f; } static void tearDown(void *data) { struct fixture *f = data; TEAR_DOWN_CLUSTER; free(f); } /****************************************************************************** * * Parameters * *****************************************************************************/ static char *cluster_5[] = {"5", NULL}; static MunitParameterEnum cluster_5_params[] = { {CLUSTER_N_PARAM, cluster_5}, {NULL, NULL}, }; static char *cluster_3[] = {"3", NULL}; static MunitParameterEnum cluster_3_params[] = { {CLUSTER_N_PARAM, cluster_3}, {NULL, NULL}, }; /****************************************************************************** * * Helper macros * *****************************************************************************/ /* Wait until the I'th server becomes candidate. */ #define STEP_UNTIL_CANDIDATE(I) \ CLUSTER_STEP_UNTIL_STATE_IS(I, RAFT_CANDIDATE, 2000) /* Wait until the I'th server becomes leader. */ #define STEP_UNTIL_LEADER(I) CLUSTER_STEP_UNTIL_STATE_IS(I, RAFT_LEADER, 2000) /****************************************************************************** * * Assertions * *****************************************************************************/ /* Assert that the I'th server is in follower state. */ #define ASSERT_FOLLOWER(I) munit_assert_int(CLUSTER_STATE(I), ==, RAFT_FOLLOWER) /* Assert that the I'th server is in candidate state. */ #define ASSERT_CANDIDATE(I) \ munit_assert_int(CLUSTER_STATE(I), ==, RAFT_CANDIDATE) /* Assert that the I'th server is in leader state. */ #define ASSERT_LEADER(I) munit_assert_int(CLUSTER_STATE(I), ==, RAFT_LEADER) /* Assert that the I'th server is unavailable. */ #define ASSERT_UNAVAILABLE(I) \ munit_assert_int(CLUSTER_STATE(I), ==, RAFT_UNAVAILABLE) /* Assert that the I'th server has voted for the server with the given ID. */ #define ASSERT_VOTED_FOR(I, ID) munit_assert_int(CLUSTER_VOTED_FOR(I), ==, ID) /* Assert that the I'th server has the given current term. */ #define ASSERT_TERM(I, TERM) \ { \ struct raft *raft_ = CLUSTER_RAFT(I); \ munit_assert_int(raft_->current_term, ==, TERM); \ } /* Assert that the fixture time matches the given value */ #define ASSERT_TIME(TIME) munit_assert_int(CLUSTER_TIME, ==, TIME) /****************************************************************************** * * Successful election round * *****************************************************************************/ SUITE(election) /* Test an election round with two voters. */ TEST(election, twoVoters, setUp, tearDown, 0, NULL) { struct fixture *f = data; (void)params; CLUSTER_START; /* The first server eventually times out and converts to candidate. */ STEP_UNTIL_CANDIDATE(0); ASSERT_TIME(1000); CLUSTER_STEP; /* Server 1 tick */ ASSERT_FOLLOWER(1); CLUSTER_STEP; /* Server 0 completes sending a RequestVote RPC */ CLUSTER_STEP; /* Server 1 receives RequestVote RPC */ ASSERT_VOTED_FOR(1, 1); ASSERT_TIME(1015); CLUSTER_STEP; /* Server 1 completes sending RequestVote RPC */ CLUSTER_STEP; /* Server 1 receives RequestVote RPC result */ ASSERT_LEADER(0); ASSERT_TIME(1030); return MUNIT_OK; } /* If we have already voted and the same candidate requests the vote again, the * vote is granted. */ TEST(election, grantAgain, setUp, tearDown, 0, NULL) { struct fixture *f = data; (void)params; raft_fixture_set_randomized_election_timeout(&f->cluster, 1, 10000); raft_set_election_timeout(CLUSTER_RAFT(1), 10000); CLUSTER_START; /* The first server converts to candidate. */ STEP_UNTIL_CANDIDATE(0); ASSERT_TIME(1000); CLUSTER_STEP; /* Server 1 tick */ ASSERT_FOLLOWER(1); /* Disconnect the second server, so the first server does not receive the * result and eventually starts a new election round. */ CLUSTER_SATURATE_BOTHWAYS(0, 1); CLUSTER_STEP_UNTIL_TERM_IS(0, 3, 2000); ASSERT_CANDIDATE(0); ASSERT_TIME(2000); /* Reconnecting the two servers eventually makes the first server win the * election. */ CLUSTER_DESATURATE_BOTHWAYS(0, 1); STEP_UNTIL_LEADER(0); ASSERT_TIME(2030); return MUNIT_OK; } /* If the requester last log entry index is the same, the vote is granted. */ TEST(election, grantIfLastIndexIsSame, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry entry1; struct raft_entry entry2; (void)params; entry1.type = RAFT_COMMAND; entry1.term = 1; FsmEncodeSetX(1, &entry1.buf); entry2.type = RAFT_COMMAND; entry2.term = 1; FsmEncodeSetX(1, &entry2.buf); CLUSTER_ADD_ENTRY(0, &entry1); CLUSTER_ADD_ENTRY(1, &entry2); CLUSTER_SET_TERM(1, 2); CLUSTER_START; /* The first server converts to candidate. */ STEP_UNTIL_CANDIDATE(0); /* The first server eventually receives a RequestVote result RPC and * converts to leader */ STEP_UNTIL_LEADER(0); ASSERT_TIME(1030); return MUNIT_OK; } /* If the requester last log entry index is higher, the vote is granted. */ TEST(election, grantIfLastIndexIsHigher, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry entry; (void)params; entry.type = RAFT_COMMAND; entry.term = 1; FsmEncodeSetX(1, &entry.buf); CLUSTER_ADD_ENTRY(0, &entry); CLUSTER_SET_TERM(1, 2); CLUSTER_START; /* The first server converts to candidate. */ STEP_UNTIL_CANDIDATE(0); /* The second server grants its vote. */ CLUSTER_STEP_UNTIL_VOTED_FOR(1, 0, 2000); /* The first server receives a RequestVote result RPC and converts to * leader */ CLUSTER_STEP_N(2); ASSERT_LEADER(0); return MUNIT_OK; } /* If a candidate receives a vote request response granting the vote but the * quorum is not reached, it stays candidate. */ TEST(election, waitQuorum, setUp, tearDown, 0, cluster_5_params) { struct fixture *f = data; (void)params; CLUSTER_START; /* The first server converts to candidate. */ STEP_UNTIL_CANDIDATE(0); /* All servers grant their vote. */ CLUSTER_STEP_UNTIL_VOTED_FOR(1, 0, 2000); CLUSTER_STEP_UNTIL_VOTED_FOR(2, 0, 2000); CLUSTER_STEP_UNTIL_VOTED_FOR(3, 0, 2000); CLUSTER_STEP_UNTIL_VOTED_FOR(4, 0, 2000); ASSERT_TIME(1015); /* The first server receives the first RequestVote result RPC but stays * candidate since it has only 2 votes, and 3 are required. */ CLUSTER_STEP_N(4); /* Send completes on all other servers */ CLUSTER_STEP; /* First message is delivered */ ASSERT_TIME(1030); ASSERT_CANDIDATE(0); /* Eventually we are elected */ CLUSTER_STEP; /* Second message is delivered */ ASSERT_LEADER(0); /* Server 0 reaches the quorum */ ASSERT_TIME(1030); return MUNIT_OK; } /* The vote request gets rejected if our term is higher. */ TEST(election, rejectIfHigherTerm, setUp, tearDown, 0, NULL) { struct fixture *f = data; (void)params; CLUSTER_SET_TERM(1, 3); CLUSTER_START; /* The first server converts to candidate. */ STEP_UNTIL_CANDIDATE(0); CLUSTER_STEP_N(3); /* Server 1 tick and RequestVote send/delivery */ /* The second server receives a RequestVote RPC and rejects the vote for the * first server. */ ASSERT_VOTED_FOR(1, 0); CLUSTER_STEP_N(2); /* RequestVote result send/delivery */ /* The first server receives the RequestVote result RPC and converts to * follower because it discovers the newer term. */ ASSERT_FOLLOWER(0); return 0; } /* If the server already has a leader, the vote is not granted (even if the * request has a higher term). */ TEST(election, rejectIfHasLeader, setUp, tearDown, 0, cluster_3_params) { struct fixture *f = data; (void)params; CLUSTER_START; /* Server 0 wins the elections. */ STEP_UNTIL_LEADER(0); /* Server 2 gets disconnected and becomes candidate. */ CLUSTER_SATURATE_BOTHWAYS(0, 2); STEP_UNTIL_CANDIDATE(2); /* Server 2 stays candidate since its requests get rejected. */ CLUSTER_STEP_N(20); ASSERT_CANDIDATE(2); return MUNIT_OK; } /* If a server has already voted, vote is not granted. */ TEST(election, rejectIfAlreadyVoted, setUp, tearDown, 0, cluster_3_params) { struct fixture *f = data; (void)params; /* Disconnect server 1 from server 0 and change its randomized election * timeout to match the one of server 0. This way server 1 will convert to * candidate but not receive vote requests. */ raft_fixture_set_randomized_election_timeout(&f->cluster, 1, 1000); CLUSTER_SATURATE_BOTHWAYS(0, 1); CLUSTER_START; /* Server 0 and server 1 both become candidates. */ STEP_UNTIL_CANDIDATE(0); STEP_UNTIL_CANDIDATE(1); ASSERT_TIME(1000); /* Server 2 receives the vote request from server 0 and grants it. */ CLUSTER_STEP_UNTIL_VOTED_FOR(2, 0, 2000); ASSERT_TIME(1015); /* Server 0 receives the vote result from server 2 and becomes leader. */ STEP_UNTIL_LEADER(0); ASSERT_TIME(1030); /* Server 1 is still candidate because its vote request got rejected. */ ASSERT_CANDIDATE(1); return MUNIT_OK; } /* If the requester last log entry term is lower than ours, the vote is not * granted. */ TEST(election, rejectIfLastTermIsLower, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry entry1; struct raft_entry entry2; (void)params; entry1.type = RAFT_COMMAND; entry1.term = 1; FsmEncodeSetX(123, &entry1.buf); entry2.type = RAFT_COMMAND; entry2.term = 2; FsmEncodeSetX(456, &entry2.buf); CLUSTER_ADD_ENTRY(0, &entry1); CLUSTER_ADD_ENTRY(1, &entry2); CLUSTER_START; /* The first server becomes candidate. */ STEP_UNTIL_CANDIDATE(0); ASSERT_TIME(1000); /* The second server receives a RequestVote RPC and rejects the vote for the * first server. */ CLUSTER_STEP_UNTIL_DELIVERED(0, 1, 100); ASSERT_VOTED_FOR(1, 0); ASSERT_TIME(1015); /* The first server receives the response and stays candidate. */ CLUSTER_STEP_UNTIL_DELIVERED(1, 0, 100); ASSERT_CANDIDATE(0); ASSERT_TIME(1030); /* Eventually the second server becomes leader because it has a longer * log. */ STEP_UNTIL_LEADER(1); ASSERT_TIME(1130); return MUNIT_OK; } /* If the requester last log entry index is the lower, the vote is not * granted. */ TEST(election, rejectIfLastIndexIsLower, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry entry; (void)params; entry.type = RAFT_COMMAND; entry.term = 2; FsmEncodeSetX(123, &entry.buf); CLUSTER_ADD_ENTRY(1, &entry); CLUSTER_START; /* The first server becomes candidate. */ STEP_UNTIL_CANDIDATE(0); ASSERT_TIME(1000); /* The second server receives a RequestVote RPC and rejects the vote for the * first server. */ CLUSTER_STEP_UNTIL_DELIVERED(0, 1, 100); ASSERT_VOTED_FOR(1, 0); ASSERT_TIME(1015); /* The first server receives the response and stays candidate. */ CLUSTER_STEP_UNTIL_DELIVERED(1, 0, 100); ASSERT_CANDIDATE(0); ASSERT_TIME(1030); /* Eventually the second server becomes leader because it has a longer * log. */ STEP_UNTIL_LEADER(1); ASSERT_TIME(1130); return MUNIT_OK; } static char *reject_not_voting_n[] = {"3", NULL}; static char *reject_not_voting_n_voting[] = {"2", NULL}; static MunitParameterEnum reject_not_voting_params[] = { {CLUSTER_N_PARAM, reject_not_voting_n}, {CLUSTER_N_VOTING_PARAM, reject_not_voting_n_voting}, {NULL, NULL}, }; /* If we are not a voting server, the vote is not granted. */ TEST(election, rejectIfNotVoter, setUp, tearDown, 0, reject_not_voting_params) { struct fixture *f = data; /* Disconnect server 0 from server 1, so server 0 can't win the elections * (since there are only 2 voting servers). */ CLUSTER_SATURATE_BOTHWAYS(0, 1); CLUSTER_START; /* Server 0 becomes candidate. */ STEP_UNTIL_CANDIDATE(0); ASSERT_TIME(1000); /* Server 0 stays candidate because it can't reach a quorum. */ CLUSTER_STEP_UNTIL_TERM_IS(0, 3, 2000); ASSERT_CANDIDATE(0); ASSERT_TIME(2000); return MUNIT_OK; } /* If a candidate server receives a response indicating that the vote was not * granted, nothing happens (e.g. the server has already voted for someone * else). */ TEST(election, receiveRejectResult, setUp, tearDown, 0, cluster_5_params) { struct fixture *f = data; (void)params; /* Lower the randomized election timeout of server 4, so it becomes * candidate just after server 0 */ raft_fixture_set_randomized_election_timeout(&f->cluster, 4, 1020); /* Disconnect server 0 from all others except server 1. */ CLUSTER_SATURATE_BOTHWAYS(0, 2); CLUSTER_SATURATE_BOTHWAYS(0, 3); CLUSTER_SATURATE_BOTHWAYS(0, 4); /* Disconnect server 4 from all others except the server 1. */ CLUSTER_SATURATE_BOTHWAYS(4, 0); CLUSTER_SATURATE_BOTHWAYS(4, 2); CLUSTER_SATURATE_BOTHWAYS(4, 3); CLUSTER_START; /* The server 0 becomes candidate, server 4 one is still follower. */ STEP_UNTIL_CANDIDATE(0); ASSERT_TIME(1000); ASSERT_FOLLOWER(4); /* Server 1 receives a RequestVote RPC and grants its vote. */ CLUSTER_STEP_UNTIL_DELIVERED(0, 1, 100); ASSERT_TIME(1015); ASSERT_VOTED_FOR(1, 1); ASSERT_CANDIDATE(0); ASSERT_FOLLOWER(4); /* Disconnect server 0 from server 1, so it doesn't receive further * messages. */ CLUSTER_SATURATE_BOTHWAYS(0, 1); /* Server 4 server eventually becomes candidate */ STEP_UNTIL_CANDIDATE(4); ASSERT_TIME(1100); ASSERT_CANDIDATE(0); /* The second server receives a RequestVote RPC but rejects its vote since * it has already voted. */ CLUSTER_STEP_UNTIL_DELIVERED(4, 0, 100); ASSERT_VOTED_FOR(1, 1); ASSERT_CANDIDATE(0); ASSERT_CANDIDATE(4); return MUNIT_OK; } /* An I/O error occurs when converting to candidate. */ TEST(election, ioErrorConvertTerm, setUp, tearDown, 0, NULL) { struct fixture *f = data; CLUSTER_START; raft_fixture_term_fault(&f->cluster, 0, 0); CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_UNAVAILABLE, 2000); return MUNIT_OK; } /* An I/O error occurs when converting to candidate. */ TEST(election, ioErrorConvertVote, setUp, tearDown, 0, NULL) { struct fixture *f = data; CLUSTER_START; raft_fixture_vote_fault(&f->cluster, 0, 0); CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_UNAVAILABLE, 2000); return MUNIT_OK; } /* The I/O error occurs when sending a vote request, and gets ignored. */ TEST(election, ioErrorSendVoteRequest, setUp, tearDown, 0, NULL) { struct fixture *f = data; CLUSTER_START; /* The first server fails to send a RequestVote RPC. */ raft_fixture_send_fault(&f->cluster, 0, 0); CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_LEADER, 5000); return MUNIT_OK; } /* The I/O error occurs when the second node tries to persist its vote. */ TEST(election, ioErrorPersistVote, setUp, tearDown, 0, NULL) { struct fixture *f = data; CLUSTER_START; /* The first server becomes candidate. */ CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_CANDIDATE, 2000); /* The second server receives a RequestVote RPC but fails to persist its * vote. */ raft_fixture_vote_fault(&f->cluster, 1, 0); CLUSTER_STEP_UNTIL_STATE_IS(1, RAFT_UNAVAILABLE, 1000); return MUNIT_OK; } /* Test an election round with two voters and pre-vote. */ TEST(election, preVote, setUp, tearDown, 0, NULL) { struct fixture *f = data; raft_set_pre_vote(CLUSTER_RAFT(0), true); raft_set_pre_vote(CLUSTER_RAFT(1), true); CLUSTER_START; /* The first server eventually times out and converts to candidate, but it * does not increment its term yet.*/ STEP_UNTIL_CANDIDATE(0); ASSERT_TIME(1000); ASSERT_TERM(0, 1); CLUSTER_STEP; /* Server 1 tick */ ASSERT_FOLLOWER(1); CLUSTER_STEP; /* Server 0 completes sending a pre-vote RequestVote RPC */ CLUSTER_STEP; /* Server 1 receives the pre-vote RequestVote RPC */ ASSERT_TERM(1, 1); /* Server 1 does increment its term */ ASSERT_VOTED_FOR(1, 0); /* Server 1 does not persist its vote */ ASSERT_TIME(1015); CLUSTER_STEP; /* Server 1 completes sending pre-vote RequestVote result */ CLUSTER_STEP; /* Server 0 receives the pre-vote RequestVote result */ ASSERT_CANDIDATE(0); ASSERT_TERM(0, 2); /* Server 0 has now incremented its term. */ ASSERT_TIME(1030); CLUSTER_STEP; /* Server 1 completes sending an actual RequestVote RPC */ CLUSTER_STEP; /* Server 1 receives the actual RequestVote RPC */ ASSERT_TERM(1, 2); /* Server 1 does increment its term. */ ASSERT_VOTED_FOR(1, 1); /* Server 1 does persists its vote */ CLUSTER_STEP; /* Server 1 completes sending actual RequestVote result */ CLUSTER_STEP; /* Server 0 receives the actual RequestVote result */ ASSERT_LEADER(0); return MUNIT_OK; } /* A candidate receives votes then crashes. */ TEST(election, preVoteWithcandidateCrash, setUp, tearDown, 0, cluster_3_params) { struct fixture *f = data; raft_set_pre_vote(CLUSTER_RAFT(0), true); raft_set_pre_vote(CLUSTER_RAFT(1), true); raft_set_pre_vote(CLUSTER_RAFT(2), true); CLUSTER_START; /* The first server eventually times out and converts to candidate, but it * does not increment its term yet.*/ STEP_UNTIL_CANDIDATE(0); ASSERT_TIME(1000); ASSERT_TERM(0, 1); /* Server 1 and 2 ticks */ CLUSTER_STEP_N(2); ASSERT_FOLLOWER(1); ASSERT_FOLLOWER(2); /* Server 0 completes sending a pre-vote RequestVote RPCs */ CLUSTER_STEP_N(2); CLUSTER_STEP; /* Server 1 receives the pre-vote RequestVote RPC */ ASSERT_TERM(1, 1); /* Server 1 does not increment its term */ ASSERT_VOTED_FOR(1, 0); /* Server 1 does not persist its vote */ ASSERT_TIME(1015); CLUSTER_STEP; /* Server 2 receives the pre-vote RequestVote RPC */ ASSERT_TERM(2, 1); /* Server 2 does not increment its term */ ASSERT_VOTED_FOR(2, 0); /* Server 1 does not persist its vote */ ASSERT_TIME(1015); /* Server 1 and 2 complete sending pre-vote RequestVote results */ CLUSTER_STEP_N(2); /* Server 0 receives the pre-vote RequestVote results */ CLUSTER_STEP_N(2); ASSERT_CANDIDATE(0); ASSERT_TERM(0, 2); /* Server 0 has now incremented its term. */ ASSERT_TIME(1030); /* Server 0 completes sending actual RequestVote RPCs */ CLUSTER_STEP_N(2); CLUSTER_STEP; /* Server 1 receives the actual RequestVote RPC */ ASSERT_TERM(1, 2); /* Server 1 does increment its term. */ ASSERT_VOTED_FOR(1, 1); /* Server 1 does persists its vote */ CLUSTER_STEP; /* Server 2 receives the actual RequestVote RPC */ ASSERT_TERM(2, 2); /* Server 2 does increment its term. */ ASSERT_VOTED_FOR(2, 1); /* Server 2 does persists its vote */ /* Server 0 crashes. */ CLUSTER_KILL(0); /* Server 1 times out and starts an election. * It doesn't increment its term */ STEP_UNTIL_CANDIDATE(1); ASSERT_TIME(2200); ASSERT_TERM(1, 2); /* Server 1 completes sending the pre-vote RequestVote RPCs and server 2 has * received those RPCs. * Since server 2 has no current leader (the leader crashed before sending a * HeartBeat), it will grant its vote to server 1, but will not persist it * due to pre-vote, it's persisted vote is still for Server 0 (id 1) */ CLUSTER_STEP_N(5); ASSERT_TERM(2, 2); /* Server 2 does not increment its term */ ASSERT_VOTED_FOR(2, 1); /* Server 1 receives the pre-vote RequestVote Result */ CLUSTER_STEP_N(2); /* Server 1 increments it's term to start a non pre-vote election */ ASSERT_TERM(1, 3); /* Server 1 has now incremented its term. */ ASSERT_VOTED_FOR(1, 2); /* Server 1 has persisted its vote */ ASSERT_TIME(2230); /* Server 1 completes sending actual RequestVote RPCs */ CLUSTER_STEP_N(2); /* Server 2 receives the actual RequestVote RPCs */ CLUSTER_STEP_N(2); ASSERT_VOTED_FOR(2, 2); /* Server 2 persists its vote */ /* Server 1 receives RequestVote RPCs results and becomes leader */ CLUSTER_STEP_N(2); ASSERT_LEADER(1); return MUNIT_OK; } /* Ensure delayed pre-vote responses are not counted towards the real election * quorum. */ TEST(election, preVoteNoStaleVotes, setUp, tearDown, 0, cluster_3_params) { struct fixture *f = data; raft_set_pre_vote(CLUSTER_RAFT(0), true); raft_set_pre_vote(CLUSTER_RAFT(1), true); raft_set_pre_vote(CLUSTER_RAFT(2), true); /* Server 2 is 1 term ahead of the other servers, this will allow it to send * stale pre-vote responses that pass the term checks. */ CLUSTER_SET_TERM(2, 2); CLUSTER_START; /* The first server eventually times out and converts to candidate, but it * does not increment its term yet.*/ STEP_UNTIL_CANDIDATE(0); ASSERT_TIME(1000); ASSERT_TERM(0, 1); /* Server 1 and 2 ticks */ CLUSTER_STEP_N(2); ASSERT_FOLLOWER(1); ASSERT_FOLLOWER(2); /* Server 0 completes sending a pre-vote RequestVote RPCs */ CLUSTER_STEP_N(2); CLUSTER_STEP; /* Server 1 receives the pre-vote RequestVote RPC */ ASSERT_TERM(1, 1); /* Server 1 does not increment its term */ ASSERT_VOTED_FOR(1, 0); /* Server 1 does not persist its vote */ ASSERT_TIME(1015); CLUSTER_STEP; /* Server 2 receives the pre-vote RequestVote RPC */ ASSERT_TERM(2, 2); /* Server 2 does not increment its term */ ASSERT_VOTED_FOR(2, 0); /* Server 1 does not persist its vote */ ASSERT_TIME(1015); /* Slow down responses of Server 2 */ CLUSTER_SET_NETWORK_LATENCY(2, 100); /* Server 1 completes sending pre-vote RequestVote results */ CLUSTER_STEP_N(2); /* Server 0 receives the pre-vote RequestVote results */ CLUSTER_STEP_N(2); ASSERT_CANDIDATE(0); ASSERT_TERM(0, 2); /* Server 0 has now incremented its term. */ ASSERT_TIME(1030); /* Don't send messages from 0, this ensures no real RequestVote RPCs are * sent */ CLUSTER_SATURATE(0, 1); CLUSTER_SATURATE(0, 2); /* Wait until all messages from 2 to 0 are delivered */ CLUSTER_STEP_UNTIL_DELIVERED(2, 0, 100); /* Make sure we haven't counted the pre-vote result as a real vote */ ASSERT_CANDIDATE(0); return MUNIT_OK; } /* A follower doesn't convert to candidate while waiting for log entries to be * persisted. */ TEST(election, inFlightAppendBlocksCandidacy, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_apply req; /* Server 1 takes a long time to persist entries. */ CLUSTER_SET_DISK_LATENCY(1, 10000); CLUSTER_START; /* Server 0 is the leader. It replicates a log entry. */ CLUSTER_ELECT(0); CLUSTER_APPLY_ADD_X(0, &req, 1, NULL); /* Server 1 receives the entry. */ CLUSTER_STEP_UNTIL_DELIVERED(0, 1, 1000); /* Contact is lost between servers 0 and 1. */ CLUSTER_SATURATE(0, 1); CLUSTER_SATURATE(1, 0); /* Several election timeouts lapse, but server 1 does not become a * candidate, because it's waiting for the entry to be persisted. */ CLUSTER_STEP_UNTIL_ELAPSED(5000); munit_assert_int(CLUSTER_STATE(1), ==, RAFT_FOLLOWER); /* Eventually, server 1 finishes persisting the entry and becomes a * candidate. */ CLUSTER_STEP_UNTIL_STATE_IS(1, RAFT_CANDIDATE, 10000); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_fixture.c000066400000000000000000000222561465252713400222370ustar00rootroot00000000000000#include "../../../src/raft.h" #include "../lib/fsm.h" #include "../lib/heap.h" #include "../lib/runner.h" #define N_SERVERS 3 /****************************************************************************** * * Fixture * *****************************************************************************/ struct fixture { FIXTURE_HEAP; struct raft_fsm fsms[N_SERVERS]; struct raft_fixture fixture; }; static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_calloc(1, sizeof *f); struct raft_configuration configuration; unsigned i; int rc; SET_UP_HEAP; for (i = 0; i < N_SERVERS; i++) { FsmInit(&f->fsms[i], 2); } rc = raft_fixture_init(&f->fixture); munit_assert_int(rc, ==, 0); for (i = 0; i < N_SERVERS; i++) { rc = raft_fixture_grow(&f->fixture, &f->fsms[i]); munit_assert_int(rc, ==, 0); } rc = raft_fixture_configuration(&f->fixture, N_SERVERS, &configuration); munit_assert_int(rc, ==, 0); rc = raft_fixture_bootstrap(&f->fixture, &configuration); munit_assert_int(rc, ==, 0); raft_configuration_close(&configuration); rc = raft_fixture_start(&f->fixture); munit_assert_int(rc, ==, 0); return f; } static void tearDown(void *data) { struct fixture *f = data; unsigned i; raft_fixture_close(&f->fixture); for (i = 0; i < N_SERVERS; i++) { FsmClose(&f->fsms[i]); } TEAR_DOWN_HEAP; free(f); } /****************************************************************************** * * Helper macros * *****************************************************************************/ #define GET(I) raft_fixture_get(&f->fixture, I) #define STEP raft_fixture_step(&f->fixture) #define STEP_N(N) raft_fixture_step_n(&f->fixture, N) #define STEP_UNTIL_STATE_IS(I, STATE) \ { \ bool done_; \ done_ = raft_fixture_step_until_state_is(&f->fixture, I, STATE, 2000); \ munit_assert_true(done_); \ } #define STATE(I) raft_state(GET(I)) #define ELECT(I) raft_fixture_elect(&f->fixture, I) #define DEPOSE raft_fixture_depose(&f->fixture) #define APPLY(I, REQ) \ { \ struct raft_buffer buf; \ int rc; \ FsmEncodeAddX(1, &buf); \ rc = raft_apply(GET(I), REQ, &buf, NULL, 1, NULL); \ munit_assert_int(rc, ==, 0); \ } #define STEP_UNTIL_APPLIED(INDEX) \ raft_fixture_step_until_applied(&f->fixture, N_SERVERS, INDEX, INDEX * 1000) /****************************************************************************** * * Assertions * *****************************************************************************/ /* Assert that the fixture time matches the given value */ #define ASSERT_TIME(TIME) \ munit_assert_int(raft_fixture_time(&f->fixture), ==, TIME) /* Assert that the I'th server is in the given state. */ #define ASSERT_STATE(I, S) munit_assert_int(STATE(I), ==, S) /* Assert that the x field of the FSM with the given index matches the given * value. */ #define ASSERT_FSM_X(I, VALUE) munit_assert_int(FsmGetX(&f->fsms[I]), ==, VALUE) /****************************************************************************** * * raft_fixture_step * *****************************************************************************/ SUITE(raft_fixture_step) /* If there is no disk I/O in progress or network messages in flight, the tick * callbacks are called. */ TEST(raft_fixture_step, tick, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_fixture_event *event; (void)params; ASSERT_TIME(0); event = STEP; munit_assert_int(raft_fixture_event_server_index(event), ==, 0); munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_TICK); ASSERT_TIME(100); event = STEP; munit_assert_int(raft_fixture_event_server_index(event), ==, 1); munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_TICK); ASSERT_TIME(100); event = STEP; munit_assert_int(raft_fixture_event_server_index(event), ==, 2); munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_TICK); ASSERT_TIME(100); event = STEP; munit_assert_int(raft_fixture_event_server_index(event), ==, 0); munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_TICK); ASSERT_TIME(200); return MUNIT_OK; } /* By default the election timeout of server 0 is the first to expire . */ TEST(raft_fixture_step, electionTimeout, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_fixture_event *event; (void)params; event = STEP_N(28); munit_assert_int(raft_fixture_event_server_index(event), ==, 0); munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_TICK); ASSERT_TIME(1000); ASSERT_STATE(0, RAFT_CANDIDATE); ASSERT_STATE(1, RAFT_FOLLOWER); ASSERT_STATE(2, RAFT_FOLLOWER); munit_log(MUNIT_LOG_INFO, "done"); return MUNIT_OK; } /* Send requests are flushed immediately. */ TEST(raft_fixture_step, flushSend, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_fixture_event *event; (void)params; STEP_UNTIL_STATE_IS(0, RAFT_CANDIDATE); event = STEP; munit_assert_int(raft_fixture_event_server_index(event), ==, 0); munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_NETWORK); ASSERT_TIME(1000); event = STEP; munit_assert_int(raft_fixture_event_server_index(event), ==, 0); munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_NETWORK); ASSERT_TIME(1000); return MUNIT_OK; } /* Messages are delivered according to the current network latency. */ TEST(raft_fixture_step, deliver, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_fixture_event *event; (void)params; STEP_UNTIL_STATE_IS(0, RAFT_CANDIDATE); /* Server 0 starts election */ STEP_N(2); /* Server 0 sends 2 RequestVote */ STEP_N(2); /* Ticks for server 1 and 2 */ ASSERT_TIME(1000); event = STEP; munit_assert_int(raft_fixture_event_server_index(event), ==, 0); munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_NETWORK); ASSERT_TIME(1015); return MUNIT_OK; } /****************************************************************************** * * raft_fixture_elect * *****************************************************************************/ SUITE(raft_fixture_elect) /* Trigger the election of the first server. */ TEST(raft_fixture_elect, first, setUp, tearDown, 0, NULL) { struct fixture *f = data; ELECT(0); ASSERT_STATE(0, RAFT_LEADER); ASSERT_STATE(1, RAFT_FOLLOWER); ASSERT_STATE(2, RAFT_FOLLOWER); return MUNIT_OK; } /* Trigger the election of the second server. */ TEST(raft_fixture_elect, second, setUp, tearDown, 0, NULL) { struct fixture *f = data; ELECT(1); ASSERT_STATE(0, RAFT_FOLLOWER); ASSERT_STATE(1, RAFT_LEADER); ASSERT_STATE(2, RAFT_FOLLOWER); return MUNIT_OK; } /* Trigger an election change. */ TEST(raft_fixture_elect, change, setUp, tearDown, 0, NULL) { struct fixture *f = data; ELECT(0); DEPOSE; ASSERT_STATE(0, RAFT_FOLLOWER); ASSERT_STATE(1, RAFT_FOLLOWER); ASSERT_STATE(2, RAFT_FOLLOWER); ELECT(1); ASSERT_STATE(0, RAFT_FOLLOWER); ASSERT_STATE(1, RAFT_LEADER); ASSERT_STATE(2, RAFT_FOLLOWER); return MUNIT_OK; } /* Trigger an election that re-elects the same node. */ TEST(raft_fixture_elect, again, setUp, tearDown, 0, NULL) { struct fixture *f = data; ELECT(0); DEPOSE; ASSERT_STATE(0, RAFT_FOLLOWER); ASSERT_STATE(1, RAFT_FOLLOWER); ASSERT_STATE(2, RAFT_FOLLOWER); ELECT(0); ASSERT_STATE(0, RAFT_LEADER); ASSERT_STATE(1, RAFT_FOLLOWER); ASSERT_STATE(2, RAFT_FOLLOWER); return MUNIT_OK; } /****************************************************************************** * * raft_fixture_step_until_applied * *****************************************************************************/ SUITE(raft_fixture_step_until_applied) /* Wait for one entry to be applied. */ TEST(raft_fixture_step_until_applied, one, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_apply *req = munit_malloc(sizeof *req); ELECT(0); APPLY(0, req); STEP_UNTIL_APPLIED(3); ASSERT_FSM_X(0, 1); ASSERT_FSM_X(1, 1); ASSERT_FSM_X(2, 1); free(req); return MUNIT_OK; } /* Wait for two entries to be applied. */ TEST(raft_fixture_step_until_applied, two, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_apply *req1 = munit_malloc(sizeof *req1); struct raft_apply *req2 = munit_malloc(sizeof *req2); ELECT(0); APPLY(0, req1); APPLY(0, req2); STEP_UNTIL_APPLIED(4); ASSERT_FSM_X(0, 2); ASSERT_FSM_X(1, 2); ASSERT_FSM_X(2, 2); free(req1); free(req2); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_heap.c000066400000000000000000000022231465252713400214560ustar00rootroot00000000000000#include "../../../src/raft.h" #include "../lib/runner.h" /****************************************************************************** * * Default heap functions * *****************************************************************************/ SUITE(raft_heap) TEST(raft_heap, malloc, NULL, NULL, 0, NULL) { void *p; p = raft_malloc(8); munit_assert_ptr_not_null(p); raft_free(p); return MUNIT_OK; } TEST(raft_heap, calloc, NULL, NULL, 0, NULL) { void *p; p = raft_calloc(1, 8); munit_assert_ptr_not_null(p); munit_assert_int(*(uint64_t *)p, ==, 0); raft_free(p); return MUNIT_OK; } TEST(raft_heap, realloc, NULL, NULL, 0, NULL) { void *p; p = raft_realloc(NULL, 8); munit_assert_ptr_not_null(p); *(uint64_t *)p = 1; p = raft_realloc(p, 16); munit_assert_ptr_not_null(p); munit_assert_int(*(uint64_t *)p, ==, 1); raft_free(p); return MUNIT_OK; } TEST(raft_heap, aligned_alloc, NULL, NULL, 0, NULL) { void *p; p = raft_aligned_alloc(1024, 2048); munit_assert_ptr_not_null(p); munit_assert_int((uintptr_t)p % 1024, ==, 0); raft_free(p); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_init.c000066400000000000000000000051021465252713400215030ustar00rootroot00000000000000#include "../../../src/raft.h" #include "../lib/runner.h" /****************************************************************************** * * raft_init * *****************************************************************************/ SUITE(raft_init) /* Incompatible raft->io and raft->fsm wrt async snapshots. */ TEST(raft_init, incompatIoFsmAsyncSnapshotNotNull, NULL, NULL, 0, NULL) { /* Set incompatible io and fsm versions and non-NULL snapshot_async fn */ struct raft r = {0}; struct raft_io io = {0}; struct raft_fsm fsm = {0}; io.version = 1; /* Too low */ io.async_work = (int (*)(struct raft_io *, struct raft_io_async_work *, raft_io_async_work_cb))(uintptr_t)0xDEADBEEF; fsm.version = 3; fsm.snapshot_async = (int (*)(struct raft_fsm *, struct raft_buffer **, unsigned int *))(uintptr_t)0xDEADBEEF; int rc; rc = raft_init(&r, &io, &fsm, 1, "1"); munit_assert_int(rc, ==, -1); munit_assert_string_equal( r.errmsg, "async snapshot requires io->version > 1 and async_work method."); return MUNIT_OK; } /* Incompatible raft->io and raft->fsm wrt async snapshots. */ TEST(raft_init, incompatIoFsmAsyncSnapshotNull, NULL, NULL, 0, NULL) { /* Set incompatible io and fsm versions and NULL snapshot_async fn */ struct raft r = {0}; struct raft_io io = {0}; struct raft_fsm fsm = {0}; io.version = 2; io.async_work = NULL; fsm.version = 3; fsm.snapshot_async = (int (*)(struct raft_fsm *, struct raft_buffer **, unsigned int *))(uintptr_t)0xDEADBEEF; int rc; rc = raft_init(&r, &io, &fsm, 1, "1"); munit_assert_int(rc, ==, -1); munit_assert_string_equal( r.errmsg, "async snapshot requires io->version > 1 and async_work method."); return MUNIT_OK; } TEST(raft_init, ioVersionNotSet, NULL, NULL, 0, NULL) { struct raft r = {0}; struct raft_io io = {0}; struct raft_fsm fsm = {0}; io.version = 0; fsm.version = 3; int rc; rc = raft_init(&r, &io, &fsm, 1, "1"); munit_assert_int(rc, ==, -1); munit_assert_string_equal(r.errmsg, "io->version must be set"); return MUNIT_OK; } TEST(raft_init, fsmVersionNotSet, NULL, NULL, 0, NULL) { struct raft r = {0}; struct raft_io io = {0}; struct raft_fsm fsm = {0}; io.version = 2; fsm.version = 0; int rc; rc = raft_init(&r, &io, &fsm, 1, "1"); munit_assert_int(rc, ==, -1); munit_assert_string_equal(r.errmsg, "fsm->version must be set"); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_membership.c000066400000000000000000000260251465252713400227020ustar00rootroot00000000000000#include "../../../src/raft/configuration.h" #include "../../../src/raft/progress.h" #include "../lib/cluster.h" #include "../lib/runner.h" /****************************************************************************** * * Fixture * *****************************************************************************/ struct fixture { FIXTURE_CLUSTER; struct raft_change req; }; /* Set up a cluster of 2 servers, with the first as leader. */ static void *setup(const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_CLUSTER(2); CLUSTER_BOOTSTRAP; CLUSTER_START; CLUSTER_ELECT(0); return f; } static void tear_down(void *data) { struct fixture *f = data; TEAR_DOWN_CLUSTER; free(f); } /****************************************************************************** * * Helper macros * *****************************************************************************/ /* Add a an empty server to the cluster and start it. */ #define GROW \ { \ int rv__; \ CLUSTER_GROW; \ rv__ = raft_start(CLUSTER_RAFT(2)); \ munit_assert_int(rv__, ==, 0); \ } /* Invoke raft_add against the I'th node and assert it returns the given * value. */ #define ADD(I, ID, RV) \ { \ int rv_; \ char address_[16]; \ sprintf(address_, "%d", ID); \ rv_ = raft_add(CLUSTER_RAFT(I), &f->req, ID, address_, NULL); \ munit_assert_int(rv_, ==, RV); \ } /* Submit a request to assign the given ROLE to the server with the given ID. */ #define ASSIGN(I, ID, ROLE) \ { \ int _rv; \ _rv = raft_assign(CLUSTER_RAFT(I), &f->req, ID, ROLE, NULL); \ munit_assert_int(_rv, ==, 0); \ } /* Invoke raft_remove against the I'th node and assert it returns the given * value. */ #define REMOVE(I, ID, RV) \ { \ int rv_; \ rv_ = raft_remove(CLUSTER_RAFT(I), &f->req, ID, NULL); \ munit_assert_int(rv_, ==, RV); \ } struct result { int status; bool done; }; /* Submit an apply request. */ #define APPLY_SUBMIT(I) \ struct raft_buffer _buf; \ struct raft_apply _req; \ struct result _result = {0, false}; \ int _rv; \ FsmEncodeSetX(123, &_buf); \ _req.data = &_result; \ _rv = raft_apply(CLUSTER_RAFT(I), &_req, &_buf, NULL, 1, NULL); \ munit_assert_int(_rv, ==, 0); /****************************************************************************** * * Assertions * *****************************************************************************/ /* Assert the values of the committed and uncommitted configuration indexes on * the raft instance with the given index. */ #define ASSERT_CONFIGURATION_INDEXES(I, COMMITTED, UNCOMMITTED) \ { \ struct raft *raft_ = CLUSTER_RAFT(I); \ munit_assert_int(raft_->configuration_committed_index, ==, COMMITTED); \ munit_assert_int(raft_->configuration_uncommitted_index, ==, \ UNCOMMITTED); \ } /****************************************************************************** * * raft_add * *****************************************************************************/ SUITE(raft_add) /* After a request to add a new non-voting server is committed, the new * configuration is not marked as uncommitted anymore */ TEST(raft_add, committed, setup, tear_down, 0, NULL) { struct fixture *f = data; struct raft *raft = CLUSTER_RAFT(0); const struct raft_server *server; ADD(0 /* I */, 3 /* ID */, 0); /* The new configuration is already effective. */ munit_assert_int(raft->configuration.n, ==, 3); server = &raft->configuration.servers[2]; munit_assert_int(server->id, ==, 3); munit_assert_string_equal(server->address, "3"); munit_assert_int(server->role, ==, RAFT_SPARE); /* The new configuration is marked as uncommitted. */ ASSERT_CONFIGURATION_INDEXES(0, 1, 3); /* The next/match indexes now include an entry for the new server. */ munit_assert_int(raft->leader_state.progress[2].next_index, ==, 4); munit_assert_int(raft->leader_state.progress[2].match_index, ==, 0); CLUSTER_STEP_UNTIL_APPLIED(0, 3, 2000); ASSERT_CONFIGURATION_INDEXES(0, 3, 0); /* The new configuration is marked as committed. */ return MUNIT_OK; } /* Trying to add a server on a node which is not the leader results in an * error. */ TEST(raft_add, notLeader, setup, tear_down, 0, NULL) { struct fixture *f = data; ADD(1 /* I */, 3 /* ID */, RAFT_NOTLEADER); return MUNIT_OK; } /* Trying to add a server while a configuration change is already in progress * results in an error. */ TEST(raft_add, busy, setup, tear_down, 0, NULL) { struct fixture *f = data; ADD(0 /* I */, 3 /* ID */, 0); ADD(0 /* I */, 4 /* ID */, RAFT_CANTCHANGE); munit_log(MUNIT_LOG_INFO, "done"); return MUNIT_OK; } /* Trying to add a server with an ID which is already in use results in an * error. */ TEST(raft_add, duplicateId, setup, tear_down, 0, NULL) { struct fixture *f = data; ADD(0 /* I */, 2 /* ID */, RAFT_DUPLICATEID); return MUNIT_OK; } /****************************************************************************** * * raft_remove * *****************************************************************************/ SUITE(raft_remove) /* After a request to remove server is committed, the new configuration is not * marked as uncommitted anymore */ TEST(raft_remove, committed, setup, tear_down, 0, NULL) { struct fixture *f = data; GROW; ADD(0, 3, 0); CLUSTER_STEP_UNTIL_APPLIED(0, 3, 2000); ASSIGN(0, 3, RAFT_STANDBY); CLUSTER_STEP_UNTIL_APPLIED(2, 2, 2000); CLUSTER_STEP_N(2); REMOVE(0, 3, 0); ASSERT_CONFIGURATION_INDEXES(0, 4, 5); CLUSTER_STEP_UNTIL_APPLIED(0, 5, 2000); ASSERT_CONFIGURATION_INDEXES(0, 5, 0); munit_assert_int(CLUSTER_RAFT(0)->configuration.n, ==, 2); return MUNIT_OK; } /* A leader gets a request to remove itself. */ TEST(raft_remove, self, setup, tear_down, 0, NULL) { struct fixture *f = data; REMOVE(0, 1, 0); CLUSTER_STEP_UNTIL_APPLIED(0, 2, 2000); CLUSTER_STEP_UNTIL_APPLIED(1, 2, 10000); return MUNIT_OK; } /* A leader gets a request to remove itself from a 3-node cluster */ TEST(raft_remove, selfThreeNodeClusterReplicate, setup, tear_down, 0, NULL) { struct fixture *f = data; /* Add a third node */ GROW; ADD(0, 3, 0); CLUSTER_STEP_UNTIL_APPLIED(0, 3, 2000); ASSIGN(0, 3, RAFT_VOTER); CLUSTER_STEP_UNTIL_APPLIED(0, 4, 2000); /* Verify node with id 1 is the leader */ raft_id leader_id = 0xDEADBEEF; const char *leader_address = NULL; raft_leader(CLUSTER_RAFT(0), &leader_id, &leader_address); munit_assert_ulong(leader_id, ==, 1); munit_assert_ptr_not_null(leader_address); /* The leader is requested to remove itself from the configuration */ REMOVE(0, 1, 0); /* The - removed - leader should still replicate entries. * * Raft dissertation 4.2.2 * `First, there will be a period of time (while it is committing Cnew) when * a leader can manage a cluster that does not include itself; it replicates * log entries but does not count itself in majorities.` * * */ APPLY_SUBMIT(0) /* The removed leader eventually steps down */ CLUSTER_STEP_UNTIL_HAS_NO_LEADER(5000); raft_leader(CLUSTER_RAFT(0), &leader_id, &leader_address); munit_assert_ulong(leader_id, ==, 0); munit_assert_ptr_null(leader_address); /* The original leader has applied the REMOVE entry */ CLUSTER_STEP_UNTIL_APPLIED(0, 5, 10000); /* At this point the other nodes have replicated the new config, but have * not yet applied it, they miss a heartbeat from the leader informing them * of the commit index of the new config.*/ /* A new leader is elected */ CLUSTER_STEP_UNTIL_HAS_LEADER(5000); /* The other nodes applied the barrier after * the config change and therefore commit the new config . */ CLUSTER_STEP_UNTIL_APPLIED(1, 6, 10000); CLUSTER_STEP_UNTIL_APPLIED(2, 6, 10000); /* The removed leader doesn't know who the leader is */ raft_leader(CLUSTER_RAFT(0), &leader_id, &leader_address); munit_assert_ulong(leader_id, ==, 0); munit_assert_ptr_null(leader_address); /* The new configuration has a leader */ raft_leader(CLUSTER_RAFT(1), &leader_id, &leader_address); munit_assert_ulong(leader_id, !=, 0); munit_assert_ulong(leader_id, !=, 1); munit_assert_ptr_not_null(leader_address); return MUNIT_OK; } /* Trying to remove a server on a node which is not the leader results in an * error. */ TEST(raft_remove, notLeader, setup, tear_down, 0, NULL) { struct fixture *f = data; REMOVE(1 /* I */, 3 /* ID */, RAFT_NOTLEADER); return MUNIT_OK; } /* Trying to remove a server while a configuration change is already in progress * results in an error. */ TEST(raft_remove, inProgress, setup, tear_down, 0, NULL) { struct fixture *f = data; ADD(0, 3, 0); REMOVE(0, 3, RAFT_CANTCHANGE); return MUNIT_OK; } /* Trying to remove a server with an unknown ID results in an error. */ TEST(raft_remove, badId, setup, tear_down, 0, NULL) { struct fixture *f = data; REMOVE(0, 3, RAFT_BADID); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_recover.c000066400000000000000000000024551465252713400222150ustar00rootroot00000000000000#include "../lib/cluster.h" #include "../lib/runner.h" /****************************************************************************** * * Fixture holding a bootstrapped raft cluster. * *****************************************************************************/ struct fixture { FIXTURE_CLUSTER; }; static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_CLUSTER(3); CLUSTER_BOOTSTRAP; return f; } static void tearDown(void *data) { struct fixture *f = data; TEAR_DOWN_CLUSTER; free(f); } /****************************************************************************** * * Recover tests. * *****************************************************************************/ SUITE(raft_recover) /* Attempting to recover a running instance results in RAFT_BUSY. */ TEST(raft_recover, busy, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft *raft; struct raft_configuration configuration; int rv; /* Start all servers. */ CLUSTER_START; raft = CLUSTER_RAFT(0); CLUSTER_CONFIGURATION(&configuration); rv = raft_recover(raft, &configuration); munit_assert_int(rv, ==, RAFT_BUSY); raft_configuration_close(&configuration); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_replication.c000066400000000000000000001300111465252713400230470ustar00rootroot00000000000000#include "../../../src/raft/configuration.h" #include "../../../src/raft/flags.h" #include "../../../src/raft/progress.h" #include "../lib/cluster.h" #include "../lib/runner.h" /****************************************************************************** * * Fixture * *****************************************************************************/ struct fixture { FIXTURE_CLUSTER; }; /****************************************************************************** * * Helper macros * *****************************************************************************/ /* Standard startup sequence, bootstrapping the cluster and electing server 0 */ #define BOOTSTRAP_START_AND_ELECT \ CLUSTER_BOOTSTRAP; \ CLUSTER_START; \ CLUSTER_ELECT(0); \ ASSERT_TIME(1045) /****************************************************************************** * * Set up a cluster with a two servers. * *****************************************************************************/ static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_CLUSTER(2); return f; } static void tearDown(void *data) { struct fixture *f = data; TEAR_DOWN_CLUSTER; free(f); } /****************************************************************************** * * Assertions * *****************************************************************************/ /* Assert that the I'th server is in follower state. */ #define ASSERT_FOLLOWER(I) munit_assert_int(CLUSTER_STATE(I), ==, RAFT_FOLLOWER) /* Assert that the I'th server is in candidate state. */ #define ASSERT_CANDIDATE(I) \ munit_assert_int(CLUSTER_STATE(I), ==, RAFT_CANDIDATE) /* Assert that the I'th server is in leader state. */ #define ASSERT_LEADER(I) munit_assert_int(CLUSTER_STATE(I), ==, RAFT_LEADER) /* Assert that the fixture time matches the given value */ #define ASSERT_TIME(TIME) munit_assert_int(CLUSTER_TIME, ==, TIME) /* Assert that the configuration of the I'th server matches the given one */ #define ASSERT_CONFIGURATION(I, EXPECTED) \ do { \ struct raft *_raft = CLUSTER_RAFT(I); \ struct raft_configuration *_actual = &_raft->configuration; \ unsigned _i; \ \ munit_assert_uint(_actual->n, ==, (EXPECTED)->n); \ for (_i = 0; _i < _actual->n; _i++) { \ struct raft_server *_server1 = &_actual->servers[_i]; \ struct raft_server *_server2 = &(EXPECTED)->servers[_i]; \ munit_assert_ulong(_server1->id, ==, _server2->id); \ munit_assert_int(_server1->role, ==, _server2->role); \ munit_assert_string_equal(_server1->address, _server2->address); \ } \ } while (0) /****************************************************************************** * * Log replication. * *****************************************************************************/ SUITE(replication) /* A leader sends a heartbeat message as soon as it gets elected. */ TEST(replication, sendInitialHeartbeat, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft *raft; CLUSTER_BOOTSTRAP; CLUSTER_START; /* Server 0 becomes candidate and sends vote requests after the election * timeout. */ CLUSTER_STEP_N(19); ASSERT_TIME(1000); ASSERT_CANDIDATE(0); /* Server 0 receives the vote result, becomes leader and sends * heartbeats. */ CLUSTER_STEP_N(6); ASSERT_LEADER(0); ASSERT_TIME(1030); raft = CLUSTER_RAFT(0); munit_assert_int(raft->leader_state.progress[1].last_send, ==, 1030); /* Server 1 receives the heartbeat from server 0 and resets its election * timer. */ raft = CLUSTER_RAFT(1); munit_assert_int(raft->election_timer_start, ==, 1015); CLUSTER_STEP_N(2); munit_assert_int(raft->election_timer_start, ==, 1045); munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 1); munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 1); return MUNIT_OK; } /* After receiving an AppendEntriesResult, a leader has set the feature flags of * a node. */ TEST(replication, receiveFlags, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft *raft; CLUSTER_BOOTSTRAP; CLUSTER_START; /* Server 0 becomes leader and sends the initial heartbeat. */ CLUSTER_STEP_N(24); ASSERT_LEADER(0); ASSERT_TIME(1030); /* Flags is empty */ raft = CLUSTER_RAFT(0); munit_assert_ullong(raft->leader_state.progress[1].features, ==, 0); raft = CLUSTER_RAFT(1); /* Server 1 receives the first heartbeat. */ CLUSTER_STEP_N(4); munit_assert_int(raft->election_timer_start, ==, 1045); munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 1); /* Server 0 receives the reply to the heartbeat. */ CLUSTER_STEP_N(2); munit_assert_int(CLUSTER_N_RECV(0, RAFT_IO_APPEND_ENTRIES_RESULT), ==, 1); raft = CLUSTER_RAFT(0); munit_assert_ullong(raft->leader_state.progress[1].features, ==, RAFT_DEFAULT_FEATURE_FLAGS); return MUNIT_OK; } /* A leader keeps sending heartbeat messages at regular intervals to * maintain leadership. */ TEST(replication, sendFollowupHeartbeat, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft *raft; CLUSTER_BOOTSTRAP; CLUSTER_START; /* Server 0 becomes leader and sends the initial heartbeat. */ CLUSTER_STEP_N(24); ASSERT_LEADER(0); ASSERT_TIME(1030); raft = CLUSTER_RAFT(1); /* Server 1 receives the first heartbeat. */ CLUSTER_STEP_N(4); munit_assert_int(raft->election_timer_start, ==, 1045); munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 1); /* Server 1 receives the second heartbeat. */ CLUSTER_STEP_N(8); munit_assert_int(raft->election_timer_start, ==, 1215); munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 2); /* Server 1 receives the third heartbeat. */ CLUSTER_STEP_N(7); munit_assert_int(raft->election_timer_start, ==, 1315); munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 3); /* Server 1 receives the fourth heartbeat. */ CLUSTER_STEP_N(7); munit_assert_int(raft->election_timer_start, ==, 1415); munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 4); munit_assert_int(CLUSTER_N_RECV(0, RAFT_IO_APPEND_ENTRIES_RESULT), ==, 4); munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 4); munit_assert_int(CLUSTER_N_SEND(1, RAFT_IO_APPEND_ENTRIES_RESULT), ==, 4); return MUNIT_OK; } /* If a leader replicates some entries during a given heartbeat interval, it * skips sending the heartbeat for that interval. */ TEST(replication, sendSkipHeartbeat, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft *raft; struct raft_apply req; CLUSTER_BOOTSTRAP; CLUSTER_START; raft = CLUSTER_RAFT(0); /* Server 0 becomes leader and sends the first two heartbeats. */ CLUSTER_STEP_UNTIL_ELAPSED(1215); ASSERT_LEADER(0); munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 2); munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 2); /* Server 0 starts replicating a new entry after 15 milliseconds. */ CLUSTER_STEP_UNTIL_ELAPSED(15); ASSERT_TIME(1230); CLUSTER_APPLY_ADD_X(0, &req, 1, NULL); CLUSTER_STEP_N(1); munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 3); munit_assert_int(raft->leader_state.progress[1].last_send, ==, 1230); /* When the heartbeat timeout expires, server 0 does not send an empty * append entries. */ CLUSTER_STEP_UNTIL_ELAPSED(70); ASSERT_TIME(1300); CLUSTER_STEP_N(1); munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 3); munit_assert_int(raft->leader_state.progress[1].last_send, ==, 1230); return MUNIT_OK; } /* The leader doesn't send replication messages to idle servers. */ TEST(replication, skipIdle, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_change req1; struct raft_apply req2; BOOTSTRAP_START_AND_ELECT; CLUSTER_ADD(&req1); CLUSTER_STEP_UNTIL_APPLIED(0, 3, 1000); CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, &req2, 1, NULL); CLUSTER_STEP_UNTIL_ELAPSED(1000); munit_assert_int(CLUSTER_LAST_APPLIED(0), ==, 4); munit_assert_int(CLUSTER_LAST_APPLIED(1), ==, 4); munit_assert_int(CLUSTER_LAST_APPLIED(2), ==, 0); return MUNIT_OK; } /* A follower remains in probe mode until the leader receives a successful * AppendEntries response. */ TEST(replication, sendProbe, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_apply req1; struct raft_apply req2; CLUSTER_BOOTSTRAP; CLUSTER_START; /* Server 0 becomes leader and sends the initial heartbeat. */ CLUSTER_STEP_N(25); ASSERT_LEADER(0); ASSERT_TIME(1030); munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 1); /* Set a very high network latency for server 1, so server 0 will send a * second probe AppendEntries without transitioning to pipeline mode. */ munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 0); CLUSTER_SET_NETWORK_LATENCY(1, 250); /* Server 0 receives a new entry after 15 milliseconds. Since the follower * is still in probe mode and since an AppendEntries message was already * sent recently, it does not send the new entry immediately. */ CLUSTER_STEP_UNTIL_ELAPSED(15); CLUSTER_APPLY_ADD_X(0, &req1, 1, NULL); CLUSTER_STEP; munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 1); /* A heartbeat timeout elapses without receiving a response, so server 0 * sends an new AppendEntries to server 1. */ CLUSTER_STEP_UNTIL_ELAPSED(85); CLUSTER_STEP; munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 2); /* Server 0 receives a second entry after 15 milliseconds. Since the * follower is still in probe mode and since an AppendEntries message was * already sent recently, it does not send the new entry immediately. */ CLUSTER_STEP_UNTIL_ELAPSED(15); CLUSTER_APPLY_ADD_X(0, &req2, 1, NULL); CLUSTER_STEP; munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 2); /* Eventually server 0 receives AppendEntries results for both entries. */ CLUSTER_STEP_UNTIL_APPLIED(0, 4, 1000); return MUNIT_OK; } static bool indices_updated(struct raft_fixture *f, void *data) { (void)f; const struct raft *r = data; return r->last_stored == 4 && r->leader_state.progress[1].match_index == 3; } /* A follower transitions to pipeline mode after the leader receives a * successful AppendEntries response from it. */ TEST(replication, sendPipeline, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft *raft; struct raft_apply req1; struct raft_apply req2; CLUSTER_BOOTSTRAP; CLUSTER_START; raft = CLUSTER_RAFT(0); /* Server 0 becomes leader and sends the initial heartbeat, receiving a * successful response. */ CLUSTER_STEP_UNTIL_ELAPSED(1070); ASSERT_LEADER(0); ASSERT_TIME(1070); munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 1); /* Server 0 receives a new entry after 15 milliseconds. Since the follower * has transitioned to pipeline mode the new entry is sent immediately and * the next index is optimistically increased. */ CLUSTER_STEP_UNTIL_ELAPSED(15); CLUSTER_APPLY_ADD_X(0, &req1, 1, NULL); CLUSTER_STEP; munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 2); munit_assert_int(raft->leader_state.progress[1].next_index, ==, 4); /* After another 15 milliseconds server 0 receives a second apply request, * which is also sent out immediately */ CLUSTER_STEP_UNTIL_ELAPSED(15); CLUSTER_APPLY_ADD_X(0, &req2, 1, NULL); CLUSTER_STEP; munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 3); munit_assert_int(raft->leader_state.progress[1].next_index, ==, 5); /* Wait until the leader has stored entry 4 and the follower has matched * entry 3. Expect the commit index to have been updated to 3. */ CLUSTER_STEP_UNTIL(indices_updated, CLUSTER_RAFT(0), 2000); munit_assert_ulong(raft->commit_index, ==, 3); /* Eventually server 0 receives AppendEntries results for both entries. */ CLUSTER_STEP_UNTIL_APPLIED(0, 4, 1000); return MUNIT_OK; } /* A follower disconnects while in probe mode. */ TEST(replication, sendDisconnect, setUp, tearDown, 0, NULL) { struct fixture *f = data; CLUSTER_BOOTSTRAP; CLUSTER_START; /* Server 0 becomes leader and sends the initial heartbeat, however they * fail because server 1 has disconnected. */ CLUSTER_STEP_N(24); ASSERT_LEADER(0); CLUSTER_DISCONNECT(0, 1); CLUSTER_STEP; munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 0); /* After the heartbeat timeout server 0 retries, but still fails. */ CLUSTER_STEP_UNTIL_ELAPSED(100); CLUSTER_STEP; munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 0); /* After another heartbeat timeout server 0 retries and this time * succeeds. */ CLUSTER_STEP_UNTIL_ELAPSED(100); CLUSTER_RECONNECT(0, 1); CLUSTER_STEP; munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 1); return MUNIT_OK; } /* A follower disconnects while in pipeline mode. */ TEST(replication, sendDisconnectPipeline, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_apply req1; struct raft_apply req2; CLUSTER_BOOTSTRAP; CLUSTER_START; /* Server 0 becomes leader and sends a couple of heartbeats. */ CLUSTER_STEP_UNTIL_ELAPSED(1215); munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 2); /* It then starts to replicate a few entries, however the follower * disconnects before delivering results. */ CLUSTER_APPLY_ADD_X(0, &req1, 1, NULL); CLUSTER_STEP; munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 3); CLUSTER_APPLY_ADD_X(0, &req2, 1, NULL); CLUSTER_STEP; munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 4); CLUSTER_DISCONNECT(0, 1); /* The next heartbeat fails, transitioning the follower back to probe * mode. */ CLUSTER_STEP_UNTIL_ELAPSED(115); munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 4); /* After reconnection the follower eventually replicates the entries and * reports back. */ CLUSTER_RECONNECT(0, 1); CLUSTER_STEP_UNTIL_APPLIED(0, 3, 1000); return MUNIT_OK; } static char *send_oom_heap_fault_delay[] = {"5", NULL}; static char *send_oom_heap_fault_repeat[] = {"1", NULL}; static MunitParameterEnum send_oom_params[] = { {TEST_HEAP_FAULT_DELAY, send_oom_heap_fault_delay}, {TEST_HEAP_FAULT_REPEAT, send_oom_heap_fault_repeat}, {NULL, NULL}, }; /* Out of memory failures. */ TEST(replication, sendOom, setUp, tearDown, 0, send_oom_params) { struct fixture *f = data; return MUNIT_SKIP; struct raft_apply req; BOOTSTRAP_START_AND_ELECT; HEAP_FAULT_ENABLE; CLUSTER_APPLY_ADD_X(0, &req, 1, NULL); CLUSTER_STEP; return MUNIT_OK; } /* A failure occurs upon submitting the I/O request. */ TEST(replication, persistError, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_apply req; BOOTSTRAP_START_AND_ELECT; raft_fixture_append_fault(&f->cluster, 0, 0); CLUSTER_APPLY_ADD_X(0, &req, 1, NULL); CLUSTER_STEP; return MUNIT_OK; } /* Receive the same entry a second time, before the first has been persisted. */ TEST(replication, recvTwice, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_apply *req = munit_malloc(sizeof *req); BOOTSTRAP_START_AND_ELECT; CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, req, 1, NULL); /* Set a high disk latency for server 1, so server 0 won't receive an * AppendEntries result within the heartbeat and will re-send the same * entries */ CLUSTER_SET_DISK_LATENCY(1, 300); CLUSTER_STEP_UNTIL_DELIVERED(0, 1, 100); /* First AppendEntries */ CLUSTER_STEP_UNTIL_ELAPSED(110); /* Heartbeat timeout */ CLUSTER_STEP_UNTIL_DELIVERED(0, 1, 100); /* Second AppendEntries */ CLUSTER_STEP_UNTIL_APPLIED(0, req->index, 500); free(req); return MUNIT_OK; } /* If the term in the request is stale, the server rejects it. */ TEST(replication, recvStaleTerm, setUp, tearDown, 0, NULL) { struct fixture *f = data; CLUSTER_GROW; BOOTSTRAP_START_AND_ELECT; /* Set a very high election timeout and the disconnect the leader so it will * keep sending heartbeats. */ raft_fixture_set_randomized_election_timeout(&f->cluster, 0, 5000); raft_set_election_timeout(CLUSTER_RAFT(0), 5000); CLUSTER_SATURATE_BOTHWAYS(0, 1); CLUSTER_SATURATE_BOTHWAYS(0, 2); /* Eventually a new leader gets elected. */ CLUSTER_STEP_UNTIL_HAS_NO_LEADER(5000); CLUSTER_STEP_UNTIL_HAS_LEADER(10000); munit_assert_int(CLUSTER_LEADER, ==, 1); /* Reconnect the old leader to the current follower. */ CLUSTER_DESATURATE_BOTHWAYS(0, 2); /* Step a few times, so the old leader sends heartbeats to the follower, * which rejects them. */ CLUSTER_STEP_UNTIL_ELAPSED(200); return MUNIT_OK; } /* If server's log is shorter than prevLogIndex, the request is rejected . */ TEST(replication, recvMissingEntries, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry entry; CLUSTER_BOOTSTRAP; /* Server 0 has an entry that server 1 doesn't have */ entry.type = RAFT_COMMAND; entry.term = 1; FsmEncodeSetX(1, &entry.buf); CLUSTER_ADD_ENTRY(0, &entry); /* Server 0 wins the election because it has a longer log. */ CLUSTER_START; CLUSTER_STEP_UNTIL_HAS_LEADER(5000); munit_assert_int(CLUSTER_LEADER, ==, 0); /* The first server replicates missing entries to the second. */ CLUSTER_STEP_UNTIL_APPLIED(1, 3, 3000); return MUNIT_OK; } /* If the term of the last log entry on the server is different from the one * prevLogTerm, and value of prevLogIndex is greater than server's commit commit * index (i.e. this is a normal inconsistency), we reject the request. */ TEST(replication, recvPrevLogTermMismatch, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry entry1; struct raft_entry entry2; CLUSTER_BOOTSTRAP; /* The servers have an entry with a conflicting term. */ entry1.type = RAFT_COMMAND; entry1.term = 2; FsmEncodeSetX(1, &entry1.buf); CLUSTER_ADD_ENTRY(0, &entry1); entry2.type = RAFT_COMMAND; entry2.term = 1; FsmEncodeSetX(2, &entry2.buf); CLUSTER_ADD_ENTRY(1, &entry2); CLUSTER_START; CLUSTER_ELECT(0); /* The follower eventually replicates the entry */ CLUSTER_STEP_UNTIL_APPLIED(1, 2, 3000); return MUNIT_OK; } /* The follower has an uncommitted log entry that conflicts with a new one sent * by the leader (same index but different term). The follower's conflicting log * entry happens to be a configuration change. In that case the follower * discards the conflicting entry from its log and rolls back its configuration * to the initial one contained in the log entry at index 1. */ TEST(replication, recvRollbackConfigurationToInitial, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry entry1; struct raft_entry entry2; struct raft_configuration base; /* Committed configuration at index 1 */ struct raft_configuration conf; /* Uncommitted configuration at index 2 */ CLUSTER_BOOTSTRAP; CLUSTER_CONFIGURATION(&base); /* Both servers have an entry at index 2, but with conflicting terms. The * entry of the second server is a configuration change. */ entry1.type = RAFT_COMMAND; entry1.term = 2; FsmEncodeSetX(1, &entry1.buf); CLUSTER_ADD_ENTRY(0, &entry1); entry2.type = RAFT_CHANGE; entry2.term = 1; CLUSTER_CONFIGURATION(&conf); raft_configuration_add(&conf, 3, "3", 2); raft_configuration_encode(&conf, &entry2.buf); CLUSTER_ADD_ENTRY(1, &entry2); /* At startup the second server uses the most recent configuration, i.e. the * one contained in the entry that we just added. The server can't know yet * if it's committed or not, and regards it as pending configuration * change. */ CLUSTER_START; ASSERT_CONFIGURATION(1, &conf); /* The first server gets elected. */ CLUSTER_ELECT(0); /* The second server eventually replicates the first server's log entry at * index 2, truncating its own log and rolling back to the configuration * contained in the log entry at index 1. */ CLUSTER_STEP_UNTIL_APPLIED(1, 2, 3000); ASSERT_CONFIGURATION(0, &base); ASSERT_CONFIGURATION(1, &base); raft_configuration_close(&base); raft_configuration_close(&conf); return MUNIT_OK; } /* The follower has an uncommitted log entry that conflicts with a new one sent * by the leader (same index but different term). The follower's conflicting log * entry happens to be a configuration change. There's also an older committed * configuration entry present. In that case the follower discards the * conflicting entry from its log and rolls back its configuration to the * committed one in the older configuration entry. */ TEST(replication, recvRollbackConfigurationToPrevious, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry entry1; struct raft_entry entry2; struct raft_entry entry3; struct raft_entry entry4; struct raft_configuration base; /* Committed configuration at index 2 */ struct raft_configuration conf; /* Uncommitted configuration at index 3 */ CLUSTER_BOOTSTRAP; CLUSTER_CONFIGURATION(&base); /* Both servers have a matching configuration entry at index 2. */ CLUSTER_CONFIGURATION(&conf); entry1.type = RAFT_CHANGE; entry1.term = 1; raft_configuration_encode(&conf, &entry1.buf); CLUSTER_ADD_ENTRY(0, &entry1); entry2.type = RAFT_CHANGE; entry2.term = 1; raft_configuration_encode(&conf, &entry2.buf); CLUSTER_ADD_ENTRY(1, &entry2); /* Both servers have an entry at index 3, but with conflicting terms. The * entry of the second server is a configuration change. */ entry3.type = RAFT_COMMAND; entry3.term = 2; FsmEncodeSetX(1, &entry3.buf); CLUSTER_ADD_ENTRY(0, &entry3); entry4.type = RAFT_CHANGE; entry4.term = 1; raft_configuration_add(&conf, 3, "3", 2); raft_configuration_encode(&conf, &entry4.buf); CLUSTER_ADD_ENTRY(1, &entry4); /* At startup the second server uses the most recent configuration, i.e. the * one contained in the log entry at index 3. The server can't know yet if * it's committed or not, and regards it as pending configuration change. */ CLUSTER_START; ASSERT_CONFIGURATION(1, &conf); /* The first server gets elected. */ CLUSTER_ELECT(0); /* The second server eventually replicates the first server's log entry at * index 3, truncating its own log and rolling back to the configuration * contained in the log entry at index 2. */ CLUSTER_STEP_UNTIL_APPLIED(1, 3, 3000); ASSERT_CONFIGURATION(0, &base); ASSERT_CONFIGURATION(1, &base); raft_configuration_close(&base); raft_configuration_close(&conf); return MUNIT_OK; } /* The follower has an uncommitted log entry that conflicts with a new one sent * by the leader (same index but different term). The follower's conflicting log * entry happens to be a configuration change. The follower's log has been * truncated after a snashot and does not contain the previous committed * configuration anymore. In that case the follower discards the conflicting * entry from its log and rolls back its configuration to the previous committed * one, which was cached when the snapshot was restored. */ TEST(replication, recvRollbackConfigurationToSnapshot, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry entry1; struct raft_entry entry2; struct raft_configuration base; /* Committed configuration at index 1 */ struct raft_configuration conf; /* Uncommitted configuration at index 2 */ int rv; CLUSTER_CONFIGURATION(&conf); CLUSTER_CONFIGURATION(&base); /* Bootstrap the first server. This creates a log entry at index 1 * containing the initial configuration. */ rv = raft_bootstrap(CLUSTER_RAFT(0), &conf); munit_assert_int(rv, ==, 0); /* The second server has a snapshot up to entry 1. Entry 1 is not present in * the log. */ CLUSTER_SET_SNAPSHOT(1 /* */, 1 /* last index */, 1 /* last term */, 1 /* conf index */, 5 /* x */, 0 /* y */); CLUSTER_SET_TERM(1, 1); /* Both servers have an entry at index 2, but with conflicting terms. The * entry of the second server is a configuration change and gets appended to * the truncated log. */ entry1.type = RAFT_COMMAND; entry1.term = 3; FsmEncodeSetX(1, &entry1.buf); CLUSTER_ADD_ENTRY(0, &entry1); entry2.type = RAFT_CHANGE; entry2.term = 2; raft_configuration_add(&conf, 3, "3", 2); raft_configuration_encode(&conf, &entry2.buf); CLUSTER_ADD_ENTRY(1, &entry2); /* At startup the second server uses the most recent configuration, i.e. the * one contained in the log entry at index 2. The server can't know yet if * it's committed or not, and regards it as pending configuration change. */ CLUSTER_START; ASSERT_CONFIGURATION(1, &conf); CLUSTER_ELECT(0); /* The second server eventually replicates the first server's log entry at * index 3, truncating its own log and rolling back to the configuration * contained in the snapshot, which is not present in the log anymore but * was cached at startup. */ CLUSTER_STEP_UNTIL_APPLIED(1, 3, 3000); ASSERT_CONFIGURATION(0, &base); ASSERT_CONFIGURATION(1, &base); raft_configuration_close(&base); raft_configuration_close(&conf); return MUNIT_OK; } /* If any of the new entry has the same index of an existing entry in our log, * but different term, and that entry index is already committed, we bail out * with an error. */ TEST(replication, recvPrevIndexConflict, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry entry1; struct raft_entry entry2; CLUSTER_BOOTSTRAP; /* The servers have an entry with a conflicting term. */ entry1.type = RAFT_COMMAND; entry1.term = 2; FsmEncodeSetX(1, &entry1.buf); CLUSTER_ADD_ENTRY(0, &entry1); entry2.type = RAFT_COMMAND; entry2.term = 1; FsmEncodeSetX(2, &entry2.buf); CLUSTER_ADD_ENTRY(1, &entry2); CLUSTER_START; CLUSTER_ELECT(0); /* Artificially bump the commit index on the second server */ CLUSTER_RAFT(1)->commit_index = 2; CLUSTER_STEP; CLUSTER_STEP; return MUNIT_OK; } /* A write log request is submitted for outstanding log entries. If some entries * are already existing in the log, they will be skipped. */ TEST(replication, recvSkip, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_apply *req = munit_malloc(sizeof *req); BOOTSTRAP_START_AND_ELECT; /* Submit an entry */ CLUSTER_APPLY_ADD_X(0, req, 1, NULL); /* The leader replicates the entry to the follower however it does not get * notified about the result, so it sends the entry again. */ CLUSTER_STEP; CLUSTER_SATURATE_BOTHWAYS(0, 1); CLUSTER_STEP_UNTIL_ELAPSED(150); /* The follower reconnects and receives again the same entry. This time the * leader receives the notification. */ CLUSTER_DESATURATE_BOTHWAYS(0, 1); CLUSTER_STEP_UNTIL_APPLIED(0, req->index, 2000); free(req); return MUNIT_OK; } /* If the index and term of the last snapshot on the server match prevLogIndex * and prevLogTerm the request is accepted. */ TEST(replication, recvMatch_last_snapshot, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry entry; struct raft_configuration configuration; int rv; CLUSTER_CONFIGURATION(&configuration); rv = raft_bootstrap(CLUSTER_RAFT(0), &configuration); munit_assert_int(rv, ==, 0); raft_configuration_close(&configuration); /* The first server has entry 2 */ entry.type = RAFT_COMMAND; entry.term = 2; FsmEncodeSetX(5, &entry.buf); CLUSTER_ADD_ENTRY(0, &entry); /* The second server has a snapshot up to entry 2 */ CLUSTER_SET_SNAPSHOT(1 /* */, 2 /* last index */, 2 /* last term */, 1 /* conf index */, 5 /* x */, 0 /* y */); CLUSTER_SET_TERM(1, 2); CLUSTER_START; CLUSTER_ELECT(0); /* Apply an additional entry and check that it gets replicated on the * follower. */ CLUSTER_MAKE_PROGRESS; CLUSTER_STEP_UNTIL_APPLIED(1, 3, 3000); return MUNIT_OK; } /* If a candidate server receives a request containing the same term as its * own, it it steps down to follower and accept the request . */ TEST(replication, recvCandidateSameTerm, setUp, tearDown, 0, NULL) { struct fixture *f = data; CLUSTER_GROW; CLUSTER_BOOTSTRAP; /* Disconnect server 2 from the other two and set a low election timeout on * it, so it will immediately start an election. */ CLUSTER_SATURATE_BOTHWAYS(2, 0); CLUSTER_SATURATE_BOTHWAYS(2, 1); raft_fixture_set_randomized_election_timeout(&f->cluster, 2, 800); raft_set_election_timeout(CLUSTER_RAFT(2), 800); /* Server 2 becomes candidate. */ CLUSTER_START; CLUSTER_STEP_UNTIL_STATE_IS(2, RAFT_CANDIDATE, 1000); munit_assert_int(CLUSTER_TERM(2), ==, 2); /* Server 0 wins the election and replicates an entry. */ CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_LEADER, 2000); munit_assert_int(CLUSTER_TERM(0), ==, 2); munit_assert_int(CLUSTER_TERM(1), ==, 2); munit_assert_int(CLUSTER_TERM(2), ==, 2); CLUSTER_MAKE_PROGRESS; /* Now reconnect the third server, which eventually steps down and * replicates the entry. */ munit_assert_int(CLUSTER_STATE(2), ==, RAFT_CANDIDATE); munit_assert_int(CLUSTER_TERM(2), ==, 2); CLUSTER_DESATURATE_BOTHWAYS(2, 0); CLUSTER_DESATURATE_BOTHWAYS(2, 1); CLUSTER_STEP_UNTIL_STATE_IS(2, RAFT_FOLLOWER, 2000); CLUSTER_STEP_UNTIL_APPLIED(2, 2, 2000); return MUNIT_OK; } /* If a candidate server receives a request containing an higher term as its * own, it it steps down to follower and accept the request . */ TEST(replication, recvCandidateHigherTerm, setUp, tearDown, 0, NULL) { struct fixture *f = data; CLUSTER_GROW; CLUSTER_BOOTSTRAP; /* Set a high election timeout on server 1, so it won't become candidate */ raft_fixture_set_randomized_election_timeout(&f->cluster, 1, 2000); raft_set_election_timeout(CLUSTER_RAFT(1), 2000); /* Disconnect server 2 from the other two. */ CLUSTER_SATURATE_BOTHWAYS(2, 0); CLUSTER_SATURATE_BOTHWAYS(2, 1); /* Set a low election timeout on server 0, and disconnect it from server 1, * so by the time it wins the second round, server 2 will have turned * candidate */ raft_fixture_set_randomized_election_timeout(&f->cluster, 0, 800); raft_set_election_timeout(CLUSTER_RAFT(0), 800); CLUSTER_SATURATE_BOTHWAYS(0, 1); CLUSTER_START; /* Server 2 becomes candidate, and server 0 already is candidate. */ CLUSTER_STEP_UNTIL_STATE_IS(2, RAFT_CANDIDATE, 1500); munit_assert_int(CLUSTER_TERM(2), ==, 2); munit_assert_int(CLUSTER_STATE(0), ==, RAFT_CANDIDATE); munit_assert_int(CLUSTER_TERM(0), ==, 2); /* Server 0 starts a new election, while server 2 is still candidate */ CLUSTER_STEP_UNTIL_TERM_IS(0, 3, 2000); munit_assert_int(CLUSTER_TERM(2), ==, 2); munit_assert_int(CLUSTER_STATE(2), ==, RAFT_CANDIDATE); /* Reconnect the first and second server and let the election succeed and * replicate an entry. */ CLUSTER_DESATURATE_BOTHWAYS(0, 1); CLUSTER_STEP_UNTIL_HAS_LEADER(1000); CLUSTER_MAKE_PROGRESS; /* Now reconnect the third server, which eventually steps down and * replicates the entry. */ munit_assert_int(CLUSTER_STATE(2), ==, RAFT_CANDIDATE); munit_assert_int(CLUSTER_TERM(2), ==, 2); CLUSTER_DESATURATE_BOTHWAYS(2, 0); CLUSTER_DESATURATE_BOTHWAYS(2, 1); CLUSTER_STEP_UNTIL_STATE_IS(2, RAFT_FOLLOWER, 2000); CLUSTER_STEP_UNTIL_APPLIED(2, 2, 2000); return MUNIT_OK; } /* If the server handling the response is not the leader, the result * is ignored. */ TEST(replication, resultNotLeader, setUp, tearDown, 0, NULL) { struct fixture *f = data; BOOTSTRAP_START_AND_ELECT; /* Set a very high-latency for the second server's outgoing messages, so the * first server won't get notified about the results for a while. */ CLUSTER_SET_NETWORK_LATENCY(1, 400); /* Set a low election timeout on the first server so it will step down very * soon. */ raft_fixture_set_randomized_election_timeout(&f->cluster, 0, 200); raft_set_election_timeout(CLUSTER_RAFT(0), 200); /* Eventually leader steps down and becomes candidate. */ CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_CANDIDATE, 2000); /* The AppendEntries result eventually gets delivered, but the candidate * ignores it. */ CLUSTER_STEP_UNTIL_ELAPSED(400); return MUNIT_OK; } /* If the response has a term which is lower than the server's one, it's * ignored. */ TEST(replication, resultLowerTerm, setUp, tearDown, 0, NULL) { struct fixture *f = data; CLUSTER_GROW; BOOTSTRAP_START_AND_ELECT; /* Set a very high-latency for the second server's outgoing messages, so the * first server won't get notified about the results for a while. */ CLUSTER_SET_NETWORK_LATENCY(1, 2000); /* Set a high election timeout on server 1, so it won't become candidate */ raft_fixture_set_randomized_election_timeout(&f->cluster, 1, 2000); raft_set_election_timeout(CLUSTER_RAFT(1), 2000); /* Disconnect server 0 and set a low election timeout on it so it will step * down very soon. */ CLUSTER_SATURATE_BOTHWAYS(0, 2); raft_fixture_set_randomized_election_timeout(&f->cluster, 0, 200); raft_set_election_timeout(CLUSTER_RAFT(0), 200); CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_FOLLOWER, 2000); /* Make server 0 become leader again. */ CLUSTER_DESATURATE_BOTHWAYS(0, 2); CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_LEADER, 4000); /* Eventually deliver the result message. */ CLUSTER_STEP_UNTIL_ELAPSED(2500); return MUNIT_OK; } /* If the response has a term which is higher than the server's one, step down * to follower. */ TEST(replication, resultHigherTerm, setUp, tearDown, 0, NULL) { struct fixture *f = data; CLUSTER_GROW; BOOTSTRAP_START_AND_ELECT; /* Set a very high election timeout for server 0 so it won't step down. */ raft_fixture_set_randomized_election_timeout(&f->cluster, 0, 5000); raft_set_election_timeout(CLUSTER_RAFT(0), 5000); /* Disconnect the server 0 from the rest of the cluster. */ CLUSTER_SATURATE_BOTHWAYS(0, 1); CLUSTER_SATURATE_BOTHWAYS(0, 2); /* Eventually a new leader gets elected */ CLUSTER_STEP_UNTIL_HAS_NO_LEADER(2000); CLUSTER_STEP_UNTIL_HAS_LEADER(4000); munit_assert_int(CLUSTER_LEADER, ==, 1); /* Reconnect the old leader to the current follower, which eventually * replies with an AppendEntries result containing an higher term. */ CLUSTER_DESATURATE_BOTHWAYS(0, 2); CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_FOLLOWER, 2000); return MUNIT_OK; } /* If the response fails because a log mismatch, the nextIndex for the server is * updated and the relevant older entries are resent. */ TEST(replication, resultRetry, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry entry; CLUSTER_BOOTSTRAP; /* Add an additional entry to the first server that the second server does * not have. */ entry.type = RAFT_COMMAND; entry.term = 1; FsmEncodeSetX(5, &entry.buf); CLUSTER_ADD_ENTRY(0, &entry); CLUSTER_START; CLUSTER_ELECT(0); /* The first server receives an AppendEntries result from the second server * indicating that its log does not have the entry at index 2, so it will * resend it. */ CLUSTER_STEP_UNTIL_APPLIED(1, 3, 2000); return MUNIT_OK; } static void applyAssertStatusCb(struct raft_apply *req, int status, void *result) { (void)result; int status_expected = (int)(intptr_t)(req->data); munit_assert_int(status_expected, ==, status); } /* When the leader fails to write some new entries to disk, it steps down. */ TEST(replication, diskWriteFailure, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_apply *req = munit_malloc(sizeof(*req)); req->data = (void *)(intptr_t)RAFT_IOERR; BOOTSTRAP_START_AND_ELECT; raft_fixture_append_fault(&f->cluster, 0, 0); CLUSTER_APPLY_ADD_X(0, req, 1, applyAssertStatusCb); /* The leader steps down when its disk write fails. */ CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_FOLLOWER, 2000); free(req); return MUNIT_OK; } /* A follower updates its term number while persisting entries. */ TEST(replication, newTermWhileAppending, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_apply *req = munit_malloc(sizeof(*req)); raft_term term; CLUSTER_GROW; /* Make sure that persisting entries will take a long time */ CLUSTER_SET_DISK_LATENCY(2, 3000); BOOTSTRAP_START_AND_ELECT; CLUSTER_APPLY_ADD_X(0, req, 1, NULL); /* Wait for the leader to replicate the entry */ CLUSTER_STEP_UNTIL_ELAPSED(500); /* Force a new term */ term = CLUSTER_RAFT(2)->current_term; CLUSTER_DEPOSE; CLUSTER_ELECT(1); CLUSTER_STEP_UNTIL_ELAPSED(500); munit_assert_ullong(CLUSTER_RAFT(2)->current_term, ==, term + 1); /* Wait for the long disk write to complete */ CLUSTER_STEP_UNTIL_ELAPSED(3000); free(req); return MUNIT_OK; } /* A leader with slow disk commits an entry that it hasn't persisted yet, * because enough followers to have a majority have aknowledged that they have * appended the entry. The leader's last_stored field hence lags behind its * commit_index. A new leader gets elected, with a higher commit index and sends * first a new entry than a heartbeat to the old leader, that needs to update * its commit_index taking into account its lagging last_stored. */ TEST(replication, lastStoredLaggingBehindCommitIndex, setUp, tearDown, 0, NULL) { struct fixture *f = data; CLUSTER_GROW; /* Server 0 takes a long time to persist entry 2 (the barrier) */ CLUSTER_SET_DISK_LATENCY(0, 10000); /* Server 0 gets elected and creates a barrier entry at index 2 */ BOOTSTRAP_START_AND_ELECT; /* Server 0 commits and applies barrier entry 2 even if it not persist it * yet. */ CLUSTER_STEP_UNTIL_APPLIED(0, 2, 2000); munit_assert_int(CLUSTER_RAFT(0)->last_stored, ==, 1); munit_assert_int(CLUSTER_RAFT(0)->commit_index, ==, 2); munit_assert_int(CLUSTER_RAFT(0)->last_applied, ==, 2); /* Server 1 stored barrier entry 2, but did not yet receive a notification * from server 0 about the new commit index. */ munit_assert_int(CLUSTER_RAFT(1)->last_stored, ==, 2); munit_assert_int(CLUSTER_RAFT(1)->commit_index, ==, 1); munit_assert_int(CLUSTER_RAFT(1)->last_applied, ==, 1); /* Disconnect server 0 from server 1 and 2. */ CLUSTER_DISCONNECT(0, 1); CLUSTER_DISCONNECT(0, 2); /* Set a very high election timeout on server 0, so it won't step down for a * while, even if disconnected. */ raft_fixture_set_randomized_election_timeout(&f->cluster, 0, 10000); raft_set_election_timeout(CLUSTER_RAFT(0), 10000); /* Server 1 and 2 eventually timeout and start an election, server 1 * wins. */ CLUSTER_STEP_UNTIL_HAS_NO_LEADER(4000); CLUSTER_STEP_UNTIL_HAS_LEADER(2000); munit_assert_int(CLUSTER_LEADER, ==, 1); /* Server 1 commits the barrier entry at index 3 that it created at the * start of its term. */ CLUSTER_STEP_UNTIL_APPLIED(1, 3, 2000); /* Reconnect server 0 to server 1, which will start replicating entry 3 to * it. */ CLUSTER_RECONNECT(0, 1); CLUSTER_STEP_UNTIL_APPLIED(0, 3, 20000); return MUNIT_OK; } /* A leader with faulty disk fails to persist the barrier entry upon election. */ TEST(replication, failPersistBarrier, setUp, tearDown, 0, NULL) { struct fixture *f = data; CLUSTER_GROW; /* Server 0 will fail to persist entry 2, a barrier */ raft_fixture_append_fault(&f->cluster, 0, 0); /* Server 0 gets elected and creates a barrier entry at index 2 */ CLUSTER_BOOTSTRAP; CLUSTER_START; CLUSTER_START_ELECT(0); /* Cluster recovers. */ CLUSTER_STEP_UNTIL_HAS_LEADER(20000); return MUNIT_OK; } /* All servers fail to persist the barrier entry upon election of the first * leader. Ensure the cluster is able to make progress afterwards. */ TEST(replication, failPersistBarrierFollower, setUp, tearDown, 0, NULL) { struct fixture *f = data; CLUSTER_GROW; /* The servers will fail to persist entry 2, a barrier */ raft_fixture_append_fault(&f->cluster, 1, 0); raft_fixture_append_fault(&f->cluster, 2, 0); /* Server 0 gets elected and creates a barrier entry at index 2 */ CLUSTER_BOOTSTRAP; CLUSTER_START; CLUSTER_START_ELECT(0); CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; return MUNIT_OK; } /* A leader originates a log entry, fails to persist it, and steps down. * A follower that received the entry wins the ensuing election and sends * the same entry back to the original leader, while the original leader * still has an outgoing pending message that references its copy of the * entry. This triggers the original leader to reinstate the entry in its * log. */ TEST(replication, receiveSameWithPendingSend, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_apply req; /* Three voters. */ CLUSTER_GROW; /* Server 0 is the leader. */ BOOTSTRAP_START_AND_ELECT; /* Server 1 never gets the entry. */ raft_fixture_set_send_latency(&f->cluster, 0, 1, 10000); /* Disk write fails, but not before the entry gets to server 2. */ CLUSTER_SET_DISK_LATENCY(0, 1000); raft_fixture_append_fault(&f->cluster, 0, 0); req.data = (void *)(intptr_t)RAFT_IOERR; CLUSTER_APPLY_ADD_X(0, &req, 1, NULL); /* Server 0 steps down. */ CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_FOLLOWER, 1500); munit_assert_ullong(CLUSTER_RAFT(0)->current_term, ==, 2); ASSERT_FOLLOWER(1); ASSERT_FOLLOWER(2); /* Only server 2 has the new entry. */ munit_assert_ullong(CLUSTER_RAFT(0)->last_stored, ==, 2); munit_assert_ullong(CLUSTER_RAFT(1)->last_stored, ==, 2); munit_assert_ullong(CLUSTER_RAFT(2)->last_stored, ==, 3); /* Server 2 times out first and wins the election. */ raft_set_election_timeout(CLUSTER_RAFT(2), 500); raft_fixture_start_elect(&f->cluster, 2); CLUSTER_STEP_UNTIL_STATE_IS(2, RAFT_LEADER, 1000); munit_assert_ullong(CLUSTER_RAFT(2)->current_term, ==, 3); /* Server 0 gets the same entry back from server 2. */ CLUSTER_STEP_UNTIL_APPLIED(2, 3, 1000); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_snapshot.c000066400000000000000000000665711465252713400224200ustar00rootroot00000000000000#include "../lib/cluster.h" #include "../lib/runner.h" /****************************************************************************** * * Fixture * *****************************************************************************/ struct fixture { FIXTURE_CLUSTER; }; static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_CLUSTER(3); CLUSTER_BOOTSTRAP; CLUSTER_START; CLUSTER_ELECT(0); return f; } static void tearDown(void *data) { struct fixture *f = data; TEAR_DOWN_CLUSTER; free(f); } /****************************************************************************** * * Helper macros * *****************************************************************************/ /* Set the snapshot threshold on all servers of the cluster */ #define SET_SNAPSHOT_THRESHOLD(VALUE) \ { \ unsigned i; \ for (i = 0; i < CLUSTER_N; i++) { \ raft_set_snapshot_threshold(CLUSTER_RAFT(i), VALUE); \ } \ } /* Set the snapshot trailing logs number on all servers of the cluster */ #define SET_SNAPSHOT_TRAILING(VALUE) \ { \ unsigned i; \ for (i = 0; i < CLUSTER_N; i++) { \ raft_set_snapshot_trailing(CLUSTER_RAFT(i), VALUE); \ } \ } /* Set the snapshot timeout on all servers of the cluster */ #define SET_SNAPSHOT_TIMEOUT(VALUE) \ { \ unsigned i; \ for (i = 0; i < CLUSTER_N; i++) { \ raft_set_install_snapshot_timeout(CLUSTER_RAFT(i), VALUE); \ } \ } static int ioMethodSnapshotPutFail(struct raft_io *raft_io, unsigned trailing, struct raft_io_snapshot_put *req, const struct raft_snapshot *snapshot, raft_io_snapshot_put_cb cb) { (void)raft_io; (void)trailing; (void)req; (void)snapshot; (void)cb; return -1; } #define SET_FAULTY_SNAPSHOT_PUT() \ { \ unsigned i; \ for (i = 0; i < CLUSTER_N; i++) { \ CLUSTER_RAFT(i)->io->snapshot_put = ioMethodSnapshotPutFail; \ } \ } static int ioMethodAsyncWorkFail(struct raft_io *raft_io, struct raft_io_async_work *req, raft_io_async_work_cb cb) { (void)raft_io; (void)req; (void)cb; return -1; } #define SET_FAULTY_ASYNC_WORK() \ { \ unsigned i; \ for (i = 0; i < CLUSTER_N; i++) { \ CLUSTER_RAFT(i)->io->async_work = ioMethodAsyncWorkFail; \ } \ } static int fsmSnapshotFail(struct raft_fsm *fsm, struct raft_buffer *bufs[], unsigned *n_bufs) { (void)fsm; (void)bufs; (void)n_bufs; return -1; } #define SET_FAULTY_SNAPSHOT_ASYNC() \ { \ unsigned i; \ for (i = 0; i < CLUSTER_N; i++) { \ CLUSTER_RAFT(i)->fsm->snapshot_async = fsmSnapshotFail; \ } \ } #define RESET_FSM_ASYNC(I) \ { \ struct raft_fsm *fsm = CLUSTER_RAFT(I)->fsm; \ FsmClose(fsm); \ FsmInitAsync(fsm, fsm->version); \ } #define SET_FAULTY_SNAPSHOT() \ { \ unsigned i; \ for (i = 0; i < CLUSTER_N; i++) { \ CLUSTER_RAFT(i)->fsm->snapshot = fsmSnapshotFail; \ } \ } /****************************************************************************** * * Successfully install a snapshot * *****************************************************************************/ SUITE(snapshot) /* Install a snapshot on a follower that has fallen behind. */ TEST(snapshot, installOne, setUp, tearDown, 0, NULL) { struct fixture *f = data; (void)params; /* Set very low threshold and trailing entries number */ SET_SNAPSHOT_THRESHOLD(3); SET_SNAPSHOT_TRAILING(1); CLUSTER_SATURATE_BOTHWAYS(0, 2); /* Apply a few of entries, to force a snapshot to be taken. */ CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; /* Reconnect the follower and wait for it to catch up */ CLUSTER_DESATURATE_BOTHWAYS(0, 2); CLUSTER_STEP_UNTIL_APPLIED(2, 4, 5000); /* Check that the leader has sent a snapshot */ munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); return MUNIT_OK; } /* Install snapshot times out and leader retries */ TEST(snapshot, installOneTimeOut, setUp, tearDown, 0, NULL) { struct fixture *f = data; (void)params; /* Set very low threshold and trailing entries number */ SET_SNAPSHOT_THRESHOLD(3); SET_SNAPSHOT_TRAILING(1); SET_SNAPSHOT_TIMEOUT(200); /* Apply a few of entries, to force a snapshot to be taken. Drop all network * traffic between servers 0 and 2 in order for AppendEntries RPCs to not be * replicated */ CLUSTER_SATURATE_BOTHWAYS(0, 2); CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; /* Reconnect both servers and set a high disk latency on server 2 so that * the InstallSnapshot RPC will time out */ CLUSTER_SET_DISK_LATENCY(2, 300); CLUSTER_DESATURATE_BOTHWAYS(0, 2); /* Wait a while and check that the leader has sent a snapshot */ CLUSTER_STEP_UNTIL_ELAPSED(300); munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); /* Wait for the snapshot to be installed */ CLUSTER_STEP_UNTIL_APPLIED(2, 4, 5000); /* Assert that the leader has retried the InstallSnapshot RPC */ munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 2); munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 2); return MUNIT_OK; } /* Install snapshot to an offline node */ TEST(snapshot, installOneDisconnectedFromBeginningReconnects, setUp, tearDown, 0, NULL) { struct fixture *f = data; (void)params; /* Set very low threshold and trailing entries number */ SET_SNAPSHOT_THRESHOLD(3); SET_SNAPSHOT_TRAILING(1); SET_SNAPSHOT_TIMEOUT(200); /* Apply a few of entries, to force a snapshot to be taken. Disconnect * servers 0 and 2 so that the network calls return failure status */ CLUSTER_DISCONNECT(0, 2); CLUSTER_DISCONNECT(2, 0); CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; /* Wait a while so leader detects offline node */ CLUSTER_STEP_UNTIL_ELAPSED(2000); /* Assert that the leader doesn't try sending a snapshot to an offline node */ munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 0); munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 0); CLUSTER_RECONNECT(0, 2); CLUSTER_RECONNECT(2, 0); /* Wait for the snapshot to be installed */ CLUSTER_STEP_UNTIL_APPLIED(2, 4, 5000); /* Assert that the leader has sent an InstallSnapshot RPC */ munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); return MUNIT_OK; } /* Install snapshot to an offline node that went down during operation */ TEST(snapshot, installOneDisconnectedDuringOperationReconnects, setUp, tearDown, 0, NULL) { struct fixture *f = data; (void)params; /* Set very low threshold and trailing entries number */ SET_SNAPSHOT_THRESHOLD(3); SET_SNAPSHOT_TRAILING(1); SET_SNAPSHOT_TIMEOUT(200); /* Apply a few of entries */ CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; /* Wait for follower to catch up*/ CLUSTER_STEP_UNTIL_APPLIED(2, 5, 5000); /* Assert that the leader hasn't sent an InstallSnapshot RPC */ munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 0); CLUSTER_DISCONNECT(0, 2); CLUSTER_DISCONNECT(2, 0); /* Wait a while so leader detects offline node */ CLUSTER_STEP_UNTIL_ELAPSED(2000); /* Apply a few more entries */ CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; /* Assert that the leader doesn't try sending snapshot to an offline node */ munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 0); munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 0); CLUSTER_RECONNECT(0, 2); CLUSTER_RECONNECT(2, 0); CLUSTER_STEP_UNTIL_APPLIED(2, 8, 5000); /* Assert that the leader has tried sending an InstallSnapshot RPC */ munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); return MUNIT_OK; } /* No snapshots sent to killed nodes */ TEST(snapshot, noSnapshotInstallToKilled, setUp, tearDown, 0, NULL) { struct fixture *f = data; (void)params; /* Set very low threshold and trailing entries number */ SET_SNAPSHOT_THRESHOLD(3); SET_SNAPSHOT_TRAILING(1); SET_SNAPSHOT_TIMEOUT(200); /* Kill a server */ CLUSTER_KILL(2); /* Apply a few of entries */ CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; /* Wait a while */ CLUSTER_STEP_UNTIL_ELAPSED(4000); /* Assert that the leader hasn't sent an InstallSnapshot RPC */ munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 0); return MUNIT_OK; } /* Install snapshot times out and leader retries, afterwards AppendEntries * resume */ TEST(snapshot, installOneTimeOutAppendAfter, setUp, tearDown, 0, NULL) { struct fixture *f = data; (void)params; /* Set very low threshold and trailing entries number */ SET_SNAPSHOT_THRESHOLD(3); SET_SNAPSHOT_TRAILING(1); SET_SNAPSHOT_TIMEOUT(200); /* Apply a few of entries, to force a snapshot to be taken. Drop all network * traffic between servers 0 and 2 in order for AppendEntries RPCs to not be * replicated */ CLUSTER_SATURATE_BOTHWAYS(0, 2); CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; /* Reconnect both servers and set a high disk latency on server 2 so that * the InstallSnapshot RPC will time out */ CLUSTER_SET_DISK_LATENCY(2, 300); CLUSTER_DESATURATE_BOTHWAYS(0, 2); /* Wait for the snapshot to be installed */ CLUSTER_STEP_UNTIL_APPLIED(2, 4, 5000); /* Append a few entries and check if they are replicated */ CLUSTER_MAKE_PROGRESS; CLUSTER_STEP_UNTIL_APPLIED(2, 5, 5000); /* Assert that the leader has retried the InstallSnapshot RPC */ munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 2); return MUNIT_OK; } /* Install 2 snapshots that both time out and assure the follower catches up */ TEST(snapshot, installMultipleTimeOut, setUp, tearDown, 0, NULL) { struct fixture *f = data; (void)params; /* Set very low threshold and trailing entries number */ SET_SNAPSHOT_THRESHOLD(3); SET_SNAPSHOT_TRAILING(1); SET_SNAPSHOT_TIMEOUT(200); /* Apply a few of entries, to force a snapshot to be taken. Drop all network * traffic between servers 0 and 2 in order for AppendEntries RPCs to not be * replicated */ CLUSTER_SATURATE_BOTHWAYS(0, 2); CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; /* Reconnect both servers and set a high disk latency on server 2 so that * the InstallSnapshot RPC will time out */ CLUSTER_SET_DISK_LATENCY(2, 300); CLUSTER_DESATURATE_BOTHWAYS(0, 2); /* Step until the snapshot times out */ CLUSTER_STEP_UNTIL_ELAPSED(400); /* Apply another few of entries, to force a new snapshot to be taken. Drop * all traffic between servers 0 and 2 in order for AppendEntries RPCs to * not be replicated */ CLUSTER_SATURATE_BOTHWAYS(0, 2); CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; /* Reconnect the follower */ CLUSTER_DESATURATE_BOTHWAYS(0, 2); CLUSTER_STEP_UNTIL_APPLIED(2, 7, 5000); /* Assert that the leader has sent multiple InstallSnapshot RPCs */ munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), >=, 2); return MUNIT_OK; } /* Install 2 snapshots that both time out, launch a few regular AppendEntries * and assure the follower catches up */ TEST(snapshot, installMultipleTimeOutAppendAfter, setUp, tearDown, 0, NULL) { struct fixture *f = data; (void)params; /* Set very low threshold and trailing entries number */ SET_SNAPSHOT_THRESHOLD(3); SET_SNAPSHOT_TRAILING(1); SET_SNAPSHOT_TIMEOUT(200); /* Apply a few of entries, to force a snapshot to be taken. Drop all network * traffic between servers 0 and 2 in order for AppendEntries RPCs to not be * replicated */ CLUSTER_SATURATE_BOTHWAYS(0, 2); CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; /* Reconnect both servers and set a high disk latency on server 2 so that * the InstallSnapshot RPC will time out */ CLUSTER_SET_DISK_LATENCY(2, 300); CLUSTER_DESATURATE_BOTHWAYS(0, 2); /* Step until the snapshot times out */ CLUSTER_STEP_UNTIL_ELAPSED(400); /* Apply another few of entries, to force a new snapshot to be taken. Drop * all traffic between servers 0 and 2 in order for AppendEntries RPCs to * not be replicated */ CLUSTER_SATURATE_BOTHWAYS(0, 2); CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; /* Reconnect the follower */ CLUSTER_DESATURATE_BOTHWAYS(0, 2); /* Append a few entries and make sure the follower catches up */ CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_STEP_UNTIL_APPLIED(2, 9, 5000); /* Assert that the leader has sent multiple InstallSnapshot RPCs */ munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), >=, 2); return MUNIT_OK; } static bool server_installing_snapshot(struct raft_fixture *f, void *data) { (void)f; const struct raft *r = data; return r->snapshot.put.data != NULL && r->last_stored == 0; } static bool server_taking_snapshot(struct raft_fixture *f, void *data) { (void)f; const struct raft *r = data; return r->snapshot.put.data != NULL && r->last_stored != 0; } static bool server_snapshot_done(struct raft_fixture *f, void *data) { (void)f; const struct raft *r = data; return r->snapshot.put.data == NULL; } /* Follower receives HeartBeats during the installation of a snapshot */ TEST(snapshot, installSnapshotHeartBeats, setUp, tearDown, 0, NULL) { struct fixture *f = data; (void)params; /* Set very low threshold and trailing entries number */ SET_SNAPSHOT_THRESHOLD(3); SET_SNAPSHOT_TRAILING(1); CLUSTER_SATURATE_BOTHWAYS(0, 1); /* Apply a few of entries, to force a snapshot to be taken. */ CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; /* Set a large disk latency on the follower, this will allow some * heartbeats to be sent during the snapshot installation */ CLUSTER_SET_DISK_LATENCY(1, 2000); munit_assert_uint(CLUSTER_N_RECV(1, RAFT_IO_INSTALL_SNAPSHOT), ==, 0); /* Step the cluster until server 1 installs a snapshot */ const struct raft *r = CLUSTER_RAFT(1); CLUSTER_DESATURATE_BOTHWAYS(0, 1); CLUSTER_STEP_UNTIL(server_installing_snapshot, (void *)r, 2000); munit_assert_uint(CLUSTER_N_RECV(1, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); /* Count the number of AppendEntries RPCs received during the snapshot * install*/ unsigned before = CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES); CLUSTER_STEP_UNTIL(server_snapshot_done, (void *)r, 5000); unsigned after = CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES); munit_assert_uint(before, <, after); /* Check that the InstallSnapshot RPC was not resent */ munit_assert_uint(CLUSTER_N_RECV(1, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); /* Check that the snapshot was applied and we can still make progress */ CLUSTER_STEP_UNTIL_APPLIED(1, 4, 5000); CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_STEP_UNTIL_APPLIED(1, 6, 5000); return MUNIT_OK; } /* InstallSnapshot RPC arrives while persisting Entries */ TEST(snapshot, installSnapshotDuringEntriesWrite, setUp, tearDown, 0, NULL) { struct fixture *f = data; (void)params; /* Set a large disk latency on the follower, this will allow a * InstallSnapshot RPC to arrive while the entries are being persisted. */ CLUSTER_SET_DISK_LATENCY(1, 2000); SET_SNAPSHOT_THRESHOLD(3); SET_SNAPSHOT_TRAILING(1); /* Replicate some entries, these will take a while to persist */ CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; /* Make sure leader can't succesfully send any more entries */ CLUSTER_DISCONNECT(0, 1); CLUSTER_MAKE_PROGRESS; /* Snapshot taken here */ CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; /* Snapshot taken here */ CLUSTER_MAKE_PROGRESS; /* Snapshot with index 6 is sent while follower is still writing the entries * to disk that arrived before the disconnect. */ CLUSTER_RECONNECT(0, 1); /* Make sure follower is up to date */ CLUSTER_STEP_UNTIL_APPLIED(1, 7, 5000); return MUNIT_OK; } static char *fsm_version[] = {"1", "2", "3", NULL}; static char *fsm_snapshot_async[] = {"0", "1", NULL}; static MunitParameterEnum fsm_snapshot_async_params[] = { {CLUSTER_SS_ASYNC_PARAM, fsm_snapshot_async}, {CLUSTER_FSM_VERSION_PARAM, fsm_version}, {NULL, NULL}, }; static char *fsm_snapshot_only_async[] = {"1", NULL}; static char *fsm_version_only_async[] = {"3", NULL}; static MunitParameterEnum fsm_snapshot_only_async_params[] = { {CLUSTER_SS_ASYNC_PARAM, fsm_snapshot_only_async}, {CLUSTER_FSM_VERSION_PARAM, fsm_version_only_async}, {NULL, NULL}, }; /* Follower receives AppendEntries RPCs while taking a snapshot */ TEST(snapshot, takeSnapshotAppendEntries, setUp, tearDown, 0, fsm_snapshot_async_params) { struct fixture *f = data; (void)params; /* Set very low threshold and trailing entries number */ SET_SNAPSHOT_THRESHOLD(3); SET_SNAPSHOT_TRAILING(1); /* Set a large disk latency on the follower, this will allow AppendEntries * to be sent while a snapshot is taken */ CLUSTER_SET_DISK_LATENCY(1, 2000); /* Apply a few of entries, to force a snapshot to be taken. */ CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; /* Step the cluster until server 1 takes a snapshot */ const struct raft *r = CLUSTER_RAFT(1); CLUSTER_STEP_UNTIL(server_taking_snapshot, (void *)r, 3000); /* Send AppendEntries RPCs while server 1 is taking a snapshot */ static struct raft_apply reqs[5]; for (int i = 0; i < 5; i++) { CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, &reqs[i], 1, NULL); } CLUSTER_STEP_UNTIL(server_snapshot_done, (void *)r, 5000); /* Make sure the AppendEntries are applied and we can make progress */ CLUSTER_STEP_UNTIL_APPLIED(1, 9, 5000); CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_STEP_UNTIL_APPLIED(1, 11, 5000); return MUNIT_OK; } TEST(snapshot, takeSnapshotSnapshotPutFail, setUp, tearDown, 0, fsm_snapshot_async_params) { struct fixture *f = data; (void)params; SET_FAULTY_SNAPSHOT_PUT(); /* Set very low threshold and trailing entries number */ SET_SNAPSHOT_THRESHOLD(3); SET_SNAPSHOT_TRAILING(1); /* Apply a few of entries, to force a snapshot to be taken. */ CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; /* No crash or leaks have occurred */ return MUNIT_OK; } TEST(snapshot, takeSnapshotAsyncWorkFail, setUp, tearDown, 0, fsm_snapshot_async_params) { struct fixture *f = data; (void)params; SET_FAULTY_ASYNC_WORK(); /* Set very low threshold and trailing entries number */ SET_SNAPSHOT_THRESHOLD(3); SET_SNAPSHOT_TRAILING(1); /* Apply a few of entries, to force a snapshot to be taken. */ CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; /* No crash or leaks have occurred */ return MUNIT_OK; } TEST(snapshot, takeSnapshotAsyncFail, setUp, tearDown, 0, fsm_snapshot_only_async_params) { struct fixture *f = data; (void)params; SET_FAULTY_SNAPSHOT_ASYNC(); /* Set very low threshold and trailing entries number */ SET_SNAPSHOT_THRESHOLD(3); SET_SNAPSHOT_TRAILING(1); /* Apply a few of entries, to force a snapshot to be taken. */ CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; /* No crash or leaks have occurred */ return MUNIT_OK; } TEST(snapshot, takeSnapshotAsyncFailOnce, setUp, tearDown, 0, fsm_snapshot_only_async_params) { struct fixture *f = data; (void)params; SET_FAULTY_SNAPSHOT_ASYNC(); /* Set very low threshold and trailing entries number */ SET_SNAPSHOT_THRESHOLD(3); SET_SNAPSHOT_TRAILING(1); CLUSTER_SATURATE_BOTHWAYS(0, 2); /* Apply a few of entries, to force a snapshot to be taken. */ CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; /* Wait for snapshot to fail. */ CLUSTER_STEP_UNTIL_ELAPSED(200); /* napshot will have failed here. */ /* Set the non-faulty fsm->snapshot_async function */ RESET_FSM_ASYNC(CLUSTER_LEADER); CLUSTER_MAKE_PROGRESS; /* Wait for snapshot to be finished */ CLUSTER_STEP_UNTIL_ELAPSED(200); /* Reconnect the follower and wait for it to catch up */ CLUSTER_DESATURATE_BOTHWAYS(0, 2); CLUSTER_STEP_UNTIL_APPLIED(2, 4, 5000); /* Check that the leader has sent a snapshot */ munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); return MUNIT_OK; } TEST(snapshot, takeSnapshotFail, setUp, tearDown, 0, fsm_snapshot_async_params) { struct fixture *f = data; (void)params; SET_FAULTY_SNAPSHOT(); /* Set very low threshold and trailing entries number */ SET_SNAPSHOT_THRESHOLD(3); SET_SNAPSHOT_TRAILING(1); /* Apply a few of entries, to force a snapshot to be taken. */ CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; /* No crash or leaks have occurred */ return MUNIT_OK; } /* A follower doesn't convert to candidate state while it's installing a * snapshot. */ TEST(snapshot, snapshotBlocksCandidate, setUp, tearDown, 0, NULL) { struct fixture *f = data; (void)params; /* Set very low threshold and trailing entries number */ SET_SNAPSHOT_THRESHOLD(3); SET_SNAPSHOT_TRAILING(1); /* Apply a few of entries, to force a snapshot to be taken. Drop all network * traffic between servers 0 and 2 in order for AppendEntries RPCs to not be * replicated */ CLUSTER_SATURATE_BOTHWAYS(0, 2); CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; /* Reconnect both servers and set a high disk latency on server 2 */ CLUSTER_SET_DISK_LATENCY(2, 5000); CLUSTER_DESATURATE_BOTHWAYS(0, 2); /* Wait a while and check that the leader has sent a snapshot */ CLUSTER_STEP_UNTIL_ELAPSED(500); munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); /* Disconnect the servers again so that heartbeats, etc. won't arrive */ CLUSTER_SATURATE_BOTHWAYS(0, 2); munit_assert_int(CLUSTER_STATE(2), ==, RAFT_FOLLOWER); munit_assert_ptr(CLUSTER_RAFT(2)->snapshot.put.data, !=, NULL); CLUSTER_STEP_UNTIL_ELAPSED(4000); munit_assert_int(CLUSTER_STATE(2), ==, RAFT_FOLLOWER); return MUNIT_OK; } /* An UNAVAILABLE node doesn't install snapshots. */ TEST(snapshot, unavailableDiscardsSnapshot, setUp, tearDown, 0, NULL) { struct fixture *f = data; (void)params; /* Set very low threshold and trailing entries number */ SET_SNAPSHOT_THRESHOLD(3); SET_SNAPSHOT_TRAILING(1); /* Apply a few of entries, to force a snapshot to be taken. Drop all network * traffic between servers 0 and 2 in order for AppendEntries RPCs to not be * replicated */ CLUSTER_SATURATE_BOTHWAYS(0, 2); CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; /* Reconnect both servers */ CLUSTER_SET_DISK_LATENCY(2, 600); CLUSTER_DESATURATE_BOTHWAYS(0, 2); /* Wait a while and check that the leader has sent a snapshot */ CLUSTER_STEP_UNTIL_ELAPSED(500); munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); raft_fixture_make_unavailable(&f->cluster, 2); CLUSTER_STEP_UNTIL_ELAPSED(500); munit_assert_uint64(raft_last_applied(CLUSTER_RAFT(2)), ==, 1); return MUNIT_OK; } /* A new term starts while a node is installing a snapshot. */ TEST(snapshot, newTermWhileInstalling, setUp, tearDown, 0, NULL) { struct fixture *f = data; (void)params; /* Set very low threshold and trailing entries number */ SET_SNAPSHOT_THRESHOLD(3); SET_SNAPSHOT_TRAILING(1); /* Apply a few of entries, to force a snapshot to be taken. Drop all network * traffic between servers 0 and 2 in order for AppendEntries RPCs to not be * replicated */ CLUSTER_SATURATE_BOTHWAYS(0, 2); CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; CLUSTER_MAKE_PROGRESS; /* Reconnect both servers */ CLUSTER_SET_DISK_LATENCY(2, 3000); CLUSTER_DESATURATE_BOTHWAYS(0, 2); /* Wait a while and check that the leader has sent a snapshot */ CLUSTER_STEP_UNTIL_ELAPSED(500); munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); /* Force a new term to start */ CLUSTER_DEPOSE; CLUSTER_ELECT(1); CLUSTER_STEP_UNTIL_ELAPSED(1000); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_start.c000066400000000000000000000150661465252713400217070ustar00rootroot00000000000000#include "../lib/cluster.h" #include "../lib/runner.h" /****************************************************************************** * * Fixture with a fake raft_io instance. * *****************************************************************************/ struct fixture { FIXTURE_CLUSTER; }; /****************************************************************************** * * Helper macros * *****************************************************************************/ /* Bootstrap the I'th server. */ #define BOOTSTRAP(I) \ do { \ struct raft_configuration _configuration; \ int _rv; \ struct raft *_raft; \ CLUSTER_CONFIGURATION(&_configuration); \ _raft = CLUSTER_RAFT(I); \ _rv = raft_bootstrap(_raft, &_configuration); \ munit_assert_int(_rv, ==, 0); \ raft_configuration_close(&_configuration); \ } while (0) /****************************************************************************** * * Set up a cluster with a single server. * *****************************************************************************/ static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_CLUSTER(1); return f; } static void tearDown(void *data) { struct fixture *f = data; TEAR_DOWN_CLUSTER; free(f); } /****************************************************************************** * * raft_start * *****************************************************************************/ SUITE(raft_start) /* There are two servers. The first has a snapshot present and no other * entries. */ TEST(raft_start, oneSnapshotAndNoEntries, setUp, tearDown, 0, NULL) { struct fixture *f = data; CLUSTER_GROW; CLUSTER_SET_SNAPSHOT(0 /* server index */, 6 /* last index */, 2 /* last term */, 1 /* conf index */, 5 /* x */, 7 /* y */); CLUSTER_SET_TERM(0, 2); BOOTSTRAP(1); CLUSTER_START; CLUSTER_MAKE_PROGRESS; return MUNIT_OK; } /* There are two servers. The first has a snapshot along with some follow-up * entries. */ TEST(raft_start, oneSnapshotAndSomeFollowUpEntries, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry entries[2]; struct raft_fsm *fsm; CLUSTER_GROW; BOOTSTRAP(1); entries[0].type = RAFT_COMMAND; entries[0].term = 2; FsmEncodeSetX(6, &entries[0].buf); entries[1].type = RAFT_COMMAND; entries[1].term = 2; FsmEncodeAddY(2, &entries[1].buf); CLUSTER_SET_SNAPSHOT(0 /* */, 6 /* last index */, 2 /* last term */, 1 /* conf index */, 5 /* x */, 7 /* y */); CLUSTER_ADD_ENTRY(0, &entries[0]); CLUSTER_ADD_ENTRY(1, &entries[1]); CLUSTER_SET_TERM(0, 2); CLUSTER_START; CLUSTER_MAKE_PROGRESS; fsm = CLUSTER_FSM(0); munit_assert_int(FsmGetX(fsm), ==, 7); return MUNIT_OK; } /****************************************************************************** * * Start with entries present on disk. * *****************************************************************************/ /* There are 3 servers. The first has no entries are present at all */ TEST(raft_start, noEntries, setUp, tearDown, 0, NULL) { struct fixture *f = data; CLUSTER_GROW; CLUSTER_GROW; BOOTSTRAP(1); BOOTSTRAP(2); CLUSTER_START; CLUSTER_MAKE_PROGRESS; return MUNIT_OK; } /* There are 3 servers, the first has some entries, the others don't. */ TEST(raft_start, twoEntries, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_configuration configuration; struct raft_entry entry; struct raft_fsm *fsm; unsigned i; int rv; CLUSTER_GROW; CLUSTER_GROW; CLUSTER_CONFIGURATION(&configuration); rv = raft_bootstrap(CLUSTER_RAFT(0), &configuration); munit_assert_int(rv, ==, 0); raft_configuration_close(&configuration); entry.type = RAFT_COMMAND; entry.term = 3; FsmEncodeSetX(123, &entry.buf); CLUSTER_ADD_ENTRY(0, &entry); CLUSTER_SET_TERM(0, 3); BOOTSTRAP(1); BOOTSTRAP(2); CLUSTER_START; CLUSTER_ELECT(0); CLUSTER_MAKE_PROGRESS; CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_N, 3, 3000); for (i = 0; i < CLUSTER_N; i++) { fsm = CLUSTER_FSM(i); munit_assert_int(FsmGetX(fsm), ==, 124); } return MUNIT_OK; } /* There is a single voting server in the cluster, which immediately elects * itself when starting. */ TEST(raft_start, singleVotingSelfElect, setUp, tearDown, 0, NULL) { struct fixture *f = data; CLUSTER_BOOTSTRAP; CLUSTER_START; munit_assert_int(CLUSTER_STATE(0), ==, RAFT_LEADER); CLUSTER_MAKE_PROGRESS; return MUNIT_OK; } /* There are two servers in the cluster, one is voting and the other is * not. When started, the non-voting server does not elects itself. */ TEST(raft_start, singleVotingNotUs, setUp, tearDown, 0, NULL) { struct fixture *f = data; CLUSTER_GROW; CLUSTER_BOOTSTRAP_N_VOTING(1); CLUSTER_START; munit_assert_int(CLUSTER_STATE(1), ==, RAFT_FOLLOWER); CLUSTER_MAKE_PROGRESS; return MUNIT_OK; } static void state_cb(struct raft *r, unsigned short old, unsigned short new) { munit_assert_ushort(old, !=, new); r->data = (void *)(uintptr_t)0xFEEDBEEF; } /* There is a single voting server in the cluster, register a state_cb and * assert that it's called because the node will progress to leader. */ TEST(raft_start, singleVotingWithStateCb, setUp, tearDown, 0, NULL) { struct fixture *f = data; CLUSTER_BOOTSTRAP; struct raft *r = CLUSTER_RAFT(0); r->data = (void *)(uintptr_t)0; raft_register_state_cb(r, state_cb); CLUSTER_START; munit_assert_uint((uintptr_t)r->data, ==, 0xFEEDBEEF); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_strerror.c000066400000000000000000000026361465252713400224330ustar00rootroot00000000000000#include "../../../src/raft.h" #include "../lib/runner.h" /****************************************************************************** * * raft_strerror * *****************************************************************************/ SUITE(raft_strerror) #define ERR_CODE_MAP(X) \ X(RAFT_NOMEM) \ X(RAFT_BADID) \ X(RAFT_DUPLICATEID) \ X(RAFT_DUPLICATEADDRESS) \ X(RAFT_BADROLE) \ X(RAFT_MALFORMED) \ X(RAFT_NOTLEADER) \ X(RAFT_LEADERSHIPLOST) \ X(RAFT_SHUTDOWN) \ X(RAFT_CANTBOOTSTRAP) \ X(RAFT_CANTCHANGE) \ X(RAFT_CORRUPT) \ X(RAFT_CANCELED) \ X(RAFT_NAMETOOLONG) \ X(RAFT_TOOBIG) \ X(RAFT_NOCONNECTION) \ X(RAFT_BUSY) \ X(RAFT_IOERR) #define TEST_CASE_STRERROR(CODE) \ TEST(raft_strerror, CODE, NULL, NULL, 0, NULL) \ { \ (void)data; \ (void)params; \ munit_assert_not_null(raft_strerror(CODE)); \ return MUNIT_OK; \ } ERR_CODE_MAP(TEST_CASE_STRERROR) TEST(raft_strerror, default, NULL, NULL, 0, NULL) { (void)data; (void)params; munit_assert_string_equal(raft_strerror(666), "unknown error"); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_tick.c000066400000000000000000000162751465252713400215070ustar00rootroot00000000000000#include "../lib/cluster.h" #include "../lib/runner.h" /****************************************************************************** * * Fixture * *****************************************************************************/ struct fixture { FIXTURE_CLUSTER; }; static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_malloc(sizeof *f); const char *n_voting_param = munit_parameters_get(params, "n_voting"); unsigned n = 3; unsigned n_voting = n; if (n_voting_param != NULL) { n_voting = atoi(n_voting_param); } SETUP_CLUSTER(n); CLUSTER_BOOTSTRAP_N_VOTING(n_voting); CLUSTER_START; return f; } static void tearDown(void *data) { struct fixture *f = data; TEAR_DOWN_CLUSTER; free(f); } /****************************************************************************** * * Assertions * *****************************************************************************/ /* Assert the current value of the timer of the I'th raft instance */ #define ASSERT_ELECTION_TIMER(I, MSECS) \ { \ struct raft *raft_ = CLUSTER_RAFT(I); \ munit_assert_int( \ raft_->io->time(raft_->io) - raft_->election_timer_start, ==, \ MSECS); \ } /* Assert the current state of the I'th raft instance. */ #define ASSERT_STATE(I, STATE) munit_assert_int(CLUSTER_STATE(I), ==, STATE); /****************************************************************************** * * Tick callback * *****************************************************************************/ SUITE(tick) /* Internal timers are updated according to the given time delta. */ TEST(tick, electionTimer, setUp, tearDown, 0, NULL) { struct fixture *f = data; (void)params; CLUSTER_STEP; ASSERT_ELECTION_TIMER(0, 100); CLUSTER_STEP; ASSERT_ELECTION_TIMER(1, 100); CLUSTER_STEP; ASSERT_ELECTION_TIMER(2, 100); CLUSTER_STEP; ASSERT_ELECTION_TIMER(0, 200); return MUNIT_OK; } /* If the election timeout expires, the follower is a voting server, and it * hasn't voted yet in this term, then become candidate and start a new * election. */ TEST(tick, candidate, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft *raft = CLUSTER_RAFT(0); (void)params; CLUSTER_STEP_UNTIL_ELAPSED( raft->follower_state.randomized_election_timeout); /* The term has been incremented. */ munit_assert_int(raft->current_term, ==, 2); /* We have voted for ourselves. */ munit_assert_int(raft->voted_for, ==, 1); /* We are candidate */ ASSERT_STATE(0, RAFT_CANDIDATE); /* The votes array is initialized */ munit_assert_ptr_not_null(raft->candidate_state.votes); munit_assert_true(raft->candidate_state.votes[0]); munit_assert_false(raft->candidate_state.votes[1]); return MUNIT_OK; } /* If the election timeout has not elapsed, stay follower. */ TEST(tick, electionTimerNotExpired, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft *raft = CLUSTER_RAFT(0); (void)params; CLUSTER_STEP_UNTIL_ELAPSED( raft->follower_state.randomized_election_timeout - 100); ASSERT_STATE(0, RAFT_FOLLOWER); return MUNIT_OK; } static char *elapse_non_voter_n_voting[] = {"1", NULL}; static MunitParameterEnum elapse_non_voter_params[] = { {"n_voting", elapse_non_voter_n_voting}, {NULL, NULL}, }; /* If the election timeout has elapsed, but we're not voters, stay follower. */ TEST(tick, not_voter, setUp, tearDown, 0, elapse_non_voter_params) { struct fixture *f = data; struct raft *raft = CLUSTER_RAFT(1); (void)params; /* Prevent the timer of the first server from expiring. */ raft_fixture_set_randomized_election_timeout(&f->cluster, 0, 2000); raft_set_election_timeout(CLUSTER_RAFT(0), 2000); CLUSTER_STEP_UNTIL_ELAPSED( raft->follower_state.randomized_election_timeout + 100); ASSERT_STATE(1, RAFT_FOLLOWER); return MUNIT_OK; } /* If we're leader election timeout elapses without hearing from a majority of * the cluster, step down. */ TEST(tick, no_contact, setUp, tearDown, 0, NULL) { struct fixture *f = data; (void)params; CLUSTER_ELECT(0); CLUSTER_SATURATE_BOTHWAYS(0, 1); CLUSTER_SATURATE_BOTHWAYS(0, 2); /* Wait for the leader to step down. */ CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_FOLLOWER, 2000); return MUNIT_OK; } /* If we're candidate and the election timeout has elapsed, start a new * election. */ TEST(tick, new_election, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft *raft = CLUSTER_RAFT(0); (void)params; CLUSTER_SATURATE_BOTHWAYS(0, 1); CLUSTER_SATURATE_BOTHWAYS(0, 2); /* Become candidate */ CLUSTER_STEP_UNTIL_ELAPSED( raft->follower_state.randomized_election_timeout); /* Expire the election timeout */ CLUSTER_STEP_UNTIL_ELAPSED( raft->candidate_state.randomized_election_timeout); /* The term has been incremented and saved to stable store. */ munit_assert_int(raft->current_term, ==, 3); /* We have voted for ourselves. */ munit_assert_int(raft->voted_for, ==, 1); /* We are still candidate */ ASSERT_STATE(0, RAFT_CANDIDATE); /* The votes array is initialized */ munit_assert_ptr_not_null(raft->candidate_state.votes); munit_assert_true(raft->candidate_state.votes[0]); munit_assert_false(raft->candidate_state.votes[1]); return MUNIT_OK; } /* If the election timeout has not elapsed, stay candidate. */ TEST(tick, during_election, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft *raft = CLUSTER_RAFT(0); (void)params; CLUSTER_SATURATE_BOTHWAYS(0, 1); CLUSTER_SATURATE_BOTHWAYS(0, 2); /* Become candidate */ CLUSTER_STEP_UNTIL_ELAPSED( raft->follower_state.randomized_election_timeout); /* Make some time elapse, but not enough to trigger the timeout */ CLUSTER_STEP_UNTIL_ELAPSED( raft->candidate_state.randomized_election_timeout - 100); /* We are still candidate at the same term */ ASSERT_STATE(0, RAFT_CANDIDATE); munit_assert_int(raft->current_term, ==, 2); return MUNIT_OK; } static char *elapse_request_vote_only_to_voters_n_voting[] = {"2", NULL}; static MunitParameterEnum elapse_request_vote_only_to_voters_params[] = { {"n_voting", elapse_request_vote_only_to_voters_n_voting}, {NULL, NULL}, }; /* Vote requests are sent only to voting servers. */ TEST(tick, request_vote_only_to_voters, setUp, tearDown, 0, elapse_request_vote_only_to_voters_params) { struct fixture *f = data; struct raft *raft = CLUSTER_RAFT(0); (void)params; CLUSTER_SATURATE_BOTHWAYS(0, 1); CLUSTER_SATURATE_BOTHWAYS(0, 2); /* Become candidate */ CLUSTER_STEP_UNTIL_ELAPSED( raft->follower_state.randomized_election_timeout); /* We have sent vote requests only to the voting server */ //__assert_request_vote(f, 2, 2, 1, 1); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_transfer.c000066400000000000000000000144241465252713400223730ustar00rootroot00000000000000#include "../lib/cluster.h" #include "../lib/runner.h" /****************************************************************************** * * Fixture with a test raft cluster. * *****************************************************************************/ struct fixture { FIXTURE_CLUSTER; }; /****************************************************************************** * * Helper macros * *****************************************************************************/ static void transferCb(struct raft_transfer *req) { bool *done = req->data; munit_assert_false(*done); *done = true; } static bool transferCbHasFired(struct raft_fixture *f, void *arg) { bool *done = arg; (void)f; return *done; } /* Submit a transfer leadership request against the I'th server. */ #define TRANSFER_SUBMIT(I, ID) \ struct raft *_raft = CLUSTER_RAFT(I); \ struct raft_transfer _req; \ bool _done = false; \ int _rv; \ _req.data = &_done; \ _rv = raft_transfer(_raft, &_req, ID, transferCb); \ munit_assert_int(_rv, ==, 0); /* Wait until the transfer leadership request completes. */ #define TRANSFER_WAIT CLUSTER_STEP_UNTIL(transferCbHasFired, &_done, 2000) /* Submit a transfer leadership request and wait for it to complete. */ #define TRANSFER(I, ID) \ do { \ TRANSFER_SUBMIT(I, ID); \ TRANSFER_WAIT; \ } while (0) /* Submit a transfer leadership request against the I'th server and assert that * the given error is returned. */ #define TRANSFER_ERROR(I, ID, RV, ERRMSG) \ do { \ struct raft_transfer __req; \ int __rv; \ __rv = raft_transfer(CLUSTER_RAFT(I), &__req, ID, NULL); \ munit_assert_int(__rv, ==, RV); \ munit_assert_string_equal(CLUSTER_ERRMSG(I), ERRMSG); \ } while (0) /****************************************************************************** * * Set up a cluster with a three servers. * *****************************************************************************/ static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_CLUSTER(3); CLUSTER_BOOTSTRAP; CLUSTER_START; CLUSTER_ELECT(0); return f; } static void tearDown(void *data) { struct fixture *f = data; TEAR_DOWN_CLUSTER; free(f); } /****************************************************************************** * * raft_transfer * *****************************************************************************/ SUITE(raft_transfer) /* The follower we ask to transfer leadership to is up-to-date. */ TEST(raft_transfer, upToDate, setUp, tearDown, 0, NULL) { struct fixture *f = data; TRANSFER(0, 2); CLUSTER_STEP_UNTIL_HAS_LEADER(1000); munit_assert_int(CLUSTER_LEADER, ==, 1); return MUNIT_OK; } /* The follower we ask to transfer leadership to needs to catch up. */ TEST(raft_transfer, catchUp, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_apply req; CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, &req, 1, NULL); TRANSFER(0, 2); CLUSTER_STEP_UNTIL_HAS_LEADER(1000); munit_assert_int(CLUSTER_LEADER, ==, 1); return MUNIT_OK; } /* The follower we ask to transfer leadership to is down and the leadership * transfer does not succeed. */ TEST(raft_transfer, expire, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_apply req; CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, &req, 1, NULL); CLUSTER_KILL(1); TRANSFER(0, 2); munit_assert_int(CLUSTER_LEADER, ==, 0); return MUNIT_OK; } /* The given ID doesn't match any server in the current configuration. */ TEST(raft_transfer, unknownServer, setUp, tearDown, 0, NULL) { struct fixture *f = data; TRANSFER_ERROR(0, 4, RAFT_BADID, "server ID is not valid"); return MUNIT_OK; } /* Submitting a transfer request twice is an error. */ TEST(raft_transfer, twice, setUp, tearDown, 0, NULL) { struct fixture *f = data; TRANSFER_SUBMIT(0, 2); TRANSFER_ERROR(0, 3, RAFT_NOTLEADER, "server is not the leader"); TRANSFER_WAIT; return MUNIT_OK; } /* If the given ID is zero, the target is selected automatically. */ TEST(raft_transfer, autoSelect, setUp, tearDown, 0, NULL) { struct fixture *f = data; TRANSFER(0, 0); CLUSTER_STEP_UNTIL_HAS_LEADER(1000); munit_assert_int(CLUSTER_LEADER, !=, 0); return MUNIT_OK; } /* If the given ID is zero, the target is selected automatically. Followers that * are up-to-date are preferred. */ TEST(raft_transfer, autoSelectUpToDate, setUp, tearDown, 0, NULL) { struct fixture *f = data; CLUSTER_KILL(1); CLUSTER_MAKE_PROGRESS; TRANSFER(0, 0); CLUSTER_STEP_UNTIL_HAS_LEADER(1000); munit_assert_int(CLUSTER_LEADER, ==, 2); return MUNIT_OK; } /* It's not possible to transfer leadership after the server has been * demoted. */ TEST(raft_transfer, afterDemotion, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_change req; struct raft *raft = CLUSTER_RAFT(0); int rv; CLUSTER_ADD(&req); CLUSTER_STEP_UNTIL_APPLIED(0, 3, 1000); CLUSTER_ASSIGN(&req, RAFT_VOTER); CLUSTER_STEP_UNTIL_APPLIED(0, 4, 1000); rv = raft_assign(raft, &req, raft->id, RAFT_SPARE, NULL); munit_assert_int(rv, ==, 0); CLUSTER_STEP_UNTIL_APPLIED(0, 5, 1000); TRANSFER_ERROR(0, 2, RAFT_NOTLEADER, "server is not the leader"); return MUNIT_OK; } static char *cluster_pre_vote[] = {"0", "1", NULL}; static char *cluster_heartbeat[] = {"1", "100", NULL}; static MunitParameterEnum _params[] = { {CLUSTER_PRE_VOTE_PARAM, cluster_pre_vote}, {CLUSTER_HEARTBEAT_PARAM, cluster_heartbeat}, {NULL, NULL}, }; /* It's possible to transfer leadership also when pre-vote is active */ TEST(raft_transfer, preVote, setUp, tearDown, 0, _params) { struct fixture *f = data; TRANSFER(0, 2); CLUSTER_STEP_UNTIL_HAS_LEADER(1000); munit_assert_int(CLUSTER_LEADER, ==, 1); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_uv_append.c000066400000000000000000001013321465252713400225230ustar00rootroot00000000000000#include "../../../src/raft/uv.h" #include "../lib/aio.h" #include "../lib/runner.h" #include "../lib/uv.h" #include "append_helpers.h" #include /* Maximum number of blocks a segment can have */ #define MAX_SEGMENT_BLOCKS 4 /* This block size should work fine for all file systems. */ #define SEGMENT_BLOCK_SIZE 4096 /* Default segment size */ #define SEGMENT_SIZE 4096 * MAX_SEGMENT_BLOCKS /****************************************************************************** * * Fixture with a libuv-based raft_io instance. * *****************************************************************************/ struct fixture { FIXTURE_UV_DEPS; FIXTURE_UV; int count; /* To generate deterministic entry data */ }; /****************************************************************************** * * Set up and tear down. * *****************************************************************************/ static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_UV_DEPS; SETUP_UV; raft_uv_set_block_size(&f->io, SEGMENT_BLOCK_SIZE); raft_uv_set_segment_size(&f->io, SEGMENT_SIZE); f->count = 0; return f; } static void tearDownDeps(void *data) { struct fixture *f = data; if (f == NULL) { return; } TEAR_DOWN_UV_DEPS; free(f); } static void tearDown(void *data) { struct fixture *f = data; if (f == NULL) { return; } TEAR_DOWN_UV; tearDownDeps(f); } /****************************************************************************** * * Assertions * *****************************************************************************/ /* Shutdown the fixture's raft_io instance, then load all entries on disk using * a new raft_io instance, and assert that there are N entries with a total data * size of TOTAL_DATA_SIZE bytes. */ #define ASSERT_ENTRIES(N, TOTAL_DATA_SIZE) \ TEAR_DOWN_UV; \ do { \ struct uv_loop_s _loop; \ struct raft_uv_transport _transport; \ struct raft_io _io; \ raft_term _term; \ raft_id _voted_for; \ struct raft_snapshot *_snapshot; \ raft_index _start_index; \ struct raft_entry *_entries; \ size_t _i; \ size_t _n; \ void *_batch = NULL; \ size_t _total_data_size = 0; \ int _rv; \ \ _rv = uv_loop_init(&_loop); \ munit_assert_int(_rv, ==, 0); \ _transport.version = 1; \ _rv = raft_uv_tcp_init(&_transport, &_loop); \ munit_assert_int(_rv, ==, 0); \ _rv = raft_uv_init(&_io, &_loop, f->dir, &_transport); \ munit_assert_int(_rv, ==, 0); \ _rv = _io.init(&_io, 1, "1"); \ if (_rv != 0) { \ munit_errorf("io->init(): %s (%d)", _io.errmsg, _rv); \ } \ _rv = _io.load(&_io, &_term, &_voted_for, &_snapshot, &_start_index, \ &_entries, &_n); \ if (_rv != 0) { \ munit_errorf("io->load(): %s (%d)", _io.errmsg, _rv); \ } \ _io.close(&_io, NULL); \ uv_run(&_loop, UV_RUN_NOWAIT); \ raft_uv_close(&_io); \ raft_uv_tcp_close(&_transport); \ uv_loop_close(&_loop); \ \ munit_assert_ptr_null(_snapshot); \ munit_assert_int(_n, ==, N); \ for (_i = 0; _i < _n; _i++) { \ struct raft_entry *_entry = &_entries[_i]; \ uint64_t _value = *(uint64_t *)_entry->buf.base; \ munit_assert_int(_entry->term, ==, 1); \ munit_assert_int(_entry->type, ==, RAFT_COMMAND); \ munit_assert_int(_value, ==, _i); \ munit_assert_ptr_not_null(_entry->batch); \ } \ for (_i = 0; _i < _n; _i++) { \ struct raft_entry *_entry = &_entries[_i]; \ if (_entry->batch != _batch) { \ _batch = _entry->batch; \ raft_free(_batch); \ } \ _total_data_size += _entry->buf.len; \ } \ raft_free(_entries); \ munit_assert_int(_total_data_size, ==, TOTAL_DATA_SIZE); \ } while (0); /****************************************************************************** * * raft_io->append() * *****************************************************************************/ SUITE(append) /* Append an entries array containing unaligned buffers. */ TEST(append, unaligned, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND_SUBMIT_CB_DATA(0, 1, 9, NULL, NULL, RAFT_INVALID); munit_assert_string_equal(f->io.errmsg, "entry buffers must be 8-byte aligned"); APPEND_SUBMIT_CB_DATA(1, 3, 63, NULL, NULL, RAFT_INVALID); munit_assert_string_equal(f->io.errmsg, "entry buffers must be 8-byte aligned"); return MUNIT_OK; } /* Append the very first batch of entries. */ TEST(append, first, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; APPEND(1, 64); ASSERT_ENTRIES(1, 64); return MUNIT_OK; } /* As soon as the backend starts writing the first open segment, a second one * and a third one get prepared. */ TEST(append, prepareSegments, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(1, 64); while (!DirHasFile(f->dir, "open-3")) { LOOP_RUN(1); } munit_assert_true(DirHasFile(f->dir, "open-1")); munit_assert_true(DirHasFile(f->dir, "open-2")); munit_assert_true(DirHasFile(f->dir, "open-3")); return MUNIT_OK; } /* Once the first segment fills up, it gets finalized, and an additional one * gets prepared, to maintain the available segments pool size. */ TEST(append, finalizeSegment, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE); APPEND(1, 64); while (!DirHasFile(f->dir, "open-4")) { LOOP_RUN(1); } munit_assert_true(DirHasFile(f->dir, "0000000000000001-0000000000000004")); munit_assert_false(DirHasFile(f->dir, "open-1")); munit_assert_true(DirHasFile(f->dir, "open-4")); return MUNIT_OK; } /* The very first batch of entries to append is bigger than the regular open * segment size. */ TEST(append, firstBig, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; APPEND(MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE); ASSERT_ENTRIES(MAX_SEGMENT_BLOCKS, MAX_SEGMENT_BLOCKS * SEGMENT_BLOCK_SIZE); return MUNIT_OK; } /* The second batch of entries to append is bigger than the regular open * segment size. */ TEST(append, secondBig, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(1, 64); APPEND(MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE); return MUNIT_OK; } /* Schedule multiple appends each one exceeding the segment size. */ TEST(append, severalBig, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; APPEND_SUBMIT(0, 2, MAX_SEGMENT_BLOCKS * SEGMENT_BLOCK_SIZE); APPEND_SUBMIT(1, 2, MAX_SEGMENT_BLOCKS * SEGMENT_BLOCK_SIZE); APPEND_SUBMIT(2, 2, MAX_SEGMENT_BLOCKS * SEGMENT_BLOCK_SIZE); APPEND_WAIT(0); APPEND_WAIT(1); APPEND_WAIT(2); ASSERT_ENTRIES(6, 6 * MAX_SEGMENT_BLOCKS * SEGMENT_BLOCK_SIZE); return MUNIT_OK; } /* Write the very first entry and then another one, both fitting in the same * block. */ TEST(append, fitBlock, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; APPEND(1, 64); APPEND(1, 64); ASSERT_ENTRIES(2, 128); return MUNIT_OK; } /* Write an entry that fills the first block exactly and then another one. */ TEST(append, matchBlock, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; size_t size; size = SEGMENT_BLOCK_SIZE; size -= sizeof(uint64_t) + /* Format */ sizeof(uint64_t) + /* Checksums */ 8 + 16; /* Header */ APPEND(1, size); APPEND(1, 64); ASSERT_ENTRIES(2, size + 64); return MUNIT_OK; } /* Write an entry that exceeds the first block, then another one that fits in * the second block, then a third one that fills the rest of the second block * plus the whole third block exactly, and finally a fourth entry that fits in * the fourth block */ TEST(append, exceedBlock, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; size_t written; size_t size1; size_t size2; size1 = SEGMENT_BLOCK_SIZE; APPEND(1, size1); APPEND(1, 64); written = sizeof(uint64_t) + /* Format version */ 2 * sizeof(uint32_t) + /* CRC sums of first batch */ 8 + 16 + /* Header of first batch */ size1 + /* Size of first batch */ 2 * sizeof(uint32_t) + /* CRC of second batch */ 8 + 16 + /* Header of second batch */ 64; /* Size of second batch */ /* Write a third entry that fills the second block exactly */ size2 = SEGMENT_BLOCK_SIZE - (written % SEGMENT_BLOCK_SIZE); size2 -= (2 * sizeof(uint32_t) + 8 + 16); size2 += SEGMENT_BLOCK_SIZE; APPEND(1, size2); /* Write a fourth entry */ APPEND(1, 64); ASSERT_ENTRIES(4, size1 + 64 + size2 + 64); return MUNIT_OK; } /* If an append request is submitted before the write operation of the previous * append request is started, then a single write will be performed for both * requests. */ TEST(append, batch, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND_SUBMIT(0, 1, 64); APPEND_SUBMIT(1, 1, 64); APPEND_WAIT(0); APPEND_WAIT(1); return MUNIT_OK; } /* An append request submitted while a write operation is in progress gets * executed only when the write completes. */ TEST(append, wait, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND_SUBMIT(0, 1, 64); LOOP_RUN(1); APPEND_SUBMIT(1, 1, 64); APPEND_WAIT(0); APPEND_WAIT(1); return MUNIT_OK; } /* Several batches with different size gets appended in fast pace, forcing the * segment arena to grow. */ TEST(append, resizeArena, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; APPEND_SUBMIT(0, 2, 64); APPEND_SUBMIT(1, 1, SEGMENT_BLOCK_SIZE); APPEND_SUBMIT(2, 2, 64); APPEND_SUBMIT(3, 1, SEGMENT_BLOCK_SIZE); APPEND_SUBMIT(4, 1, SEGMENT_BLOCK_SIZE); APPEND_WAIT(0); APPEND_WAIT(1); APPEND_WAIT(2); APPEND_WAIT(3); APPEND_WAIT(4); ASSERT_ENTRIES(7, 64 * 4 + SEGMENT_BLOCK_SIZE * 3); return MUNIT_OK; } /* A few append requests get queued, then a truncate request comes in and other * append requests right after, before truncation is fully completed. */ TEST(append, truncate, setUp, tearDown, 0, NULL) { struct fixture *f = data; int rv; return MUNIT_SKIP; /* FIXME: flaky */ APPEND(2, 64); APPEND_SUBMIT(0, 2, 64); rv = f->io.truncate(&f->io, 2); munit_assert_int(rv, ==, 0); APPEND_SUBMIT(1, 2, 64); APPEND_WAIT(0); APPEND_WAIT(1); return MUNIT_OK; } /* A few append requests get queued, then a truncate request comes in and other * append requests right after, before truncation is fully completed. However * the backend is closed before the truncation request can be processed. */ TEST(append, truncateClosing, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; int rv; APPEND(2, 64); APPEND_SUBMIT(0, 2, 64); rv = f->io.truncate(&f->io, 2); munit_assert_int(rv, ==, 0); APPEND_SUBMIT(1, 2, 64); APPEND_EXPECT(1, RAFT_CANCELED); TEAR_DOWN_UV; return MUNIT_OK; } /* A few append requests get queued, however the backend is closed before * preparing the second segment completes. */ TEST(append, prepareClosing, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; APPEND_SUBMIT(0, 2, 64); LOOP_RUN(1); TEAR_DOWN_UV; return MUNIT_OK; } /* The counters of the open segments get increased as they are closed. */ TEST(append, counter, setUp, tearDown, 0, NULL) { struct fixture *f = data; size_t size = SEGMENT_BLOCK_SIZE; int i; for (i = 0; i < 10; i++) { APPEND(1, size); } munit_assert_true(DirHasFile(f->dir, "0000000000000001-0000000000000003")); munit_assert_true(DirHasFile(f->dir, "0000000000000004-0000000000000006")); munit_assert_true(DirHasFile(f->dir, "open-4")); return MUNIT_OK; } /* If the I/O instance is closed, all pending append requests get canceled. */ TEST(append, cancel, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; APPEND_SUBMIT(0, 1, 64); APPEND_EXPECT(0, RAFT_CANCELED); TEAR_DOWN_UV; return MUNIT_OK; } /* The creation of the current open segment fails because there's no space. */ TEST(append, noSpaceUponPrepareCurrent, setUp, tearDown, 0, DirTmpfsParams) { struct fixture *f = data; SKIP_IF_NO_FIXTURE; raft_uv_set_segment_size(&f->io, SEGMENT_BLOCK_SIZE * 32768); APPEND_FAILURE( 1, 64, RAFT_NOSPACE, "create segment open-1: not enough space to allocate 134217728 bytes"); return MUNIT_OK; } /* The creation of a spare open segment fails because there's no space. */ TEST(append, noSpaceUponPrepareSpare, setUp, tearDown, 0, DirTmpfsParams) { struct fixture *f = data; SKIP_IF_NO_FIXTURE; #if defined(__powerpc64__) /* XXX: fails on ppc64el */ return MUNIT_SKIP; #endif raft_uv_set_segment_size(&f->io, SEGMENT_BLOCK_SIZE * 2); DirFill(f->dir, SEGMENT_BLOCK_SIZE * 3); APPEND(1, SEGMENT_BLOCK_SIZE); APPEND_SUBMIT(0, 1, SEGMENT_BLOCK_SIZE); APPEND_EXPECT(0, RAFT_NOSPACE); APPEND_WAIT(0); return MUNIT_OK; } /* The write request fails because there's not enough space. */ TEST(append, noSpaceUponWrite, setUp, tearDownDeps, 0, DirTmpfsParams) { struct fixture *f = data; SKIP_IF_NO_FIXTURE; #if defined(__powerpc64__) /* XXX: fails on ppc64el */ TEAR_DOWN_UV; return MUNIT_SKIP; #endif raft_uv_set_segment_size(&f->io, SEGMENT_BLOCK_SIZE); DirFill(f->dir, SEGMENT_BLOCK_SIZE * 2); APPEND(1, 64); APPEND_FAILURE(1, (SEGMENT_BLOCK_SIZE + 128), RAFT_NOSPACE, "short write: 4096 bytes instead of 8192"); DirRemoveFile(f->dir, ".fill"); LOOP_RUN(50); APPEND(5, 64); ASSERT_ENTRIES(6, 384); return MUNIT_OK; } /* A few requests fail because not enough disk space is available. Eventually * the space is released and the request succeeds. */ TEST(append, noSpaceResolved, setUp, tearDownDeps, 0, DirTmpfsParams) { struct fixture *f = data; SKIP_IF_NO_FIXTURE; #if defined(__powerpc64__) /* XXX: fails on ppc64el */ TEAR_DOWN_UV; return MUNIT_SKIP; #endif DirFill(f->dir, SEGMENT_BLOCK_SIZE); APPEND_FAILURE( 1, 64, RAFT_NOSPACE, "create segment open-1: not enough space to allocate 16384 bytes"); APPEND_FAILURE( 1, 64, RAFT_NOSPACE, "create segment open-2: not enough space to allocate 16384 bytes"); DirRemoveFile(f->dir, ".fill"); f->count = 0; /* Reset the data counter */ APPEND(1, 64); ASSERT_ENTRIES(1, 64); return MUNIT_OK; } /* An error occurs while performing a write. */ TEST(append, writeError, setUp, tearDown, 0, NULL) { struct fixture *f = data; aio_context_t ctx = 0; /* FIXME: doesn't fail anymore after * https://github.com/CanonicalLtd/raft/pull/49 */ return MUNIT_SKIP; APPEND_SUBMIT(0, 1, 64); AioFill(&ctx, 0); APPEND_WAIT(0); AioDestroy(ctx); return MUNIT_OK; } static char *oomHeapFaultDelay[] = {"1", /* FIXME "2", */ NULL}; static char *oomHeapFaultRepeat[] = {"1", NULL}; static MunitParameterEnum oomParams[] = { {TEST_HEAP_FAULT_DELAY, oomHeapFaultDelay}, {TEST_HEAP_FAULT_REPEAT, oomHeapFaultRepeat}, {NULL, NULL}, }; /* Out of memory conditions. */ TEST(append, oom, setUp, tearDown, 0, oomParams) { struct fixture *f = data; HEAP_FAULT_ENABLE; APPEND_ERROR(1, 64, RAFT_NOMEM, ""); return MUNIT_OK; } /* The uv instance is closed while a write request is in progress. */ TEST(append, closeDuringWrite, setUp, tearDown, 0, NULL) { struct fixture *f = data; /* TODO: broken */ return MUNIT_SKIP; APPEND_SUBMIT(0, 1, 64); LOOP_RUN(1); TEAR_DOWN_UV; return MUNIT_OK; } /* When the backend is closed, all unused open segments get removed. */ TEST(append, removeSegmentUponClose, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; APPEND(1, 64); while (!DirHasFile(f->dir, "open-2")) { LOOP_RUN(1); } TEAR_DOWN_UV; munit_assert_false(DirHasFile(f->dir, "open-2")); return MUNIT_OK; } /* When the backend is closed, all pending prepare get requests get canceled. */ TEST(append, cancelPrepareRequest, setUp, tearDown, 0, NULL) { struct fixture *f = data; /* TODO: find a way to test a prepare request cancelation */ return MUNIT_SKIP; APPEND(MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE); APPEND_SUBMIT(0, 1, 64); APPEND_EXPECT(0, RAFT_CANCELED); TEAR_DOWN_UV; return MUNIT_OK; } /* When the writer gets closed it tells the writer to close the segment that * it's currently writing. */ TEST(append, currentSegment, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; APPEND(1, 64); TEAR_DOWN_UV; munit_assert_true(DirHasFile(f->dir, "0000000000000001-0000000000000001")); return MUNIT_OK; } /* The kernel has ran out of available AIO events. */ TEST(append, ioSetupError, setUp, tearDown, 0, NULL) { struct fixture *f = data; aio_context_t ctx = 0; int rv; rv = AioFill(&ctx, 0); if (rv != 0) { return MUNIT_SKIP; } APPEND_FAILURE(1, 64, RAFT_TOOMANY, "setup writer for open-1: AIO events user limit exceeded"); return MUNIT_OK; } /*=========================================================================== Test interaction between UvAppend and UvBarrier ===========================================================================*/ struct barrierData { int current; /* Count the number of finished AppendEntries RPCs */ int expected; /* Expected number of finished AppendEntries RPCs */ bool done; /* @true if the Barrier CB has fired */ bool expectDone; /* Expect the Barrier CB to have fired or not */ char **files; /* Expected files in the directory, NULL terminated */ struct uv *uv; }; static void barrierCbCompareCounter(struct UvBarrierReq *barrier) { struct barrierData *bd = barrier->data; munit_assert_false(bd->done); bd->done = true; struct uv *uv = bd->uv; UvUnblock(uv); munit_assert_int(bd->current, ==, bd->expected); if (bd->files != NULL) { int i = 0; while (bd->files[i] != NULL) { munit_assert_true(DirHasFile(uv->dir, bd->files[i])); ++i; } } } static void barrierDoneCb(struct UvBarrierReq *barrier) { struct barrierData *bd = barrier->data; munit_assert_false(bd->done); bd->done = true; } static void appendCbIncreaseCounterAssertResult(struct raft_io_append *req, int status) { struct result *result = req->data; munit_assert_int(status, ==, result->status); result->done = true; struct barrierData *bd = result->data; munit_assert_true(bd->done == bd->expectDone); bd->current += 1; } static void appendDummyCb(struct raft_io_append *req, int status) { (void)req; (void)status; } static char *bools[] = {"0", "1", NULL}; static MunitParameterEnum blocking_bool_params[] = { {"bool", bools}, {NULL, NULL}, }; /* Fill up 3 segments worth of AppendEntries RPC's. * Request a Barrier and expect that the AppendEntries RPC's are finished before * the Barrier callback is fired. */ TEST(append, barrierOpenSegments, setUp, tearDown, 0, blocking_bool_params) { struct fixture *f = data; struct barrierData bd = {0}; bd.current = 0; bd.expected = 3; bd.done = false; bd.expectDone = false; bd.uv = f->io.impl; char *files[] = {"0000000000000001-0000000000000004", "0000000000000005-0000000000000008", "0000000000000009-0000000000000012", NULL}; bd.files = files; APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, appendCbIncreaseCounterAssertResult, &bd, 0); APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, appendCbIncreaseCounterAssertResult, &bd, 0); APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, appendCbIncreaseCounterAssertResult, &bd, 0); struct UvBarrierReq barrier = {0}; barrier.data = (void *)&bd; barrier.blocking = (bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0); barrier.cb = barrierCbCompareCounter; UvBarrier(f->io.impl, 1, &barrier); /* Make sure every callback fired */ LOOP_RUN_UNTIL(&bd.done); APPEND_WAIT(0); APPEND_WAIT(1); APPEND_WAIT(2); return MUNIT_OK; } /* Fill up 3 segments worth of AppendEntries RPC's. * Request a Barrier and stop early. */ TEST(append, barrierOpenSegmentsExitEarly, setUp, NULL, 0, blocking_bool_params) { struct fixture *f = data; struct barrierData bd = {0}; bd.current = 0; bd.expected = 3; bd.done = false; bd.expectDone = false; bd.uv = f->io.impl; char *files[] = {"0000000000000001-0000000000000004", "0000000000000005-0000000000000008", "0000000000000009-0000000000000012", NULL}; bd.files = files; APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, appendDummyCb, NULL, 0); APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, appendDummyCb, NULL, 0); APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, appendDummyCb, NULL, 0); struct UvBarrierReq barrier = {0}; barrier.data = (void *)&bd; barrier.blocking = (bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0); barrier.cb = barrierDoneCb; UvBarrier(f->io.impl, 1, &barrier); /* Exit early. */ tearDown(data); munit_assert_true(bd.done); return MUNIT_OK; } /* Fill up 3 segments worth of AppendEntries RPC's. * Request a 2 barriers and expect their callbacks to fire. */ TEST(append, twoBarriersOpenSegments, setUp, tearDown, 0, blocking_bool_params) { struct fixture *f = data; struct barrierData bd1 = {0}; bd1.current = 0; bd1.expected = 3; bd1.done = false; bd1.expectDone = false; bd1.uv = f->io.impl; char *files[] = {"0000000000000001-0000000000000004", "0000000000000005-0000000000000008", "0000000000000009-0000000000000012", NULL}; bd1.files = files; /* Only expect the callback to eventually fire. */ struct barrierData bd2 = {0}; bd2.uv = f->io.impl; APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, appendCbIncreaseCounterAssertResult, &bd1, 0); APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, appendCbIncreaseCounterAssertResult, &bd1, 0); APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, appendCbIncreaseCounterAssertResult, &bd1, 0); struct UvBarrierReq barrier1 = {0}; barrier1.data = (void *)&bd1; barrier1.blocking = (bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0); barrier1.cb = barrierCbCompareCounter; UvBarrier(f->io.impl, 1, &barrier1); struct UvBarrierReq barrier2 = {0}; barrier2.data = (void *)&bd2; barrier2.blocking = (bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0); barrier2.cb = barrierCbCompareCounter; UvBarrier(f->io.impl, 1, &barrier2); /* Make sure every callback fired */ LOOP_RUN_UNTIL(&bd1.done); LOOP_RUN_UNTIL(&bd2.done); APPEND_WAIT(0); APPEND_WAIT(1); APPEND_WAIT(2); return MUNIT_OK; } /* Fill up 3 segments worth of AppendEntries RPC's. * Request 2 barriers and exit early. */ TEST(append, twoBarriersExitEarly, setUp, NULL, 0, blocking_bool_params) { struct fixture *f = data; struct barrierData bd1 = {0}; bd1.current = 0; bd1.expected = 3; bd1.done = false; bd1.expectDone = false; bd1.uv = f->io.impl; char *files[] = {"0000000000000001-0000000000000004", "0000000000000005-0000000000000008", "0000000000000009-0000000000000012", NULL}; bd1.files = files; /* Only expect the callback to eventually fire. */ struct barrierData bd2 = {0}; bd2.uv = f->io.impl; APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, appendDummyCb, NULL, 0); APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, appendDummyCb, NULL, 0); APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, appendDummyCb, NULL, 0); struct UvBarrierReq barrier1 = {0}; barrier1.data = (void *)&bd1; barrier1.blocking = (bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0); barrier1.cb = barrierDoneCb; UvBarrier(f->io.impl, 1, &barrier1); struct UvBarrierReq barrier2 = {0}; barrier2.data = (void *)&bd2; barrier2.blocking = (bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0); barrier2.cb = barrierDoneCb; UvBarrier(f->io.impl, 1, &barrier2); /* Exit early. */ tearDown(data); munit_assert_true(bd1.done); munit_assert_true(bd2.done); return MUNIT_OK; } /* Request a blocking Barrier and expect that the no AppendEntries RPC's are * finished before the Barrier callback is fired. */ TEST(append, blockingBarrierNoOpenSegments, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct barrierData bd = {0}; bd.current = 0; bd.expected = 0; bd.done = false; bd.expectDone = true; bd.uv = f->io.impl; struct UvBarrierReq barrier = {0}; barrier.data = (void *)&bd; barrier.blocking = true; barrier.cb = barrierCbCompareCounter; UvBarrier(f->io.impl, 1, &barrier); APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, appendCbIncreaseCounterAssertResult, &bd, 0); APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, appendCbIncreaseCounterAssertResult, &bd, 0); APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, appendCbIncreaseCounterAssertResult, &bd, 0); /* Make sure every callback fired */ LOOP_RUN_UNTIL(&bd.done); APPEND_WAIT(0); APPEND_WAIT(1); APPEND_WAIT(2); return MUNIT_OK; } /* Request a blocking Barrier and expect that the no AppendEntries RPC's are * finished before the Barrier callback is fired. */ TEST(append, blockingBarrierSingleOpenSegment, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct barrierData bd = {0}; bd.current = 0; bd.expected = 0; bd.done = false; bd.expectDone = true; bd.uv = f->io.impl; char *files[] = {"0000000000000001-0000000000000001", NULL}; bd.files = files; /* Wait until there is at least 1 open segment otherwise * the barrier Cb is fired immediately. */ APPEND(1, 64); while (!DirHasFile(f->dir, "open-1")) { LOOP_RUN(1); } struct UvBarrierReq barrier = {0}; barrier.data = (void *)&bd; barrier.blocking = true; barrier.cb = barrierCbCompareCounter; UvBarrier(f->io.impl, 1, &barrier); APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, appendCbIncreaseCounterAssertResult, &bd, 0); APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, appendCbIncreaseCounterAssertResult, &bd, 0); APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, appendCbIncreaseCounterAssertResult, &bd, 0); /* Make sure every callback fired */ LOOP_RUN_UNTIL(&bd.done); APPEND_WAIT(0); APPEND_WAIT(1); APPEND_WAIT(2); return MUNIT_OK; } static void longWorkCb(uv_work_t *work) { (void)work; sleep(1); } static void longAfterWorkCb(uv_work_t *work, int status) { struct barrierData *bd = work->data; munit_assert_false(bd->done); bd->done = true; munit_assert_int(status, ==, 0); struct uv *uv = bd->uv; UvUnblock(uv); munit_assert_int(bd->current, ==, bd->expected); free(work); } static void barrierCbLongWork(struct UvBarrierReq *barrier) { struct barrierData *bd = barrier->data; munit_assert_false(bd->done); struct uv *uv = bd->uv; int rv; uv_work_t *work = munit_malloc(sizeof(*work)); munit_assert_ptr_not_null(work); work->data = bd; rv = uv_queue_work(uv->loop, work, longWorkCb, longAfterWorkCb); munit_assert_int(rv, ==, 0); } /* Request a non-blocking Barrier that triggers a long-running task, the barrier * is removed when the long running task completes. This simulates a large * snapshot write. Ensure Append requests complete before the long running task * completes.*/ TEST(append, nonBlockingBarrierLongBlockingTask, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct barrierData bd = {0}; bd.current = 0; bd.expected = 1; bd.done = false; bd.expectDone = false; bd.uv = f->io.impl; struct UvBarrierReq barrier = {0}; barrier.data = (void *)&bd; barrier.blocking = false; barrier.cb = barrierCbLongWork; UvBarrier(f->io.impl, bd.uv->append_next_index, &barrier); APPEND_SUBMIT_CB_DATA(0, 1, 64, appendCbIncreaseCounterAssertResult, &bd, 0); /* Make sure every callback fired */ LOOP_RUN_UNTIL(&bd.done); APPEND_WAIT(0); return MUNIT_OK; } /* Request a blocking Barrier that triggers a long-running task, the barrier * is unblocked and removed when the long running task completes. This simulates * a large snapshot install. Ensure Append requests complete after the work * completes.*/ TEST(append, blockingBarrierLongBlockingTask, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct barrierData bd = {0}; bd.current = 0; bd.expected = 0; bd.done = false; bd.expectDone = true; bd.uv = f->io.impl; struct UvBarrierReq barrier = {0}; barrier.data = (void *)&bd; barrier.blocking = true; barrier.cb = barrierCbLongWork; UvBarrier(f->io.impl, bd.uv->append_next_index, &barrier); APPEND_SUBMIT_CB_DATA(0, 1, 64, appendCbIncreaseCounterAssertResult, &bd, 0); /* Make sure every callback fired */ LOOP_RUN_UNTIL(&bd.done); APPEND_WAIT(0); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_uv_bootstrap.c000066400000000000000000000057501465252713400233000ustar00rootroot00000000000000#include "../lib/runner.h" #include "../lib/uv.h" /****************************************************************************** * * Fixture with a libuv-based raft_io instance and an empty configuration. * *****************************************************************************/ struct fixture { FIXTURE_UV_DEPS; FIXTURE_UV; struct raft_configuration conf; }; /****************************************************************************** * * Helper macros * *****************************************************************************/ /* Add a server to the fixture's configuration. */ #define CONFIGURATION_ADD(ID, ADDRESS) \ { \ int rv_; \ rv_ = raft_configuration_add(&f->conf, ID, ADDRESS, RAFT_VOTER); \ munit_assert_int(rv_, ==, 0); \ } /* Invoke f->io->bootstrap() and assert that no error occurs. */ #define BOOTSTRAP \ { \ int rv_; \ rv_ = f->io.bootstrap(&f->io, &f->conf); \ munit_assert_int(rv_, ==, 0); \ } /****************************************************************************** * * Set up and tear down. * *****************************************************************************/ static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_UV_DEPS; SETUP_UV; raft_configuration_init(&f->conf); return f; } static void tearDown(void *data) { struct fixture *f = data; raft_configuration_close(&f->conf); TEAR_DOWN_UV; TEAR_DOWN_UV_DEPS; free(f); } /****************************************************************************** * * raft_io->bootstrap() * *****************************************************************************/ SUITE(bootstrap) /* Invoke f->io->bootstrap() and assert that it returns the given error code and * message. */ #define BOOTSTRAP_ERROR(RV, ERRMSG) \ { \ int rv_; \ rv_ = f->io.bootstrap(&f->io, &f->conf); \ munit_assert_int(rv_, ==, RV); \ munit_assert_string_equal(f->io.errmsg, ERRMSG); \ } /* Bootstrap a pristine server. */ TEST(bootstrap, pristine, setUp, tearDown, 0, NULL) { struct fixture *f = data; CONFIGURATION_ADD(1, "1"); BOOTSTRAP; return MUNIT_OK; } /* The data directory already has metadata files with a non-zero term. */ TEST(bootstrap, termIsNonZero, setUp, tearDown, 0, NULL) { struct fixture *f = data; CONFIGURATION_ADD(1, "1"); BOOTSTRAP; BOOTSTRAP_ERROR(RAFT_CANTBOOTSTRAP, "metadata contains term 1"); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_uv_init.c000066400000000000000000000236771465252713400222360ustar00rootroot00000000000000#include "../../../src/raft.h" #include "../../../src/raft/byte.h" #include "../../../src/raft/uv_encoding.h" #include "../lib/runner.h" #include "../lib/uv.h" #include #include #define BAD_FORMAT 3 #define BAD_FORMAT_STR "3" /****************************************************************************** * * Fixture with a non-initialized raft_io instance and uv dependencies. * *****************************************************************************/ struct fixture { FIXTURE_UV_DEPS; FIXTURE_UV; bool closed; }; /****************************************************************************** * * Helper macros * *****************************************************************************/ static void closeCb(struct raft_io *io) { struct fixture *f = io->data; f->closed = true; } /* Invoke raft_uv_init() and assert that no error occurs. */ #define INIT(DIR) \ do { \ int _rv; \ _rv = raft_uv_init(&f->io, &f->loop, DIR, &f->transport); \ munit_assert_int(_rv, ==, 0); \ _rv = f->io.init(&f->io, 1, "1"); \ munit_assert_int(_rv, ==, 0); \ } while (0) /* Invoke raft_io->close(). */ #define CLOSE \ do { \ f->io.close(&f->io, closeCb); \ LOOP_RUN_UNTIL(&f->closed); \ raft_uv_close(&f->io); \ } while (0) /* Invoke raft_uv_init() and assert that the given error code is returned and * the given error message set. */ #define INIT_ERROR(DIR, RV, ERRMSG) \ do { \ int _rv; \ _rv = raft_uv_init(&f->io, &f->loop, DIR, &f->transport); \ munit_assert_int(_rv, ==, 0); \ _rv = f->io.init(&f->io, 1, "1"); \ munit_assert_int(_rv, ==, RV); \ munit_assert_string_equal(f->io.errmsg, ERRMSG); \ CLOSE; \ } while (0) /* Write either the metadata1 or metadata2 file, filling it with the given * values. */ #define WRITE_METADATA_FILE(N, FORMAT, VERSION, TERM, VOTED_FOR) \ { \ uint8_t buf[8 * 4]; \ void *cursor = buf; \ char filename[strlen("metadataN") + 1]; \ sprintf(filename, "metadata%d", N); \ bytePut64(&cursor, FORMAT); \ bytePut64(&cursor, VERSION); \ bytePut64(&cursor, TERM); \ bytePut64(&cursor, VOTED_FOR); \ DirWriteFile(f->dir, filename, buf, sizeof buf); \ } #define LONG_DIR \ "/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" \ "/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" \ "/ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc" \ "/ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd" \ "/eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee" \ "/fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff" \ "/ggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggg" \ "/hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh" \ "/iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii" \ "/jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj" \ "/kkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk" \ "/lllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllll" \ "/mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm" static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_UV_DEPS; f->io.data = f; f->closed = false; return f; } static void tearDown(void *data) { struct fixture *f = data; if (f == NULL) { return; } TEAR_DOWN_UV_DEPS; free(f); } /****************************************************************************** * * raft_io->init() * *****************************************************************************/ SUITE(init) TEST(init, dirTooLong, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_io io = {0}; int rv; rv = raft_uv_init(&io, &f->loop, LONG_DIR, &f->transport); munit_assert_int(rv, ==, RAFT_NAMETOOLONG); munit_assert_string_equal(io.errmsg, "directory path too long"); return 0; } /* Out of memory conditions upon probing for direct I/O. */ TEST(init, probeDirectIoOom, setUp, tearDown, 0, NULL) { struct fixture *f = data; /* XXX: tmpfs seems to not support O_DIRECT */ struct statfs info; int rv; rv = statfs(f->dir, &info); munit_assert_int(rv, ==, 0); if (info.f_type == TMPFS_MAGIC) { return MUNIT_SKIP; } #if defined(__powerpc64__) /* XXX: fails on ppc64el */ return MUNIT_SKIP; #endif HeapFaultConfig(&f->heap, 1 /* delay */, 1 /* repeat */); HEAP_FAULT_ENABLE; INIT_ERROR(f->dir, RAFT_NOMEM, "probe Direct I/O: out of memory"); return 0; } /* Out of memory conditions upon probing for async I/O. */ TEST(init, probeAsyncIoOom, setUp, tearDown, 0, NULL) { struct fixture *f = data; /* XXX: tmpfs seems to not support O_DIRECT */ struct statfs info; int rv; rv = statfs(f->dir, &info); munit_assert_int(rv, ==, 0); if (info.f_type == TMPFS_MAGIC) { return MUNIT_SKIP; } #if defined(__powerpc64__) /* XXX: fails on ppc64el */ return MUNIT_SKIP; #endif HeapFaultConfig(&f->heap, 2 /* delay */, 1 /* repeat */); HEAP_FAULT_ENABLE; INIT_ERROR(f->dir, RAFT_NOMEM, "probe Async I/O: out of memory"); return 0; } /* The given directory does not exist. */ TEST(init, dirDoesNotExist, setUp, tearDown, 0, NULL) { struct fixture *f = data; INIT_ERROR("/foo/bar/egg/baz", RAFT_NOTFOUND, "directory '/foo/bar/egg/baz' does not exist"); return MUNIT_OK; } /* The given directory not accessible */ TEST(init, dirNotAccessible, setUp, tearDown, 0, NULL) { struct fixture *f = data; char errmsg[RAFT_ERRMSG_BUF_SIZE]; sprintf(errmsg, "directory '%s' is not writable", f->dir); DirMakeUnexecutable(f->dir); INIT_ERROR(f->dir, RAFT_INVALID, errmsg); return MUNIT_OK; } /* No space is left for probing I/O capabilities. */ TEST(init, noSpace, setUp, tearDown, 0, DirTmpfsParams) { struct fixture *f = data; SKIP_IF_NO_FIXTURE; DirFill(f->dir, 4); INIT_ERROR(f->dir, RAFT_NOSPACE, "create I/O capabilities probe file: not enough space to " "allocate 4096 bytes"); return MUNIT_OK; } /* The metadata1 file has not the expected number of bytes. In this case the * file is not considered at all, and the effect is as if this was a brand new * server. */ TEST(init, metadataOneTooShort, setUp, tearDown, 0, NULL) { struct fixture *f = data; uint8_t buf[16] = {0}; DirWriteFile(f->dir, "metadata1", buf, sizeof buf); INIT(f->dir); CLOSE; return MUNIT_OK; } /* The metadata1 file has not the expected format. */ TEST(init, metadataOneBadFormat, setUp, tearDown, 0, NULL) { struct fixture *f = data; WRITE_METADATA_FILE(1, /* Metadata file index */ BAD_FORMAT, /* Format */ 1, /* Version */ 1, /* Term */ 0 /* Voted for */); INIT_ERROR(f->dir, RAFT_MALFORMED, "decode content of metadata1: bad format version " BAD_FORMAT_STR); return MUNIT_OK; } /* The metadata1 file has not a valid version. */ TEST(init, metadataOneBadVersion, setUp, tearDown, 0, NULL) { struct fixture *f = data; WRITE_METADATA_FILE(1, /* Metadata file index */ UV__DISK_FORMAT, /* Format */ 0, /* Version */ 1, /* Term */ 0 /* Voted for */); INIT_ERROR(f->dir, RAFT_CORRUPT, "decode content of metadata1: version is set to zero"); return MUNIT_OK; } /* The data directory has both metadata files, but they have the same * version. */ TEST(init, metadataOneAndTwoSameVersion, setUp, tearDown, 0, NULL) { struct fixture *f = data; WRITE_METADATA_FILE(1, /* Metadata file index */ UV__DISK_FORMAT, /* Format */ 2, /* Version */ 3, /* Term */ 0 /* Voted for */); WRITE_METADATA_FILE(2, /* Metadata file index */ UV__DISK_FORMAT, /* Format */ 2, /* Version */ 2, /* Term */ 0 /* Voted for */); INIT_ERROR(f->dir, RAFT_CORRUPT, "metadata1 and metadata2 are both at version 2"); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_uv_load.c000066400000000000000000002115411465252713400221770ustar00rootroot00000000000000#include #include "../../../src/raft/byte.h" #include "../../../src/raft/uv.h" #include "../../../src/raft/uv_encoding.h" #include "../lib/runner.h" #include "../lib/uv.h" /****************************************************************************** * * Fixture with a non-initialized libuv-based raft_io instance. * *****************************************************************************/ struct fixture { FIXTURE_UV_DEPS; FIXTURE_UV; }; /****************************************************************************** * * Helper macros * *****************************************************************************/ static void closeCb(struct raft_io *io) { bool *done = io->data; *done = true; } static void appendCb(struct raft_io_append *req, int status) { bool *done = req->data; munit_assert_int(status, ==, 0); *done = true; } static void snapshotPutCb(struct raft_io_snapshot_put *req, int status) { bool *done = req->data; munit_assert_int(status, ==, 0); *done = true; } struct snapshot { raft_term term; raft_index index; uint64_t data; }; #define WORD_SIZE 8 /* Maximum number of blocks a segment can have */ #define MAX_SEGMENT_BLOCKS 4 /* This block size should work fine for all file systems. */ #define SEGMENT_BLOCK_SIZE 4096 /* Desired segment size */ #define SEGMENT_SIZE SEGMENT_BLOCK_SIZE *MAX_SEGMENT_BLOCKS #define CLOSED_SEGMENT_FILENAME(START, END) \ "000000000000000" #START \ "-" \ "000000000000000" #END /* Check if open segment file exists. */ #define HAS_OPEN_SEGMENT_FILE(COUNT) DirHasFile(f->dir, "open-" #COUNT) /* Check if closed segment file exists. */ #define HAS_CLOSED_SEGMENT_FILE(START, END) \ DirHasFile(f->dir, CLOSED_SEGMENT_FILENAME(START, END)) /* Initialize a standalone raft_io instance and use it to append N batches of * entries, each containing one entry. DATA should be an integer that will be * used as base value for the data of the first entry, and will be then * incremented for subsequent entries. */ #define APPEND(N, DATA) \ do { \ struct raft_uv_transport _transport; \ struct raft_io _io; \ raft_term _term; \ raft_id _voted_for; \ struct raft_snapshot *_snapshot; \ raft_index _start_index; \ struct raft_entry *_entries; \ size_t _i; \ size_t _n; \ void *_batch = NULL; \ struct raft_entry _new_entry; \ uint64_t _new_entry_data; \ uint64_t _data = DATA; \ struct raft_io_append _req; \ bool _done = false; \ int _rv; \ \ /* Initialize the instance, loading existing data, but discarding \ * it. This makes sure that the start index is correctly set. */ \ _transport.version = 1; \ _rv = raft_uv_tcp_init(&_transport, &f->loop); \ munit_assert_int(_rv, ==, 0); \ _rv = raft_uv_init(&_io, &f->loop, f->dir, &_transport); \ munit_assert_int(_rv, ==, 0); \ _rv = _io.init(&_io, 1, "1"); \ munit_assert_int(_rv, ==, 0); \ raft_uv_set_block_size(&_io, SEGMENT_BLOCK_SIZE); \ raft_uv_set_segment_size(&_io, SEGMENT_SIZE); \ _rv = _io.load(&_io, &_term, &_voted_for, &_snapshot, &_start_index, \ &_entries, &_n); \ munit_assert_int(_rv, ==, 0); \ for (_i = 0; _i < _n; _i++) { \ struct raft_entry *_entry = &_entries[_i]; \ if (_entry->batch != _batch) { \ _batch = _entry->batch; \ raft_free(_batch); \ } \ } \ if (_entries != NULL) { \ raft_free(_entries); \ } \ if (_snapshot != NULL) { \ raft_configuration_close(&_snapshot->configuration); \ munit_assert_int(_snapshot->n_bufs, ==, 1); \ raft_free(_snapshot->bufs[0].base); \ raft_free(_snapshot->bufs); \ raft_free(_snapshot); \ } \ \ /* Append the new entries. */ \ for (_i = 0; _i < N; _i++) { \ struct raft_entry *entry = &_new_entry; \ entry->term = 1; \ entry->type = RAFT_COMMAND; \ entry->buf.base = &_new_entry_data; \ entry->buf.len = sizeof _new_entry_data; \ entry->batch = NULL; \ munit_assert_ptr_not_null(entry->buf.base); \ memset(entry->buf.base, 0, entry->buf.len); \ *(uint64_t *)entry->buf.base = _data; \ _data++; \ _req.data = &_done; \ _rv = _io.append(&_io, &_req, entry, 1, appendCb); \ munit_assert_int(_rv, ==, 0); \ LOOP_RUN_UNTIL(&_done); \ _done = false; \ } \ \ /* Shutdown the standalone raft_io instance. */ \ _done = false; \ _io.data = &_done; \ _io.close(&_io, closeCb); \ LOOP_RUN_UNTIL(&_done); \ raft_uv_close(&_io); \ raft_uv_tcp_close(&_transport); \ } while (0); /* Initialize a standalone raft_io instance and use it to persist a new snapshot * at the given INDEX and TERM. DATA should be an integer that will be used as * as snapshot content. */ #define SNAPSHOT_PUT(TERM, INDEX, DATA) \ do { \ struct raft_uv_transport _transport; \ struct raft_io _io; \ raft_term _term; \ raft_id _voted_for; \ struct raft_snapshot *_snapshot; \ raft_index _start_index; \ struct raft_entry *_entries; \ size_t _i; \ size_t _n; \ void *_batch = NULL; \ struct raft_snapshot _new_snapshot; \ struct raft_buffer _new_snapshot_buf; \ uint64_t _new_snapshot_data = DATA; \ struct raft_io_snapshot_put _req; \ bool _done = false; \ int _rv; \ \ /* Initialize the instance, loading existing data, but discarding \ * it. This makes sure that the start index is correctly set. */ \ _transport.version = 1; \ _rv = raft_uv_tcp_init(&_transport, &f->loop); \ munit_assert_int(_rv, ==, 0); \ _rv = raft_uv_init(&_io, &f->loop, f->dir, &_transport); \ munit_assert_int(_rv, ==, 0); \ _rv = _io.init(&_io, 1, "1"); \ munit_assert_int(_rv, ==, 0); \ raft_uv_set_block_size(&_io, SEGMENT_BLOCK_SIZE); \ raft_uv_set_segment_size(&_io, SEGMENT_SIZE); \ _rv = _io.load(&_io, &_term, &_voted_for, &_snapshot, &_start_index, \ &_entries, &_n); \ munit_assert_int(_rv, ==, 0); \ for (_i = 0; _i < _n; _i++) { \ struct raft_entry *_entry = &_entries[_i]; \ if (_entry->batch != _batch) { \ _batch = _entry->batch; \ raft_free(_batch); \ } \ } \ if (_entries != NULL) { \ raft_free(_entries); \ } \ if (_snapshot != NULL) { \ raft_configuration_close(&_snapshot->configuration); \ munit_assert_int(_snapshot->n_bufs, ==, 1); \ raft_free(_snapshot->bufs[0].base); \ raft_free(_snapshot->bufs); \ raft_free(_snapshot); \ } \ \ /* Persist the new snapshot. */ \ _new_snapshot.index = INDEX; \ _new_snapshot.term = TERM; \ raft_configuration_init(&_new_snapshot.configuration); \ _rv = raft_configuration_add(&_new_snapshot.configuration, 1, "1", \ RAFT_VOTER); \ munit_assert_int(_rv, ==, 0); \ _new_snapshot.bufs = &_new_snapshot_buf; \ _new_snapshot.n_bufs = 1; \ _new_snapshot_buf.base = &_new_snapshot_data; \ _new_snapshot_buf.len = sizeof _new_snapshot_data; \ _req.data = &_done; \ _rv = \ _io.snapshot_put(&_io, 10, &_req, &_new_snapshot, snapshotPutCb); \ munit_assert_int(_rv, ==, 0); \ LOOP_RUN_UNTIL(&_done); \ raft_configuration_close(&_new_snapshot.configuration); \ \ /* Shutdown the standalone raft_io instance. */ \ _done = false; \ _io.data = &_done; \ _io.close(&_io, closeCb); \ LOOP_RUN_UNTIL(&_done); \ raft_uv_close(&_io); \ raft_uv_tcp_close(&_transport); \ } while (0); /* Forcibly turn a closed segment into an open one, by renaming the underlying * file and growing its size. */ #define UNFINALIZE(FIRST_INDEX, LAST_INDEX, COUNTER) \ do { \ const char *_filename1 = \ CLOSED_SEGMENT_FILENAME(FIRST_INDEX, LAST_INDEX); \ char _filename2[64]; \ sprintf(_filename2, "open-%u", (unsigned)COUNTER); \ munit_assert_true(DirHasFile(f->dir, _filename1)); \ munit_assert_false(DirHasFile(f->dir, _filename2)); \ DirRenameFile(f->dir, _filename1, _filename2); \ DirGrowFile(f->dir, _filename2, SEGMENT_SIZE); \ } while (0) #define LOAD_VARS \ int _rv; \ raft_term _term; \ raft_id _voted_for; \ struct raft_snapshot *_snapshot; \ raft_index _start_index; \ struct raft_entry *_entries; \ size_t _n; /* Initialize the raft_io instance, then call raft_io->load() and assert that it * returns the given error code and message. */ #define LOAD_ERROR(RV, ERRMSG) \ do { \ LOAD_VARS; \ SETUP_UV; \ _rv = f->io.load(&f->io, &_term, &_voted_for, &_snapshot, \ &_start_index, &_entries, &_n); \ munit_assert_int(_rv, ==, RV); \ munit_assert_string_equal(f->io.errmsg, ERRMSG); \ } while (0) #define LOAD_ERROR_NO_SETUP(RV, ERRMSG) \ do { \ LOAD_VARS; \ _rv = f->io.load(&f->io, &_term, &_voted_for, &_snapshot, \ &_start_index, &_entries, &_n); \ munit_assert_int(_rv, ==, RV); \ munit_assert_string_equal(f->io.errmsg, ERRMSG); \ } while (0) #define LOAD_ERROR_NO_RECOVER(RV, ERRMSG) \ do { \ LOAD_VARS; \ SETUP_UV; \ _rv = f->io.load(&f->io, &_term, &_voted_for, &_snapshot, \ &_start_index, &_entries, &_n); \ munit_assert_int(_rv, ==, RV); \ munit_assert_string_equal(f->io.errmsg, ERRMSG); \ } while (0) #define _LOAD(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, N_ENTRIES) \ _rv = f->io.load(&f->io, &_term, &_voted_for, &_snapshot, &_start_index, \ &_entries, &_n); \ munit_assert_int(_rv, ==, 0); \ munit_assert_int(_term, ==, TERM); \ munit_assert_int(_voted_for, ==, VOTED_FOR); \ munit_assert_int(_start_index, ==, START_INDEX); \ if (_snapshot != NULL) { \ struct snapshot *_expected = (struct snapshot *)(SNAPSHOT); \ munit_assert_ptr_not_null(_snapshot); \ munit_assert_int(_snapshot->term, ==, _expected->term); \ munit_assert_int(_snapshot->index, ==, _expected->index); \ munit_assert_int(_snapshot->n_bufs, ==, 1); \ munit_assert_int(*(uint64_t *)_snapshot->bufs[0].base, ==, \ _expected->data); \ raft_configuration_close(&_snapshot->configuration); \ raft_free(_snapshot->bufs[0].base); \ raft_free(_snapshot->bufs); \ raft_free(_snapshot); \ } \ if (_n != 0) { \ munit_assert_int(_n, ==, N_ENTRIES); \ for (_i = 0; _i < _n; _i++) { \ struct raft_entry *_entry = &_entries[_i]; \ uint64_t _value = *(uint64_t *)_entry->buf.base; \ munit_assert_int(_value, ==, _data); \ _data++; \ } \ for (_i = 0; _i < _n; _i++) { \ struct raft_entry *_entry = &_entries[_i]; \ if (_entry->batch != _batch) { \ _batch = _entry->batch; \ raft_free(_batch); \ } \ } \ raft_free(_entries); \ } /* Initialize the raft_io instance, then invoke raft_io->load() and assert that * it returns the given state. If non-NULL, SNAPSHOT points to a struct snapshot * object whose attributes must match the loaded snapshot. ENTRIES_DATA is * supposed to be the integer stored in the data of first loaded entry. */ #define LOAD(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, ENTRIES_DATA, N_ENTRIES) \ do { \ LOAD_VARS; \ void *_batch = NULL; \ uint64_t _data = ENTRIES_DATA; \ unsigned _i; \ SETUP_UV; \ _LOAD(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, N_ENTRIES) \ } while (0) /* Same as LOAD but with auto recovery turned on. */ #define LOAD_WITH_AUTO_RECOVERY(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, \ ENTRIES_DATA, N_ENTRIES) \ do { \ LOAD_VARS; \ void *_batch = NULL; \ uint64_t _data = ENTRIES_DATA; \ unsigned _i; \ SETUP_UV; \ raft_uv_set_auto_recovery(&f->io, true); \ _LOAD(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, N_ENTRIES) \ } while (0) /* Same as LOAD without SETUP_UV */ #define LOAD_NO_SETUP(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, ENTRIES_DATA, \ N_ENTRIES) \ do { \ LOAD_VARS; \ void *_batch = NULL; \ uint64_t _data = ENTRIES_DATA; \ unsigned _i; \ _LOAD(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, N_ENTRIES) \ } while (0) /****************************************************************************** * * Set up and tear down. * *****************************************************************************/ static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_UV_DEPS; return f; } static void tearDown(void *data) { struct fixture *f = data; TEAR_DOWN_UV; TEAR_DOWN_UV_DEPS; free(f); } /****************************************************************************** * * raft_io->load() * *****************************************************************************/ SUITE(load) /* Load the initial state of a pristine server. */ TEST(load, emptyDir, setUp, tearDown, 0, NULL) { struct fixture *f = data; LOAD(0, /* term */ 0, /* voted for */ NULL, /* snapshot */ 1, /* start index */ 0, /* data for first loaded entry */ 0 /* n entries */ ); return MUNIT_OK; } static char *unknownFiles[] = { "garbage", "0000000000000000000000000001-00000000001garbage", "open-1garbage", NULL, }; static MunitParameterEnum unknownFilesParams[] = { {"filename", unknownFiles}, {NULL, NULL}, }; /* Files that are not part of the raft state are ignored. */ TEST(load, ignoreUnknownFiles, setUp, tearDown, 0, unknownFilesParams) { struct fixture *f = data; const char *filename = munit_parameters_get(params, "filename"); DirWriteFileWithZeros(f->dir, filename, 128); LOAD(0, /* term */ 0, /* voted for */ NULL, /* snapshot */ 1, /* start index */ 0, /* data for first loaded entry */ 0 /* n entries */ ); return MUNIT_OK; } static char *unusableFiles[] = {"tmp-0000000001221212-0000000001221217", "tmp-snapshot-15-8260687-512469866", "snapshot-525-43326736-880259052", "snapshot-999-13371337-880259052.meta", "snapshot-20-8260687-512469866", "snapshot-88-8260687-512469866.meta", "snapshot-88-8260999-512469866.meta", "tmp-snapshot-88-8260999-512469866.meta", "tmp-snapshot-33-8260687-512469866", "snapshot-33-8260687-512469866.meta", "tmp-metadata1", "tmp-metadata2", "tmp-open1", "tmp-open13", NULL}; static MunitParameterEnum unusableFilesParams[] = { {"filename", unusableFiles}, {NULL, NULL}, }; /* Files that can no longer be used are removed. */ TEST(load, removeUnusableFiles, setUp, tearDown, 0, unusableFilesParams) { struct fixture *f = data; const char *filename = munit_parameters_get(params, "filename"); DirWriteFileWithZeros(f->dir, filename, 128); munit_assert_true(DirHasFile(f->dir, filename)); LOAD(0, /* term */ 0, /* voted for */ NULL, /* snapshot */ 1, /* start index */ 0, /* data for first loaded entry */ 0 /* n entries */ ); munit_assert_false(DirHasFile(f->dir, filename)); return MUNIT_OK; } /* The data directory has an empty open segment. */ TEST(load, emptyOpenSegment, setUp, tearDown, 0, NULL) { struct fixture *f = data; DirWriteFile(f->dir, "open-1", NULL, 0); LOAD(0, /* term */ 0, /* voted for */ NULL, /* snapshot */ 1, /* start index */ 0, /* data for first loaded entry */ 0 /* n entries */ ); /* The empty segment has been removed. */ munit_assert_false(HAS_OPEN_SEGMENT_FILE(1)); return MUNIT_OK; } /* The data directory has a freshly allocated open segment filled with zeros. */ TEST(load, openSegmentWithTrailingZeros, setUp, tearDown, 0, NULL) { struct fixture *f = data; DirWriteFileWithZeros(f->dir, "open-1", 256); LOAD(0, /* term */ 0, /* voted for */ NULL, /* snapshot */ 1, /* start index */ 0, /* data for first loaded entry */ 0 /* n entries */ ); /* The empty segment has been removed. */ munit_assert_false(HAS_OPEN_SEGMENT_FILE(1)); return MUNIT_OK; } /* The data directory has a valid closed and open segments. */ TEST(load, bothOpenAndClosedSegments, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(2, 1); APPEND(1, 3); APPEND(1, 4); UNFINALIZE(4, 4, 1); LOAD(0, /* term */ 0, /* voted for */ NULL, /* snapshot */ 1, /* start index */ 1, /* data for first loaded entry */ 4 /* n entries */ ); return MUNIT_OK; } /* The data directory has an allocated open segment which contains non-zero * corrupted data in its second batch. */ TEST(load, openSegmentWithNonZeroData, setUp, tearDown, 0, NULL) { struct fixture *f = data; uint64_t corrupt = 123456789; APPEND(2, 1); UNFINALIZE(1, 2, 1); DirOverwriteFile(f->dir, "open-1", &corrupt, sizeof corrupt, 60); LOAD(0, /* term */ 0, /* voted for */ NULL, /* snapshot */ 1, /* start index */ 1, /* data for first loaded entry */ 1 /* n entries */ ); /* The segment has been removed. */ munit_assert_false(HAS_OPEN_SEGMENT_FILE(1)); return MUNIT_OK; } /* The data directory has an open segment with a partially written batch that * needs to be truncated. */ TEST(load, openSegmentWithIncompleteBatch, setUp, tearDown, 0, NULL) { struct fixture *f = data; uint8_t zero[256]; APPEND(2, 1); UNFINALIZE(1, 2, 1); memset(zero, 0, sizeof zero); DirOverwriteFile(f->dir, "open-1", &zero, sizeof zero, 62); LOAD(0, /* term */ 0, /* voted for */ NULL, /* snapshot */ 1, /* start index */ 1, /* data for first loaded entry */ 1 /* n entries */ ); return MUNIT_OK; } /* The data directory has an open segment whose first batch is only * partially written. In that case the segment gets removed. */ TEST(load, openSegmentWithIncompleteFirstBatch, setUp, tearDown, 0, NULL) { struct fixture *f = data; uint8_t buf[5 * WORD_SIZE] = { UV__DISK_FORMAT, 0, 0, 0, 0, 0, 0, 0, /* Format version */ 0, 0, 0, 0, 0, 0, 0, 0, /* CRC32 checksums */ 0, 0, 0, 0, 0, 0, 0, 0, /* Number of entries */ 0, 0, 0, 0, 0, 0, 0, 0, /* Local data size */ 0, 0, 0, 0, 0, 0, 0, 0 /* Batch data */ }; APPEND(1, 1); UNFINALIZE(1, 1, 1); DirOverwriteFile(f->dir, "open-1", buf, sizeof buf, 0); LOAD(0, /* term */ 0, /* voted for */ NULL, /* snapshot */ 1, /* start index */ 0, /* data for first loaded entry */ 0 /* n entries */ ); return MUNIT_OK; } /* The data directory has two segments, with the second having an entry. */ TEST(load, twoOpenSegments, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(1, 1); APPEND(1, 2); UNFINALIZE(1, 1, 1); UNFINALIZE(2, 2, 2); LOAD(0, /* term */ 0, /* voted for */ NULL, /* snapshot */ 1, /* start index */ 1, /* data for first loaded entry */ 2 /* n entries */ ); /* The first and second segments have been renamed. */ munit_assert_false(HAS_OPEN_SEGMENT_FILE(1)); munit_assert_false(HAS_OPEN_SEGMENT_FILE(2)); munit_assert_true(HAS_CLOSED_SEGMENT_FILE(1, 1)); munit_assert_true(HAS_CLOSED_SEGMENT_FILE(2, 2)); return MUNIT_OK; } /* The data directory has two open segments, with the second one filled with * zeros. */ TEST(load, secondOpenSegmentIsAllZeros, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(1, 1); UNFINALIZE(1, 1, 1); DirWriteFileWithZeros(f->dir, "open-2", SEGMENT_SIZE); LOAD(0, /* term */ 0, /* voted for */ NULL, /* snapshot */ 1, /* start index */ 1, /* data for first loaded entry */ 1 /* n entries */ ); /* The first segment has been renamed. */ munit_assert_false(HAS_OPEN_SEGMENT_FILE(1)); munit_assert_true(HAS_CLOSED_SEGMENT_FILE(1, 1)); /* The second segment has been removed. */ munit_assert_false(HAS_OPEN_SEGMENT_FILE(2)); return MUNIT_OK; } /* The data directory has two open segments, the first one has a corrupt header * and auto-recovery is on. */ TEST(load, twoOpenSegmentsFirstCorruptAutoRecovery, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(1, 1); UNFINALIZE(1, 1, 1); DirWriteFileWithZeros(f->dir, "open-2", SEGMENT_SIZE); /* Corrupt open segment */ uint64_t version = 0 /* Format version */; DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); /* Load is successful and equals pristine condition. */ LOAD_WITH_AUTO_RECOVERY(0, /* term */ 0, /* voted for */ NULL, /* snapshot */ 1, /* start index */ 0, /* data for first loaded entry */ 0 /* n entries */ ); /* The open segments are renamed, and there is no closed segment. */ munit_assert_false(HAS_OPEN_SEGMENT_FILE(1)); munit_assert_false(HAS_OPEN_SEGMENT_FILE(2)); munit_assert_false(HAS_CLOSED_SEGMENT_FILE(1, 1)); return MUNIT_OK; } /* The data directory has two open segments, the first one has a corrupt header. */ TEST(load, twoOpenSegmentsFirstCorrupt, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(1, 1); UNFINALIZE(1, 1, 1); DirWriteFileWithZeros(f->dir, "open-2", SEGMENT_SIZE); /* Corrupt open segment */ uint64_t version = 0 /* Format version */; DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); LOAD_ERROR(RAFT_CORRUPT, "load open segment open-1: unexpected format version 0"); /* The open segments are renamed, and there is no closed segment. */ munit_assert_true(HAS_OPEN_SEGMENT_FILE(1)); munit_assert_true(HAS_OPEN_SEGMENT_FILE(2)); return MUNIT_OK; } /* The data directory has a valid open segment. */ TEST(load, openSegment, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(1, 1); UNFINALIZE(1, 1, 1); LOAD(0, /* term */ 0, /* voted for */ NULL, /* snapshot */ 1, /* start index */ 1, /* data for first loaded entry */ 1 /* n entries */ ); return MUNIT_OK; } /* There is exactly one snapshot and no segments. */ TEST(load, onlyOneSnapshot, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct snapshot snapshot = { 1, /* term */ 1, /* index */ 1 /* data */ }; SNAPSHOT_PUT(1, 1, 1); LOAD(0, /* term */ 0, /* voted for */ &snapshot, /* snapshot */ 2, /* start index */ 0, /* data for first loaded entry */ 0 /* n entries */ ); return MUNIT_OK; } /* There are several snapshots, including an incomplete one. The last one is * loaded and the incomplete or older ones are removed. */ TEST(load, manySnapshots, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct snapshot snapshot = { 2, /* term */ 9, /* index */ 4 /* data */ }; char filename[64]; uint64_t now; /* Take a snapshot but then remove the data file, as if the server crashed * before it could complete writing it. */ uv_update_time(&f->loop); now = uv_now(&f->loop); sprintf(filename, "snapshot-1-8-%ju", now); SNAPSHOT_PUT(1, 8, 1); DirRemoveFile(f->dir, filename); SNAPSHOT_PUT(1, 8, 2); SNAPSHOT_PUT(2, 6, 3); SNAPSHOT_PUT(2, 9, 4); LOAD(0, /* term */ 0, /* voted for */ &snapshot, /* snapshot */ 10, /* start index */ 0, /* data for first loaded entry */ 0 /* n entries */ ); /* The orphaned .meta file is removed */ char meta_filename[128]; sprintf(meta_filename, "%s%s", filename, UV__SNAPSHOT_META_SUFFIX); munit_assert_false(DirHasFile(f->dir, meta_filename)); return MUNIT_OK; } /* There are two snapshots, but the last one has an empty data file. The first * one is loaded and the empty one is discarded. */ TEST(load, emptySnapshot, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct snapshot snapshot = { 1, /* term */ 4, /* index */ 1 /* data */ }; char filename[64]; uint64_t now; SNAPSHOT_PUT(1, 4, 1); /* Take a snapshot but then truncate the data file, as if the server ran out * of space before it could write it. */ uv_update_time(&f->loop); now = uv_now(&f->loop); sprintf(filename, "snapshot-2-6-%ju", now); SNAPSHOT_PUT(2, 6, 2); DirTruncateFile(f->dir, filename, 0); LOAD(0, /* term */ 0, /* voted for */ &snapshot, /* snapshot */ 5, /* start index */ 0, /* data for first loaded entry */ 0 /* n entries */ ); return MUNIT_OK; } /* There is an orphaned snapshot and an orphaned snapshot .meta file, * make sure they are removed */ TEST(load, orphanedSnapshotFiles, setUp, tearDown, 0, NULL) { struct fixture *f = data; uv_update_time(&f->loop); uint64_t now = uv_now(&f->loop); struct snapshot expected_snapshot = { 2, /* term */ 16, /* index */ 4 /* data */ }; char filename1_removed[64]; char metafilename1_removed[64]; char filename2_removed[64]; char metafilename2_removed[64]; /* Take a snapshot but then remove the data file, as if the server crashed * before it could complete writing it. */ sprintf(filename1_removed, "snapshot-2-18-%ju", now); sprintf(metafilename1_removed, "snapshot-2-18-%ju%s", now, UV__SNAPSHOT_META_SUFFIX); SNAPSHOT_PUT(2, 18, 1); munit_assert_true(DirHasFile(f->dir, filename1_removed)); munit_assert_true(DirHasFile(f->dir, metafilename1_removed)); DirRemoveFile(f->dir, filename1_removed); /* Take a snapshot but then remove the .meta file */ now = uv_now(&f->loop); sprintf(filename2_removed, "snapshot-2-19-%ju", now); sprintf(metafilename2_removed, "snapshot-2-19-%ju%s", now, UV__SNAPSHOT_META_SUFFIX); SNAPSHOT_PUT(2, 19, 2); munit_assert_true(DirHasFile(f->dir, filename2_removed)); munit_assert_true(DirHasFile(f->dir, metafilename2_removed)); DirRemoveFile(f->dir, metafilename2_removed); /* Take a valid snapshot and make sure it's loaded */ SNAPSHOT_PUT(2, 16, 4); LOAD(0, /* term */ 0, /* voted for */ &expected_snapshot, /* snapshot */ 17, /* start index */ 0, /* data for first loaded entry */ 0 /* n entries */ ); /* The orphaned files are removed */ munit_assert_false(DirHasFile(f->dir, metafilename1_removed)); munit_assert_false(DirHasFile(f->dir, filename2_removed)); return MUNIT_OK; } /* The data directory has a closed segment with entries that are no longer * needed, since they are included in a snapshot. We still keep those segments * and just let the next snapshot logic delete them. */ TEST(load, closedSegmentWithEntriesBehindSnapshot, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct snapshot snapshot = { 1, /* term */ 2, /* index */ 1 /* data */ }; APPEND(1, 1); SNAPSHOT_PUT(1, 2, 1); LOAD(0, /* term */ 0, /* voted for */ &snapshot, /* snapshot */ 3, /* start index */ 0, /* data for first loaded entry */ 0 /* n entries */ ); munit_assert_true(HAS_CLOSED_SEGMENT_FILE(1, 1)); return MUNIT_OK; } /* The data directory has a closed segment with entries that are no longer * needed, since they are included in a snapshot. However it also has an open * segment that has enough entries to reach the snapshot last index. */ TEST(load, openSegmentWithEntriesPastSnapshot, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct snapshot snapshot = { 1, /* term */ 2, /* index */ 1 /* data */ }; APPEND(1, 1); APPEND(1, 2); SNAPSHOT_PUT(1, 2, 1); UNFINALIZE(2, 2, 1); LOAD(0, /* term */ 0, /* voted for */ &snapshot, /* snapshot */ 1, /* start index */ 1, /* data for first loaded entry */ 2 /* n entries */ ); return MUNIT_OK; } /* The data directory has a closed segment whose filename encodes a number of * entries which is different then ones it actually contains. */ TEST(load, closedSegmentWithInconsistentFilename, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(3, 1); DirRenameFile(f->dir, "0000000000000001-0000000000000003", "0000000000000001-0000000000000004"); LOAD_ERROR(RAFT_CORRUPT, "load closed segment 0000000000000001-0000000000000004: found 3 " "entries (expected 4)"); return MUNIT_OK; } /* The data directory has a closed segment whose filename encodes a number of * entries which is different then ones it actually contains, and auto-recovery * is turned on. */ TEST(load, closedSegmentWithInconsistentFilenameAutoRecovery, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(3, 1); DirRenameFile(f->dir, "0000000000000001-0000000000000003", "0000000000000001-0000000000000004"); /* Load in pristine condition */ LOAD_WITH_AUTO_RECOVERY(0, /* term */ 0, /* voted for */ NULL, /* snapshot */ 1, /* start index */ 0, /* data for first loaded entry */ 0 /* n entries */ ); return MUNIT_OK; } /* The data directory has a closed segment with entries that are no longer * needed, since they are included in a snapshot. It also has an open segment, * however that does not have enough entries to reach the snapshot last * index. */ TEST(load, openSegmentWithEntriesBehindSnapshot, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(1, 1); APPEND(1, 2); SNAPSHOT_PUT(1, 3, 1); UNFINALIZE(2, 2, 1); LOAD_ERROR(RAFT_CORRUPT, "last entry on disk has index 2, which is behind last " "snapshot's index 3"); return MUNIT_OK; } /* The data directory has a closed segment with entries that are no longer * needed, since they are included in a snapshot. It also has an open segment, * however that does not have enough entries to reach the snapshot last * index, and auto-receovery is turned on. */ TEST(load, openSegmentWithEntriesBehindSnapshotAutoRecovery, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct snapshot snapshot = { 1, /* term */ 3, /* index */ 1 /* data */ }; APPEND(1, 1); APPEND(1, 2); SNAPSHOT_PUT(1, 3, 1); UNFINALIZE(2, 2, 1); LOAD_WITH_AUTO_RECOVERY(0, /* term */ 0, /* voted for */ &snapshot, /* snapshot */ 4, /* start index */ 0, /* data for first loaded entry */ 0 /* n entries */ ); return MUNIT_OK; } /* The data directory contains a snapshot and an open segment containing a valid * entry, and no closed segments. */ TEST(load, openSegmentNoClosedSegmentsSnapshotPresent, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct snapshot snapshot = { 1, /* term */ 3, /* index */ 1 /* data */ }; SNAPSHOT_PUT(1, 3, 1); APPEND(1, 4); UNFINALIZE(4, 4, 1); LOAD(0, /* term */ 0, /* voted for */ &snapshot, /* snapshot */ 4, /* start index */ 4, /* data for first loaded entry */ 1 /* n entries */ ); return MUNIT_OK; } /* The data directory contains a snapshot and an open segment with a corrupt * format header and no closed segments. */ TEST(load, corruptOpenSegmentNoClosedSegmentsSnapshotPresent, setUp, tearDown, 0, NULL) { struct fixture *f = data; SNAPSHOT_PUT(1, 3, 1); APPEND(1, 4); UNFINALIZE(4, 4, 1); /* Corrupt open segment */ uint64_t version = 0 /* Format version */; DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); LOAD_ERROR(RAFT_CORRUPT, "load open segment open-1: unexpected format version 0"); return MUNIT_OK; } /* The data directory contains a snapshot and an open segment with a corrupt * format header and no closed segments. Auto-recovery is turned on. */ TEST(load, corruptOpenSegmentNoClosedSegmentsSnapshotPresentWithAutoRecovery, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct snapshot snapshot = { 1, /* term */ 3, /* index */ 1 /* data */ }; SNAPSHOT_PUT(1, 3, 1); APPEND(1, 4); UNFINALIZE(4, 4, 1); /* Corrupt open segment */ uint64_t version = 0 /* Format version */; DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); /* Load is successful. */ LOAD_WITH_AUTO_RECOVERY(0, /* term */ 0, /* voted for */ &snapshot, /* snapshot */ 4, /* start index */ 1, /* data for first loaded entry */ 1 /* n entries */ ); return MUNIT_OK; } /* The data directory contains a snapshot and an open segment with a corrupt * format header and a closed segment. */ TEST(load, corruptOpenSegmentClosedSegmentSnapshotPresent, setUp, tearDown, 0, NULL) { struct fixture *f = data; SNAPSHOT_PUT(1, 3, 1); APPEND(1, 4); APPEND(1, 5); UNFINALIZE(5, 5, 1); /* Corrupt open segment */ uint64_t version = 0 /* Format version */; DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); LOAD_ERROR(RAFT_CORRUPT, "load open segment open-1: unexpected format version 0"); return MUNIT_OK; } /* The data directory contains a snapshot and an open segment with a corrupt * format header and a closed segment. Auto-recovery is turned on. */ TEST(load, corruptOpenSegmentClosedSegmentSnapshotPresentWithAutoRecovery, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct snapshot snapshot = { 1, /* term */ 3, /* index */ 1 /* data */ }; SNAPSHOT_PUT(1, 3, 1); APPEND(1, 4); APPEND(1, 5); UNFINALIZE(5, 5, 1); /* Corrupt open segment */ uint64_t version = 0 /* Format version */; DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); /* Load is successful. */ LOAD_WITH_AUTO_RECOVERY(0, /* term */ 0, /* voted for */ &snapshot, /* snapshot */ 4, /* start index */ 4, /* data for first loaded entry */ 1 /* n entries */ ); /* Open segment has been renamed */ munit_assert_false(DirHasFile(f->dir, "open-1")); return MUNIT_OK; } /* The data directory contains a snapshot and an open segment with a corrupt * format header and multiple closed segment. Auto-recovery is turned on. */ TEST(load, corruptOpenSegmentClosedSegmentsSnapshotPresentWithAutoRecovery, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct snapshot snapshot = { 1, /* term */ 3, /* index */ 1 /* data */ }; SNAPSHOT_PUT(1, 3, 1); APPEND(1, 4); APPEND(1, 5); APPEND(1, 6); UNFINALIZE(6, 6, 1); /* Corrupt open segment */ uint64_t version = 0 /* Format version */; DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); LOAD_WITH_AUTO_RECOVERY(0, /* term */ 0, /* voted for */ &snapshot, /* snapshot */ 4, /* start index */ 4, /* data for first loaded entry */ 2 /* n entries */ ); /* Open segment has been renamed during the first load */ munit_assert_false(DirHasFile(f->dir, "open-1")); return MUNIT_OK; } /* The data directory contains a snapshot and an open segment with a corrupt * format header and multiple closed segment. */ TEST(load, corruptOpenSegmentClosedSegmentsSnapshotPresent, setUp, tearDown, 0, NULL) { struct fixture *f = data; SNAPSHOT_PUT(1, 3, 1); APPEND(1, 4); APPEND(1, 5); APPEND(1, 6); UNFINALIZE(6, 6, 1); /* Corrupt open segment */ uint64_t version = 0 /* Format version */; DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); LOAD_ERROR(RAFT_CORRUPT, "load open segment open-1: unexpected format version 0"); return MUNIT_OK; } /* The data directory contains a closed segment and an open segment with a * corrupt format header and no snapshot. */ TEST(load, corruptOpenSegmentClosedSegments, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(4, 1); APPEND(1, 5); UNFINALIZE(5, 5, 1); /* Corrupt open segment */ uint64_t version = 0 /* Format version */; DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); LOAD_ERROR(RAFT_CORRUPT, "load open segment open-1: unexpected format version 0"); return MUNIT_OK; } /* The data directory contains a closed segment and an open segment with a * corrupt format header and no snapshot. Auto-recovery is turned on. */ TEST(load, corruptOpenSegmentClosedSegmentsWithAutoRecovery, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(4, 1); APPEND(1, 5); UNFINALIZE(5, 5, 1); /* Corrupt open segment */ uint64_t version = 0 /* Format version */; DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); /* load is successful. */ LOAD_WITH_AUTO_RECOVERY(0, /* term */ 0, /* voted for */ NULL, /* snapshot */ 1, /* start index */ 1, /* data for first loaded entry */ 4 /* n entries */ ); /* Open segment has been renamed */ munit_assert_false(DirHasFile(f->dir, "open-1")); return MUNIT_OK; } /* The data directory contains a closed segment and two open segments. * The first open segment has a corrupt header. */ TEST(load, corruptOpenSegmentsClosedSegments, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(3, 1); APPEND(1, 4); APPEND(1, 5); UNFINALIZE(4, 4, 1); UNFINALIZE(5, 5, 2); /* Corrupt open segment */ uint64_t version = 0 /* Format version */; DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); LOAD_ERROR(RAFT_CORRUPT, "load open segment open-1: unexpected format version 0"); return MUNIT_OK; } /* The data directory contains a closed segment and two open segments. * The first open segment has a corrupt header. Auto-recovery is turned on. */ TEST(load, corruptOpenSegmentsClosedSegmentsWithAutoRecovery, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(3, 1); APPEND(1, 4); APPEND(1, 5); UNFINALIZE(4, 4, 1); UNFINALIZE(5, 5, 2); /* Corrupt open segment */ uint64_t version = 0 /* Format version */; DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); LOAD_WITH_AUTO_RECOVERY(0, /* term */ 0, /* voted for */ NULL, /* snapshot */ 1, /* start index */ 1, /* data for first loaded entry */ 3 /* n entries */ ); /* Open segments have been renamed */ munit_assert_false(DirHasFile(f->dir, "open-1")); munit_assert_false(DirHasFile(f->dir, "open-2")); return MUNIT_OK; } /* The data directory contains a closed segment and two open segments. * The second open segment has a corrupt header. */ TEST(load, corruptLastOpenSegmentClosedSegments, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(3, 1); APPEND(1, 4); APPEND(1, 5); UNFINALIZE(4, 4, 1); UNFINALIZE(5, 5, 2); /* Corrupt open segment */ uint64_t version = 0 /* Format version */; DirOverwriteFile(f->dir, "open-2", &version, sizeof version, 0); LOAD_ERROR(RAFT_CORRUPT, "load open segment open-2: unexpected format version 0"); return MUNIT_OK; } /* The data directory contains a closed segment and two open segments. * The second open segment has a corrupt header. Auto-recovery is turned on. */ TEST(load, corruptLastOpenSegmentClosedSegmentsWithAutoRecovery, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(3, 1); APPEND(1, 4); APPEND(1, 5); UNFINALIZE(4, 4, 1); UNFINALIZE(5, 5, 2); /* Corrupt open segment */ uint64_t version = 0 /* Format version */; DirOverwriteFile(f->dir, "open-2", &version, sizeof version, 0); LOAD_WITH_AUTO_RECOVERY(0, /* term */ 0, /* voted for */ NULL, /* snapshot */ 1, /* start index */ 1, /* data for first loaded entry */ 4 /* n entries */ ); /* Open segment has been renamed during the first load */ munit_assert_false(DirHasFile(f->dir, "open-2")); return MUNIT_OK; } /* The data directory has several closed segments, all with entries compatible * with the snapshot. */ TEST(load, closedSegmentsOverlappingWithSnapshot, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct snapshot snapshot = { 1, /* term */ 4, /* index */ 1 /* data */ }; APPEND(1, 1); APPEND(2, 2); APPEND(3, 4); SNAPSHOT_PUT(1, 4, 1); LOAD(0, /* term */ 0, /* voted for */ &snapshot, /* snapshot */ 1, /* start index */ 1, /* data for first loaded entry */ 6 /* n entries */ ); return MUNIT_OK; } /* The data directory has several closed segments, the last of which is corrupt. * There is a snapshot. */ TEST(load, closedSegmentsWithSnapshotLastSegmentCorrupt, setUp, tearDown, 0, NULL) { struct fixture *f = data; SNAPSHOT_PUT(1, 4, 1); APPEND(1, 5); APPEND(2, 6); APPEND(2, 8); /* Corrupt the last closed segment */ size_t offset = WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */; uint32_t corrupted = 123456789; DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(8, 9), &corrupted, sizeof corrupted, offset); LOAD_ERROR(RAFT_CORRUPT, "load closed segment 0000000000000008-0000000000000009: entries " "batch 1 starting at byte 8: data checksum mismatch"); return MUNIT_OK; } /* The data directory has several closed segments, the last of which is corrupt. * There is a snapshot. Auto-recovery is turned on. */ TEST(load, closedSegmentsWithSnapshotLastSegmentCorruptAutoRecovery, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct snapshot snapshot = { 1, /* term */ 4, /* index */ 1 /* data */ }; SNAPSHOT_PUT(1, 4, 1); APPEND(1, 5); APPEND(2, 6); APPEND(2, 8); /* Corrupt the last closed segment */ size_t offset = WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */; uint32_t corrupted = 123456789; DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(8, 9), &corrupted, sizeof corrupted, offset); LOAD_WITH_AUTO_RECOVERY(0, /* term */ 0, /* voted for */ &snapshot, /* snapshot */ 5, /* start index */ 5, /* data for first loaded entry */ 3 /* n entries */ ); return MUNIT_OK; } /* The data directory has several closed segments, the last of which is corrupt. * There is an open segment and a snapshot. Auto-recovery is turned on. */ TEST(load, closedSegmentsWithSnapshotLastSegmentCorruptOpenSegmentWithAutoRecovery, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct snapshot snapshot = { 1, /* term */ 4, /* index */ 1 /* data */ }; SNAPSHOT_PUT(1, 4, 1); APPEND(1, 5); APPEND(2, 6); APPEND(1, 8); APPEND(1, 9); UNFINALIZE(9, 9, 1); /* Corrupt the last closed segment */ size_t offset = WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */; uint32_t corrupted = 123456789; DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(8, 8), &corrupted, sizeof corrupted, offset); munit_assert_true(HAS_OPEN_SEGMENT_FILE(1)); LOAD_WITH_AUTO_RECOVERY(0, /* term */ 0, /* voted for */ &snapshot, /* snapshot */ 5, /* start index */ 5, /* data for first loaded entry */ 3 /* n entries */ ); munit_assert_false(HAS_OPEN_SEGMENT_FILE(1)); return MUNIT_OK; } /* The data directory has several closed segments, the last of which is corrupt. * There is an open segment and a snapshot. */ TEST(load, closedSegmentsWithSnapshotLastSegmentCorruptOpenSegment, setUp, tearDown, 0, NULL) { struct fixture *f = data; SNAPSHOT_PUT(1, 4, 1); APPEND(1, 5); APPEND(2, 6); APPEND(1, 8); APPEND(1, 9); UNFINALIZE(9, 9, 1); /* Corrupt the last closed segment */ size_t offset = WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */; uint32_t corrupted = 123456789; DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(8, 8), &corrupted, sizeof corrupted, offset); munit_assert_true(HAS_OPEN_SEGMENT_FILE(1)); LOAD_ERROR(RAFT_CORRUPT, "load closed segment 0000000000000008-0000000000000008: entries " "batch 1 starting at byte 8: data checksum mismatch"); return MUNIT_OK; } /* The data directory has several closed segments, the second to last one of * which is corrupt. There is a snapshot. */ TEST(load, closedSegmentsWithSnapshotSecondLastSegmentCorrupt, setUp, tearDown, 0, NULL) { struct fixture *f = data; SNAPSHOT_PUT(1, 4, 1); APPEND(1, 5); APPEND(2, 6); APPEND(2, 8); /* Corrupt the second last closed segment */ size_t offset = WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */; uint32_t corrupted = 123456789; DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(6, 7), &corrupted, sizeof corrupted, offset); LOAD_ERROR(RAFT_CORRUPT, "load closed segment 0000000000000006-0000000000000007: entries " "batch 1 starting at byte 8: data checksum mismatch"); /* Second load still fails. */ LOAD_ERROR_NO_SETUP( RAFT_CORRUPT, "load closed segment 0000000000000006-0000000000000007: entries " "batch 1 starting at byte 8: data checksum mismatch"); return MUNIT_OK; } /* The data directory has several closed segments, some of which have a gap, * which is still compatible with the snapshot. */ TEST(load, nonContiguousClosedSegments, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct snapshot snapshot = { 1, /* term */ 4, /* index */ 1 /* data */ }; APPEND(1, 1); APPEND(2, 2); APPEND(3, 4); SNAPSHOT_PUT(1, 4, 1); DirRemoveFile(f->dir, CLOSED_SEGMENT_FILENAME(2, 3)); LOAD(0, /* term */ 0, /* voted for */ &snapshot, /* snapshot */ 4, /* start index */ 4, /* data for first loaded entry */ 3 /* n entries */ ); return MUNIT_OK; } /* If the data directory has a closed segment whose start index is beyond the * snapshot's last index, an error is returned. */ TEST(load, closedSegmentWithEntriesPastSnapshot, setUp, tearDown, 0, NULL) { struct fixture *f = data; uint64_t now; char errmsg[128]; APPEND(5, 1); APPEND(1, 5); uv_update_time(&f->loop); now = uv_now(&f->loop); sprintf(errmsg, "closed segment 0000000000000006-0000000000000006 is past last " "snapshot snapshot-1-4-%ju", now); SNAPSHOT_PUT(1, 4, 1); DirRemoveFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 5)); LOAD_ERROR(RAFT_CORRUPT, errmsg); return MUNIT_OK; } /* The data directory has an open segment which has incomplete format data. */ TEST(load, openSegmentWithIncompleteFormat, setUp, tearDown, 0, NULL) { struct fixture *f = data; DirWriteFileWithZeros(f->dir, "open-1", WORD_SIZE / 2); LOAD_ERROR(RAFT_IOERR, "load open segment open-1: file has only 4 bytes"); return MUNIT_OK; } /* The data directory has an open segment which has an incomplete batch * preamble. */ TEST(load, openSegmentWithIncompletePreamble, setUp, tearDown, 0, NULL) { struct fixture *f = data; size_t offset = WORD_SIZE /* Format version */ + WORD_SIZE /* Checksums */; APPEND(1, 1); UNFINALIZE(1, 1, 1); DirTruncateFile(f->dir, "open-1", offset); LOAD_ERROR(RAFT_IOERR, "load open segment open-1: entries batch 1 starting at byte 16: " "read preamble: short read: 0 bytes instead of 8"); return MUNIT_OK; } /* The data directory has an open segment which has incomplete batch header. */ TEST(load, openSegmentWithIncompleteBatchHeader, setUp, tearDown, 0, NULL) { struct fixture *f = data; size_t offset = WORD_SIZE + /* Format version */ WORD_SIZE + /* Checksums */ WORD_SIZE + /* Number of entries */ WORD_SIZE /* Partial batch header */; APPEND(1, 1); UNFINALIZE(1, 1, 1); DirTruncateFile(f->dir, "open-1", offset); #ifdef DQLITE_NEXT const char *msg = "load open segment open-1: entries batch 1 starting at byte 8: " "read header: short read: 8 bytes instead of 24"; #else const char *msg = "load open segment open-1: entries batch 1 starting at byte 8: " "read header: short read: 8 bytes instead of 16"; #endif LOAD_ERROR(RAFT_IOERR, msg); return MUNIT_OK; } /* The data directory has an open segment which has incomplete batch data. */ TEST(load, openSegmentWithIncompleteBatchData, setUp, tearDown, 0, NULL) { struct fixture *f = data; size_t offset = WORD_SIZE + /* Format version */ WORD_SIZE + /* Checksums */ WORD_SIZE + /* Number of entries */ WORD_SIZE + /* Entry term */ WORD_SIZE + /* Entry type and data size */ WORD_SIZE / 2 /* Partial entry data */; #ifdef DQLITE_NEXT offset += WORD_SIZE; /* Local data size */ #endif APPEND(1, 1); UNFINALIZE(1, 1, 1); DirTruncateFile(f->dir, "open-1", offset); #ifdef DQLITE_NEXT const char *msg = "load open segment open-1: entries batch 1 starting at byte 8: " "read data: short read: 4 bytes instead of 24"; #else const char *msg = "load open segment open-1: entries batch 1 starting at byte 8: " "read data: short read: 4 bytes instead of 8"; #endif LOAD_ERROR(RAFT_IOERR, msg); return MUNIT_OK; } /* The data directory has a closed segment which has corrupted batch header. */ TEST(load, closedSegmentWithCorruptedBatchHeader, setUp, tearDown, 0, NULL) { struct fixture *f = data; size_t offset = WORD_SIZE /* Format version */; uint64_t corrupted = 12345678; APPEND(1, 1); DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 1), &corrupted, sizeof corrupted, offset); LOAD_ERROR(RAFT_CORRUPT, "load closed segment 0000000000000001-0000000000000001: entries " "batch 1 starting at byte 8: header checksum mismatch"); return MUNIT_OK; } /* The data directory has a closed segment which has corrupted batch data. */ TEST(load, closedSegmentWithCorruptedBatchData, setUp, tearDown, 0, NULL) { struct fixture *f = data; size_t offset = WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */; uint32_t corrupted = 123456789; APPEND(1, 1); DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 1), &corrupted, sizeof corrupted, offset); LOAD_ERROR(RAFT_CORRUPT, "load closed segment 0000000000000001-0000000000000001: entries " "batch 1 starting at byte 8: data checksum mismatch"); return MUNIT_OK; } /* The data directory has a closed segment whose first index does not match what * we expect. */ TEST(load, closedSegmentWithBadIndex, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(1, 1); APPEND(1, 2); DirRemoveFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 1)); LOAD_ERROR(RAFT_CORRUPT, "unexpected closed segment 0000000000000002-0000000000000002: " "first index should have been 1"); return MUNIT_OK; } /* The data directory has an empty closed segment. */ TEST(load, emptyClosedSegment, setUp, tearDown, 0, NULL) { struct fixture *f = data; DirWriteFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 1), NULL, 0); LOAD_ERROR( RAFT_CORRUPT, "load closed segment 0000000000000001-0000000000000001: file is empty"); return MUNIT_OK; } /* The data directory has a closed segment with an unexpected format. */ TEST(load, closedSegmentWithBadFormat, setUp, tearDown, 0, NULL) { struct fixture *f = data; uint8_t buf[8] = {3, 0, 0, 0, 0, 0, 0, 0}; DirWriteFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 1), buf, sizeof buf); LOAD_ERROR(RAFT_CORRUPT, "load closed segment 0000000000000001-0000000000000001: " "unexpected format version 3"); return MUNIT_OK; } /* The data directory has an open segment which is not readable. */ TEST(load, openSegmentWithNoAccessPermission, setUp, tearDown, 0, NULL) { struct fixture *f = data; /* Skip the test when running as root, since EACCES would not be triggered * in that case. */ if (getuid() == 0) { SETUP_UV; /* Setup the uv object since teardown expects it. */ return MUNIT_SKIP; } APPEND(1, 1); UNFINALIZE(1, 1, 1); DirMakeFileUnreadable(f->dir, "open-1"); LOAD_ERROR(RAFT_IOERR, "load open segment open-1: read file: open: permission denied"); return MUNIT_OK; } /* The data directory has an open segment with format set to 0 and non-zero * content. */ TEST(load, openSegmentWithZeroFormatAndThenData, setUp, tearDown, 0, NULL) { struct fixture *f = data; uint64_t version = 0 /* Format version */; APPEND(1, 1); UNFINALIZE(1, 1, 1); DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); LOAD_ERROR(RAFT_CORRUPT, "load open segment open-1: unexpected format version 0"); return MUNIT_OK; } /* The data directory has an open segment with an unexpected format. */ TEST(load, openSegmentWithBadFormat, setUp, tearDown, 0, NULL) { struct fixture *f = data; uint8_t version[8] = {3, 0, 0, 0, 0, 0, 0, 0}; APPEND(1, 1); UNFINALIZE(1, 1, 1); DirOverwriteFile(f->dir, "open-1", version, sizeof version, 0); LOAD_ERROR(RAFT_CORRUPT, "load open segment open-1: unexpected format version 3"); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_uv_recover.c000066400000000000000000000042741465252713400227300ustar00rootroot00000000000000#include "../lib/runner.h" #include "../lib/uv.h" /****************************************************************************** * * Fixture * *****************************************************************************/ struct fixture { FIXTURE_UV_DEPS; FIXTURE_UV; }; static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_UV_DEPS; SETUP_UV; return f; } static void tearDown(void *data) { struct fixture *f = data; TEAR_DOWN_UV; TEAR_DOWN_UV_DEPS; free(f); } /****************************************************************************** * * raft_io->recover() * *****************************************************************************/ SUITE(recover) /* Invoke recover and assert that it fails with the given error. */ #define RECOVER_ERROR(RV, CONF) \ { \ int rv_; \ rv_ = f->io.recover(&f->io, CONF); \ munit_assert_int(rv_, ==, RV); \ } /* Invoke recover and assert that it succeeds */ #define RECOVER(CONF) RECOVER_ERROR(0, CONF) /* If the instance has been already initialized, an error is returned. */ /* A new configuration is saved as last entry on disk. */ TEST(recover, newConfiguration, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_configuration configuration1; struct raft_configuration configuration2; int rv; /* Bootstrap using an initial configuration */ raft_configuration_init(&configuration1); rv = raft_configuration_add(&configuration1, 1, "1", RAFT_VOTER); munit_assert_int(rv, ==, 0); rv = raft_configuration_add(&configuration1, 2, "2", RAFT_VOTER); munit_assert_int(rv, ==, 0); rv = f->io.bootstrap(&f->io, &configuration1); munit_assert_int(rv, ==, 0); /* Bootstrap using a different configuration */ raft_configuration_init(&configuration2); rv = raft_configuration_add(&configuration2, 1, "1", RAFT_VOTER); munit_assert_int(rv, ==, 0); RECOVER(&configuration2); raft_configuration_close(&configuration1); raft_configuration_close(&configuration2); return 0; } dqlite-1.16.7/test/raft/integration/test_uv_recv.c000066400000000000000000000422451465252713400222220ustar00rootroot00000000000000#include "../lib/runner.h" #include "../lib/tcp.h" #include "../lib/uv.h" /****************************************************************************** * * Fixture with a libuv-based raft_io instance. * *****************************************************************************/ struct peer { struct uv_loop_s loop; struct raft_uv_transport transport; struct raft_io io; }; struct fixture { FIXTURE_UV_DEPS; FIXTURE_TCP; FIXTURE_UV; struct peer peer; bool closed; }; /****************************************************************************** * * Helper macros * *****************************************************************************/ struct result { struct raft_message *message; bool done; }; static void recvCb(struct raft_io *io, struct raft_message *m1) { struct result *result = io->data; struct raft_message *m2 = result->message; unsigned i; munit_assert_int(m1->type, ==, m2->type); switch (m1->type) { case RAFT_IO_REQUEST_VOTE: munit_assert_int(m1->request_vote.term, ==, m2->request_vote.term); munit_assert_int(m1->request_vote.candidate_id, ==, m2->request_vote.candidate_id); munit_assert_int(m1->request_vote.last_log_index, ==, m2->request_vote.last_log_index); munit_assert_int(m1->request_vote.last_log_term, ==, m2->request_vote.last_log_term); munit_assert_int(m1->request_vote.disrupt_leader, ==, m2->request_vote.disrupt_leader); break; case RAFT_IO_REQUEST_VOTE_RESULT: munit_assert_int(m1->request_vote_result.term, ==, m2->request_vote_result.term); munit_assert_int(m1->request_vote_result.vote_granted, ==, m2->request_vote_result.vote_granted); break; case RAFT_IO_APPEND_ENTRIES: munit_assert_int(m1->append_entries.n_entries, ==, m2->append_entries.n_entries); for (i = 0; i < m1->append_entries.n_entries; i++) { struct raft_entry *entry1 = &m1->append_entries.entries[i]; struct raft_entry *entry2 = &m2->append_entries.entries[i]; munit_assert_int(entry1->term, ==, entry2->term); munit_assert_int(entry1->type, ==, entry2->type); munit_assert_int(entry1->buf.len, ==, entry2->buf.len); munit_assert_int( memcmp(entry1->buf.base, entry2->buf.base, entry1->buf.len), ==, 0); } if (m1->append_entries.n_entries > 0) { raft_free(m1->append_entries.entries[0].batch); raft_free(m1->append_entries.entries); } break; case RAFT_IO_APPEND_ENTRIES_RESULT: munit_assert_int(m1->append_entries_result.term, ==, m2->append_entries_result.term); munit_assert_int(m1->append_entries_result.rejected, ==, m2->append_entries_result.rejected); munit_assert_int(m1->append_entries_result.last_log_index, ==, m2->append_entries_result.last_log_index); break; case RAFT_IO_INSTALL_SNAPSHOT: munit_assert_int(m1->install_snapshot.conf.n, ==, m2->install_snapshot.conf.n); for (i = 0; i < m1->install_snapshot.conf.n; i++) { struct raft_server *s1 = &m1->install_snapshot.conf.servers[i]; struct raft_server *s2 = &m2->install_snapshot.conf.servers[i]; munit_assert_int(s1->id, ==, s2->id); munit_assert_string_equal(s1->address, s2->address); munit_assert_int(s1->role, ==, s2->role); } munit_assert_int(m1->install_snapshot.data.len, ==, m2->install_snapshot.data.len); munit_assert_int(memcmp(m1->install_snapshot.data.base, m2->install_snapshot.data.base, m2->install_snapshot.data.len), ==, 0); raft_configuration_close(&m1->install_snapshot.conf); raft_free(m1->install_snapshot.data.base); break; case RAFT_IO_TIMEOUT_NOW: munit_assert_int(m1->timeout_now.term, ==, m2->timeout_now.term); munit_assert_int(m1->timeout_now.last_log_index, ==, m2->timeout_now.last_log_index); munit_assert_int(m1->timeout_now.last_log_term, ==, m2->timeout_now.last_log_term); break; }; result->done = true; } static void peerSendCb(struct raft_io_send *req, int status) { bool *done = req->data; munit_assert_int(status, ==, 0); *done = true; } static void peerCloseCb(struct raft_io *io) { bool *done = io->data; *done = true; } /* Set up the fixture's peer raft_io instance. */ #define PEER_SETUP \ do { \ struct uv_loop_s *_loop = &f->peer.loop; \ struct raft_uv_transport *_transport = &f->peer.transport; \ struct raft_io *_io = &f->peer.io; \ int _rv; \ _rv = uv_loop_init(_loop); \ munit_assert_int(_rv, ==, 0); \ _transport->version = 1; \ _rv = raft_uv_tcp_init(_transport, _loop); \ munit_assert_int(_rv, ==, 0); \ _rv = raft_uv_init(_io, _loop, f->dir, _transport); \ munit_assert_int(_rv, ==, 0); \ _rv = _io->init(_io, 2, "127.0.0.1:9002"); \ munit_assert_int(_rv, ==, 0); \ } while (0) /* Tear down the fixture's peer raft_io instance. */ #define PEER_TEAR_DOWN \ do { \ struct uv_loop_s *_loop = &f->peer.loop; \ struct raft_uv_transport *_transport = &f->peer.transport; \ struct raft_io *_io = &f->peer.io; \ bool _done = false; \ int _i; \ _done = false; \ _io->data = &_done; \ _io->close(_io, peerCloseCb); \ for (_i = 0; _i < 10; _i++) { \ if (_done) { \ break; \ } \ uv_run(_loop, UV_RUN_ONCE); \ } \ uv_run(_loop, UV_RUN_DEFAULT); \ munit_assert_true(_done); \ raft_uv_close(_io); \ raft_uv_tcp_close(_transport); \ uv_loop_close(_loop); \ } while (0) /* Send a message to the main fixture's raft_io instance using the fixture's * peer instance. */ #define PEER_SEND(MESSAGE) \ do { \ struct uv_loop_s *_loop = &f->peer.loop; \ struct raft_io *_io = &f->peer.io; \ struct raft_io_send _req; \ bool _done = false; \ int _i; \ int _rv; \ (MESSAGE)->server_id = 1; \ (MESSAGE)->server_address = "127.0.0.1:9001"; \ _req.data = &_done; \ _rv = _io->send(_io, &_req, MESSAGE, peerSendCb); \ munit_assert_int(_rv, ==, 0); \ for (_i = 0; _i < 10; _i++) { \ if (_done) { \ break; \ } \ uv_run(_loop, UV_RUN_ONCE); \ } \ munit_assert_true(_done); \ } while (0) /* Establish a connection and send an handshake using plain TCP. */ #define PEER_HANDSHAKE \ do { \ uint8_t _handshake[] = { \ 6, 6, 6, 0, 0, 0, 0, 0, /* Protocol */ \ 1, 0, 0, 0, 0, 0, 0, 0, /* Server ID */ \ 2, 0, 0, 0, 0, 0, 0, 0, /* Address length, in words */ \ 0, 0, 0, 0, 0, 0, 0, 0, /* First address word */ \ 0, 0, 0, 0, 0, 0, 0, 0 /* Second address word */ \ }; \ sprintf((char *)&_handshake[24], "127.0.0.1:666"); \ TCP_CLIENT_CONNECT(9001); \ TCP_CLIENT_SEND(_handshake, sizeof _handshake); \ } while (0); /* Run the loop until a new message is received. Assert that the received * message matches the given one. */ #define RECV(MESSAGE) \ do { \ struct result _result = {MESSAGE, false}; \ f->io.data = &_result; \ LOOP_RUN_UNTIL(&_result.done); \ f->io.data = NULL; \ } while (0) /****************************************************************************** * * Set up and tear down. * *****************************************************************************/ static void *setUpDeps(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_UV_DEPS; SETUP_TCP; PEER_SETUP; f->io.data = f; f->closed = false; return f; } static void tearDownDeps(void *data) { struct fixture *f = data; PEER_TEAR_DOWN; TEAR_DOWN_TCP; TEAR_DOWN_UV_DEPS; free(f); } static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = setUpDeps(params, user_data); int rv; SETUP_UV; f->io.data = f; rv = f->io.start(&f->io, 10000, NULL, recvCb); munit_assert_int(rv, ==, 0); return f; } static void tearDown(void *data) { struct fixture *f = data; TEAR_DOWN_UV; tearDownDeps(f); } /****************************************************************************** * * raft_io_recv_cb * *****************************************************************************/ SUITE(recv) /* Receive the very first message over the connection. */ TEST(recv, first, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_message message; message.type = RAFT_IO_REQUEST_VOTE; message.request_vote.candidate_id = 2; message.request_vote.last_log_index = 123; message.request_vote.last_log_term = 2; message.request_vote.disrupt_leader = false; PEER_SEND(&message); RECV(&message); return MUNIT_OK; } /* Receive the a first message then another one. */ TEST(recv, second, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_message message; message.type = RAFT_IO_REQUEST_VOTE; message.request_vote.candidate_id = 2; message.request_vote.last_log_index = 123; message.request_vote.last_log_term = 2; message.request_vote.disrupt_leader = true; PEER_SEND(&message); RECV(&message); PEER_SEND(&message); RECV(&message); return MUNIT_OK; } /* Receive a RequestVote result message. */ TEST(recv, requestVoteResult, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_message message; message.type = RAFT_IO_REQUEST_VOTE_RESULT; message.request_vote_result.term = 3; message.request_vote_result.vote_granted = true; message.request_vote_result.pre_vote = false; PEER_SEND(&message); RECV(&message); return MUNIT_OK; } /* Receive an AppendEntries message with two entries. */ TEST(recv, appendEntries, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry entries[2]; struct raft_message message; uint8_t data1[8] = {1, 2, 3, 4, 5, 6, 7, 8}; uint8_t data2[8] = {8, 7, 6, 5, 4, 3, 2, 1}; entries[0].type = RAFT_COMMAND; entries[0].buf.base = data1; entries[0].buf.len = sizeof data1; entries[1].type = RAFT_COMMAND; entries[1].buf.base = data2; entries[1].buf.len = sizeof data2; message.type = RAFT_IO_APPEND_ENTRIES; message.append_entries.entries = entries; message.append_entries.n_entries = 2; PEER_SEND(&message); RECV(&message); return MUNIT_OK; } /* Receive an AppendEntries message with no entries (i.e. an heartbeat). */ TEST(recv, heartbeat, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_message message; message.type = RAFT_IO_APPEND_ENTRIES; message.append_entries.entries = NULL; message.append_entries.n_entries = 0; PEER_SEND(&message); RECV(&message); return MUNIT_OK; } /* Receive an AppendEntries result f->peer.message. */ TEST(recv, appendEntriesResult, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_message message; message.type = RAFT_IO_APPEND_ENTRIES_RESULT; message.append_entries_result.term = 3; message.append_entries_result.rejected = 0; message.append_entries_result.last_log_index = 123; PEER_SEND(&message); RECV(&message); return MUNIT_OK; } /* Receive an InstallSnapshot message. */ TEST(recv, installSnapshot, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_message message; uint8_t snapshot_data[8] = {1, 2, 3, 4, 5, 6, 7, 8}; int rv; message.type = RAFT_IO_INSTALL_SNAPSHOT; message.install_snapshot.term = 2; message.install_snapshot.last_index = 123; message.install_snapshot.last_term = 1; raft_configuration_init(&message.install_snapshot.conf); rv = raft_configuration_add(&message.install_snapshot.conf, 1, "1", RAFT_VOTER); munit_assert_int(rv, ==, 0); message.install_snapshot.data.len = sizeof snapshot_data; message.install_snapshot.data.base = snapshot_data; PEER_SEND(&message); RECV(&message); raft_configuration_close(&message.install_snapshot.conf); return MUNIT_OK; } /* Receive a TimeoutNow message. */ TEST(recv, timeoutNow, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_message message; message.type = RAFT_IO_TIMEOUT_NOW; message.timeout_now.term = 3; message.timeout_now.last_log_index = 123; message.timeout_now.last_log_term = 2; PEER_SEND(&message); RECV(&message); return MUNIT_OK; } /* The handshake fails because of an unexpected protocon version. */ TEST(recv, badProtocol, setUp, tearDown, 0, NULL) { struct fixture *f = data; uint8_t handshake[] = { 6, 6, 6, 0, 0, 0, 0, 0, /* Protocol */ 1, 0, 0, 0, 0, 0, 0, 0, /* Server ID */ 2, 0, 0, 0, 0, 0, 0, 0 /* Address length */ }; TCP_CLIENT_CONNECT(9001); TCP_CLIENT_SEND(handshake, sizeof handshake); LOOP_RUN(2); return MUNIT_OK; } /* A message can't have zero length. */ TEST(recv, badSize, setUp, tearDown, 0, NULL) { struct fixture *f = data; uint8_t header[] = { 1, 0, 0, 0, 0, 0, 0, 0, /* Message type */ 0, 0, 0, 0, 0, 0, 0, 0 /* Message size */ }; PEER_HANDSHAKE; TCP_CLIENT_SEND(header, sizeof header); LOOP_RUN(2); return MUNIT_OK; } /* A message with a bad type causes the connection to be aborted. */ TEST(recv, badType, setUp, tearDown, 0, NULL) { struct fixture *f = data; uint8_t header[] = { 1, 2, 3, 4, 5, 6, 7, 8, /* Message type */ 0, 0, 0, 0, 0, 0, 0, 0 /* Message size */ }; PEER_HANDSHAKE; TCP_CLIENT_SEND(header, sizeof header); LOOP_RUN(2); return MUNIT_OK; } /* The backend is closed just before accepting a new connection. */ TEST(recv, closeBeforeAccept, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; uint8_t header[] = { 1, 2, 3, 4, 5, 6, 7, 8, /* Message type */ 0, 0, 0, 0, 0, 0, 0, 0 /* Message size */ }; PEER_HANDSHAKE; TCP_CLIENT_SEND(header, sizeof header); LOOP_RUN(1); TEAR_DOWN_UV; return MUNIT_OK; } /* The backend is closed after receiving the header of an AppendEntries * message. */ TEST(recv, closeAfterAppendEntriesHeader, setUp, tearDown, 0, NULL) { /* TODO */ return MUNIT_SKIP; } dqlite-1.16.7/test/raft/integration/test_uv_send.c000066400000000000000000000301731465252713400222110ustar00rootroot00000000000000#include #include "../lib/runner.h" #include "../lib/tcp.h" #include "../lib/uv.h" /****************************************************************************** * * Fixture with a libuv-based raft_io instance and some pre-set messages. * *****************************************************************************/ #define N_MESSAGES 5 struct fixture { FIXTURE_UV_DEPS; FIXTURE_TCP_SERVER; FIXTURE_UV; struct raft_message messages[N_MESSAGES]; }; /****************************************************************************** * * Helper macros * *****************************************************************************/ struct result { int status; bool done; }; static void sendCbAssertResult(struct raft_io_send *req, int status) { struct result *result = req->data; munit_assert_int(status, ==, result->status); result->done = true; } /* Get I'th fixture's message. */ #define MESSAGE(I) (&f->messages[I]) /* Submit a send request for the I'th fixture's message. */ #define SEND_SUBMIT(I, RV, STATUS) \ struct raft_io_send _req##I; \ struct result _result##I = {STATUS, false}; \ int _rv##I; \ _req##I.data = &_result##I; \ _rv##I = \ f->io.send(&f->io, &_req##I, &f->messages[I], sendCbAssertResult); \ munit_assert_int(_rv##I, ==, RV) /* Wait for the submit request of the I'th message to finish. */ #define SEND_WAIT(I) LOOP_RUN_UNTIL(&_result##I.done) /* Submit a send request for the I'th fixture's message and wait for the * operation to successfully complete. */ #define SEND(I) \ do { \ SEND_SUBMIT(I, 0 /* rv */, 0 /* status */); \ SEND_WAIT(I); \ } while (0) /* Submit a send request and assert that it fails synchronously with the * given error code and message. */ #define SEND_ERROR(I, RV, ERRMSG) \ do { \ SEND_SUBMIT(I, RV, 0 /* status */); \ /* munit_assert_string_equal(f->transport.errmsg, ERRMSG);*/ \ } while (0) /* Submit a send request and wait for the operation to fail with the given code * and message. */ #define SEND_FAILURE(I, STATUS, ERRMSG) \ do { \ SEND_SUBMIT(I, 0 /* rv */, STATUS); \ SEND_WAIT(I); \ /*munit_assert_string_equal(f->transport.errmsg, ERRMSG);*/ \ } while (0) /****************************************************************************** * * Set up and tear down. * *****************************************************************************/ static void *setUpDeps(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_UV_DEPS; SETUP_TCP_SERVER; f->io.data = f; return f; } static void tearDownDeps(void *data) { struct fixture *f = data; TEAR_DOWN_TCP_SERVER; TEAR_DOWN_UV_DEPS; free(f); } static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = setUpDeps(params, user_data); unsigned i; SETUP_UV; raft_uv_set_connect_retry_delay(&f->io, 1); for (i = 0; i < N_MESSAGES; i++) { struct raft_message *message = &f->messages[i]; message->type = RAFT_IO_REQUEST_VOTE; message->server_id = 1; message->server_address = f->server.address; } return f; } static void tearDown(void *data) { struct fixture *f = data; TEAR_DOWN_UV; tearDownDeps(f); } /****************************************************************************** * * raft_io->send() * *****************************************************************************/ SUITE(send) /* The first time a request is sent to a server a connection attempt is * triggered. If the connection succeeds the request gets written out. */ TEST(send, first, setUp, tearDown, 0, NULL) { struct fixture *f = data; SEND(0); return MUNIT_OK; } /* The second time a request is sent it re-uses the connection that was already * established */ TEST(send, second, setUp, tearDown, 0, NULL) { struct fixture *f = data; SEND(0); SEND(0); return MUNIT_OK; } /* Submit a few send requests in parallel. */ TEST(send, parallel, setUp, tearDown, 0, NULL) { struct fixture *f = data; SEND_SUBMIT(0 /* message */, 0 /* rv */, 0 /* status */); SEND_SUBMIT(1 /* message */, 0 /* rv */, 0 /* status */); SEND_WAIT(0); SEND_WAIT(1); return MUNIT_OK; } /* Send a request vote result message. */ TEST(send, voteResult, setUp, tearDown, 0, NULL) { struct fixture *f = data; MESSAGE(0)->type = RAFT_IO_REQUEST_VOTE_RESULT; SEND(0); return MUNIT_OK; } /* Send an append entries message. */ TEST(send, appendEntries, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry entries[2]; entries[0].buf.base = raft_malloc(16); entries[0].buf.len = 16; entries[1].buf.base = raft_malloc(8); entries[1].buf.len = 8; MESSAGE(0)->type = RAFT_IO_APPEND_ENTRIES; MESSAGE(0)->append_entries.entries = entries; MESSAGE(0)->append_entries.n_entries = 2; SEND(0); raft_free(entries[0].buf.base); raft_free(entries[1].buf.base); return MUNIT_OK; } /* Send an append entries message with zero entries (i.e. a heartbeat). */ TEST(send, heartbeat, setUp, tearDown, 0, NULL) { struct fixture *f = data; MESSAGE(0)->type = RAFT_IO_APPEND_ENTRIES; MESSAGE(0)->append_entries.entries = NULL; MESSAGE(0)->append_entries.n_entries = 0; SEND(0); return MUNIT_OK; } /* Send an append entries result message. */ TEST(send, appendEntriesResult, setUp, tearDown, 0, NULL) { struct fixture *f = data; MESSAGE(0)->type = RAFT_IO_APPEND_ENTRIES_RESULT; SEND(0); return MUNIT_OK; } /* Send an install snapshot message. */ TEST(send, installSnapshot, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_install_snapshot *p = &MESSAGE(0)->install_snapshot; int rv; MESSAGE(0)->type = RAFT_IO_INSTALL_SNAPSHOT; raft_configuration_init(&p->conf); rv = raft_configuration_add(&p->conf, 1, "1", RAFT_VOTER); munit_assert_int(rv, ==, 0); p->data.len = 8; p->data.base = raft_malloc(p->data.len); SEND(0); raft_configuration_close(&p->conf); raft_free(p->data.base); return MUNIT_OK; } /* A connection attempt fails asynchronously after the connect function * returns. */ TEST(send, noConnection, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; MESSAGE(0)->server_address = "127.0.0.1:123456"; SEND_SUBMIT(0 /* message */, 0 /* rv */, RAFT_CANCELED /* status */); TEAR_DOWN_UV; return MUNIT_OK; } /* The message has an invalid IPv4 address. */ TEST(send, badAddress, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; MESSAGE(0)->server_address = "1"; SEND_SUBMIT(0 /* message */, 0 /* rv */, RAFT_CANCELED /* status */); TEAR_DOWN_UV; return MUNIT_OK; } /* Make sure UvSend doesn't use a stale connection for a certain server id * by first sending a message to a valid address and then sending a message to * an invalid address, making sure the valid connection is not reused. * Afterwards assert that a send to the correct address still succeeds. */ TEST(send, changeToUnconnectedAddress, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; /* Send a message to a server and a connected address */ SEND(0); /* Send a message to the same server, but update the address to an * unconnected address and assert it fails. */ munit_assert_ullong(MESSAGE(0)->server_id, ==, MESSAGE(1)->server_id); MESSAGE(1)->server_address = "127.0.0.2:1"; SEND_SUBMIT(1 /* message */, 0 /* rv */, RAFT_CANCELED /* status */); /* Send another message to the same server and connected address */ munit_assert_ullong(MESSAGE(0)->server_id, ==, MESSAGE(2)->server_id); SEND(2); /* Send another message to the same server and connected address */ munit_assert_ullong(MESSAGE(0)->server_id, ==, MESSAGE(3)->server_id); SEND(3); TEAR_DOWN_UV; return MUNIT_OK; } /* The message has an invalid type. */ TEST(send, badMessage, setUp, tearDown, 0, NULL) { struct fixture *f = data; MESSAGE(0)->type = 666; SEND_ERROR(0, RAFT_MALFORMED, ""); return MUNIT_OK; } /* Old send requests that have accumulated and could not yet be sent are * progressively evicted. */ TEST(send, evictOldPending, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; TCP_SERVER_STOP; SEND_SUBMIT(0 /* message */, 0 /* rv */, RAFT_NOCONNECTION /* status */); SEND_SUBMIT(1 /* message */, 0 /* rv */, RAFT_CANCELED /* status */); SEND_SUBMIT(2 /* message */, 0 /* rv */, RAFT_CANCELED /* status */); SEND_SUBMIT(3 /* message */, 0 /* rv */, RAFT_CANCELED /* status */); SEND_WAIT(0); TEAR_DOWN_UV; return MUNIT_OK; } /* After the connection is established the peer dies and then comes back a * little bit later. */ TEST(send, reconnectAfterWriteError, setUp, tearDown, 0, NULL) { struct fixture *f = data; int socket; SEND(0); socket = TcpServerAccept(&f->server); close(socket); SEND_FAILURE(0, RAFT_IOERR, ""); SEND(0); return MUNIT_OK; } /* After the connection is established the peer dies and then comes back a * little bit later. At the time the peer died there where several writes * pending. */ TEST(send, reconnectAfterMultipleWriteErrors, setUp, tearDown, 0, NULL) { struct fixture *f = data; int socket; signal(SIGPIPE, SIG_IGN); SEND(0); socket = TcpServerAccept(&f->server); close(socket); SEND_SUBMIT(1 /* message */, 0 /* rv */, RAFT_IOERR /* status */); SEND_SUBMIT(2 /* message */, 0 /* rv */, RAFT_IOERR /* status */); SEND_WAIT(1); SEND_WAIT(2); SEND(3); return MUNIT_OK; } static char *oomHeapFaultDelay[] = {"0", "1", "2", "3", "4", NULL}; static char *oomHeapFaultRepeat[] = {"1", NULL}; static MunitParameterEnum oomParams[] = { {TEST_HEAP_FAULT_DELAY, oomHeapFaultDelay}, {TEST_HEAP_FAULT_REPEAT, oomHeapFaultRepeat}, {NULL, NULL}, }; /* Out of memory conditions. */ TEST(send, oom, setUp, tearDown, 0, oomParams) { struct fixture *f = data; HEAP_FAULT_ENABLE; SEND_ERROR(0, RAFT_NOMEM, ""); return MUNIT_OK; } static char *oomAsyncHeapFaultDelay[] = {"2", NULL}; static char *oomAsyncHeapFaultRepeat[] = {"1", NULL}; static MunitParameterEnum oomAsyncParams[] = { {TEST_HEAP_FAULT_DELAY, oomAsyncHeapFaultDelay}, {TEST_HEAP_FAULT_REPEAT, oomAsyncHeapFaultRepeat}, {NULL, NULL}, }; /* Transient out of memory error happening after send() has returned. */ TEST(send, oomAsync, setUp, tearDown, 0, oomAsyncParams) { struct fixture *f = data; SEND(0); return MUNIT_OK; } /* The backend gets closed while there is a pending write. */ TEST(send, closeDuringWrite, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; struct raft_entry entry; /* Set a very large message that is likely to fill the socket buffer. * TODO: figure a more deterministic way to choose the value. */ entry.buf.len = 1024 * 1024 * 8; entry.buf.base = raft_malloc(entry.buf.len); MESSAGE(0)->type = RAFT_IO_APPEND_ENTRIES; MESSAGE(0)->append_entries.entries = &entry; MESSAGE(0)->append_entries.n_entries = 1; SEND_SUBMIT(0 /* message */, 0 /* rv */, RAFT_CANCELED /* status */); TEAR_DOWN_UV; raft_free(entry.buf.base); return MUNIT_OK; } /* The backend gets closed while there is a pending connect request. */ TEST(send, closeDuringConnection, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; SEND_SUBMIT(0 /* message */, 0 /* rv */, RAFT_CANCELED /* status */); TEAR_DOWN_UV; return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_uv_set_term.c000066400000000000000000000220231465252713400230750ustar00rootroot00000000000000#include "../../../src/raft.h" #include "../../../src/raft/byte.h" #include "../../../src/raft/uv_encoding.h" #include "../lib/runner.h" #include "../lib/uv.h" /****************************************************************************** * * Fixture with a libuv-based raft_io instance. * *****************************************************************************/ struct fixture { FIXTURE_UV_DEPS; FIXTURE_UV; bool closed; }; /****************************************************************************** * * Helper macros * *****************************************************************************/ static void closeCb(struct raft_io *io) { struct fixture *f = io->data; f->closed = true; } /* Invoke raft_uv_init() and assert that no error occurs. */ #define INIT \ do { \ int _rv; \ _rv = raft_uv_init(&f->io, &f->loop, f->dir, &f->transport); \ munit_assert_int(_rv, ==, 0); \ _rv = f->io.init(&f->io, 1, "1"); \ munit_assert_int(_rv, ==, 0); \ } while (0) /* Invoke raft_io->close(). */ #define CLOSE \ do { \ f->io.close(&f->io, closeCb); \ LOOP_RUN_UNTIL(&f->closed); \ raft_uv_close(&f->io); \ } while (0) /* Invoke f->io->set_term() and assert that no error occurs. */ #define SET_TERM(TERM) \ do { \ int _rv; \ _rv = f->io.set_term(&f->io, TERM); \ munit_assert_int(_rv, ==, 0); \ } while (0) /* Invoke f->io->set_term() and assert that the given error code is returned and * the given error message set. */ #define SET_TERM_ERROR(TERM, RV, ERRMSG) \ do { \ int _rv; \ _rv = f->io.set_term(&f->io, TERM); \ munit_assert_int(_rv, ==, RV); \ munit_assert_string_equal(f->io.errmsg_(&f->io), ERRMSG); \ } while (0) /* Write either the metadata1 or metadata2 file, filling it with the given * values. */ #define WRITE_METADATA_FILE(N, FORMAT, VERSION, TERM, VOTED_FOR) \ { \ uint8_t buf[8 * 4]; \ void *cursor = buf; \ char filename[strlen("metadataN") + 1]; \ sprintf(filename, "metadata%d", N); \ bytePut64(&cursor, FORMAT); \ bytePut64(&cursor, VERSION); \ bytePut64(&cursor, TERM); \ bytePut64(&cursor, VOTED_FOR); \ DirWriteFile(f->dir, filename, buf, sizeof buf); \ } /* Assert that the content of either the metadata1 or metadata2 file match the * given values. */ #define ASSERT_METADATA_FILE(N, VERSION, TERM, VOTED_FOR) \ { \ uint8_t buf2[8 * 4]; \ const void *cursor = buf2; \ char filename[strlen("metadataN") + 1]; \ sprintf(filename, "metadata%d", N); \ DirReadFile(f->dir, filename, buf2, sizeof buf2); \ munit_assert_int(byteGet64(&cursor), ==, UV__DISK_FORMAT); \ munit_assert_int(byteGet64(&cursor), ==, VERSION); \ munit_assert_int(byteGet64(&cursor), ==, TERM); \ munit_assert_int(byteGet64(&cursor), ==, VOTED_FOR); \ } /****************************************************************************** * * Set up and tear down. * *****************************************************************************/ static void *setUpDeps(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_UV_DEPS; f->io.data = f; f->closed = false; return f; } static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = setUpDeps(params, user_data); INIT; return f; } static void tearDown(void *data) { struct fixture *f = data; CLOSE; TEAR_DOWN_UV_DEPS; free(f); } /****************************************************************************** * * raft_io->set_term() * *****************************************************************************/ SUITE(set_term) /* The very first time set_term() is called, the metadata1 file gets written. */ TEST(set_term, first, setUp, tearDown, 0, NULL) { struct fixture *f = data; SET_TERM(1); ASSERT_METADATA_FILE(1, 1, 1, 0); munit_assert_false(DirHasFile(f->dir, "metadata2")); return MUNIT_OK; } /* The second time set_term() is called, the metadata2 file gets written. */ TEST(set_term, second, setUp, tearDown, 0, NULL) { struct fixture *f = data; SET_TERM(1); SET_TERM(2); ASSERT_METADATA_FILE(1, 1, 1, 0); ASSERT_METADATA_FILE(2, 2, 2, 0); return MUNIT_OK; } /* The third time set_term() is called, the metadata1 file gets overwritten. */ TEST(set_term, third, setUp, tearDown, 0, NULL) { struct fixture *f = data; SET_TERM(1); SET_TERM(2); SET_TERM(3); ASSERT_METADATA_FILE(1, 3, 3, 0); ASSERT_METADATA_FILE(2, 2, 2, 0); return MUNIT_OK; } /* The fourth time set_term() is called, the metadata2 file gets overwritten. */ TEST(set_term, fourth, setUp, tearDown, 0, NULL) { struct fixture *f = data; SET_TERM(1); SET_TERM(2); SET_TERM(3); SET_TERM(4); ASSERT_METADATA_FILE(1, 3, 3, 0); ASSERT_METADATA_FILE(2, 4, 4, 0); return MUNIT_OK; } /* If the data directory has a single metadata1 file, the first time set_data() * is called, the second metadata file gets created. */ TEST(set_term, metadataOneExists, setUpDeps, tearDown, 0, NULL) { struct fixture *f = data; WRITE_METADATA_FILE(1, /* Metadata file index */ UV__DISK_FORMAT, /* Format */ 1, /* Version */ 1, /* Term */ 0 /* Voted for */); INIT; SET_TERM(2); ASSERT_METADATA_FILE(1, 1, 1, 0); ASSERT_METADATA_FILE(2, 2, 2, 0); return MUNIT_OK; } /* The data directory has both metadata files, but metadata1 is greater. */ TEST(set_term, metadataOneIsGreater, setUpDeps, tearDown, 0, NULL) { struct fixture *f = data; WRITE_METADATA_FILE(1, /* Metadata file index */ UV__DISK_FORMAT, /* Format */ 3, /* Version */ 3, /* Term */ 0 /* Voted for */); WRITE_METADATA_FILE(2, /* Metadata file index */ UV__DISK_FORMAT, /* Format */ 2, /* Version */ 2, /* Term */ 0 /* Voted for */); INIT; SET_TERM(4); ASSERT_METADATA_FILE(1 /* n */, 3 /* version */, 3 /* term */, 0 /* voted for */); ASSERT_METADATA_FILE(2 /* n */, 4 /* version */, 4 /* term */, 0 /* voted for */); return MUNIT_OK; } /* The data directory has both metadata files, but metadata2 is greater. */ TEST(set_term, metadataTwoIsGreater, setUpDeps, tearDown, 0, NULL) { struct fixture *f = data; WRITE_METADATA_FILE(1, /* Metadata file index */ UV__DISK_FORMAT, /* Format */ 1, /* Version */ 1, /* Term */ 0 /* Voted for */); WRITE_METADATA_FILE(2, /* Metadata file index */ UV__DISK_FORMAT, /* Format */ 2, /* Version */ 2, /* Term */ 0 /* Voted for */); INIT; SET_TERM(2); ASSERT_METADATA_FILE(1 /* n */, 3 /* version */, 2 /* term */, 0 /* voted for */); ASSERT_METADATA_FILE(2 /* n */, 2 /* version */, 2 /* term */, 0 /* voted for */); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_uv_snapshot_put.c000066400000000000000000000235421465252713400240110ustar00rootroot00000000000000#include #include "../lib/runner.h" #include "../lib/tcp.h" #include "../lib/uv.h" #include "append_helpers.h" /****************************************************************************** * * Fixture with a libuv-based raft_io instance. * *****************************************************************************/ struct fixture { FIXTURE_UV_DEPS; FIXTURE_UV; bool closed; int count; }; /****************************************************************************** * * Helper macros * *****************************************************************************/ struct snapshot { raft_term term; raft_index index; uint64_t data; bool done; }; static void snapshotPutCbAssertResult(struct raft_io_snapshot_put *req, int status) { struct result *result = req->data; munit_assert_int(status, ==, result->status); result->done = true; } static void snapshotGetCbAssertResult(struct raft_io_snapshot_get *req, struct raft_snapshot *snapshot, int status) { struct snapshot *expect = req->data; munit_assert_int(status, ==, 0); munit_assert_ptr_not_null(snapshot); munit_assert_int(snapshot->term, ==, expect->term); munit_assert_int(snapshot->index, ==, snapshot->index); expect->done = true; raft_configuration_close(&snapshot->configuration); raft_free(snapshot->bufs[0].base); raft_free(snapshot->bufs); raft_free(snapshot); } /* Submit a request to truncate the log at N */ #define TRUNCATE(N) \ { \ int _rv; \ _rv = f->io.truncate(&f->io, N); \ munit_assert_int(_rv, ==, 0); \ } #define SNAPSHOT_PUT_REQ(TRAILING, INDEX, RV, STATUS) \ struct raft_snapshot _snapshot; \ struct raft_buffer _snapshot_buf; \ uint64_t _snapshot_data; \ struct raft_io_snapshot_put _req; \ struct result _result = {STATUS, false, NULL}; \ int _rv; \ _snapshot.term = 1; \ _snapshot.index = INDEX; \ raft_configuration_init(&_snapshot.configuration); \ _rv = raft_configuration_add(&_snapshot.configuration, 1, "1", \ RAFT_STANDBY); \ munit_assert_int(_rv, ==, 0); \ _snapshot.bufs = &_snapshot_buf; \ _snapshot.n_bufs = 1; \ _snapshot_buf.base = &_snapshot_data; \ _snapshot_buf.len = sizeof _snapshot_data; \ _req.data = &_result; \ _rv = f->io.snapshot_put(&f->io, TRAILING, &_req, &_snapshot, \ snapshotPutCbAssertResult); \ munit_assert_int(_rv, ==, RV) /* Submit a snapshot put request for the given snapshot and wait for the * operation to successfully complete. */ #define SNAPSHOT_PUT(TRAILING, INDEX) \ do { \ SNAPSHOT_PUT_REQ(TRAILING, INDEX, 0 /* rv */, 0 /* status */); \ LOOP_RUN_UNTIL(&_result.done); \ raft_configuration_close(&_snapshot.configuration); \ } while (0) /* Submit a snapshot put request and assert that it fails synchronously with the * given error code and message. */ #define SNAPSHOT_PUT_ERROR(SNAPSHOT, TRAILING, RV, ERRMSG) \ do { \ SNAPSHOT_PUT_REQ(SNAPSHOT, TRAILING, RV, 0 /* status */); \ /* munit_assert_string_equal(f->transport.errmsg, ERRMSG);*/ \ } while (0) /* Submit a snapshot put request and wait for the operation to fail with the * given code and message. */ #define SNAPSHOT_PUT_FAILURE(STATUS, ERRMSG) \ do { \ SNAPSHOT_PUT_REQ(0 /* rv */, STATUS); \ LOOP_RUN_UNTIL(&_result.done); \ /*munit_assert_string_equal(f->transport.errmsg, ERRMSG);*/ \ } while (0) /* Use raft_io->snapshot_get to load the last snapshot and compare it with the * given parameters. */ #define ASSERT_SNAPSHOT(TERM, INDEX, DATA) \ do { \ struct raft_io_snapshot_get _req; \ struct snapshot _expect = {TERM, INDEX, DATA, false}; \ int _rv; \ _req.data = &_expect; \ _rv = f->io.snapshot_get(&f->io, &_req, snapshotGetCbAssertResult); \ munit_assert_int(_rv, ==, 0); \ LOOP_RUN_UNTIL(&_expect.done); \ } while (0) /****************************************************************************** * * Set up and tear down. * *****************************************************************************/ static void *setUpDeps(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_UV_DEPS; f->io.data = f; f->closed = false; return f; } static void tearDownDeps(void *data) { struct fixture *f = data; TEAR_DOWN_UV_DEPS; free(f); } static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = setUpDeps(params, user_data); SETUP_UV; return f; } static void tearDown(void *data) { struct fixture *f = data; TEAR_DOWN_UV; tearDownDeps(f); } /****************************************************************************** * * raft_io->snapshot_put * *****************************************************************************/ SUITE(snapshot_put) /* Put the first snapshot. */ TEST(snapshot_put, first, setUp, tearDown, 0, NULL) { struct fixture *f = data; SNAPSHOT_PUT(10, /* trailing */ 1 /* index */ ); ASSERT_SNAPSHOT(1, 1, 1); return MUNIT_OK; } /* If the number of closed entries is less than the given trailing amount, no * segment is deleted. */ TEST(snapshot_put, entriesLessThanTrailing, setUp, tearDown, 0, NULL) { struct fixture *f = data; unsigned i; raft_uv_set_segment_size( &f->io, 4096); /* Lower the number of block to force finalizing */ for (i = 0; i < 40; i++) { APPEND(10, 8); } SNAPSHOT_PUT(128, /* trailing */ 100 /* index */ ); munit_assert_true(DirHasFile(f->dir, "0000000000000001-0000000000000150")); munit_assert_true(DirHasFile(f->dir, "0000000000000151-0000000000000300")); return MUNIT_OK; } /* If the number of closed entries is greater than the given trailing amount, * closed segments that are fully past the trailing amount get deleted. */ TEST(snapshot_put, entriesMoreThanTrailing, setUp, tearDown, 0, NULL) { struct fixture *f = data; unsigned i; raft_uv_set_segment_size( &f->io, 4096); /* Lower the number of block to force finalizing */ for (i = 0; i < 40; i++) { APPEND(10, 8); } SNAPSHOT_PUT(128, /* trailing */ 280 /* index */ ); munit_assert_false(DirHasFile(f->dir, "0000000000000001-0000000000000150")); munit_assert_true(DirHasFile(f->dir, "0000000000000151-0000000000000300")); return MUNIT_OK; } /* Request to install a snapshot. */ TEST(snapshot_put, install, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(4, 8); SNAPSHOT_PUT(0, /* trailing */ 1 /* index */ ); return MUNIT_OK; } /* Request to install a snapshot without compression. */ TEST(snapshot_put, installNoCompression, setUp, tearDown, 0, NULL) { struct fixture *f = data; raft_uv_set_snapshot_compression(&f->io, false); APPEND(4, 8); SNAPSHOT_PUT(0, /* trailing */ 1 /* index */ ); return MUNIT_OK; } /* Request to install a snapshot, no previous entry is present. */ TEST(snapshot_put, installWithoutPreviousEntries, setUp, tearDown, 0, NULL) { struct fixture *f = data; SNAPSHOT_PUT(0, /* trailing */ 1 /* index */ ); return MUNIT_OK; } /* Request to install a couple of snapshots in a row, no previous entry is * present. */ TEST(snapshot_put, installMultipleWithoutPreviousEntries, setUp, tearDown, 0, NULL) { struct fixture *f = data; SNAPSHOT_PUT(0, /* trailing */ 1 /* index */ ); SNAPSHOT_PUT(0, /* trailing */ 3 /* index */ ); SNAPSHOT_PUT(0, /* trailing */ 1337 /* index */ ); return MUNIT_OK; } /* Request to install a couple of snapshots in a row, AppendEntries Requests * happen before, meanwhile and after */ TEST(snapshot_put, installMultipleAppendEntriesInBetween, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND_SUBMIT(0, 256, 8); APPEND_SUBMIT(1, 256, 8); SNAPSHOT_PUT(0, /* trailing */ 1 /* index */ ); APPEND_WAIT(0); APPEND_WAIT(1); APPEND_SUBMIT(2, 256, 8); APPEND_SUBMIT(3, 256, 8); SNAPSHOT_PUT(0, /* trailing */ 100 /* index */ ); APPEND_WAIT(2); APPEND_WAIT(3); APPEND_SUBMIT(4, 256, 8); APPEND_SUBMIT(5, 256, 8); APPEND_WAIT(4); APPEND_WAIT(5); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_uv_tcp_connect.c000066400000000000000000000264101465252713400235560ustar00rootroot00000000000000#include "../../../src/raft.h" #include "../../../src/raft.h" #include "../lib/addrinfo.h" #include "../lib/heap.h" #include "../lib/loop.h" #include "../lib/runner.h" #include "../lib/tcp.h" /****************************************************************************** * * Fixture with a TCP-based raft_uv_transport. * *****************************************************************************/ struct fixture { FIXTURE_HEAP; FIXTURE_LOOP; FIXTURE_TCP_SERVER; struct raft_uv_transport transport; bool closed; }; /****************************************************************************** * * Helper macros * *****************************************************************************/ struct result { int status; bool done; }; static void closeCb(struct raft_uv_transport *transport) { struct fixture *f = transport->data; f->closed = true; } static void connectCbAssertResult(struct raft_uv_connect *req, struct uv_stream_s *stream, int status) { struct result *result = req->data; munit_assert_int(status, ==, result->status); if (status == 0) { uv_close((struct uv_handle_s *)stream, (uv_close_cb)raft_free); } result->done = true; } #define INIT \ do { \ int _rv; \ _rv = f->transport.init(&f->transport, 1, "127.0.0.1:9000"); \ munit_assert_int(_rv, ==, 0); \ f->transport.data = f; \ f->closed = false; \ } while (0) #define CLOSE_SUBMIT \ munit_assert_false(f->closed); \ f->transport.close(&f->transport, closeCb); #define CLOSE_WAIT LOOP_RUN_UNTIL(&f->closed) #define CLOSE \ CLOSE_SUBMIT; \ CLOSE_WAIT #define CONNECT_REQ(ID, ADDRESS, RV, STATUS) \ struct raft_uv_connect _req; \ struct result _result = {STATUS, false}; \ int _rv; \ _req.data = &_result; \ _rv = f->transport.connect(&f->transport, &_req, ID, ADDRESS, \ connectCbAssertResult); \ munit_assert_int(_rv, ==, RV) /* Try to submit a connect request and assert that the given error code and * message are returned. */ #define CONNECT_ERROR(ID, ADDRESS, RV, ERRMSG) \ { \ CONNECT_REQ(ID, ADDRESS, RV /* rv */, 0 /* status */); \ munit_assert_string_equal(f->transport.errmsg, ERRMSG); \ } /* Submit a connect request with the given parameters and wait for the operation * to successfully complete. */ #define CONNECT(ID, ADDRESS) \ { \ CONNECT_REQ(ID, ADDRESS, 0 /* rv */, 0 /* status */); \ LOOP_RUN_UNTIL(&_result.done); \ } /* Submit a connect request with the given parameters and wait for the operation * to fail with the given code and message. */ #define CONNECT_FAILURE(ID, ADDRESS, STATUS, ERRMSG) \ { \ CONNECT_REQ(ID, ADDRESS, 0 /* rv */, STATUS); \ LOOP_RUN_UNTIL(&_result.done); \ munit_assert_string_equal(f->transport.errmsg, ERRMSG); \ } /* Submit a connect request with the given parameters, close the transport after * N loop iterations and assert that the request got canceled. */ #define CONNECT_CLOSE(ID, ADDRESS, N) \ { \ CONNECT_REQ(ID, ADDRESS, 0 /* rv */, RAFT_CANCELED); \ LOOP_RUN(N); \ CLOSE_SUBMIT; \ munit_assert_false(_result.done); \ LOOP_RUN_UNTIL(&_result.done); \ CLOSE_WAIT; \ } /****************************************************************************** * * Set up and tear down. * *****************************************************************************/ static void *setUpDeps(const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_malloc(sizeof *f); int rv; SET_UP_ADDRINFO; SET_UP_HEAP; SETUP_LOOP; SETUP_TCP_SERVER; f->transport.version = 1; rv = raft_uv_tcp_init(&f->transport, &f->loop); munit_assert_int(rv, ==, 0); return f; } static void tearDownDeps(void *data) { struct fixture *f = data; LOOP_STOP; raft_uv_tcp_close(&f->transport); TEAR_DOWN_TCP_SERVER; TEAR_DOWN_LOOP; TEAR_DOWN_HEAP; TEAR_DOWN_ADDRINFO; free(f); } static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = setUpDeps(params, user_data); INIT; return f; } static void tearDown(void *data) { struct fixture *f = data; CLOSE; tearDownDeps(f); } /****************************************************************************** * * raft_uv_transport->connect() * *****************************************************************************/ #define BOGUS_ADDRESS "127.0.0.1:6666" #define INVALID_ADDRESS "500.0.0.1:6666" SUITE(tcp_connect) /* Successfully connect to the peer by IP */ TEST(tcp_connect, first, setUp, tearDown, 0, NULL) { struct fixture *f = data; CONNECT(2, TCP_SERVER_ADDRESS); return MUNIT_OK; } /* Successfully connect to the peer by hostname */ TEST(tcp_connect, connectByName, setUp, tearDown, 0, NULL) { struct fixture *f = data; char host_adress[256]; sprintf(host_adress, "localhost:%d", TCP_SERVER_PORT); CONNECT(2, host_adress); return MUNIT_OK; } /* Successfully connect to the peer by first IP */ TEST(tcp_connect, firstIP, setUp, tearDown, 0, NULL) { struct fixture *f = data; const struct AddrinfoResult results[] = {{"127.0.0.1", TCP_SERVER_PORT}, {"192.0.2.0", 6666}}; AddrinfoInjectSetResponse(0, 2, results); CONNECT(2, "any-host"); return MUNIT_OK; } /* Successfully connect to the peer by second IP */ TEST(tcp_connect, secondIP, setUp, tearDown, 0, NULL) { struct fixture *f = data; const struct AddrinfoResult results[] = {{"127.0.0.1", .6666}, {"127.0.0.1", TCP_SERVER_PORT}}; AddrinfoInjectSetResponse(0, 2, results); CONNECT(2, "any-host"); return MUNIT_OK; } /* The peer has shutdown */ TEST(tcp_connect, refused, setUp, tearDown, 0, NULL) { struct fixture *f = data; TCP_SERVER_STOP; CONNECT_FAILURE(2, BOGUS_ADDRESS, RAFT_NOCONNECTION, "uv_tcp_connect(): connection refused"); return MUNIT_OK; } static char *oomHeapFaultDelay[] = {"0", "1", "2", NULL}; static char *oomHeapFaultRepeat[] = {"1", NULL}; static MunitParameterEnum oomParams[] = { {TEST_HEAP_FAULT_DELAY, oomHeapFaultDelay}, {TEST_HEAP_FAULT_REPEAT, oomHeapFaultRepeat}, {NULL, NULL}, }; /* Out of memory conditions. */ TEST(tcp_connect, oom, setUp, tearDown, 0, oomParams) { struct fixture *f = data; HEAP_FAULT_ENABLE; CONNECT_ERROR(2, BOGUS_ADDRESS, RAFT_NOMEM, "out of memory"); return MUNIT_OK; } /* The transport is closed immediately after a connect request as been * submitted. The request's callback is invoked with RAFT_CANCELED. */ TEST(tcp_connect, closeImmediately, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; CONNECT_CLOSE(2, TCP_SERVER_ADDRESS, 0); return MUNIT_OK; } /* The transport gets closed during the dns lookup */ TEST(tcp_connect, closeDuringDnsLookup, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; CONNECT_CLOSE(2, TCP_SERVER_ADDRESS, 1); return MUNIT_OK; } /* The transport gets closed during the handshake. */ TEST(tcp_connect, closeDuringHandshake, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; /* This test fails for libuv version >= 1.44.2 due to changes in uv_run * whereby queueing and processing the write_cb happen in the same loop * iteration, not leaving us a chance to close without going through a lot * of hoops. * https://github.com/libuv/libuv/pull/3598 */ unsigned incompatible_uv = (1 << 16) | (44 << 8) | 2; if (uv_version() >= incompatible_uv) { CLOSE; return MUNIT_SKIP; } CONNECT_CLOSE(2, TCP_SERVER_ADDRESS, 2); return MUNIT_OK; } static void checkCb(struct uv_check_s *check) { struct fixture *f = check->data; CLOSE_SUBMIT; uv_close((struct uv_handle_s *)check, NULL); } /* The transport gets closed right after a dns lookup failure, while the * connection attempt is being aborted. */ TEST(tcp_connect, closeDuringDnsLookupAbort, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; struct uv_check_s check; int rv; /* Use a check handle in order to close the transport in the same loop * iteration where the dns failure lookup occurs */ rv = uv_check_init(&f->loop, &check); munit_assert_int(rv, ==, 0); check.data = f; uv_check_start(&check, checkCb); CONNECT_REQ(2, INVALID_ADDRESS, 0, RAFT_NOCONNECTION); LOOP_RUN(1); LOOP_RUN_UNTIL(&_result.done); CLOSE_WAIT; return MUNIT_OK; } /* The transport gets closed right after a connection failure, while the * connection attempt is being aborted. */ TEST(tcp_connect, closeDuringConnectAbort, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; struct uv_check_s check; int rv; /* Use a check handle in order to close the transport in the same loop * iteration where the connection failure occurs. */ rv = uv_check_init(&f->loop, &check); munit_assert_int(rv, ==, 0); check.data = f; CONNECT_REQ(2, BOGUS_ADDRESS, 0, RAFT_NOCONNECTION); /* Successfull DNS lookup will initiate async connect */ LOOP_RUN(1); uv_check_start(&check, checkCb); LOOP_RUN(1); LOOP_RUN_UNTIL(&_result.done); CLOSE_WAIT; return MUNIT_OK; } /* The transport gets closed right after the first connection attempt failed, * while doing a second connection attempt. */ TEST(tcp_connect, closeDuringSecondConnect, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; struct uv_check_s check; int rv; const struct AddrinfoResult results[] = {{"127.0.0.1", .6666}, {"127.0.0.1", TCP_SERVER_PORT}}; AddrinfoInjectSetResponse(0, 2, results); /* Use a check handle in order to close the transport in the same loop * iteration where the second connection attempt occurs. */ rv = uv_check_init(&f->loop, &check); munit_assert_int(rv, ==, 0); check.data = f; CONNECT_REQ(2, "any-host", 0, RAFT_CANCELED); /* Successfull DNS lookup will initiate async connect */ LOOP_RUN(1); uv_check_start(&check, checkCb); LOOP_RUN(1); LOOP_RUN_UNTIL(&_result.done); CLOSE_WAIT; return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_uv_tcp_listen.c000066400000000000000000000314531465252713400234260ustar00rootroot00000000000000#include "../../../src/raft.h" #include "../../../src/raft/byte.h" #include "../lib/addrinfo.h" #include "../lib/heap.h" #include "../lib/loop.h" #include "../lib/runner.h" #include "../lib/tcp.h" /****************************************************************************** * * Fixture with a TCP-based raft_uv_transport. * *****************************************************************************/ struct fixture { FIXTURE_HEAP; FIXTURE_LOOP; FIXTURE_TCP; struct raft_uv_transport transport; bool accepted; bool closed; struct { uint8_t buf[sizeof(uint64_t) + /* Protocol version */ sizeof(uint64_t) + /* Server ID */ sizeof(uint64_t) + /* Length of address */ sizeof(uint64_t) * 2 /* Address */]; size_t offset; } handshake; }; /****************************************************************************** * * Helper macros * *****************************************************************************/ #define PEER_ID 2 #define PEER_ADDRESS "127.0.0.1:666" static void closeCb(struct raft_uv_transport *transport) { struct fixture *f = transport->data; f->closed = true; } static void acceptCb(struct raft_uv_transport *t, raft_id id, const char *address, struct uv_stream_s *stream) { struct fixture *f = t->data; munit_assert_int(id, ==, PEER_ID); munit_assert_string_equal(address, PEER_ADDRESS); f->accepted = true; uv_close((struct uv_handle_s *)stream, (uv_close_cb)raft_free); } #define INIT \ do { \ int _rv; \ f->transport.version = 1; \ _rv = raft_uv_tcp_init(&f->transport, &f->loop); \ munit_assert_int(_rv, ==, 0); \ const char *bind_addr = munit_parameters_get(params, "bind-address"); \ if (bind_addr && strlen(bind_addr)) { \ _rv = raft_uv_tcp_set_bind_address(&f->transport, bind_addr); \ munit_assert_int(_rv, ==, 0); \ } \ const char *address = munit_parameters_get(params, "address"); \ if (!address) { \ address = "127.0.0.1:9000"; \ } \ _rv = f->transport.init(&f->transport, 1, address); \ munit_assert_int(_rv, ==, 0); \ f->transport.data = f; \ f->closed = false; \ } while (0) #define CLOSE \ do { \ f->transport.close(&f->transport, closeCb); \ LOOP_RUN_UNTIL(&f->closed); \ raft_uv_tcp_close(&f->transport); \ } while (0) /****************************************************************************** * * Set up and tear down. * *****************************************************************************/ static void *setUpDeps(const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SET_UP_ADDRINFO; SET_UP_HEAP; SETUP_LOOP; SETUP_TCP; return f; } static void tearDownDeps(void *data) { struct fixture *f = data; TEAR_DOWN_TCP; TEAR_DOWN_LOOP; TEAR_DOWN_HEAP; TEAR_DOWN_ADDRINFO; free(f); } static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = setUpDeps(params, user_data); void *cursor; /* test_tcp_listen(&f->tcp); */ INIT; f->accepted = false; f->handshake.offset = 0; cursor = f->handshake.buf; bytePut64(&cursor, 1); bytePut64(&cursor, PEER_ID); bytePut64(&cursor, 16); strcpy(cursor, PEER_ADDRESS); return f; } static void tearDown(void *data) { struct fixture *f = data; CLOSE; tearDownDeps(f); } /****************************************************************************** * * Helper macros * *****************************************************************************/ #define LISTEN(EXPECTED_RV) \ do { \ int rv; \ rv = f->transport.listen(&f->transport, acceptCb); \ munit_assert_int(rv, ==, EXPECTED_RV); \ } while (false) /* Connect to the listening socket of the transport, creating a new connection * that is waiting to be accepted. */ #define PEER_CONNECT TCP_CLIENT_CONNECT(9000) /* Make the peer close the connection. */ #define PEER_CLOSE TCP_CLIENT_CLOSE /* Make the connected client send handshake data. */ #define PEER_HANDSHAKE \ do { \ size_t n = sizeof f->handshake.buf; \ TCP_CLIENT_SEND(f->handshake.buf, n); \ } while (0) /* Make the connected client send partial handshake data: only N bytes will be * sent, starting from the offset of the last call. */ #define PEER_HANDSHAKE_PARTIAL(N) \ do { \ TCP_CLIENT_SEND(f->handshake.buf + f->handshake.offset, N); \ } while (0) /* After a PEER_CONNECT() call, spin the event loop until the connected * callback of the listening TCP handle gets called. */ #define LOOP_RUN_UNTIL_CONNECTED LOOP_RUN(1); /* After a PEER_HANDSHAKE_PARTIAL() call, spin the event loop until the read * callback gets called. */ #define LOOP_RUN_UNTIL_READ LOOP_RUN(1); /* Spin the event loop until the accept callback gets eventually invoked. */ #define ACCEPT LOOP_RUN_UNTIL(&f->accepted); /****************************************************************************** * * Success scenarios. * *****************************************************************************/ SUITE(tcp_listen) /* Parameters for listen address */ static char *validAddresses[] = {"127.0.0.1:9000", "localhost:9000", NULL}; static char *validBindAddresses[] = { "", "127.0.0.1:9000", "localhost:9000", ":9000", "0.0.0.0:9000", NULL}; static MunitParameterEnum validListenParams[] = { {"address", validAddresses}, {"bind-address", validBindAddresses}, {NULL, NULL}, }; /* If the handshake is successful, the accept callback is invoked. */ TEST(tcp_listen, success, setUp, tearDown, 0, validListenParams) { struct fixture *f = data; LISTEN(0); PEER_CONNECT; PEER_HANDSHAKE; ACCEPT; return MUNIT_OK; } /* Parameters for invalid listen addresses */ static char *invalidAddresses[] = {"500.1.2.3:9000", "not-existing:9000", "192.0.2.0:9000", NULL}; static char *invalidBindAddresses[] = { "", "500.1.2.3:9000", "not-existing:9000", "192.0.2.0:9000", NULL}; static MunitParameterEnum invalidTcpListenParams[] = { {"address", invalidAddresses}, {"bind-address", invalidBindAddresses}, {NULL, NULL}, }; /* Check error on invalid hostname specified */ TEST(tcp_listen, invalidAddress, setUp, tearDown, 0, invalidTcpListenParams) { struct fixture *f = data; LISTEN(RAFT_IOERR); return MUNIT_OK; } /* Check success with addrinfo resolve to mutiple IP and first one is used to * connect */ TEST(tcp_listen, firstOfTwo, setUp, tearDown, 0, NULL) { const struct AddrinfoResult results[] = {{"127.0.0.1", 9000}, {"127.0.0.2", 9000}}; struct fixture *f = data; AddrinfoInjectSetResponse(0, 2, results); LISTEN(0); PEER_CONNECT; PEER_HANDSHAKE; ACCEPT; return MUNIT_OK; } /* Check success with addrinfo resolve to mutiple IP and second one is used to * connect */ TEST(tcp_listen, secondOfTwo, setUp, tearDown, 0, NULL) { const struct AddrinfoResult results[] = {{"127.0.0.2", 9000}, {"127.0.0.1", 9000}}; struct fixture *f = data; AddrinfoInjectSetResponse(0, 2, results); LISTEN(0); PEER_CONNECT; PEER_HANDSHAKE; ACCEPT; return MUNIT_OK; } /* Simulate port already in use error by addrinfo response contain the same IP * twice */ TEST(tcp_listen, alreadyBound, setUp, tearDown, 0, NULL) { /* We need to use the same endpoint three times as a simple duplicate will * be skipped due to a glib strange behavior * https://bugzilla.redhat.com/show_bug.cgi?id=496300 */ const struct AddrinfoResult results[] = { {"127.0.0.1", 9000}, {"127.0.0.1", 9000}, {"127.0.0.1", 9000}}; struct fixture *f = data; AddrinfoInjectSetResponse(0, 3, results); LISTEN(RAFT_IOERR); return MUNIT_OK; } /* Error in bind first IP address */ TEST(tcp_listen, cannotBindFirst, setUp, tearDown, 0, NULL) { const struct AddrinfoResult results[] = {{"192.0.2.0", 9000}, {"127.0.0.1", 9000}}; struct fixture *f = data; AddrinfoInjectSetResponse(0, 2, results); LISTEN(RAFT_IOERR); return MUNIT_OK; } /* Error in bind of second IP address */ TEST(tcp_listen, cannotBindSecond, setUp, tearDown, 0, NULL) { const struct AddrinfoResult results[] = {{"127.0.0.1", 9000}, {"192.0.2.0", 9000}}; struct fixture *f = data; AddrinfoInjectSetResponse(0, 2, results); LISTEN(RAFT_IOERR); return MUNIT_OK; } /* Check error on general dns server failure */ TEST(tcp_listen, resolveFailure, setUp, tearDown, 0, NULL) { struct fixture *f = data; AddrinfoInjectSetResponse(EAI_FAIL, 0, NULL); LISTEN(RAFT_IOERR); return MUNIT_OK; } /* The client sends us a bad protocol version */ TEST(tcp_listen, badProtocol, setUp, tearDown, 0, NULL) { struct fixture *f = data; LISTEN(0); memset(f->handshake.buf, 999, sizeof(uint64_t)); PEER_CONNECT; PEER_HANDSHAKE; LOOP_RUN_UNTIL_CONNECTED; LOOP_RUN_UNTIL_READ; return MUNIT_OK; } /* Parameters for sending a partial handshake */ static char *partialHandshakeN[] = {"8", "16", "24", "32", NULL}; static MunitParameterEnum peerAbortParams[] = { {"n", partialHandshakeN}, {NULL, NULL}, }; /* The peer closes the connection after having sent a partial handshake. */ TEST(tcp_listen, peerAbort, setUp, tearDown, 0, peerAbortParams) { struct fixture *f = data; LISTEN(0); const char *n = munit_parameters_get(params, "n"); PEER_CONNECT; PEER_HANDSHAKE_PARTIAL(atoi(n)); LOOP_RUN_UNTIL_CONNECTED; LOOP_RUN_UNTIL_READ; PEER_CLOSE; return MUNIT_OK; } /* TODO: skip "2" because it makes libuv crash, as it calls abort(). See also * https://github.com/libuv/libuv/issues/1948 */ static char *oomHeapFaultDelay[] = {"0", "1", "3", NULL}; static char *oomHeapFaultRepeat[] = {"1", NULL}; static MunitParameterEnum oomParams[] = { {TEST_HEAP_FAULT_DELAY, oomHeapFaultDelay}, {TEST_HEAP_FAULT_REPEAT, oomHeapFaultRepeat}, {NULL, NULL}, }; /* Out of memory conditions */ TEST(tcp_listen, oom, setUp, tearDown, 0, oomParams) { struct fixture *f = data; LISTEN(0); PEER_CONNECT; PEER_HANDSHAKE; HEAP_FAULT_ENABLE; /* Run as much as possible. */ uv_run(&f->loop, UV_RUN_NOWAIT); uv_run(&f->loop, UV_RUN_NOWAIT); uv_run(&f->loop, UV_RUN_NOWAIT); return MUNIT_OK; } /* Close the transport right after an incoming connection becomes pending, but * it hasn't been accepted yet. */ TEST(tcp_listen, pending, setUp, tearDown, 0, NULL) { struct fixture *f = data; LISTEN(0); PEER_CONNECT; return MUNIT_OK; } /* Close the transport right after an incoming connection gets accepted, and the * peer hasn't sent handshake data yet. */ TEST(tcp_listen, closeBeforeHandshake, setUp, tearDown, 0, NULL) { struct fixture *f = data; LISTEN(0); PEER_CONNECT; LOOP_RUN_UNTIL_CONNECTED; return MUNIT_OK; } static MunitParameterEnum closeDuringHandshake[] = { {"n", partialHandshakeN}, {NULL, NULL}, }; /* Close the transport right after the peer has started to send handshake data, * but isn't done with it yet. */ TEST(tcp_listen, handshake, setUp, tearDown, 0, closeDuringHandshake) { struct fixture *f = data; LISTEN(0); const char *n_param = munit_parameters_get(params, "n"); PEER_CONNECT; PEER_HANDSHAKE_PARTIAL(atoi(n_param)); LOOP_RUN_UNTIL_CONNECTED; LOOP_RUN_UNTIL_READ; return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_uv_truncate.c000066400000000000000000000271501465252713400231060ustar00rootroot00000000000000#include "../lib/runner.h" #include "../lib/uv.h" /****************************************************************************** * * Fixture * *****************************************************************************/ struct fixture { FIXTURE_UV_DEPS; FIXTURE_UV; int count; /* To generate deterministic entry data */ }; /****************************************************************************** * * Helper macros * *****************************************************************************/ struct result { int status; bool done; }; static void appendCbAssertResult(struct raft_io_append *req, int status) { struct result *result = req->data; munit_assert_int(status, ==, result->status); result->done = true; } /* Declare and fill the entries array for the append request identified by * I. The array will have N entries, and each entry will have a data buffer of * SIZE bytes.*/ #define ENTRIES(I, N, SIZE) \ struct raft_entry _entries##I[N]; \ uint8_t _entries_data##I[N * SIZE]; \ do { \ int _i; \ for (_i = 0; _i < N; _i++) { \ struct raft_entry *entry = &_entries##I[_i]; \ entry->term = 1; \ entry->type = RAFT_COMMAND; \ entry->buf.base = &_entries_data##I[_i * SIZE]; \ entry->buf.len = SIZE; \ entry->batch = NULL; \ munit_assert_ptr_not_null(entry->buf.base); \ memset(entry->buf.base, 0, entry->buf.len); \ f->count++; \ *(uint64_t *)entry->buf.base = f->count; \ } \ } while (0) /* Submit an append request identified by I, with N_ENTRIES entries, each one of * size ENTRY_SIZE). */ #define APPEND_SUBMIT(I, N_ENTRIES, ENTRY_SIZE) \ struct raft_io_append _req##I; \ struct result _result##I = {0, false}; \ int _rv##I; \ ENTRIES(I, N_ENTRIES, ENTRY_SIZE); \ _req##I.data = &_result##I; \ _rv##I = f->io.append(&f->io, &_req##I, _entries##I, N_ENTRIES, \ appendCbAssertResult); \ munit_assert_int(_rv##I, ==, 0) /* Wait for the append request identified by I to complete. */ #define APPEND_WAIT(I) LOOP_RUN_UNTIL(&_result##I.done) #define APPEND_EXPECT(I, STATUS) _result##I.status = STATUS /* Submit an append request and wait for it to successfully complete. */ #define APPEND(N) \ do { \ APPEND_SUBMIT(9999, N, 8); \ APPEND_WAIT(9999); \ } while (0) #define TRUNCATE(N) \ do { \ int rv_; \ rv_ = f->io.truncate(&f->io, N); \ munit_assert_int(rv_, ==, 0); \ } while (0) /****************************************************************************** * * Set up and tear down. * *****************************************************************************/ static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_UV_DEPS; SETUP_UV; f->count = 0; return f; } static void tearDownDeps(void *data) { struct fixture *f = data; TEAR_DOWN_UV_DEPS; free(f); } /****************************************************************************** * * Assertions * *****************************************************************************/ /* Shutdown the fixture's raft_io instance, then load all entries on disk using * a new raft_io instance, and assert that there are N entries with data * matching the DATA array. */ #define ASSERT_ENTRIES(N, ...) \ TEAR_DOWN_UV; \ do { \ struct uv_loop_s _loop; \ struct raft_uv_transport _transport; \ struct raft_io _io; \ raft_term _term; \ raft_id _voted_for; \ struct raft_snapshot *_snapshot; \ raft_index _start_index; \ struct raft_entry *_entries; \ size_t _i; \ size_t _n; \ void *_batch = NULL; \ unsigned _data[N] = {__VA_ARGS__}; \ int _rv; \ \ _rv = uv_loop_init(&_loop); \ munit_assert_int(_rv, ==, 0); \ _transport.version = 1; \ _rv = raft_uv_tcp_init(&_transport, &_loop); \ munit_assert_int(_rv, ==, 0); \ _rv = raft_uv_init(&_io, &_loop, f->dir, &_transport); \ munit_assert_int(_rv, ==, 0); \ _rv = _io.init(&_io, 1, "1"); \ munit_assert_int(_rv, ==, 0); \ _rv = _io.load(&_io, &_term, &_voted_for, &_snapshot, &_start_index, \ &_entries, &_n); \ munit_assert_int(_rv, ==, 0); \ _io.close(&_io, NULL); \ uv_run(&_loop, UV_RUN_NOWAIT); \ raft_uv_close(&_io); \ raft_uv_tcp_close(&_transport); \ uv_loop_close(&_loop); \ \ munit_assert_ptr_null(_snapshot); \ munit_assert_int(_n, ==, N); \ for (_i = 0; _i < _n; _i++) { \ struct raft_entry *_entry = &_entries[_i]; \ uint64_t _value = *(uint64_t *)_entry->buf.base; \ munit_assert_int(_entry->term, ==, 1); \ munit_assert_int(_entry->type, ==, RAFT_COMMAND); \ munit_assert_int(_value, ==, _data[_i]); \ munit_assert_ptr_not_null(_entry->batch); \ } \ for (_i = 0; _i < _n; _i++) { \ struct raft_entry *_entry = &_entries[_i]; \ if (_entry->batch != _batch) { \ _batch = _entry->batch; \ raft_free(_batch); \ } \ } \ raft_free(_entries); \ } while (0); /****************************************************************************** * * raft_io->truncate() * *****************************************************************************/ SUITE(truncate) /* If the index to truncate is at the start of a segment, that segment and all * subsequent ones are removed. */ TEST(truncate, wholeSegment, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; APPEND(3); TRUNCATE(1); APPEND(1); ASSERT_ENTRIES(1 /* n entries */, 4 /* entries data */); return MUNIT_OK; } /* The index to truncate is the same as the last appended entry. */ TEST(truncate, sameAsLastIndex, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; APPEND(3); TRUNCATE(3); APPEND(1); ASSERT_ENTRIES(3 /* n entries */, 1, 2, 4 /* entries data */); return MUNIT_OK; } /* If the index to truncate is not at the start of a segment, that segment gets * truncated. */ TEST(truncate, partialSegment, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; APPEND(3); APPEND(1); TRUNCATE(2); APPEND(1); ASSERT_ENTRIES(2, /* n entries */ 1, 5 /* entries data */ ); return MUNIT_OK; } /* The truncate request is issued while an append request is still pending. */ TEST(truncate, pendingAppend, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; APPEND_SUBMIT(0, /* request ID */ 3, /* n entries */ 8 /* entry size */ ); TRUNCATE(2 /* truncation index */); APPEND(1); ASSERT_ENTRIES(2, /* n entries */ 1, 4 /* entries data */ ); return MUNIT_OK; } /* Multiple truncate requests pending at the same time. */ TEST(truncate, multiplePending, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; APPEND_SUBMIT(0, /* request ID */ 3, /* n entries */ 8 /* entry size */ ); TRUNCATE(2 /* truncation index */); APPEND_SUBMIT(1, /* request ID */ 2, /* n entries */ 8 /* entry size */ ); TRUNCATE(3 /* truncation index */); APPEND(1); ASSERT_ENTRIES(3, /* n entries */ 1, 4, 6 /* entries data */ ); return MUNIT_OK; } /* The truncate request gets canceled because we're closing. */ TEST(truncate, closing, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; APPEND_SUBMIT(0, /* request ID */ 3, /* n entries */ 8 /* entry size */ ); TRUNCATE(2 /* truncation index */); APPEND_EXPECT(0, /* request ID */ RAFT_CANCELED /* status */ ); TEAR_DOWN_UV; return MUNIT_OK; } /* Multiple truncate requests get canceled because we're closing. */ TEST(truncate, closingMultiple, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; APPEND_SUBMIT(0, /* request ID */ 3, /* n entries */ 8 /* entry size */ ); TRUNCATE(2 /* truncation index */); APPEND_SUBMIT(1, /* request ID */ 2, /* n entries */ 8 /* entry size */ ); TRUNCATE(3 /* truncation index */); APPEND_EXPECT(0, /* request ID */ RAFT_CANCELED /* status */ ); APPEND_EXPECT(1, /* request ID */ RAFT_CANCELED /* status */ ); TEAR_DOWN_UV; return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_uv_truncate_snapshot.c000066400000000000000000000263311465252713400250250ustar00rootroot00000000000000#include "../lib/runner.h" #include "../lib/uv.h" /****************************************************************************** * * Fixture * *****************************************************************************/ struct fixture { FIXTURE_UV_DEPS; FIXTURE_UV; int count; /* To generate deterministic entry data */ }; /****************************************************************************** * * Helper macros * *****************************************************************************/ /* Maximum number of blocks a segment can have */ #define MAX_SEGMENT_BLOCKS 4 /* This block size should work fine for all file systems. */ #define SEGMENT_BLOCK_SIZE 4096 /* Default segment size */ #define SEGMENT_SIZE 4096 * MAX_SEGMENT_BLOCKS struct result { int status; bool done; void *data; }; static void appendCbAssertResult(struct raft_io_append *req, int status) { struct result *result = req->data; munit_assert_int(status, ==, result->status); result->done = true; } static void snapshotPutCbAssertResult(struct raft_io_snapshot_put *req, int status) { struct result *result = req->data; munit_assert_int(status, ==, result->status); result->done = true; } /* Declare and fill the entries array for the append request identified by * I. The array will have N entries, and each entry will have a data buffer of * SIZE bytes.*/ #define ENTRIES(I, N, SIZE) \ struct raft_entry _entries##I[N]; \ uint8_t _entries_data##I[N * SIZE]; \ do { \ int _i; \ for (_i = 0; _i < N; _i++) { \ struct raft_entry *entry = &_entries##I[_i]; \ entry->term = 1; \ entry->type = RAFT_COMMAND; \ entry->buf.base = &_entries_data##I[_i * SIZE]; \ entry->buf.len = SIZE; \ entry->batch = NULL; \ munit_assert_ptr_not_null(entry->buf.base); \ memset(entry->buf.base, 0, entry->buf.len); \ f->count++; \ *(uint64_t *)entry->buf.base = f->count; \ } \ } while (0) /* Submit an append request identified by I, with N_ENTRIES entries, each one of * size ENTRY_SIZE). */ #define APPEND_SUBMIT(I, N_ENTRIES, ENTRY_SIZE) \ struct raft_io_append _req##I; \ struct result _result##I = {0, false, NULL}; \ int _rv##I; \ ENTRIES(I, N_ENTRIES, ENTRY_SIZE); \ _req##I.data = &_result##I; \ _rv##I = f->io.append(&f->io, &_req##I, _entries##I, N_ENTRIES, \ appendCbAssertResult); \ munit_assert_int(_rv##I, ==, 0) #define TRUNCATE(N) \ do { \ int rv_; \ rv_ = f->io.truncate(&f->io, N); \ munit_assert_int(rv_, ==, 0); \ } while (0) /****************************************************************************** * * Set up and tear down. * *****************************************************************************/ static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_UV_DEPS; SETUP_UV; raft_uv_set_block_size(&f->io, SEGMENT_BLOCK_SIZE); raft_uv_set_segment_size(&f->io, SEGMENT_SIZE); f->count = 0; return f; } static void tearDownDeps(void *data) { struct fixture *f = data; TEAR_DOWN_UV_DEPS; free(f); } /****************************************************************************** * * Assertions * *****************************************************************************/ /* Shutdown the fixture's raft_io instance, then load all entries on disk using * a new raft_io instance, and assert that there are N entries with data * matching the DATA array. */ #define ASSERT_ENTRIES(N, ...) \ TEAR_DOWN_UV; \ do { \ struct uv_loop_s _loop; \ struct raft_uv_transport _transport; \ struct raft_io _io; \ raft_term _term; \ raft_id _voted_for; \ struct raft_snapshot *_snap; \ raft_index _start_index; \ struct raft_entry *_entries; \ size_t _i; \ size_t _n; \ void *_batch = NULL; \ unsigned _data[N] = {__VA_ARGS__}; \ int _ret; \ \ _ret = uv_loop_init(&_loop); \ munit_assert_int(_ret, ==, 0); \ _transport.version = 1; \ _ret = raft_uv_tcp_init(&_transport, &_loop); \ munit_assert_int(_ret, ==, 0); \ _ret = raft_uv_init(&_io, &_loop, f->dir, &_transport); \ munit_assert_int(_ret, ==, 0); \ _ret = _io.init(&_io, 1, "1"); \ munit_assert_int(_ret, ==, 0); \ _ret = _io.load(&_io, &_term, &_voted_for, &_snap, &_start_index, \ &_entries, &_n); \ munit_assert_int(_ret, ==, 0); \ _io.close(&_io, NULL); \ uv_run(&_loop, UV_RUN_NOWAIT); \ raft_uv_close(&_io); \ raft_uv_tcp_close(&_transport); \ uv_loop_close(&_loop); \ \ munit_assert_size(_n, ==, N); \ for (_i = 0; _i < _n; _i++) { \ struct raft_entry *_entry = &_entries[_i]; \ uint64_t _value = *(uint64_t *)_entry->buf.base; \ munit_assert_int(_entry->term, ==, 1); \ munit_assert_int(_entry->type, ==, RAFT_COMMAND); \ munit_assert_int(_value, ==, _data[_i]); \ munit_assert_ptr_not_null(_entry->batch); \ } \ for (_i = 0; _i < _n; _i++) { \ struct raft_entry *_entry = &_entries[_i]; \ if (_entry->batch != _batch) { \ _batch = _entry->batch; \ raft_free(_batch); \ } \ } \ raft_free(_entries); \ if (_snap != NULL) { \ raft_configuration_close(&_snap->configuration); \ munit_assert_int(_snap->n_bufs, ==, 1); \ raft_free(_snap->bufs[0].base); \ raft_free(_snap->bufs); \ raft_free(_snap); \ } \ } while (0); #define SNAPSHOT_PUT_REQ(TRAILING, INDEX, RV, STATUS) \ struct raft_snapshot _snapshot; \ struct raft_buffer _snapshot_buf; \ uint64_t _snapshot_data; \ struct raft_io_snapshot_put _req; \ struct result _result = {STATUS, false, NULL}; \ int _rv; \ _snapshot.term = 1; \ _snapshot.index = INDEX; \ raft_configuration_init(&_snapshot.configuration); \ _rv = raft_configuration_add(&_snapshot.configuration, 1, "1", \ RAFT_STANDBY); \ munit_assert_int(_rv, ==, 0); \ _snapshot.bufs = &_snapshot_buf; \ _snapshot.n_bufs = 1; \ _snapshot_buf.base = &_snapshot_data; \ _snapshot_buf.len = sizeof _snapshot_data; \ _req.data = &_result; \ _rv = f->io.snapshot_put(&f->io, TRAILING, &_req, &_snapshot, \ snapshotPutCbAssertResult); \ munit_assert_int(_rv, ==, RV) #define SNAPSHOT_CLEANUP() raft_configuration_close(&_snapshot.configuration) /****************************************************************************** * * test interaction of raft_io->snapshot_put and raft_io->truncate() * *****************************************************************************/ SUITE(snapshot_truncate) /* Fill up 3 segments worth of data, then take a snapshot. * While the snapshot is taken, start a truncate request. */ TEST(snapshot_truncate, snapshotThenTruncate, setUp, tearDownDeps, 0, NULL) { struct fixture *f = data; APPEND_SUBMIT(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE); APPEND_SUBMIT(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE); APPEND_SUBMIT(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE); /* Take a snapshot, this will use a uv_barrier. */ SNAPSHOT_PUT_REQ(8192, 6, 0, 0); /* Truncate, this will use a uv_barrier too. */ TRUNCATE(8); /* There's no truncate callback to wait for, loop for a while. */ LOOP_RUN(1000); /* Check that truncate has done its job. */ ASSERT_ENTRIES(7, 1, 2, 3, 4, 5, 6, 7); SNAPSHOT_CLEANUP(); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_uv_work.c000066400000000000000000000044271465252713400222450ustar00rootroot00000000000000#include #include "../../../src/raft/uv.h" #include "../lib/dir.h" #include "../lib/loop.h" #include "../lib/runner.h" #include "../lib/uv.h" /****************************************************************************** * * Fixture * *****************************************************************************/ struct fixture { FIXTURE_UV_DEPS; FIXTURE_UV; }; struct result { int rv; /* Indicate success or failure of the work */ int counter; /* Proof that work was performed */ bool done; /* To check test termination */ }; /****************************************************************************** * * Set up and tear down. * *****************************************************************************/ static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_UV_DEPS; SETUP_UV; return f; } static void tearDownDeps(void *data) { struct fixture *f = data; if (f == NULL) { return; } TEAR_DOWN_UV_DEPS; free(f); } static void tearDown(void *data) { struct fixture *f = data; if (f == NULL) { return; } TEAR_DOWN_UV; tearDownDeps(f); } /****************************************************************************** * * UvAsyncWork * *****************************************************************************/ static void asyncWorkCbAssertResult(struct raft_io_async_work *req, int status) { struct result *r = req->data; munit_assert_int(status, ==, r->rv); munit_assert_int(r->counter, ==, 1); r->done = true; } static int asyncWorkFn(struct raft_io_async_work *req) { struct result *r = req->data; sleep(1); r->counter = 1; return r->rv; } SUITE(UvAsyncWork) static char *rvs[] = {"-1", "0", "1", "37", NULL}; static MunitParameterEnum rvs_params[] = { {"rv", rvs}, {NULL, NULL}, }; TEST(UvAsyncWork, work, setUp, tearDown, 0, rvs_params) { struct fixture *f = data; struct result res = {0}; struct raft_io_async_work req = {0}; res.rv = (int)strtol(munit_parameters_get(params, "rv"), NULL, 0); req.data = &res; req.work = asyncWorkFn; UvAsyncWork(&f->io, &req, asyncWorkCbAssertResult); LOOP_RUN_UNTIL(&res.done); return MUNIT_OK; } dqlite-1.16.7/test/raft/integration/test_voter_contacts.c000066400000000000000000000053761465252713400236120ustar00rootroot00000000000000#include "../lib/cluster.h" #include "../lib/runner.h" #define N_SERVERS 3 /****************************************************************************** * * Fixture with a test raft cluster. * *****************************************************************************/ struct fixture { FIXTURE_CLUSTER; }; /****************************************************************************** * * Helper macros * *****************************************************************************/ #define STEP_N(N) raft_fixture_step_n(&f->cluster, N) /****************************************************************************** * * Set up a cluster with a three servers. * *****************************************************************************/ static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SETUP_CLUSTER(N_SERVERS); CLUSTER_BOOTSTRAP; CLUSTER_START; CLUSTER_ELECT(0); return f; } static void tearDown(void *data) { struct fixture *f = data; TEAR_DOWN_CLUSTER; free(f); } /****************************************************************************** * * raft_voter_contacts * *****************************************************************************/ SUITE(raft_voter_contacts) TEST(raft_voter_contacts, upToDate, setUp, tearDown, 0, NULL) { struct fixture *f = data; CLUSTER_STEP_UNTIL_HAS_LEADER(1000); CLUSTER_STEP_N(1000); /* N node cluster with leader */ for (unsigned int i = 0; i < N_SERVERS; i++) { int count = raft_voter_contacts(CLUSTER_RAFT(i)); if (i == CLUSTER_LEADER) { munit_assert_int(count, ==, N_SERVERS); } else { munit_assert_int(count, ==, -1); } } /* Kill the cluster leader, so a new leader is elected and the number of * voters should be decreased */ unsigned int leader = CLUSTER_LEADER; CLUSTER_KILL(leader); CLUSTER_STEP_UNTIL_HAS_LEADER(1000); CLUSTER_STEP_N(1000); for (unsigned int i = 0; i < N_SERVERS; i++) { if (i == leader) { continue; } int count = raft_voter_contacts(CLUSTER_RAFT(i)); if (i == CLUSTER_LEADER) { munit_assert_int(count, ==, N_SERVERS - 1); } else { munit_assert_int(count, ==, -1); } } /* Revive the old leader, so the count should go back up */ CLUSTER_REVIVE(leader); CLUSTER_STEP_N(1000); for (unsigned int i = 0; i < N_SERVERS; i++) { int count = raft_voter_contacts(CLUSTER_RAFT(i)); if (i == CLUSTER_LEADER) { munit_assert_int(count, ==, N_SERVERS); } else { munit_assert_int(count, ==, -1); } } return MUNIT_OK; } dqlite-1.16.7/test/raft/lib/000077500000000000000000000000001465252713400155625ustar00rootroot00000000000000dqlite-1.16.7/test/raft/lib/addrinfo.c000066400000000000000000000130351465252713400175160ustar00rootroot00000000000000#include "addrinfo.h" #include #include #include #include #include bool addrinfo_mock_enabled = false; enum addrinfo_mock_state { MockResultSet, MockResultReturned, SystemResult }; struct addrinfo_mock_data { enum addrinfo_mock_state state; int rv; struct addrinfo *result; struct addrinfo_mock_data *next; }; static struct addrinfo_mock_data *addrinfo_data; void AddrinfoInjectSetUp(MUNIT_UNUSED const MunitParameter params[]) { munit_assert_int(addrinfo_mock_enabled, ==, false); munit_assert_ptr((void *)addrinfo_data, ==, NULL); addrinfo_mock_enabled = true; } void AddrinfoInjectTearDown(void) { munit_assert_int(addrinfo_mock_enabled, ==, true); // If data is not freed the freeaddrinfo was not invoked. munit_assert_ptr((void *)addrinfo_data, ==, NULL); addrinfo_mock_enabled = false; } void AddrinfoInjectSetResponse(int rv, int num_results, const struct AddrinfoResult *results) { munit_assert_int(addrinfo_mock_enabled, ==, true); munit_assert(!addrinfo_data || addrinfo_data->state == MockResultReturned); munit_assert(rv || (num_results && results)); struct addrinfo_mock_data *response = malloc(sizeof(struct addrinfo_mock_data)); munit_assert_ptr((void *)response, !=, NULL); response->state = MockResultSet; response->rv = rv; response->result = NULL; for (int i = num_results - 1; i >= 0; --i) { struct sockaddr_in *addr_in = malloc(sizeof(struct sockaddr_in)); munit_assert_ptr((void *)addr_in, !=, NULL); munit_assert_int(uv_ip4_addr(results[i].ip, results[i].port, addr_in), ==, 0); struct addrinfo *ai = malloc(sizeof(struct addrinfo)); munit_assert_ptr((void *)ai, !=, NULL); ai->ai_flags = 0; ai->ai_family = AF_INET; ai->ai_socktype = SOCK_STREAM; ai->ai_protocol = IPPROTO_TCP; ai->ai_addrlen = sizeof(struct sockaddr_in); ai->ai_addr = (struct sockaddr *)addr_in; ai->ai_canonname = NULL; ai->ai_next = response->result; response->result = ai; } response->next = addrinfo_data; addrinfo_data = response; } static int invoke_system_getaddrinfo(const char *node, const char *service, const struct addrinfo *hints, struct addrinfo **res) { int (*system_getaddrinfo)(const char *node, const char *service, const struct addrinfo *hints, struct addrinfo **res); *(void **)(&system_getaddrinfo) = dlsym(RTLD_NEXT, "getaddrinfo"); munit_assert_ptr(*(void **)&system_getaddrinfo, !=, NULL); return (*system_getaddrinfo)(node, service, hints, res); } int getaddrinfo(const char *node, const char *service, const struct addrinfo *hints, struct addrinfo **res) { int rv; if (!addrinfo_mock_enabled) { return invoke_system_getaddrinfo(node, service, hints, res); } if (!addrinfo_data || addrinfo_data->state == SystemResult) { /* We have not injected response, invoke system function */ rv = invoke_system_getaddrinfo(node, service, hints, res); if (!rv) { /* Store result for check on freeaddrinfo */ struct addrinfo_mock_data *response = malloc(sizeof(struct addrinfo_mock_data)); munit_assert_ptr((void *)response, !=, NULL); response->state = SystemResult; response->rv = rv; response->result = *res; response->next = addrinfo_data; addrinfo_data = response; } return rv; } if (addrinfo_data) { munit_assert_int(addrinfo_data->state, ==, MockResultSet); addrinfo_data->state = MockResultReturned; rv = addrinfo_data->rv; if (!rv) { *res = addrinfo_data->result; } else { *res = NULL; struct addrinfo_mock_data *response = addrinfo_data; munit_assert_ptr((void *)response->result, ==, NULL); addrinfo_data = response->next; free(response); } return rv; } return EAI_FAIL; } static void invoke_system_freeaddrinfo(struct addrinfo *res) { int (*system_freeaddrinfo)(struct addrinfo * res); *(void **)(&system_freeaddrinfo) = dlsym(RTLD_NEXT, "freeaddrinfo"); munit_assert_ptr(*(void **)&system_freeaddrinfo, !=, NULL); (*system_freeaddrinfo)(res); } void freeaddrinfo(struct addrinfo *res) { struct addrinfo_mock_data **ptr; struct addrinfo_mock_data *response; // freeaddrinfo should not be invoked with a NULL pointer munit_assert_ptr((void *)res, !=, NULL); if (!addrinfo_mock_enabled) { invoke_system_freeaddrinfo(res); return; } for (ptr = &addrinfo_data; *ptr; ptr = &((*ptr)->next)) { if ((*ptr)->result == res) { break; } } response = *ptr; munit_assert_ptr((void *)response, !=, NULL); *ptr = response->next; if (response->state == SystemResult) { invoke_system_freeaddrinfo(response->result); } else { munit_assert_int(response->state, ==, MockResultReturned); res = response->result; while (res) { struct addrinfo *next = res->ai_next; free(res->ai_addr); free(res); res = next; } } free(response); } dqlite-1.16.7/test/raft/lib/addrinfo.h000066400000000000000000000023771465252713400175320ustar00rootroot00000000000000/* Support for getaddrinfo injection for test purpose * * Provide a local bound version to capture teh getaddrinfo/freeaddrinfo * incovation The helper may operate in three different modes: a) Transparent * forward calls to system getaddrinfo/freeaddrinfo function, if the * SET_UP_ADDRINFO/TEAR_DOWN_ADDRINFO is not added to the test test case setup * teardown. b) Check, if all results requested by getaddrinfo are freed using * freeaddrinfo. Activated by adding the SET_UP_ADDRINFO/SET_UP_ADDRINFO macros * to the test fixture. c) Inject artifical responses into the the getaddrinfo * requests for test purpose additionally to b) by using * AddrinfoInjectSetResponse before triggering the getaddrinfo calls. */ #ifndef TEST_ADDRINFO_H #define TEST_ADDRINFO_H #include "munit.h" #define SET_UP_ADDRINFO AddrinfoInjectSetUp(params) #define TEAR_DOWN_ADDRINFO AddrinfoInjectTearDown() typedef struct AddrinfoResult { const char *ip; const int port; } AddrinfoResult_t; void AddrinfoInjectSetResponse(int rv, int num_results, const struct AddrinfoResult *results); void AddrinfoInjectSetUp(const MunitParameter params[]); void AddrinfoInjectTearDown(void); #endif // #ifndef TEST_ADDRINFO_H dqlite-1.16.7/test/raft/lib/aio.c000066400000000000000000000027141465252713400165020ustar00rootroot00000000000000#include "aio.h" #include #include #include #include #include "munit.h" int AioFill(aio_context_t *ctx, unsigned n) { char buf[256]; int fd; int rv; int limit; int used; /* Figure out how many events are available. */ fd = open("/proc/sys/fs/aio-max-nr", O_RDONLY); munit_assert_int(fd, !=, -1); rv = read(fd, buf, sizeof buf); munit_assert_int(rv, !=, -1); close(fd); limit = atoi(buf); munit_assert_int(limit, >, 0); /* Figure out how many events are in use. */ fd = open("/proc/sys/fs/aio-nr", O_RDONLY); munit_assert_int(fd, !=, -1); rv = read(fd, buf, sizeof buf); munit_assert_int(rv, !=, -1); close(fd); used = atoi(buf); munit_assert_int(used, >=, 0); /* Best effort check that nothing process is using AIO. Our own unit tests * case use up to 2 event slots at the time this function is called, so we * don't consider those. */ if (used > 2) { return -1; } rv = syscall(__NR_io_setup, limit - used - n, ctx); if (rv != 0) { /* The `limit - used - n` calculation is racy and io_setup can fail with * EAGAIN if in meantime another proces has reserved some events */ munit_assert_int(errno, ==, EAGAIN); return -1; } return 0; } void AioDestroy(aio_context_t ctx) { int rv; rv = syscall(__NR_io_destroy, ctx); munit_assert_int(rv, ==, 0); } dqlite-1.16.7/test/raft/lib/aio.h000066400000000000000000000012011465252713400164750ustar00rootroot00000000000000/* Utilities around the Kernel AIO sub-system. */ #ifndef TEST_AIO_H #define TEST_AIO_H #include /* Fill the AIO subsystem resources by allocating a lot of events to the given * context, and leaving only @n events available for subsequent calls to * @io_setup. * * Return -1 if it looks like there is another process already using the AIO * subsystem, which would most probably make the calling test flaky because * there won't be exactly @n events available anymore. */ int AioFill(aio_context_t *ctx, unsigned n); /* Destroy the given AIO context. */ void AioDestroy(aio_context_t ctx); #endif /* TEST_AIO_H */ dqlite-1.16.7/test/raft/lib/cluster.c000066400000000000000000000027571465252713400174220ustar00rootroot00000000000000#include "cluster.h" static void randomize(struct raft_fixture *f, unsigned i, int what) { struct raft *raft = raft_fixture_get(f, i); switch (what) { case RAFT_FIXTURE_TICK: /* TODO: provide an API to inspect how much time has elapsed since * the last election timer reset */ if (raft->election_timer_start == raft->io->time(raft->io)) { raft_fixture_set_randomized_election_timeout( f, i, munit_rand_int_range(raft->election_timeout, raft->election_timeout * 2)); } break; case RAFT_FIXTURE_DISK: raft_fixture_set_disk_latency(f, i, munit_rand_int_range(10, 25)); break; case RAFT_FIXTURE_NETWORK: raft_fixture_set_network_latency(f, i, munit_rand_int_range(25, 50)); break; default: munit_assert(0); break; } } void cluster_randomize_init(struct raft_fixture *f) { unsigned i; for (i = 0; i < raft_fixture_n(f); i++) { randomize(f, i, RAFT_FIXTURE_TICK); randomize(f, i, RAFT_FIXTURE_DISK); randomize(f, i, RAFT_FIXTURE_NETWORK); } } void cluster_randomize(struct raft_fixture *f, struct raft_fixture_event *event) { unsigned index = raft_fixture_event_server_index(event); int type = raft_fixture_event_type(event); randomize(f, index, type); } dqlite-1.16.7/test/raft/lib/cluster.h000066400000000000000000000535301465252713400174220ustar00rootroot00000000000000/* Setup and drive a test raft cluster. */ #ifndef TEST_CLUSTER_H #define TEST_CLUSTER_H #include #include "../../../src/raft.h" #include "fsm.h" #include "heap.h" #include "munit.h" #include "snapshot.h" #define FIXTURE_CLUSTER \ FIXTURE_HEAP; \ struct raft_fsm fsms[RAFT_FIXTURE_MAX_SERVERS]; \ struct raft_fixture cluster /* N is the default number of servers, but can be tweaked with the cluster-n * parameter. */ #define SETUP_CLUSTER(DEFAULT_N) \ SET_UP_HEAP; \ do { \ unsigned _n = DEFAULT_N; \ bool _pre_vote = false; \ bool _ss_async = false; \ int _fsm_version = 3; \ unsigned _hb = 0; \ unsigned _i; \ int _rv; \ if (munit_parameters_get(params, CLUSTER_N_PARAM) != NULL) { \ _n = atoi(munit_parameters_get(params, CLUSTER_N_PARAM)); \ } \ if (munit_parameters_get(params, CLUSTER_PRE_VOTE_PARAM) != NULL) { \ _pre_vote = \ atoi(munit_parameters_get(params, CLUSTER_PRE_VOTE_PARAM)); \ } \ if (munit_parameters_get(params, CLUSTER_HEARTBEAT_PARAM) != NULL) { \ _hb = atoi(munit_parameters_get(params, CLUSTER_HEARTBEAT_PARAM)); \ } \ if (munit_parameters_get(params, CLUSTER_SS_ASYNC_PARAM) != NULL) { \ _ss_async = \ atoi(munit_parameters_get(params, CLUSTER_SS_ASYNC_PARAM)); \ } \ if (munit_parameters_get(params, CLUSTER_FSM_VERSION_PARAM) != NULL) { \ _fsm_version = \ atoi(munit_parameters_get(params, CLUSTER_FSM_VERSION_PARAM)); \ } \ munit_assert_int(_n, >, 0); \ _rv = raft_fixture_init(&f->cluster); \ munit_assert_int(_rv, ==, 0); \ for (_i = 0; _i < _n; _i++) { \ if (!_ss_async || _fsm_version < 3) { \ FsmInit(&f->fsms[_i], _fsm_version); \ } else { \ FsmInitAsync(&f->fsms[_i], _fsm_version); \ } \ _rv = raft_fixture_grow(&f->cluster, &f->fsms[_i]); \ munit_assert_int(_rv, ==, 0); \ } \ for (_i = 0; _i < _n; _i++) { \ raft_set_pre_vote(raft_fixture_get(&f->cluster, _i), _pre_vote); \ if (_hb) { \ raft_set_heartbeat_timeout(raft_fixture_get(&f->cluster, _i), \ _hb); \ } \ } \ } while (0) #define TEAR_DOWN_CLUSTER \ do { \ unsigned i; \ raft_fixture_close(&f->cluster); \ for (i = 0; i < CLUSTER_N; i++) { \ FsmClose(&f->fsms[i]); \ } \ } while (0); \ TEAR_DOWN_HEAP; /* Munit parameter for setting the number of servers */ #define CLUSTER_N_PARAM "cluster-n" /* Munit parameter for setting the number of voting servers */ #define CLUSTER_N_VOTING_PARAM "cluster-n-voting" /* Munit parameter for enabling pre-vote */ #define CLUSTER_PRE_VOTE_PARAM "cluster-pre-vote" /* Munit parameter for setting HeartBeat timeout */ #define CLUSTER_HEARTBEAT_PARAM "cluster-heartbeat" /* Munit parameter for setting snapshot behaviour */ #define CLUSTER_SS_ASYNC_PARAM "cluster-snapshot-async" /* Munit parameter for setting fsm version */ #define CLUSTER_FSM_VERSION_PARAM "fsm-version" /* Get the number of servers in the cluster. */ #define CLUSTER_N raft_fixture_n(&f->cluster) /* Get the cluster time. */ #define CLUSTER_TIME raft_fixture_time(&f->cluster) /* Index of the current leader, or CLUSTER_N if there's no leader. */ #define CLUSTER_LEADER raft_fixture_leader_index(&f->cluster) /* True if the cluster has a leader. */ #define CLUSTER_HAS_LEADER CLUSTER_LEADER < CLUSTER_N /* Get the struct raft object of the I'th server. */ #define CLUSTER_RAFT(I) raft_fixture_get(&f->cluster, I) /* Get the state of the I'th server. */ #define CLUSTER_STATE(I) raft_state(raft_fixture_get(&f->cluster, I)) /* Get the current term of the I'th server. */ #define CLUSTER_TERM(I) raft_fixture_get(&f->cluster, I)->current_term /* Get the struct fsm object of the I'th server. */ #define CLUSTER_FSM(I) &f->fsms[I] /* Return the last applied index on the I'th server. */ #define CLUSTER_LAST_APPLIED(I) \ raft_last_applied(raft_fixture_get(&f->cluster, I)) /* Return the ID of the server the I'th server has voted for. */ #define CLUSTER_VOTED_FOR(I) raft_fixture_voted_for(&f->cluster, I) /* Return a description of the last error occurred on the I'th server. */ #define CLUSTER_ERRMSG(I) raft_errmsg(CLUSTER_RAFT(I)) /* Populate the given configuration with all servers in the fixture. All servers * will be voting. */ #define CLUSTER_CONFIGURATION(CONF) \ { \ int rv_; \ rv_ = raft_fixture_configuration(&f->cluster, CLUSTER_N, CONF); \ munit_assert_int(rv_, ==, 0); \ } /* Bootstrap all servers in the cluster. All servers will be voting, unless the * cluster-n-voting parameter is used. */ #define CLUSTER_BOOTSTRAP \ { \ unsigned n_ = CLUSTER_N; \ int rv_; \ struct raft_configuration configuration; \ if (munit_parameters_get(params, CLUSTER_N_VOTING_PARAM) != NULL) { \ n_ = atoi(munit_parameters_get(params, CLUSTER_N_VOTING_PARAM)); \ } \ rv_ = raft_fixture_configuration(&f->cluster, n_, &configuration); \ munit_assert_int(rv_, ==, 0); \ rv_ = raft_fixture_bootstrap(&f->cluster, &configuration); \ munit_assert_int(rv_, ==, 0); \ raft_configuration_close(&configuration); \ } /* Bootstrap all servers in the cluster. Only the first N servers will be * voting. */ #define CLUSTER_BOOTSTRAP_N_VOTING(N) \ { \ int rv_; \ struct raft_configuration configuration_; \ rv_ = raft_fixture_configuration(&f->cluster, N, &configuration_); \ munit_assert_int(rv_, ==, 0); \ rv_ = raft_fixture_bootstrap(&f->cluster, &configuration_); \ munit_assert_int(rv_, ==, 0); \ raft_configuration_close(&configuration_); \ } /* Start all servers in the test cluster. */ #define CLUSTER_START \ { \ int rc; \ rc = raft_fixture_start(&f->cluster); \ munit_assert_int(rc, ==, 0); \ } /* Step the cluster. */ #define CLUSTER_STEP raft_fixture_step(&f->cluster); /* Step the cluster N times. */ #define CLUSTER_STEP_N(N) \ { \ unsigned i_; \ for (i_ = 0; i_ < N; i_++) { \ raft_fixture_step(&f->cluster); \ } \ } /* Step until the given function becomes true. */ #define CLUSTER_STEP_UNTIL(FUNC, ARG, MSECS) \ { \ bool done_; \ done_ = raft_fixture_step_until(&f->cluster, FUNC, ARG, MSECS); \ munit_assert_true(done_); \ } /* Step the cluster until a leader is elected or #MAX_MSECS have elapsed. */ #define CLUSTER_STEP_UNTIL_ELAPSED(MSECS) \ raft_fixture_step_until_elapsed(&f->cluster, MSECS) /* Step the cluster until a leader is elected or #MAX_MSECS have elapsed. */ #define CLUSTER_STEP_UNTIL_HAS_LEADER(MAX_MSECS) \ { \ bool done; \ done = raft_fixture_step_until_has_leader(&f->cluster, MAX_MSECS); \ munit_assert_true(done); \ munit_assert_true(CLUSTER_HAS_LEADER); \ } /* Step the cluster until there's no leader or #MAX_MSECS have elapsed. */ #define CLUSTER_STEP_UNTIL_HAS_NO_LEADER(MAX_MSECS) \ { \ bool done; \ done = raft_fixture_step_until_has_no_leader(&f->cluster, MAX_MSECS); \ munit_assert_true(done); \ munit_assert_false(CLUSTER_HAS_LEADER); \ } /* Step the cluster until the given index was applied by the given server (or * all if N) or #MAX_MSECS have elapsed. */ #define CLUSTER_STEP_UNTIL_APPLIED(I, INDEX, MAX_MSECS) \ { \ bool done; \ done = \ raft_fixture_step_until_applied(&f->cluster, I, INDEX, MAX_MSECS); \ munit_assert_true(done); \ } /* Step the cluster until the state of the server with the given index matches * the given value, or #MAX_MSECS have elapsed. */ #define CLUSTER_STEP_UNTIL_STATE_IS(I, STATE, MAX_MSECS) \ { \ bool done; \ done = raft_fixture_step_until_state_is(&f->cluster, I, STATE, \ MAX_MSECS); \ munit_assert_true(done); \ } /* Step the cluster until the term of the server with the given index matches * the given value, or #MAX_MSECS have elapsed. */ #define CLUSTER_STEP_UNTIL_TERM_IS(I, TERM, MAX_MSECS) \ { \ bool done; \ done = \ raft_fixture_step_until_term_is(&f->cluster, I, TERM, MAX_MSECS); \ munit_assert_true(done); \ } /* Step the cluster until server I has voted for server J, or #MAX_MSECS have * elapsed. */ #define CLUSTER_STEP_UNTIL_VOTED_FOR(I, J, MAX_MSECS) \ { \ bool done; \ done = \ raft_fixture_step_until_voted_for(&f->cluster, I, J, MAX_MSECS); \ munit_assert_true(done); \ } /* Step the cluster until all messages from server I to server J have been * delivered, or #MAX_MSECS elapse. */ #define CLUSTER_STEP_UNTIL_DELIVERED(I, J, MAX_MSECS) \ { \ bool done; \ done = \ raft_fixture_step_until_delivered(&f->cluster, I, J, MAX_MSECS); \ munit_assert_true(done); \ } /* Request to apply an FSM command to add the given value to x. */ #define CLUSTER_APPLY_ADD_X(I, REQ, VALUE, CB) \ { \ struct raft_buffer buf_; \ struct raft *raft_; \ int rv_; \ FsmEncodeAddX(VALUE, &buf_); \ raft_ = raft_fixture_get(&f->cluster, I); \ rv_ = raft_apply(raft_, REQ, &buf_, NULL, 1, CB); \ munit_assert_int(rv_, ==, 0); \ } /* Kill the I'th server. */ #define CLUSTER_KILL(I) raft_fixture_kill(&f->cluster, I); /* Revive the I'th server */ #define CLUSTER_REVIVE(I) raft_fixture_revive(&f->cluster, I); /* Kill the leader. */ #define CLUSTER_KILL_LEADER CLUSTER_KILL(CLUSTER_LEADER) /* Kill a majority of servers, except the leader (if there is one). */ #define CLUSTER_KILL_MAJORITY \ { \ size_t i2; \ size_t n; \ for (i2 = 0, n = 0; n < (CLUSTER_N / 2) + 1; i2++) { \ if (i2 == CLUSTER_LEADER) { \ continue; \ } \ CLUSTER_KILL(i2) \ n++; \ } \ } /* Grow the cluster adding one server. */ #define CLUSTER_GROW \ { \ int rv_; \ FsmInit(&f->fsms[CLUSTER_N], 2); \ rv_ = raft_fixture_grow(&f->cluster, &f->fsms[CLUSTER_N]); \ munit_assert_int(rv_, ==, 0); \ } /* Add a new pristine server to the cluster, connected to all others. Then * submit a request to add it to the configuration as an idle server. */ #define CLUSTER_ADD(REQ) \ { \ int rc; \ struct raft *new_raft; \ CLUSTER_GROW; \ rc = raft_start(CLUSTER_RAFT(CLUSTER_N - 1)); \ munit_assert_int(rc, ==, 0); \ new_raft = CLUSTER_RAFT(CLUSTER_N - 1); \ rc = raft_add(CLUSTER_RAFT(CLUSTER_LEADER), REQ, new_raft->id, \ new_raft->address, NULL); \ munit_assert_int(rc, ==, 0); \ } /* Assign the given role to the server that was added last. */ #define CLUSTER_ASSIGN(REQ, ROLE) \ do { \ unsigned _id; \ int _rv; \ _id = CLUSTER_N; /* Last server that was added. */ \ _rv = raft_assign(CLUSTER_RAFT(CLUSTER_LEADER), REQ, _id, ROLE, NULL); \ munit_assert_int(_rv, ==, 0); \ } while (0) /* Ensure that the cluster can make progress from the current state. * * - If no leader is present, wait for one to be elected. * - Submit a request to apply a new FSM command and wait for it to complete. */ #define CLUSTER_MAKE_PROGRESS \ { \ struct raft_apply *req_ = munit_malloc(sizeof *req_); \ if (!(CLUSTER_HAS_LEADER)) { \ CLUSTER_STEP_UNTIL_HAS_LEADER(10000); \ } \ CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, req_, 1, NULL); \ CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, req_->index, 3000); \ free(req_); \ } /* Elect the I'th server. */ #define CLUSTER_ELECT(I) raft_fixture_elect(&f->cluster, I) /* Start to elect the I'th server. */ #define CLUSTER_START_ELECT(I) raft_fixture_start_elect(&f->cluster, I) /* Depose the current leader */ #define CLUSTER_DEPOSE raft_fixture_depose(&f->cluster) /* Disconnect I from J. */ #define CLUSTER_DISCONNECT(I, J) raft_fixture_disconnect(&f->cluster, I, J) /* Reconnect I to J. */ #define CLUSTER_RECONNECT(I, J) raft_fixture_reconnect(&f->cluster, I, J) /* Saturate the connection from I to J. */ #define CLUSTER_SATURATE(I, J) raft_fixture_saturate(&f->cluster, I, J) /* Saturate the connection from I to J and from J to I, in both directions. */ #define CLUSTER_SATURATE_BOTHWAYS(I, J) \ CLUSTER_SATURATE(I, J); \ CLUSTER_SATURATE(J, I) /* Desaturate the connection between I and J, making messages flow again. */ #define CLUSTER_DESATURATE(I, J) raft_fixture_desaturate(&f->cluster, I, J) /* Reconnect two servers. */ #define CLUSTER_DESATURATE_BOTHWAYS(I, J) \ CLUSTER_DESATURATE(I, J); \ CLUSTER_DESATURATE(J, I) /* Set the network latency of outgoing messages of server I. */ #define CLUSTER_SET_NETWORK_LATENCY(I, MSECS) \ raft_fixture_set_network_latency(&f->cluster, I, MSECS) /* Set the disk I/O latency of server I. */ #define CLUSTER_SET_DISK_LATENCY(I, MSECS) \ raft_fixture_set_disk_latency(&f->cluster, I, MSECS) /* Set the term persisted on the I'th server. This must be called before * starting the cluster. */ #define CLUSTER_SET_TERM(I, TERM) raft_fixture_set_term(&f->cluster, I, TERM) /* Set the snapshot persisted on the I'th server. This must be called before * starting the cluster. */ #define CLUSTER_SET_SNAPSHOT(I, LAST_INDEX, LAST_TERM, CONF_INDEX, X, Y) \ { \ struct raft_configuration configuration_; \ struct raft_snapshot *snapshot_; \ CLUSTER_CONFIGURATION(&configuration_); \ CREATE_SNAPSHOT(snapshot_, LAST_INDEX, LAST_TERM, configuration_, \ CONF_INDEX, X, Y); \ raft_fixture_set_snapshot(&f->cluster, I, snapshot_); \ } /* Add a persisted entry to the I'th server. This must be called before * starting the cluster. */ #define CLUSTER_ADD_ENTRY(I, ENTRY) \ raft_fixture_add_entry(&f->cluster, I, ENTRY) /* Add an entry to the ones persisted on the I'th server. This must be called * before starting the cluster. */ #define CLUSTER_ADD_ENTRY(I, ENTRY) \ raft_fixture_add_entry(&f->cluster, I, ENTRY) /* Return the number of messages sent by the given server. */ #define CLUSTER_N_SEND(I, TYPE) raft_fixture_n_send(&f->cluster, I, TYPE) /* Return the number of messages sent by the given server. */ #define CLUSTER_N_RECV(I, TYPE) raft_fixture_n_recv(&f->cluster, I, TYPE) /* Set a fixture hook that randomizes election timeouts, disk latency and * network latency. */ #define CLUSTER_RANDOMIZE \ cluster_randomize_init(&f->cluster); \ raft_fixture_hook(&f->cluster, cluster_randomize) void cluster_randomize_init(struct raft_fixture *f); void cluster_randomize(struct raft_fixture *f, struct raft_fixture_event *event); #endif /* TEST_CLUSTER_H */ dqlite-1.16.7/test/raft/lib/dir.c000066400000000000000000000220661465252713400165120ustar00rootroot00000000000000#include "dir.h" #include #include #include #include #include #include #include #include #include #include #define SEP "/" #define TEMPLATE "raft-test-XXXXXX" #define TEST_DIR_TEMPLATE "./tmp/%s/raft-test-XXXXXX" static char *dirAll[] = {"tmpfs", "ext4", "btrfs", "xfs", "zfs", NULL}; static char *dirTmpfs[] = {"tmpfs", NULL}; static char *dirAio[] = {"btrfs", "ext4", "xfs", NULL}; static char *dirNoAio[] = {"tmpfs", "zfs", NULL}; MunitParameterEnum DirTmpfsParams[] = { {DIR_FS_PARAM, dirTmpfs}, {NULL, NULL}, }; MunitParameterEnum DirAllParams[] = { {DIR_FS_PARAM, dirAll}, {NULL, NULL}, }; MunitParameterEnum DirAioParams[] = { {DIR_FS_PARAM, dirAio}, {NULL, NULL}, }; MunitParameterEnum DirNoAioParams[] = { {DIR_FS_PARAM, dirNoAio}, {NULL, NULL}, }; /* Create a temporary directory in the given parent directory. */ static char *dirMakeTemp(const char *parent) { char *dir; if (parent == NULL) { return NULL; } dir = munit_malloc(strlen(parent) + strlen(SEP) + strlen(TEMPLATE) + 1); sprintf(dir, "%s%s%s", parent, SEP, TEMPLATE); if (mkdtemp(dir) == NULL) { munit_error(strerror(errno)); } return dir; } void *DirSetUp(MUNIT_UNUSED const MunitParameter params[], MUNIT_UNUSED void *user_data) { const char *fs = munit_parameters_get(params, DIR_FS_PARAM); if (fs == NULL) { return dirMakeTemp("/tmp"); } else if (strcmp(fs, "tmpfs") == 0) { return DirTmpfsSetUp(params, user_data); } else if (strcmp(fs, "ext4") == 0) { return DirExt4SetUp(params, user_data); } else if (strcmp(fs, "btrfs") == 0) { return DirBtrfsSetUp(params, user_data); } else if (strcmp(fs, "zfs") == 0) { return DirZfsSetUp(params, user_data); } else if (strcmp(fs, "xfs") == 0) { return DirXfsSetUp(params, user_data); } munit_errorf("Unsupported file system %s", fs); return NULL; } void *DirTmpfsSetUp(MUNIT_UNUSED const MunitParameter params[], MUNIT_UNUSED void *user_data) { return dirMakeTemp(getenv("RAFT_TMP_TMPFS")); } void *DirExt4SetUp(MUNIT_UNUSED const MunitParameter params[], MUNIT_UNUSED void *user_data) { return dirMakeTemp(getenv("RAFT_TMP_EXT4")); } void *DirBtrfsSetUp(MUNIT_UNUSED const MunitParameter params[], MUNIT_UNUSED void *user_data) { return dirMakeTemp(getenv("RAFT_TMP_BTRFS")); } void *DirZfsSetUp(MUNIT_UNUSED const MunitParameter params[], MUNIT_UNUSED void *user_data) { return dirMakeTemp(getenv("RAFT_TMP_ZFS")); } void *DirXfsSetUp(MUNIT_UNUSED const MunitParameter params[], MUNIT_UNUSED void *user_data) { return dirMakeTemp(getenv("RAFT_TMP_XFS")); } /* Wrapper around remove(), compatible with ntfw. */ static int dirRemoveFn(const char *path, MUNIT_UNUSED const struct stat *sbuf, MUNIT_UNUSED int type, MUNIT_UNUSED struct FTW *ftwb) { return remove(path); } static void dirRemove(char *dir) { int rv; rv = chmod(dir, 0755); munit_assert_int(rv, ==, 0); rv = nftw(dir, dirRemoveFn, 10, FTW_DEPTH | FTW_MOUNT | FTW_PHYS); munit_assert_int(rv, ==, 0); } static bool dirExists(const char *dir) { struct stat sb; int rv; rv = stat(dir, &sb); if (rv == -1) { munit_assert_int(errno, ==, ENOENT); return false; } return true; } void DirTearDown(void *data) { char *dir = data; if (dir == NULL) { return; } if (dirExists(dir)) { dirRemove(dir); } free(dir); } /* Join the given @dir and @filename into @path. */ static void joinPath(const char *dir, const char *filename, char *path) { strcpy(path, dir); strcat(path, "/"); strcat(path, filename); } void DirWriteFile(const char *dir, const char *filename, const void *buf, const size_t n) { char path[256]; int fd; int rv; joinPath(dir, filename, path); fd = open(path, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); munit_assert_int(fd, !=, -1); rv = write(fd, buf, n); munit_assert_int(rv, ==, n); close(fd); } void DirWriteFileWithZeros(const char *dir, const char *filename, const size_t n) { void *buf = munit_malloc(n); DirWriteFile(dir, filename, buf, n); free(buf); } void DirOverwriteFile(const char *dir, const char *filename, const void *buf, const size_t n, const off_t whence) { char path[256]; int fd; int rv; off_t size; joinPath(dir, filename, path); fd = open(path, O_RDWR, S_IRUSR | S_IWUSR); munit_assert_int(fd, !=, -1); /* Get the size of the file */ size = lseek(fd, 0, SEEK_END); if (whence == 0) { munit_assert_int(size, >=, n); lseek(fd, 0, SEEK_SET); } else if (whence > 0) { munit_assert_int(whence, <=, size); munit_assert_int(size - whence, >=, n); lseek(fd, whence, SEEK_SET); } else { munit_assert_int(-whence, <=, size); munit_assert_int(-whence, >=, n); lseek(fd, whence, SEEK_END); } rv = write(fd, buf, n); munit_assert_int(rv, ==, n); close(fd); } void DirTruncateFile(const char *dir, const char *filename, const size_t n) { char path[256]; int fd; int rv; joinPath(dir, filename, path); fd = open(path, O_RDWR, S_IRUSR | S_IWUSR); munit_assert_int(fd, !=, -1); rv = ftruncate(fd, n); munit_assert_int(rv, ==, 0); rv = close(fd); munit_assert_int(rv, ==, 0); } void DirGrowFile(const char *dir, const char *filename, const size_t n) { char path[256]; int fd; struct stat sb; void *buf; size_t size; int rv; joinPath(dir, filename, path); fd = open(path, O_RDWR, S_IRUSR | S_IWUSR); munit_assert_int(fd, !=, -1); rv = fstat(fd, &sb); munit_assert_int(rv, ==, 0); munit_assert_int(sb.st_size, <=, n); /* Fill with zeros. */ lseek(fd, sb.st_size, SEEK_SET); size = n - sb.st_size; buf = munit_malloc(size); rv = write(fd, buf, size); munit_assert_int(rv, ==, size); free(buf); rv = close(fd); munit_assert_int(rv, ==, 0); } void DirRenameFile(const char *dir, const char *filename1, const char *filename2) { char path1[256]; char path2[256]; int rv; joinPath(dir, filename1, path1); joinPath(dir, filename2, path2); rv = rename(path1, path2); munit_assert_int(rv, ==, 0); } void DirRemoveFile(const char *dir, const char *filename) { char path[256]; int rv; joinPath(dir, filename, path); rv = unlink(path); munit_assert_int(rv, ==, 0); } void DirReadFile(const char *dir, const char *filename, void *buf, const size_t n) { char path[256]; int fd; int rv; joinPath(dir, filename, path); fd = open(path, O_RDONLY); if (fd == -1) { munit_logf(MUNIT_LOG_ERROR, "read file '%s': %s", path, strerror(errno)); } rv = read(fd, buf, n); munit_assert_int(rv, ==, n); close(fd); } void DirMakeUnexecutable(const char *dir) { int rv; rv = chmod(dir, 0); munit_assert_int(rv, ==, 0); } void DirMakeUnwritable(const char *dir) { int rv; rv = chmod(dir, 0500); munit_assert_int(rv, ==, 0); } void DirMakeFileUnreadable(const char *dir, const char *filename) { char path[256]; int rv; joinPath(dir, filename, path); rv = chmod(path, 0); munit_assert_int(rv, ==, 0); } bool DirHasFile(const char *dir, const char *filename) { char path[256]; int fd; joinPath(dir, filename, path); fd = open(path, O_RDONLY); if (fd == -1) { munit_assert_true(errno == ENOENT || errno == EACCES); return false; } close(fd); return true; } void DirFill(const char *dir, const size_t n) { char path[256]; const char *filename = ".fill"; struct statvfs fs; size_t size; int fd; int rv; rv = statvfs(dir, &fs); munit_assert_int(rv, ==, 0); size = fs.f_bsize * fs.f_bavail; if (n > 0) { munit_assert_int(size, >=, n); } joinPath(dir, filename, path); fd = open(path, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); munit_assert_int(fd, !=, -1); rv = posix_fallocate(fd, 0, size - n); munit_assert_int(rv, ==, 0); /* If n is zero, make sure any further write fails with ENOSPC */ if (n == 0) { char buf[4096]; int i; rv = lseek(fd, 0, SEEK_END); munit_assert_int(rv, !=, -1); for (i = 0; i < 40; i++) { rv = write(fd, buf, sizeof buf); if (rv < 0) { break; } } munit_assert_int(rv, ==, -1); munit_assert_int(errno, ==, ENOSPC); } close(fd); } dqlite-1.16.7/test/raft/lib/dir.h000066400000000000000000000121471465252713400165160ustar00rootroot00000000000000/* Test directory utilities. * * This module sports helpers to create temporary directories backed by various * file systems, read/write files in them, check for the presence of files * etc. */ #ifndef TEST_DIR_H #define TEST_DIR_H #include #include "munit.h" /* Munit parameter defining the file system type backing the temporary directory * created by test_dir_setup(). * * The various file systems must have been previously setup with the fs.sh * script. */ #define DIR_FS_PARAM "dir-fs" #define FIXTURE_DIR char *dir #define SET_UP_DIR \ f->dir = DirSetUp(params, user_data); \ if (f->dir == NULL) { /* Fs not available, test must be skipped. */ \ free(f); \ return NULL; \ } #define TEAR_DOWN_DIR DirTearDown(f->dir) /* Contain a single DIR_FS_PARAM parameter set to all supported file system * types. */ extern MunitParameterEnum DirAllParams[]; /* Contain a single DIR_FS_PARAM parameter set to tmpfs. */ extern MunitParameterEnum DirTmpfsParams[]; /* Contain a single DIR_FS_PARAM parameter set to all file systems with * proper AIO support (i.e. NOWAIT works). */ extern MunitParameterEnum DirAioParams[]; /* Contain a single DIR_FS_PARAM parameter set to all file systems without * proper AIO support (i.e. NOWAIT does not work). */ extern MunitParameterEnum DirNoAioParams[]; /* Create a temporary test directory. * * Return a pointer the path of the created directory. */ void *DirSetUp(const MunitParameter params[], void *user_data); /* Create a temporary test directory backed by tmpfs. * * Return a pointer the path of the created directory, or NULL if no tmpfs file * system is available. */ void *DirTmpfsSetUp(const MunitParameter params[], void *user_data); /* Create a temporary test directory backed by ext4. * * Return a pointer the path of the created directory, or NULL if no ext4 file * system is available. */ void *DirExt4SetUp(const MunitParameter params[], void *user_data); /* Create a temporary test directory backed by btrfs. * * Return a pointer the path of the created directory, or NULL if no btrfs file * system is available. */ void *DirBtrfsSetUp(const MunitParameter params[], void *user_data); /* Create a temporary test directory backed by zfs. * * Return a pointer the path of the created directory, or NULL if no zfs file * system is available. */ void *DirZfsSetUp(const MunitParameter params[], void *user_data); /* Create a temporary test directory backed by xfs. * * Return a pointer the path of the created directory, or NULL if no xfs file * system is available. */ void *DirXfsSetUp(const MunitParameter params[], void *user_data); /* Recursively remove a temporary directory. */ void DirTearDown(void *data); /* Write the given @buf to the given @filename in the given @dir. */ void DirWriteFile(const char *dir, const char *filename, const void *buf, const size_t n); /* Write the given @filename and fill it with zeros. */ void DirWriteFileWithZeros(const char *dir, const char *filename, const size_t n); /* Overwrite @n bytes of the given file with the given @buf data. * * If @whence is zero, overwrite the first @n bytes of the file. If @whence is * positive overwrite the @n bytes starting at offset @whence. If @whence is * negative overwrite @n bytes starting at @whence bytes from the end of the * file. */ void DirOverwriteFile(const char *dir, const char *filename, const void *buf, const size_t n, const off_t whence); /* Truncate the given file, leaving only the first @n bytes. */ void DirTruncateFile(const char *dir, const char *filename, const size_t n); /* Grow the given file to the given size, filling the new bytes with zeros. */ void DirGrowFile(const char *dir, const char *filename, const size_t n); /* Rename a file in the given directory from filename1 to filename2. */ void DirRenameFile(const char *dir, const char *filename1, const char *filename2); /* Remove a file. */ void DirRemoveFile(const char *dir, const char *filename); /* Read into @buf the content of the given @filename in the given @dir. */ void DirReadFile(const char *dir, const char *filename, void *buf, const size_t n); /* Make the given directory not executable, so files can't be open. */ void DirMakeUnexecutable(const char *dir); /* Make the given directory not writable. */ void DirMakeUnwritable(const char *dir); /* Make the given file not readable. */ void DirMakeFileUnreadable(const char *dir, const char *filename); /* Check if the given directory has the given file. */ bool DirHasFile(const char *dir, const char *filename); /* Fill the underlying file system of the given dir, leaving only n bytes free. */ void DirFill(const char *dir, const size_t n); #endif /* TEST_DIR_H */ dqlite-1.16.7/test/raft/lib/fault.c000066400000000000000000000024501465252713400170420ustar00rootroot00000000000000#include "fault.h" #include "munit.h" void FaultInit(struct Fault *f) { f->countdown = -1; f->n = -1; f->paused = false; } bool FaultTick(struct Fault *f) { if (MUNIT_UNLIKELY(f->paused)) { return false; } /* If the initial delay parameter was set to -1, then never fail. This is * the most common case. */ if (MUNIT_LIKELY(f->countdown < 0)) { return false; } /* If we did not yet reach 'delay' ticks, then just decrease the countdown. */ if (f->countdown > 0) { f->countdown--; return false; } munit_assert_int(f->countdown, ==, 0); /* We reached 'delay' ticks, let's see how many times we have to trigger the * fault, if any. */ if (f->n < 0) { /* Trigger the fault forever. */ return true; } if (f->n > 0) { /* Trigger the fault at least this time. */ f->n--; return true; } munit_assert_int(f->n, ==, 0); /* We reached 'repeat' ticks, let's stop triggering the fault. */ f->countdown--; return false; } void FaultConfig(struct Fault *f, int delay, int repeat) { f->countdown = delay; f->n = repeat; } void FaultPause(struct Fault *f) { f->paused = true; } void FaultResume(struct Fault *f) { f->paused = false; } dqlite-1.16.7/test/raft/lib/fault.h000066400000000000000000000016041465252713400170470ustar00rootroot00000000000000/* Helper for test components supporting fault injection. */ #ifndef TEST_FAULT_H #define TEST_FAULT_H #include /* Information about a fault that should occur in a component. */ struct Fault { int countdown; /* Trigger the fault when this counter gets to zero. */ int n; /* Repeat the fault this many times. Default is -1. */ bool paused; /* Pause fault triggering. */ }; /* Initialize a fault. */ void FaultInit(struct Fault *f); /* Advance the counters of the fault. Return true if the fault should be * triggered, false otherwise. */ bool FaultTick(struct Fault *f); /* Configure the fault with the given values. */ void FaultConfig(struct Fault *f, int delay, int repeat); /* Pause triggering configured faults. */ void FaultPause(struct Fault *f); /* Resume triggering configured faults. */ void FaultResume(struct Fault *f); #endif /* TESTFAULT_H */ dqlite-1.16.7/test/raft/lib/fs.sh000077500000000000000000000044341465252713400165360ustar00rootroot00000000000000#!/bin/sh -e # Setup loopback disk devices to test the raft I/O implementation against # various file systems. usage() { echo "usage: $0 setup|teardown [types]" } if [ "${#}" -lt 1 ]; then usage exit 1 fi cmd="${1}" shift types="tmpfs" # Check if loop devices are available, we might be running inside an # unprivileged container if sudo losetup -f > /dev/null 2>&1; then types="$types ext4" if [ "$(which mkfs.btrfs)" != "" ]; then types="$types btrfs" fi if [ "$(which mkfs.xfs)" != "" ]; then types="$types xfs" fi if [ "$(which zfs)" != "" ]; then types="$types zfs" fi if [ "${#}" -gt 0 ]; then types="${@}" fi fi if [ "${cmd}" = "detect" ]; then vars="" for type in $types; do vars="${vars}RAFT_TMP_$(echo ${type} | tr [a-z] [A-Z])=./tmp/${type} " done echo $vars exit 0 fi if [ "${cmd}" = "setup" ]; then mkdir ./tmp for type in $types; do echo -n "Creating $type loop device mount..." # Create the fs mount point mkdir "./tmp/${type}" if [ "$type" = "tmpfs" ]; then # For tmpfs we don't need a loopback disk device. sudo mount -t tmpfs -o size=32m tmpfs ./tmp/tmpfs else # Create a loopback disk device dd if=/dev/zero of="./tmp/.${type}" bs=4096 count=28672 > /dev/null 2>&1 loop=$(sudo losetup -f) sudo losetup "${loop}" "./tmp/.${type}" # Initialize the file system if [ "$type" = "zfs" ]; then sudo zpool create raft "${loop}" sudo zfs create -o mountpoint=$(pwd)/tmp/zfs raft/zfs else sudo mkfs.${type} "${loop}" > /dev/null 2>&1 sudo mount "${loop}" "./tmp/${type}" fi fi sudo chown $USER "./tmp/${type}" echo " done" done exit 0 fi if [ "${cmd}" = "teardown" ]; then for type in $types; do echo -n "Deleting $type loop device mount..." sudo umount "./tmp/${type}" rm -rf "./tmp/${type}" if [ "$type" != "tmpfs" ]; then # For zfs we need to destroy the pool if [ "$type" = "zfs" ]; then sudo zpool destroy raft fi # For regular file systems, remove the loopback disk device. loop=$(sudo losetup -a | grep ".${type}" | cut -f 1 -d :) sudo losetup -d "${loop}" rm "./tmp/.${type}" fi echo " done" done rmdir ./tmp exit 0 fi usage exit 1 dqlite-1.16.7/test/raft/lib/fsm.c000066400000000000000000000146121465252713400165170ustar00rootroot00000000000000#include "fsm.h" #include "../../../src/raft/byte.h" #include "munit.h" /* In-memory implementation of the raft_fsm interface. */ struct fsm { int x; int y; int lock; void *data; }; /* Command codes */ enum { SET_X = 1, SET_Y, ADD_X, ADD_Y }; static int fsmApply(struct raft_fsm *fsm, const struct raft_buffer *buf, void **result) { struct fsm *f = fsm->data; const void *cursor = buf->base; unsigned command; int value; if (buf->len != 16) { return -1; } command = (unsigned)byteGet64(&cursor); value = (int)byteGet64(&cursor); switch (command) { case SET_X: f->x = value; break; case SET_Y: f->y = value; break; case ADD_X: f->x += value; break; case ADD_Y: f->y += value; break; default: return -1; } *result = NULL; return 0; } static int fsmRestore(struct raft_fsm *fsm, struct raft_buffer *buf) { struct fsm *f = fsm->data; const void *cursor = buf->base; munit_assert_int(buf->len, ==, sizeof(uint64_t) * 2); f->x = byteGet64(&cursor); f->y = byteGet64(&cursor); raft_free(buf->base); return 0; } static int fsmEncodeSnapshot(int x, int y, struct raft_buffer *bufs[], unsigned *n_bufs) { struct raft_buffer *buf; void *cursor; *n_bufs = 1; *bufs = raft_malloc(sizeof **bufs); if (*bufs == NULL) { return RAFT_NOMEM; } buf = &(*bufs)[0]; buf->len = sizeof(uint64_t) * 2; buf->base = raft_malloc(buf->len); if (buf->base == NULL) { return RAFT_NOMEM; } cursor = (*bufs)[0].base; bytePut64(&cursor, x); bytePut64(&cursor, y); return 0; } /* For use with fsm->version 1 */ static int fsmSnapshot_v1(struct raft_fsm *fsm, struct raft_buffer *bufs[], unsigned *n_bufs) { struct fsm *f = fsm->data; return fsmEncodeSnapshot(f->x, f->y, bufs, n_bufs); } /* For use with fsmSnapshotFinalize and fsm->version >= 2 */ static int fsmSnapshot_v2(struct raft_fsm *fsm, struct raft_buffer *bufs[], unsigned *n_bufs) { struct fsm *f = fsm->data; munit_assert_int(f->lock, ==, 0); f->lock = 1; f->data = raft_malloc(8); /* Detect proper cleanup in finalize */ munit_assert_ptr_not_null(f->data); return fsmEncodeSnapshot(f->x, f->y, bufs, n_bufs); } static int fsmSnapshotInitialize(struct raft_fsm *fsm, struct raft_buffer *bufs[], unsigned *n_bufs) { (void)bufs; (void)n_bufs; struct fsm *f = fsm->data; munit_assert_int(f->lock, ==, 0); f->lock = 1; munit_assert_ptr_null(f->data); f->data = raft_malloc(8); /* Detect proper cleanup in finalize */ munit_assert_ptr_not_null(f->data); return 0; } static int fsmSnapshotAsync(struct raft_fsm *fsm, struct raft_buffer *bufs[], unsigned *n_bufs) { struct fsm *f = fsm->data; return fsmEncodeSnapshot(f->x, f->y, bufs, n_bufs); } static int fsmSnapshotFinalize(struct raft_fsm *fsm, struct raft_buffer *bufs[], unsigned *n_bufs) { (void)bufs; (void)n_bufs; struct fsm *f = fsm->data; if (*bufs != NULL) { for (unsigned i = 0; i < *n_bufs; ++i) { raft_free((*bufs)[i].base); } raft_free(*bufs); } *bufs = NULL; *n_bufs = 0; munit_assert_int(f->lock, ==, 1); f->lock = 0; munit_assert_ptr_not_null(f->data); raft_free(f->data); f->data = NULL; return 0; } void FsmInit(struct raft_fsm *fsm, int version) { struct fsm *f = munit_malloc(sizeof *fsm); memset(fsm, 'x', sizeof(*fsm)); /* Fill with garbage */ f->x = 0; f->y = 0; f->lock = 0; f->data = NULL; fsm->version = version; fsm->data = f; fsm->apply = fsmApply; fsm->snapshot = fsmSnapshot_v1; fsm->restore = fsmRestore; if (version > 1) { fsm->snapshot = fsmSnapshot_v2; fsm->snapshot_finalize = fsmSnapshotFinalize; fsm->snapshot_async = NULL; } } void FsmInitAsync(struct raft_fsm *fsm, int version) { munit_assert_int(version, >, 2); struct fsm *f = munit_malloc(sizeof *fsm); memset(fsm, 'x', sizeof(*fsm)); /* Fill with garbage */ f->x = 0; f->y = 0; f->lock = 0; f->data = NULL; fsm->version = version; fsm->data = f; fsm->apply = fsmApply; fsm->snapshot = fsmSnapshotInitialize; fsm->snapshot_async = fsmSnapshotAsync; fsm->snapshot_finalize = fsmSnapshotFinalize; fsm->restore = fsmRestore; } void FsmClose(struct raft_fsm *fsm) { struct fsm *f = fsm->data; free(f); } void FsmEncodeSetX(const int value, struct raft_buffer *buf) { void *cursor; buf->base = raft_malloc(16); buf->len = 16; munit_assert_ptr_not_null(buf->base); cursor = buf->base; bytePut64(&cursor, SET_X); bytePut64(&cursor, value); } void FsmEncodeAddX(const int value, struct raft_buffer *buf) { void *cursor; buf->base = raft_malloc(16); buf->len = 16; munit_assert_ptr_not_null(buf->base); cursor = buf->base; bytePut64(&cursor, ADD_X); bytePut64(&cursor, value); } void FsmEncodeSetY(const int value, struct raft_buffer *buf) { void *cursor; buf->base = raft_malloc(16); buf->len = 16; munit_assert_ptr_not_null(buf->base); cursor = buf->base; bytePut64(&cursor, SET_Y); bytePut64(&cursor, value); } void FsmEncodeAddY(const int value, struct raft_buffer *buf) { void *cursor; buf->base = raft_malloc(16); buf->len = 16; munit_assert_ptr_not_null(buf->base); cursor = buf->base; bytePut64(&cursor, ADD_Y); bytePut64(&cursor, value); } void FsmEncodeSnapshot(int x, int y, struct raft_buffer *bufs[], unsigned *n_bufs) { int rc; rc = fsmEncodeSnapshot(x, y, bufs, n_bufs); munit_assert_int(rc, ==, 0); } int FsmGetX(struct raft_fsm *fsm) { struct fsm *f = fsm->data; return f->x; } int FsmGetY(struct raft_fsm *fsm) { struct fsm *f = fsm->data; return f->y; } dqlite-1.16.7/test/raft/lib/fsm.h000066400000000000000000000022741465252713400165250ustar00rootroot00000000000000/* Test implementation of the raft_fsm interface, with fault injection. * * The test FSM supports only two commands: setting x and setting y. */ #ifndef TEST_FSM_H #define TEST_FSM_H #include "../../../src/raft.h" void FsmInit(struct raft_fsm *fsm, int version); /* Same as FsmInit but with asynchronous snapshots */ void FsmInitAsync(struct raft_fsm *fsm, int version); void FsmClose(struct raft_fsm *fsm); /* Encode a command to set x to the given value. */ void FsmEncodeSetX(int value, struct raft_buffer *buf); /* Encode a command to add the given value to x. */ void FsmEncodeAddX(int value, struct raft_buffer *buf); /* Encode a command to set y to the given value. */ void FsmEncodeSetY(int value, struct raft_buffer *buf); /* Encode a command to add the given value to y. */ void FsmEncodeAddY(int value, struct raft_buffer *buf); /* Encode a snapshot of an FSM with the given values for x and y. */ void FsmEncodeSnapshot(int x, int y, struct raft_buffer *bufs[], unsigned *n_bufs); /* Return the current value of x or y. */ int FsmGetX(struct raft_fsm *fsm); int FsmGetY(struct raft_fsm *fsm); #endif /* TEST_FSM_H */ dqlite-1.16.7/test/raft/lib/heap.c000066400000000000000000000052201465252713400166420ustar00rootroot00000000000000#include "heap.h" #include #include "fault.h" #include "munit.h" struct heap { size_t alignment; /* Value of last aligned alloc */ struct Fault fault; /* Fault trigger. */ }; static void heapInit(struct heap *h) { h->alignment = 0; FaultInit(&h->fault); } static void *heapMalloc(void *data, size_t size) { struct heap *h = data; if (FaultTick(&h->fault)) { return NULL; } return munit_malloc(size); } static void heapFree(void *data, void *ptr) { (void)data; free(ptr); } static void *heapCalloc(void *data, size_t nmemb, size_t size) { struct heap *h = data; if (FaultTick(&h->fault)) { return NULL; } return munit_calloc(nmemb, size); } static void *heapRealloc(void *data, void *ptr, size_t size) { struct heap *h = data; if (FaultTick(&h->fault)) { return NULL; } ptr = realloc(ptr, size); if (size == 0) { munit_assert_ptr_null(ptr); } else { munit_assert_ptr_not_null(ptr); } return ptr; } static void *heapAlignedAlloc(void *data, size_t alignment, size_t size) { struct heap *h = data; void *p; if (FaultTick(&h->fault)) { return NULL; } p = aligned_alloc(alignment, size); munit_assert_ptr_not_null(p); h->alignment = alignment; return p; } static void heapAlignedFree(void *data, size_t alignment, void *ptr) { struct heap *h = data; munit_assert_ulong(alignment, ==, h->alignment); heapFree(data, ptr); } static int getIntParam(const MunitParameter params[], const char *name) { const char *value = munit_parameters_get(params, name); return value != NULL ? atoi(value) : 0; } void HeapSetUp(const MunitParameter params[], struct raft_heap *h) { struct heap *heap = munit_malloc(sizeof *heap); int delay = getIntParam(params, TEST_HEAP_FAULT_DELAY); int repeat = getIntParam(params, TEST_HEAP_FAULT_REPEAT); munit_assert_ptr_not_null(h); heapInit(heap); FaultConfig(&heap->fault, delay, repeat); h->data = heap; h->malloc = heapMalloc; h->free = heapFree; h->calloc = heapCalloc; h->realloc = heapRealloc; h->aligned_alloc = heapAlignedAlloc; h->aligned_free = heapAlignedFree; raft_heap_set(h); FaultPause(&heap->fault); } void HeapTearDown(struct raft_heap *h) { struct heap *heap = h->data; free(heap); raft_heap_set_default(); } void HeapFaultConfig(struct raft_heap *h, int delay, int repeat) { struct heap *heap = h->data; FaultConfig(&heap->fault, delay, repeat); } void HeapFaultEnable(struct raft_heap *h) { struct heap *heap = h->data; FaultResume(&heap->fault); } dqlite-1.16.7/test/raft/lib/heap.h000066400000000000000000000023161465252713400166520ustar00rootroot00000000000000/* Add support for fault injection and leak detection to stdlib's malloc() * family. */ #ifndef TEST_HEAP_H #define TEST_HEAP_H #include "../../../src/raft.h" #include "munit.h" /* Munit parameter defining after how many API calls the test raft_heap * implementation should start failing and return errors. The default is -1, * meaning that no failure will ever occur. */ #define TEST_HEAP_FAULT_DELAY "heap-fault-delay" /* Munit parameter defining how many consecutive times API calls against the * test raft_heap implementation should keep failing after they started * failing. This parameter has an effect only if 'store-fail-delay' is 0 or * greater. The default is 1, and -1 means "keep failing forever". */ #define TEST_HEAP_FAULT_REPEAT "heap-fault-repeat" /* Macro helpers. */ #define FIXTURE_HEAP struct raft_heap heap #define SET_UP_HEAP HeapSetUp(params, &f->heap) #define TEAR_DOWN_HEAP HeapTearDown(&f->heap) #define HEAP_FAULT_ENABLE HeapFaultEnable(&f->heap) void HeapSetUp(const MunitParameter params[], struct raft_heap *h); void HeapTearDown(struct raft_heap *h); void HeapFaultConfig(struct raft_heap *h, int delay, int repeat); void HeapFaultEnable(struct raft_heap *h); #endif /* TEST_HEAP_H */ dqlite-1.16.7/test/raft/lib/loop.c000066400000000000000000000002301465252713400166720ustar00rootroot00000000000000#include "loop.h" void test_loop_walk_cb(uv_handle_t *handle, void *arg) { (void)arg; munit_logf(MUNIT_LOG_INFO, "handle %d", handle->type); } dqlite-1.16.7/test/raft/lib/loop.h000066400000000000000000000136021465252713400167060ustar00rootroot00000000000000/* Add support for using the libuv loop in tests. */ #ifndef TEST_LOOP_H #define TEST_LOOP_H #include #include "../../../src/raft.h" #include "munit.h" /* Max n. of loop iterations ran by a single function call */ #define LOOP_MAX_RUN 20 #define FIXTURE_LOOP struct uv_loop_s loop /* Older libuv versions might try to free() memory that was not allocated. */ #if HAVE_DECL_UV_FS_O_CREAT #define LOOP_REPLACE_ALLOCATOR \ _rv = uv_replace_allocator(raft_malloc, raft_realloc, raft_calloc, \ raft_free); \ munit_assert_int(_rv, ==, 0) #else #define LOOP_REPLACE_ALLOCATOR #endif #define SETUP_LOOP \ { \ int _rv; \ LOOP_REPLACE_ALLOCATOR; \ _rv = uv_loop_init(&f->loop); \ munit_assert_int(_rv, ==, 0); \ } #define TEAR_DOWN_LOOP \ { \ int rv_; \ int alive_ = uv_loop_alive(&f->loop); \ if (alive_ != 0) { \ LOOP_STOP; \ } \ rv_ = uv_loop_close(&f->loop); \ if (rv_ != 0) { \ uv_walk(&f->loop, test_loop_walk_cb, NULL); \ munit_errorf("uv_loop_close: %s (%d)", uv_strerror(rv_), rv_); \ } \ rv_ = uv_replace_allocator(malloc, realloc, calloc, free); \ munit_assert_int(rv_, ==, 0); \ } /* Run the loop until there are no pending active handles or the given amount of * iterations is reached. */ #define LOOP_RUN(N) \ { \ unsigned i__; \ int rv__; \ for (i__ = 0; i__ < N; i__++) { \ rv__ = uv_run(&f->loop, UV_RUN_ONCE); \ if (rv__ < 0) { \ munit_errorf("uv_run: %s (%d)", uv_strerror(rv__), rv__); \ } \ if (rv__ == 0) { \ break; \ } \ } \ } /* Run the loop until the value stored through the given boolean pointer is * true. * * If the loop exhausts all active handles or if #LOOP_MAX_RUN is reached, the * test fails. */ #define LOOP_RUN_UNTIL(CONDITION) \ { \ unsigned __i; \ int __rv; \ for (__i = 0; __i < LOOP_MAX_RUN; __i++) { \ if (*(CONDITION)) { \ break; \ } \ __rv = uv_run(&f->loop, UV_RUN_ONCE); \ if (__rv < 0) { \ munit_errorf("uv_run: %s (%d)", uv_strerror(__rv), __rv); \ } \ if (__rv == 0) { \ if (*(CONDITION)) { \ break; \ } \ munit_errorf("uv_run: stopped after %u iterations", __i + 1); \ } \ } \ if (!*(CONDITION)) { \ munit_errorf("uv_run: condition not met in %d iterations", \ LOOP_MAX_RUN); \ } \ } /* Run the loop until there are no pending active handles. * * If there are still pending active handles after LOOP_MAX_RUN iterations, the * test will fail. * * This is meant to be used in tear down functions. */ #define LOOP_STOP \ { \ int alive__; \ LOOP_RUN(LOOP_MAX_RUN); \ alive__ = uv_loop_alive(&f->loop); \ if (alive__ != 0) { \ munit_error("loop has still pending active handles"); \ } \ } void test_loop_walk_cb(uv_handle_t *handle, void *arg); #endif /* TEST_LOOP_H */ dqlite-1.16.7/test/raft/lib/macros.h000066400000000000000000000004771465252713400172270ustar00rootroot00000000000000/** * Miscellaneous test macros. */ #ifndef TEST_MACROS_H_ #define TEST_MACROS_H_ #define GET_2ND_ARG(arg1, arg2, ...) arg2 #define GET_3RD_ARG(arg1, arg2, arg3, ...) arg3 #define GET_4TH_ARG(arg1, arg2, arg3, arg4, ...) arg4 #define GET_5TH_ARG(arg1, arg2, arg3, arg4, arg5, ...) arg5 #endif /* TEST_MACROS_H_ */ dqlite-1.16.7/test/raft/lib/munit.c000066400000000000000000002056461465252713400170770ustar00rootroot00000000000000/* Copyright (c) 2013-2018 Evan Nemerson * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, * modify, merge, publish, distribute, sublicense, and/or sell copies * of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ /*** Configuration ***/ /* This is just where the output from the test goes. It's really just * meant to let you choose stdout or stderr, but if anyone really want * to direct it to a file let me know, it would be fairly easy to * support. */ #if !defined(MUNIT_OUTPUT_FILE) # define MUNIT_OUTPUT_FILE stdout #endif /* This is a bit more useful; it tells µnit how to format the seconds in * timed tests. If your tests run for longer you might want to reduce * it, and if your computer is really fast and your tests are tiny you * can increase it. */ #if !defined(MUNIT_TEST_TIME_FORMAT) # define MUNIT_TEST_TIME_FORMAT "0.8f" #endif /* If you have long test names you might want to consider bumping * this. The result information takes 43 characters. */ #if !defined(MUNIT_TEST_NAME_LEN) # define MUNIT_TEST_NAME_LEN 37 #endif /* If you don't like the timing information, you can disable it by * defining MUNIT_DISABLE_TIMING. */ #if !defined(MUNIT_DISABLE_TIMING) # define MUNIT_ENABLE_TIMING #endif /*** End configuration ***/ #if defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE < 200809L) # undef _POSIX_C_SOURCE #endif #if !defined(_POSIX_C_SOURCE) # define _POSIX_C_SOURCE 200809L #endif /* Solaris freaks out if you try to use a POSIX or SUS standard without * the "right" C standard. */ #if defined(_XOPEN_SOURCE) # undef _XOPEN_SOURCE #endif #if defined(__STDC_VERSION__) # if __STDC_VERSION__ >= 201112L # define _XOPEN_SOURCE 700 # elif __STDC_VERSION__ >= 199901L # define _XOPEN_SOURCE 600 # endif #endif /* Because, according to Microsoft, POSIX is deprecated. You've got * to appreciate the chutzpah. */ #if defined(_MSC_VER) && !defined(_CRT_NONSTDC_NO_DEPRECATE) # define _CRT_NONSTDC_NO_DEPRECATE #endif #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) # include #elif defined(_WIN32) /* https://msdn.microsoft.com/en-us/library/tf4dy80a.aspx */ #endif #include #include #include #include #include #include #include #include #if !defined(MUNIT_NO_NL_LANGINFO) && !defined(_WIN32) #define MUNIT_NL_LANGINFO #include #include #include #endif #if !defined(_WIN32) # include # include # include #else # include # include # include # if !defined(STDERR_FILENO) # define STDERR_FILENO _fileno(stderr) # endif #endif #include "munit.h" #define MUNIT_STRINGIFY(x) #x #define MUNIT_XSTRINGIFY(x) MUNIT_STRINGIFY(x) #if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_CC) || defined(__IBMCPP__) # define MUNIT_THREAD_LOCAL __thread #elif (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201102L)) || defined(_Thread_local) # define MUNIT_THREAD_LOCAL _Thread_local #elif defined(_WIN32) # define MUNIT_THREAD_LOCAL __declspec(thread) #endif /* MSVC 12.0 will emit a warning at /W4 for code like 'do { ... } * while (0)', or 'do { ... } while (true)'. I'm pretty sure nobody * at Microsoft compiles with /W4. */ #if defined(_MSC_VER) && (_MSC_VER <= 1800) #pragma warning(disable: 4127) #endif #if defined(_WIN32) || defined(__EMSCRIPTEN__) # define MUNIT_NO_FORK #endif #if defined(__EMSCRIPTEN__) # define MUNIT_NO_BUFFER #endif /*** Logging ***/ static MunitLogLevel munit_log_level_visible = MUNIT_LOG_INFO; static MunitLogLevel munit_log_level_fatal = MUNIT_LOG_ERROR; #if defined(MUNIT_THREAD_LOCAL) static MUNIT_THREAD_LOCAL bool munit_error_jmp_buf_valid = false; static MUNIT_THREAD_LOCAL jmp_buf munit_error_jmp_buf; #endif #if defined(MUNIT_THREAD_LOCAL) && defined(MUNIT_ALWAYS_TEAR_DOWN) static MUNIT_THREAD_LOCAL bool munit_tear_down_jmp_buf_valid = false; static MUNIT_THREAD_LOCAL jmp_buf munit_tear_down_jmp_buf; #endif /* At certain warning levels, mingw will trigger warnings about * suggesting the format attribute, which we've explicitly *not* set * because it will then choke on our attempts to use the MS-specific * I64 modifier for size_t (which we have to use since MSVC doesn't * support the C99 z modifier). */ #if defined(__MINGW32__) || defined(__MINGW64__) # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wsuggest-attribute=format" #endif MUNIT_PRINTF(5,0) static void munit_logf_exv(MunitLogLevel level, FILE* fp, const char* filename, int line, const char* format, va_list ap) { if (level < munit_log_level_visible) return; switch (level) { case MUNIT_LOG_DEBUG: fputs("Debug", fp); break; case MUNIT_LOG_INFO: fputs("Info", fp); break; case MUNIT_LOG_WARNING: fputs("Warning", fp); break; case MUNIT_LOG_ERROR: fputs("Error", fp); break; default: munit_logf_ex(MUNIT_LOG_ERROR, filename, line, "Invalid log level (%d)", level); return; } fputs(": ", fp); if (filename != NULL) fprintf(fp, "%s:%d: ", filename, line); vfprintf(fp, format, ap); fputc('\n', fp); } MUNIT_PRINTF(3,4) static void munit_logf_internal(MunitLogLevel level, FILE* fp, const char* format, ...) { va_list ap; va_start(ap, format); munit_logf_exv(level, fp, NULL, 0, format, ap); va_end(ap); } static void munit_log_internal(MunitLogLevel level, FILE* fp, const char* message) { munit_logf_internal(level, fp, "%s", message); } void munit_logf_ex(MunitLogLevel level, const char* filename, int line, const char* format, ...) { va_list ap; va_start(ap, format); munit_logf_exv(level, stderr, filename, line, format, ap); va_end(ap); if (level >= munit_log_level_fatal) { #if defined(MUNIT_THREAD_LOCAL) if (munit_error_jmp_buf_valid) longjmp(munit_error_jmp_buf, 1); #endif abort(); } } void munit_errorf_ex(const char* filename, int line, const char* format, ...) { va_list ap; va_start(ap, format); munit_logf_exv(MUNIT_LOG_ERROR, stderr, filename, line, format, ap); va_end(ap); #if defined(MUNIT_THREAD_LOCAL) && defined(MUNIT_ALWAYS_TEAR_DOWN) if (munit_tear_down_jmp_buf_valid) longjmp(munit_tear_down_jmp_buf, 1); #endif #if defined(MUNIT_THREAD_LOCAL) if (munit_error_jmp_buf_valid) longjmp(munit_error_jmp_buf, 1); #endif abort(); } #if defined(__MINGW32__) || defined(__MINGW64__) #pragma GCC diagnostic pop #endif #if !defined(MUNIT_STRERROR_LEN) # define MUNIT_STRERROR_LEN 80 #endif static void munit_log_errno(MunitLogLevel level, FILE* fp, const char* msg) { #if defined(MUNIT_NO_STRERROR_R) || (defined(__MINGW32__) && !defined(MINGW_HAS_SECURE_API)) munit_logf_internal(level, fp, "%s: %s (%d)", msg, strerror(errno), errno); #else char munit_error_str[MUNIT_STRERROR_LEN]; munit_error_str[0] = '\0'; #if !defined(_WIN32) strerror_r(errno, munit_error_str, MUNIT_STRERROR_LEN); #else strerror_s(munit_error_str, MUNIT_STRERROR_LEN, errno); #endif munit_logf_internal(level, fp, "%s: %s (%d)", msg, munit_error_str, errno); #endif } /*** Memory allocation ***/ void* munit_malloc_ex(const char* filename, int line, size_t size) { void* ptr; if (size == 0) return NULL; ptr = calloc(1, size); if (MUNIT_UNLIKELY(ptr == NULL)) { munit_logf_ex(MUNIT_LOG_ERROR, filename, line, "Failed to allocate %" MUNIT_SIZE_MODIFIER "u bytes.", size); } return ptr; } /*** Timer code ***/ #if defined(MUNIT_ENABLE_TIMING) #define psnip_uint64_t munit_uint64_t #define psnip_uint32_t munit_uint32_t /* Code copied from portable-snippets * . If you need to * change something, please do it there so we can keep the code in * sync. */ /* Clocks (v1) * Portable Snippets - https://github.com/nemequ/portable-snippets * Created by Evan Nemerson * * To the extent possible under law, the authors have waived all * copyright and related or neighboring rights to this code. For * details, see the Creative Commons Zero 1.0 Universal license at * https://creativecommons.org/publicdomain/zero/1.0/ */ #if !defined(PSNIP_CLOCK_H) #define PSNIP_CLOCK_H #if !defined(psnip_uint64_t) # include "../exact-int/exact-int.h" #endif #if !defined(PSNIP_CLOCK_STATIC_INLINE) # if defined(__GNUC__) # define PSNIP_CLOCK__COMPILER_ATTRIBUTES __attribute__((__unused__)) # else # define PSNIP_CLOCK__COMPILER_ATTRIBUTES # endif # define PSNIP_CLOCK__FUNCTION PSNIP_CLOCK__COMPILER_ATTRIBUTES static #endif enum PsnipClockType { /* This clock provides the current time, in units since 1970-01-01 * 00:00:00 UTC not including leap seconds. In other words, UNIX * time. Keep in mind that this clock doesn't account for leap * seconds, and can go backwards (think NTP adjustments). */ PSNIP_CLOCK_TYPE_WALL = 1, /* The CPU time is a clock which increases only when the current * process is active (i.e., it doesn't increment while blocking on * I/O). */ PSNIP_CLOCK_TYPE_CPU = 2, /* Monotonic time is always running (unlike CPU time), but it only ever moves forward unless you reboot the system. Things like NTP adjustments have no effect on this clock. */ PSNIP_CLOCK_TYPE_MONOTONIC = 3 }; struct PsnipClockTimespec { psnip_uint64_t seconds; psnip_uint64_t nanoseconds; }; /* Methods we support: */ #define PSNIP_CLOCK_METHOD_CLOCK_GETTIME 1 #define PSNIP_CLOCK_METHOD_TIME 2 #define PSNIP_CLOCK_METHOD_GETTIMEOFDAY 3 #define PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER 4 #define PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME 5 #define PSNIP_CLOCK_METHOD_CLOCK 6 #define PSNIP_CLOCK_METHOD_GETPROCESSTIMES 7 #define PSNIP_CLOCK_METHOD_GETRUSAGE 8 #define PSNIP_CLOCK_METHOD_GETSYSTEMTIMEPRECISEASFILETIME 9 #define PSNIP_CLOCK_METHOD_GETTICKCOUNT64 10 #include #if defined(HEDLEY_UNREACHABLE) # define PSNIP_CLOCK_UNREACHABLE() HEDLEY_UNREACHABLE() #else # define PSNIP_CLOCK_UNREACHABLE() assert(0) #endif /* Choose an implementation */ /* #undef PSNIP_CLOCK_WALL_METHOD */ /* #undef PSNIP_CLOCK_CPU_METHOD */ /* #undef PSNIP_CLOCK_MONOTONIC_METHOD */ /* We want to be able to detect the libc implementation, so we include ( isn't available everywhere). */ #if defined(__unix__) || defined(__unix) || defined(__linux__) # include # include #endif #if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0) /* These are known to work without librt. If you know of others * please let us know so we can add them. */ # if \ (defined(__GLIBC__) && (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 17))) || \ (defined(__FreeBSD__)) # define PSNIP_CLOCK_HAVE_CLOCK_GETTIME # elif !defined(PSNIP_CLOCK_NO_LIBRT) # define PSNIP_CLOCK_HAVE_CLOCK_GETTIME # endif #endif #if defined(_WIN32) # if !defined(PSNIP_CLOCK_CPU_METHOD) # define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_GETPROCESSTIMES # endif # if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) # define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER # endif #endif #if defined(__MACH__) && !defined(__gnu_hurd__) # if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) # define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME # endif #endif #if defined(PSNIP_CLOCK_HAVE_CLOCK_GETTIME) # include # if !defined(PSNIP_CLOCK_WALL_METHOD) # if defined(CLOCK_REALTIME_PRECISE) # define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME # define PSNIP_CLOCK_CLOCK_GETTIME_WALL CLOCK_REALTIME_PRECISE # elif !defined(__sun) # define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME # define PSNIP_CLOCK_CLOCK_GETTIME_WALL CLOCK_REALTIME # endif # endif # if !defined(PSNIP_CLOCK_CPU_METHOD) # if defined(_POSIX_CPUTIME) || defined(CLOCK_PROCESS_CPUTIME_ID) # define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME # define PSNIP_CLOCK_CLOCK_GETTIME_CPU CLOCK_PROCESS_CPUTIME_ID # elif defined(CLOCK_VIRTUAL) # define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME # define PSNIP_CLOCK_CLOCK_GETTIME_CPU CLOCK_VIRTUAL # endif # endif # if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) # if defined(CLOCK_MONOTONIC_RAW) # define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME # define PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC CLOCK_MONOTONIC # elif defined(CLOCK_MONOTONIC_PRECISE) # define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME # define PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC CLOCK_MONOTONIC_PRECISE # elif defined(_POSIX_MONOTONIC_CLOCK) || defined(CLOCK_MONOTONIC) # define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME # define PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC CLOCK_MONOTONIC # endif # endif #endif #if defined(_POSIX_VERSION) && (_POSIX_VERSION >= 200112L) # if !defined(PSNIP_CLOCK_WALL_METHOD) # define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_GETTIMEOFDAY # endif #endif #if !defined(PSNIP_CLOCK_WALL_METHOD) # define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_TIME #endif #if !defined(PSNIP_CLOCK_CPU_METHOD) # define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_CLOCK #endif /* Primarily here for testing. */ #if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) && defined(PSNIP_CLOCK_REQUIRE_MONOTONIC) # error No monotonic clock found. #endif /* Implementations */ #if \ (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \ (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \ (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \ (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK)) || \ (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK)) || \ (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK)) || \ (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_TIME)) || \ (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_TIME)) || \ (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_TIME)) # include #endif #if \ (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY)) || \ (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY)) || \ (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY)) # include #endif #if \ (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES)) || \ (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES)) || \ (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES)) || \ (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64)) || \ (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64)) || \ (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64)) # include #endif #if \ (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE)) || \ (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE)) || \ (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE)) # include # include #endif #if \ (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME)) || \ (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME)) || \ (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME)) # include # include # include #endif /*** Implementations ***/ #define PSNIP_CLOCK_NSEC_PER_SEC ((psnip_uint32_t) (1000000000ULL)) #if \ (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \ (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \ (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) PSNIP_CLOCK__FUNCTION psnip_uint32_t psnip_clock__clock_getres (clockid_t clk_id) { struct timespec res; int r; r = clock_getres(clk_id, &res); if (r != 0) return 0; return (psnip_uint32_t) (PSNIP_CLOCK_NSEC_PER_SEC / res.tv_nsec); } PSNIP_CLOCK__FUNCTION int psnip_clock__clock_gettime (clockid_t clk_id, struct PsnipClockTimespec* res) { struct timespec ts; if (clock_gettime(clk_id, &ts) != 0) return -10; res->seconds = (psnip_uint64_t) (ts.tv_sec); res->nanoseconds = (psnip_uint64_t) (ts.tv_nsec); return 0; } #endif PSNIP_CLOCK__FUNCTION psnip_uint32_t psnip_clock_wall_get_precision (void) { #if !defined(PSNIP_CLOCK_WALL_METHOD) return 0; #elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME return psnip_clock__clock_getres(PSNIP_CLOCK_CLOCK_GETTIME_WALL); #elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY return 1000000; #elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_TIME return 1; #else return 0; #endif } PSNIP_CLOCK__FUNCTION int psnip_clock_wall_get_time (struct PsnipClockTimespec* res) { (void) res; #if !defined(PSNIP_CLOCK_WALL_METHOD) return -2; #elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME return psnip_clock__clock_gettime(PSNIP_CLOCK_CLOCK_GETTIME_WALL, res); #elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_TIME res->seconds = time(NULL); res->nanoseconds = 0; #elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY struct timeval tv; if (gettimeofday(&tv, NULL) != 0) return -6; res->seconds = tv.tv_sec; res->nanoseconds = tv.tv_usec * 1000; #else return -2; #endif return 0; } PSNIP_CLOCK__FUNCTION psnip_uint32_t psnip_clock_cpu_get_precision (void) { #if !defined(PSNIP_CLOCK_CPU_METHOD) return 0; #elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME return psnip_clock__clock_getres(PSNIP_CLOCK_CLOCK_GETTIME_CPU); #elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK return CLOCKS_PER_SEC; #elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES return PSNIP_CLOCK_NSEC_PER_SEC / 100; #else return 0; #endif } PSNIP_CLOCK__FUNCTION int psnip_clock_cpu_get_time (struct PsnipClockTimespec* res) { #if !defined(PSNIP_CLOCK_CPU_METHOD) (void) res; return -2; #elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME return psnip_clock__clock_gettime(PSNIP_CLOCK_CLOCK_GETTIME_CPU, res); #elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK clock_t t = clock(); if (t == ((clock_t) -1)) return -5; res->seconds = t / CLOCKS_PER_SEC; res->nanoseconds = (t % CLOCKS_PER_SEC) * (PSNIP_CLOCK_NSEC_PER_SEC / CLOCKS_PER_SEC); #elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES FILETIME CreationTime, ExitTime, KernelTime, UserTime; LARGE_INTEGER date, adjust; if (!GetProcessTimes(GetCurrentProcess(), &CreationTime, &ExitTime, &KernelTime, &UserTime)) return -7; /* http://www.frenk.com/2009/12/convert-filetime-to-unix-timestamp/ */ date.HighPart = UserTime.dwHighDateTime; date.LowPart = UserTime.dwLowDateTime; adjust.QuadPart = 11644473600000 * 10000; date.QuadPart -= adjust.QuadPart; res->seconds = date.QuadPart / 10000000; res->nanoseconds = (date.QuadPart % 10000000) * (PSNIP_CLOCK_NSEC_PER_SEC / 100); #elif PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE struct rusage usage; if (getrusage(RUSAGE_SELF, &usage) != 0) return -8; res->seconds = usage.ru_utime.tv_sec; res->nanoseconds = tv.tv_usec * 1000; #else (void) res; return -2; #endif return 0; } PSNIP_CLOCK__FUNCTION psnip_uint32_t psnip_clock_monotonic_get_precision (void) { #if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) return 0; #elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME return psnip_clock__clock_getres(PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC); #elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME static mach_timebase_info_data_t tbi = { 0, }; if (tbi.denom == 0) mach_timebase_info(&tbi); return (psnip_uint32_t) (tbi.numer / tbi.denom); #elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64 return 1000; #elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER LARGE_INTEGER Frequency; QueryPerformanceFrequency(&Frequency); return (psnip_uint32_t) ((Frequency.QuadPart > PSNIP_CLOCK_NSEC_PER_SEC) ? PSNIP_CLOCK_NSEC_PER_SEC : Frequency.QuadPart); #else return 0; #endif } PSNIP_CLOCK__FUNCTION int psnip_clock_monotonic_get_time (struct PsnipClockTimespec* res) { #if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) (void) res; return -2; #elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME return psnip_clock__clock_gettime(PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC, res); #elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME psnip_uint64_t nsec = mach_absolute_time(); static mach_timebase_info_data_t tbi = { 0, }; if (tbi.denom == 0) mach_timebase_info(&tbi); nsec *= ((psnip_uint64_t) tbi.numer) / ((psnip_uint64_t) tbi.denom); res->seconds = nsec / PSNIP_CLOCK_NSEC_PER_SEC; res->nanoseconds = nsec % PSNIP_CLOCK_NSEC_PER_SEC; #elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER LARGE_INTEGER t, f; if (QueryPerformanceCounter(&t) == 0) return -12; QueryPerformanceFrequency(&f); res->seconds = t.QuadPart / f.QuadPart; res->nanoseconds = t.QuadPart % f.QuadPart; if (f.QuadPart > PSNIP_CLOCK_NSEC_PER_SEC) res->nanoseconds /= f.QuadPart / PSNIP_CLOCK_NSEC_PER_SEC; else res->nanoseconds *= PSNIP_CLOCK_NSEC_PER_SEC / f.QuadPart; #elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64 const ULONGLONG msec = GetTickCount64(); res->seconds = msec / 1000; res->nanoseconds = sec % 1000; #else return -2; #endif return 0; } /* Returns the number of ticks per second for the specified clock. * For example, a clock with millisecond precision would return 1000, * and a clock with 1 second (such as the time() function) would * return 1. * * If the requested clock isn't available, it will return 0. * Hopefully this will be rare, but if it happens to you please let us * know so we can work on finding a way to support your system. * * Note that different clocks on the same system often have a * different precisions. */ PSNIP_CLOCK__FUNCTION psnip_uint32_t psnip_clock_get_precision (enum PsnipClockType clock_type) { switch (clock_type) { case PSNIP_CLOCK_TYPE_MONOTONIC: return psnip_clock_monotonic_get_precision (); case PSNIP_CLOCK_TYPE_CPU: return psnip_clock_cpu_get_precision (); case PSNIP_CLOCK_TYPE_WALL: return psnip_clock_wall_get_precision (); } PSNIP_CLOCK_UNREACHABLE(); return 0; } /* Set the provided timespec to the requested time. Returns 0 on * success, or a negative value on failure. */ PSNIP_CLOCK__FUNCTION int psnip_clock_get_time (enum PsnipClockType clock_type, struct PsnipClockTimespec* res) { assert(res != NULL); switch (clock_type) { case PSNIP_CLOCK_TYPE_MONOTONIC: return psnip_clock_monotonic_get_time (res); case PSNIP_CLOCK_TYPE_CPU: return psnip_clock_cpu_get_time (res); case PSNIP_CLOCK_TYPE_WALL: return psnip_clock_wall_get_time (res); } return -1; } #endif /* !defined(PSNIP_CLOCK_H) */ static psnip_uint64_t munit_clock_get_elapsed(struct PsnipClockTimespec* start, struct PsnipClockTimespec* end) { psnip_uint64_t r = (end->seconds - start->seconds) * PSNIP_CLOCK_NSEC_PER_SEC; if (end->nanoseconds < start->nanoseconds) { r -= (start->nanoseconds - end->nanoseconds); } else { r += (end->nanoseconds - start->nanoseconds); } return r; } #else # include #endif /* defined(MUNIT_ENABLE_TIMING) */ /*** PRNG stuff ***/ /* This is (unless I screwed up, which is entirely possible) the * version of PCG with 32-bit state. It was chosen because it has a * small enough state that we should reliably be able to use CAS * instead of requiring a lock for thread-safety. * * If I did screw up, I probably will not bother changing it unless * there is a significant bias. It's really not important this be * particularly strong, as long as it is fairly random it's much more * important that it be reproducible, so bug reports have a better * chance of being reproducible. */ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__) && !defined(__EMSCRIPTEN__) && (!defined(__GNUC_MINOR__) || (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ > 8)) # define HAVE_STDATOMIC #elif defined(__clang__) # if __has_extension(c_atomic) # define HAVE_CLANG_ATOMICS # endif #endif /* Workaround for http://llvm.org/bugs/show_bug.cgi?id=26911 */ #if defined(__clang__) && defined(_WIN32) # undef HAVE_STDATOMIC # if defined(__c2__) # undef HAVE_CLANG_ATOMICS # endif #endif #if defined(_OPENMP) # define ATOMIC_UINT32_T uint32_t # define ATOMIC_UINT32_INIT(x) (x) #elif defined(HAVE_STDATOMIC) # include # define ATOMIC_UINT32_T _Atomic uint32_t # define ATOMIC_UINT32_INIT(x) ATOMIC_VAR_INIT(x) #elif defined(HAVE_CLANG_ATOMICS) # define ATOMIC_UINT32_T _Atomic uint32_t # define ATOMIC_UINT32_INIT(x) (x) #elif defined(_WIN32) # define ATOMIC_UINT32_T volatile LONG # define ATOMIC_UINT32_INIT(x) (x) #else # define ATOMIC_UINT32_T volatile uint32_t # define ATOMIC_UINT32_INIT(x) (x) #endif static ATOMIC_UINT32_T munit_rand_state = ATOMIC_UINT32_INIT(42); #if defined(_OPENMP) static inline void munit_atomic_store(ATOMIC_UINT32_T* dest, ATOMIC_UINT32_T value) { #pragma omp critical (munit_atomics) *dest = value; } static inline uint32_t munit_atomic_load(ATOMIC_UINT32_T* src) { int ret; #pragma omp critical (munit_atomics) ret = *src; return ret; } static inline uint32_t munit_atomic_cas(ATOMIC_UINT32_T* dest, ATOMIC_UINT32_T* expected, ATOMIC_UINT32_T desired) { bool ret; #pragma omp critical (munit_atomics) { if (*dest == *expected) { *dest = desired; ret = true; } else { ret = false; } } return ret; } #elif defined(HAVE_STDATOMIC) # define munit_atomic_store(dest, value) atomic_store(dest, value) # define munit_atomic_load(src) atomic_load(src) # define munit_atomic_cas(dest, expected, value) atomic_compare_exchange_weak(dest, expected, value) #elif defined(HAVE_CLANG_ATOMICS) # define munit_atomic_store(dest, value) __c11_atomic_store(dest, value, __ATOMIC_SEQ_CST) # define munit_atomic_load(src) __c11_atomic_load(src, __ATOMIC_SEQ_CST) # define munit_atomic_cas(dest, expected, value) __c11_atomic_compare_exchange_weak(dest, expected, value, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) #elif defined(__GNUC__) && (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7) # define munit_atomic_store(dest, value) __atomic_store_n(dest, value, __ATOMIC_SEQ_CST) # define munit_atomic_load(src) __atomic_load_n(src, __ATOMIC_SEQ_CST) # define munit_atomic_cas(dest, expected, value) __atomic_compare_exchange_n(dest, expected, value, true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) #elif defined(__GNUC__) && (__GNUC__ >= 4) # define munit_atomic_store(dest,value) do { *(dest) = (value); } while (0) # define munit_atomic_load(src) (*(src)) # define munit_atomic_cas(dest, expected, value) __sync_bool_compare_and_swap(dest, *expected, value) #elif defined(_WIN32) /* Untested */ # define munit_atomic_store(dest,value) do { *(dest) = (value); } while (0) # define munit_atomic_load(src) (*(src)) # define munit_atomic_cas(dest, expected, value) InterlockedCompareExchange((dest), (value), *(expected)) #else # warning No atomic implementation, PRNG will not be thread-safe # define munit_atomic_store(dest, value) do { *(dest) = (value); } while (0) # define munit_atomic_load(src) (*(src)) static inline bool munit_atomic_cas(ATOMIC_UINT32_T* dest, ATOMIC_UINT32_T* expected, ATOMIC_UINT32_T desired) { if (*dest == *expected) { *dest = desired; return true; } else { return false; } } #endif #define MUNIT_PRNG_MULTIPLIER (747796405U) #define MUNIT_PRNG_INCREMENT (1729U) static munit_uint32_t munit_rand_next_state(munit_uint32_t state) { return state * MUNIT_PRNG_MULTIPLIER + MUNIT_PRNG_INCREMENT; } static munit_uint32_t munit_rand_from_state(munit_uint32_t state) { munit_uint32_t res = ((state >> ((state >> 28) + 4)) ^ state) * (277803737U); res ^= res >> 22; return res; } void munit_rand_seed(munit_uint32_t seed) { munit_uint32_t state = munit_rand_next_state(seed + MUNIT_PRNG_INCREMENT); munit_atomic_store(&munit_rand_state, state); } static munit_uint32_t munit_rand_generate_seed(void) { munit_uint32_t seed, state; #if defined(MUNIT_ENABLE_TIMING) struct PsnipClockTimespec wc = { 0, 0 }; psnip_clock_get_time(PSNIP_CLOCK_TYPE_WALL, &wc); seed = (munit_uint32_t) wc.nanoseconds; #else seed = (munit_uint32_t) time(NULL); #endif state = munit_rand_next_state(seed + MUNIT_PRNG_INCREMENT); return munit_rand_from_state(state); } static munit_uint32_t munit_rand_state_uint32(munit_uint32_t* state) { const munit_uint32_t old = *state; *state = munit_rand_next_state(old); return munit_rand_from_state(old); } munit_uint32_t munit_rand_uint32(void) { munit_uint32_t old, state; do { old = munit_atomic_load(&munit_rand_state); state = munit_rand_next_state(old); } while (!munit_atomic_cas(&munit_rand_state, &old, state)); return munit_rand_from_state(old); } static void munit_rand_state_memory(munit_uint32_t* state, size_t size, munit_uint8_t data[MUNIT_ARRAY_PARAM(size)]) { size_t members_remaining = size / sizeof(munit_uint32_t); size_t bytes_remaining = size % sizeof(munit_uint32_t); munit_uint8_t* b = data; munit_uint32_t rv; while (members_remaining-- > 0) { rv = munit_rand_state_uint32(state); memcpy(b, &rv, sizeof(munit_uint32_t)); b += sizeof(munit_uint32_t); } if (bytes_remaining != 0) { rv = munit_rand_state_uint32(state); memcpy(b, &rv, bytes_remaining); } } void munit_rand_memory(size_t size, munit_uint8_t data[MUNIT_ARRAY_PARAM(size)]) { munit_uint32_t old, state; do { state = old = munit_atomic_load(&munit_rand_state); munit_rand_state_memory(&state, size, data); } while (!munit_atomic_cas(&munit_rand_state, &old, state)); } static munit_uint32_t munit_rand_state_at_most(munit_uint32_t* state, munit_uint32_t salt, munit_uint32_t max) { /* We want (UINT32_MAX + 1) % max, which in unsigned arithmetic is the same * as (UINT32_MAX + 1 - max) % max = -max % max. We compute -max using not * to avoid compiler warnings. */ const munit_uint32_t min = (~max + 1U) % max; munit_uint32_t x; if (max == (~((munit_uint32_t) 0U))) return munit_rand_state_uint32(state) ^ salt; max++; do { x = munit_rand_state_uint32(state) ^ salt; } while (x < min); return x % max; } static munit_uint32_t munit_rand_at_most(munit_uint32_t salt, munit_uint32_t max) { munit_uint32_t old, state; munit_uint32_t retval; do { state = old = munit_atomic_load(&munit_rand_state); retval = munit_rand_state_at_most(&state, salt, max); } while (!munit_atomic_cas(&munit_rand_state, &old, state)); return retval; } int munit_rand_int_range(int min, int max) { munit_uint64_t range = (munit_uint64_t) max - (munit_uint64_t) min; if (min > max) return munit_rand_int_range(max, min); if (range > (~((munit_uint32_t) 0U))) range = (~((munit_uint32_t) 0U)); return min + munit_rand_at_most(0, (munit_uint32_t) range); } double munit_rand_double(void) { munit_uint32_t old, state; double retval = 0.0; do { state = old = munit_atomic_load(&munit_rand_state); /* See http://mumble.net/~campbell/tmp/random_real.c for how to do * this right. Patches welcome if you feel that this is too * biased. */ retval = munit_rand_state_uint32(&state) / ((~((munit_uint32_t) 0U)) + 1.0); } while (!munit_atomic_cas(&munit_rand_state, &old, state)); return retval; } /*** Test suite handling ***/ typedef struct { unsigned int successful; unsigned int skipped; unsigned int failed; unsigned int errored; #if defined(MUNIT_ENABLE_TIMING) munit_uint64_t cpu_clock; munit_uint64_t wall_clock; #endif } MunitReport; typedef struct { const char* prefix; const MunitSuite* suite; const char** tests; munit_uint32_t seed; unsigned int iterations; MunitParameter* parameters; bool single_parameter_mode; void* user_data; MunitReport report; bool colorize; bool fork; bool show_stderr; bool fatal_failures; } MunitTestRunner; const char* munit_parameters_get(const MunitParameter params[], const char* key) { const MunitParameter* param; for (param = params ; param != NULL && param->name != NULL ; param++) if (strcmp(param->name, key) == 0) return param->value; return NULL; } #if defined(MUNIT_ENABLE_TIMING) static void munit_print_time(FILE* fp, munit_uint64_t nanoseconds) { fprintf(fp, "%" MUNIT_TEST_TIME_FORMAT, ((double) nanoseconds) / ((double) PSNIP_CLOCK_NSEC_PER_SEC)); } #endif /* Add a parameter to an array of parameters. */ static MunitResult munit_parameters_add(size_t* params_size, MunitParameter* params[MUNIT_ARRAY_PARAM(*params_size)], char* name, char* value) { *params = realloc(*params, sizeof(MunitParameter) * (*params_size + 2)); if (*params == NULL) return MUNIT_ERROR; (*params)[*params_size].name = name; (*params)[*params_size].value = value; (*params_size)++; (*params)[*params_size].name = NULL; (*params)[*params_size].value = NULL; return MUNIT_OK; } /* Concatenate two strings, but just return one of the components * unaltered if the other is NULL or "". */ static char* munit_maybe_concat(size_t* len, char* prefix, char* suffix) { char* res; size_t res_l; const size_t prefix_l = prefix != NULL ? strlen(prefix) : 0; const size_t suffix_l = suffix != NULL ? strlen(suffix) : 0; if (prefix_l == 0 && suffix_l == 0) { res = NULL; res_l = 0; } else if (prefix_l == 0 && suffix_l != 0) { res = suffix; res_l = suffix_l; } else if (prefix_l != 0 && suffix_l == 0) { res = prefix; res_l = prefix_l; } else { res_l = prefix_l + suffix_l; res = malloc(res_l + 1); memcpy(res, prefix, prefix_l); memcpy(res + prefix_l, suffix, suffix_l); res[res_l] = 0; } if (len != NULL) *len = res_l; return res; } /* Possibly free a string returned by munit_maybe_concat. */ static void munit_maybe_free_concat(char* s, const char* prefix, const char* suffix) { if (prefix != s && suffix != s) free(s); } /* Cheap string hash function, just used to salt the PRNG. */ static munit_uint32_t munit_str_hash(const char* name) { const char *p; munit_uint32_t h = 5381U; for (p = name; *p != '\0'; p++) h = (h << 5) + h + *p; return h; } static void munit_splice(int from, int to) { munit_uint8_t buf[1024]; #if !defined(_WIN32) ssize_t len; ssize_t bytes_written; ssize_t write_res; #else int len; int bytes_written; int write_res; #endif do { len = read(from, buf, sizeof(buf)); if (len > 0) { bytes_written = 0; do { write_res = write(to, buf + bytes_written, len - bytes_written); if (write_res < 0) break; bytes_written += write_res; } while (bytes_written < len); } else break; } while (true); } /* This is the part that should be handled in the child process */ static MunitResult munit_test_runner_exec(MunitTestRunner* runner, const MunitTest* test, const MunitParameter params[], MunitReport* report) { unsigned int iterations = runner->iterations; MunitResult result = MUNIT_FAIL; #if defined(MUNIT_ENABLE_TIMING) struct PsnipClockTimespec wall_clock_begin = { 0, 0 }, wall_clock_end = { 0, 0 }; struct PsnipClockTimespec cpu_clock_begin = { 0, 0 }, cpu_clock_end = { 0, 0 }; #endif unsigned int i = 0; if ((test->options & MUNIT_TEST_OPTION_SINGLE_ITERATION) == MUNIT_TEST_OPTION_SINGLE_ITERATION) iterations = 1; else if (iterations == 0) iterations = runner->suite->iterations; munit_rand_seed(runner->seed); do { void* data = (test->setup == NULL) ? runner->user_data : test->setup(params, runner->user_data); #if defined(MUNIT_ENABLE_TIMING) psnip_clock_get_time(PSNIP_CLOCK_TYPE_WALL, &wall_clock_begin); psnip_clock_get_time(PSNIP_CLOCK_TYPE_CPU, &cpu_clock_begin); #endif #if defined(MUNIT_THREAD_LOCAL) && defined(MUNIT_ALWAYS_TEAR_DOWN) if (test->tear_down != NULL) { if (MUNIT_UNLIKELY(setjmp(munit_tear_down_jmp_buf) != 0)) { test->tear_down(data); longjmp(munit_error_jmp_buf, 1); } else { munit_tear_down_jmp_buf_valid = true; } } #endif result = test->test(params, data); #if defined(MUNIT_ENABLE_TIMING) psnip_clock_get_time(PSNIP_CLOCK_TYPE_WALL, &wall_clock_end); psnip_clock_get_time(PSNIP_CLOCK_TYPE_CPU, &cpu_clock_end); #endif if (test->tear_down != NULL) test->tear_down(data); if (MUNIT_LIKELY(result == MUNIT_OK)) { report->successful++; #if defined(MUNIT_ENABLE_TIMING) report->wall_clock += munit_clock_get_elapsed(&wall_clock_begin, &wall_clock_end); report->cpu_clock += munit_clock_get_elapsed(&cpu_clock_begin, &cpu_clock_end); #endif } else { switch ((int) result) { case MUNIT_SKIP: report->skipped++; break; case MUNIT_FAIL: report->failed++; break; case MUNIT_ERROR: report->errored++; break; default: break; } break; } } while (++i < iterations); return result; } #if defined(MUNIT_EMOTICON) # define MUNIT_RESULT_STRING_OK ":)" # define MUNIT_RESULT_STRING_SKIP ":|" # define MUNIT_RESULT_STRING_FAIL ":(" # define MUNIT_RESULT_STRING_ERROR ":o" # define MUNIT_RESULT_STRING_TODO ":/" #else # define MUNIT_RESULT_STRING_OK "OK " # define MUNIT_RESULT_STRING_SKIP "SKIP " # define MUNIT_RESULT_STRING_FAIL "FAIL " # define MUNIT_RESULT_STRING_ERROR "ERROR" # define MUNIT_RESULT_STRING_TODO "TODO " #endif static void munit_test_runner_print_color(const MunitTestRunner* runner, const char* string, char color) { if (runner->colorize) fprintf(MUNIT_OUTPUT_FILE, "\x1b[3%cm%s\x1b[39m", color, string); else fputs(string, MUNIT_OUTPUT_FILE); } #if !defined(MUNIT_NO_BUFFER) static int munit_replace_stderr(FILE* stderr_buf) { if (stderr_buf != NULL) { const int orig_stderr = dup(STDERR_FILENO); int errfd = fileno(stderr_buf); if (MUNIT_UNLIKELY(errfd == -1)) { exit(EXIT_FAILURE); } dup2(errfd, STDERR_FILENO); return orig_stderr; } return -1; } static void munit_restore_stderr(int orig_stderr) { if (orig_stderr != -1) { dup2(orig_stderr, STDERR_FILENO); close(orig_stderr); } } #endif /* !defined(MUNIT_NO_BUFFER) */ /* Run a test with the specified parameters. */ static void munit_test_runner_run_test_with_params(MunitTestRunner* runner, const MunitTest* test, const MunitParameter params[]) { MunitResult result = MUNIT_OK; MunitReport report = { 0, 0, 0, 0, #if defined(MUNIT_ENABLE_TIMING) 0, 0 #endif }; unsigned int output_l; bool first; const MunitParameter* param; FILE* stderr_buf; #if !defined(MUNIT_NO_FORK) int pipefd[2]; pid_t fork_pid; ssize_t bytes_written = 0; ssize_t write_res; ssize_t bytes_read = 0; ssize_t read_res; int status = 0; pid_t changed_pid; #endif if (params != NULL) { output_l = 2; fputs(" ", MUNIT_OUTPUT_FILE); first = true; for (param = params ; param != NULL && param->name != NULL ; param++) { if (!first) { fputs(", ", MUNIT_OUTPUT_FILE); output_l += 2; } else { first = false; } output_l += fprintf(MUNIT_OUTPUT_FILE, "%s=%s", param->name, param->value); } while (output_l++ < MUNIT_TEST_NAME_LEN) { fputc(' ', MUNIT_OUTPUT_FILE); } } fflush(MUNIT_OUTPUT_FILE); stderr_buf = NULL; #if !defined(_WIN32) || defined(__MINGW32__) stderr_buf = tmpfile(); #else tmpfile_s(&stderr_buf); #endif if (stderr_buf == NULL) { munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to create buffer for stderr"); result = MUNIT_ERROR; goto print_result; } #if !defined(MUNIT_NO_FORK) if (runner->fork) { pipefd[0] = -1; pipefd[1] = -1; if (pipe(pipefd) != 0) { munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to create pipe"); result = MUNIT_ERROR; goto print_result; } fork_pid = fork(); if (fork_pid == 0) { int orig_stderr; close(pipefd[0]); orig_stderr = munit_replace_stderr(stderr_buf); munit_test_runner_exec(runner, test, params, &report); /* Note that we don't restore stderr. This is so we can buffer * things written to stderr later on (such as by * asan/tsan/ubsan, valgrind, etc.) */ close(orig_stderr); do { write_res = write(pipefd[1], ((munit_uint8_t*) (&report)) + bytes_written, sizeof(report) - bytes_written); if (write_res < 0) { if (stderr_buf != NULL) { munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to write to pipe"); } exit(EXIT_FAILURE); } bytes_written += write_res; } while ((size_t) bytes_written < sizeof(report)); if (stderr_buf != NULL) fclose(stderr_buf); close(pipefd[1]); exit(EXIT_SUCCESS); } else if (fork_pid == -1) { close(pipefd[0]); close(pipefd[1]); if (stderr_buf != NULL) { munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to fork"); } report.errored++; result = MUNIT_ERROR; } else { close(pipefd[1]); do { read_res = read(pipefd[0], ((munit_uint8_t*) (&report)) + bytes_read, sizeof(report) - bytes_read); if (read_res < 1) break; bytes_read += read_res; } while (bytes_read < (ssize_t) sizeof(report)); changed_pid = waitpid(fork_pid, &status, 0); if (MUNIT_LIKELY(changed_pid == fork_pid) && MUNIT_LIKELY(WIFEXITED(status))) { if (bytes_read != sizeof(report)) { munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child exited unexpectedly with status %d", WEXITSTATUS(status)); report.errored++; } else if (WEXITSTATUS(status) != EXIT_SUCCESS) { munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child exited with status %d", WEXITSTATUS(status)); report.errored++; } } else { if (WIFSIGNALED(status)) { #if defined(_XOPEN_VERSION) && (_XOPEN_VERSION >= 700) munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child killed by signal %d (%s)", WTERMSIG(status), strsignal(WTERMSIG(status))); #else munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child killed by signal %d", WTERMSIG(status)); #endif } else if (WIFSTOPPED(status)) { munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child stopped by signal %d", WSTOPSIG(status)); } report.errored++; } close(pipefd[0]); waitpid(fork_pid, NULL, 0); } } else #endif { #if !defined(MUNIT_NO_BUFFER) const volatile int orig_stderr = munit_replace_stderr(stderr_buf); #endif #if defined(MUNIT_THREAD_LOCAL) if (MUNIT_UNLIKELY(setjmp(munit_error_jmp_buf) != 0)) { result = MUNIT_FAIL; report.failed++; } else { munit_error_jmp_buf_valid = true; result = munit_test_runner_exec(runner, test, params, &report); } #else result = munit_test_runner_exec(runner, test, params, &report); #endif #if !defined(MUNIT_NO_BUFFER) munit_restore_stderr(orig_stderr); #endif /* Here just so that the label is used on Windows and we don't get * a warning */ goto print_result; } print_result: fputs("[ ", MUNIT_OUTPUT_FILE); if ((test->options & MUNIT_TEST_OPTION_TODO) == MUNIT_TEST_OPTION_TODO) { if (report.failed != 0 || report.errored != 0 || report.skipped != 0) { munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_TODO, '3'); result = MUNIT_OK; } else { munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_ERROR, '1'); if (MUNIT_LIKELY(stderr_buf != NULL)) munit_log_internal(MUNIT_LOG_ERROR, stderr_buf, "Test marked TODO, but was successful."); runner->report.failed++; result = MUNIT_ERROR; } } else if (report.failed > 0) { munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_FAIL, '1'); runner->report.failed++; result = MUNIT_FAIL; } else if (report.errored > 0) { munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_ERROR, '1'); runner->report.errored++; result = MUNIT_ERROR; } else if (report.skipped > 0) { munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_SKIP, '3'); runner->report.skipped++; result = MUNIT_SKIP; } else if (report.successful > 1) { munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_OK, '2'); #if defined(MUNIT_ENABLE_TIMING) fputs(" ] [ ", MUNIT_OUTPUT_FILE); munit_print_time(MUNIT_OUTPUT_FILE, report.wall_clock / report.successful); fputs(" / ", MUNIT_OUTPUT_FILE); munit_print_time(MUNIT_OUTPUT_FILE, report.cpu_clock / report.successful); fprintf(MUNIT_OUTPUT_FILE, " CPU ]\n %-" MUNIT_XSTRINGIFY(MUNIT_TEST_NAME_LEN) "s Total: [ ", ""); munit_print_time(MUNIT_OUTPUT_FILE, report.wall_clock); fputs(" / ", MUNIT_OUTPUT_FILE); munit_print_time(MUNIT_OUTPUT_FILE, report.cpu_clock); fputs(" CPU", MUNIT_OUTPUT_FILE); #endif runner->report.successful++; result = MUNIT_OK; } else if (report.successful > 0) { munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_OK, '2'); #if defined(MUNIT_ENABLE_TIMING) fputs(" ] [ ", MUNIT_OUTPUT_FILE); munit_print_time(MUNIT_OUTPUT_FILE, report.wall_clock); fputs(" / ", MUNIT_OUTPUT_FILE); munit_print_time(MUNIT_OUTPUT_FILE, report.cpu_clock); fputs(" CPU", MUNIT_OUTPUT_FILE); #endif runner->report.successful++; result = MUNIT_OK; } fputs(" ]\n", MUNIT_OUTPUT_FILE); if (stderr_buf != NULL) { if (result == MUNIT_FAIL || result == MUNIT_ERROR || runner->show_stderr) { fflush(MUNIT_OUTPUT_FILE); rewind(stderr_buf); munit_splice(fileno(stderr_buf), STDERR_FILENO); fflush(stderr); } fclose(stderr_buf); } } static void munit_test_runner_run_test_wild(MunitTestRunner* runner, const MunitTest* test, const char* test_name, MunitParameter* params, MunitParameter* p) { const MunitParameterEnum* pe; char** values; MunitParameter* next; for (pe = test->parameters ; pe != NULL && pe->name != NULL ; pe++) { if (p->name == pe->name) break; } if (pe == NULL) return; for (values = pe->values ; *values != NULL ; values++) { next = p + 1; p->value = *values; if (next->name == NULL) { munit_test_runner_run_test_with_params(runner, test, params); } else { munit_test_runner_run_test_wild(runner, test, test_name, params, next); } if (runner->fatal_failures && (runner->report.failed != 0 || runner->report.errored != 0)) break; } } /* Run a single test, with every combination of parameters * requested. */ static void munit_test_runner_run_test(MunitTestRunner* runner, const MunitTest* test, const char* prefix) { char* test_name = munit_maybe_concat(NULL, (char*) prefix, (char*) test->name); /* The array of parameters to pass to * munit_test_runner_run_test_with_params */ MunitParameter* params = NULL; size_t params_l = 0; /* Wildcard parameters are parameters which have possible values * specified in the test, but no specific value was passed to the * CLI. That means we want to run the test once for every * possible combination of parameter values or, if --single was * passed to the CLI, a single time with a random set of * parameters. */ MunitParameter* wild_params = NULL; size_t wild_params_l = 0; const MunitParameterEnum* pe; const MunitParameter* cli_p; bool filled; unsigned int possible; char** vals; size_t first_wild; const MunitParameter* wp; int pidx; munit_rand_seed(runner->seed); fprintf(MUNIT_OUTPUT_FILE, "%-" MUNIT_XSTRINGIFY(MUNIT_TEST_NAME_LEN) "s", test_name); if (test->parameters == NULL) { /* No parameters. Simple, nice. */ munit_test_runner_run_test_with_params(runner, test, NULL); } else { fputc('\n', MUNIT_OUTPUT_FILE); for (pe = test->parameters ; pe != NULL && pe->name != NULL ; pe++) { /* Did we received a value for this parameter from the CLI? */ filled = false; for (cli_p = runner->parameters ; cli_p != NULL && cli_p->name != NULL ; cli_p++) { if (strcmp(cli_p->name, pe->name) == 0) { if (MUNIT_UNLIKELY(munit_parameters_add(¶ms_l, ¶ms, pe->name, cli_p->value) != MUNIT_OK)) goto cleanup; filled = true; break; } } if (filled) continue; /* Nothing from CLI, is the enum NULL/empty? We're not a * fuzzer… */ if (pe->values == NULL || pe->values[0] == NULL) continue; /* If --single was passed to the CLI, choose a value from the * list of possibilities randomly. */ if (runner->single_parameter_mode) { possible = 0; for (vals = pe->values ; *vals != NULL ; vals++) possible++; /* We want the tests to be reproducible, even if you're only * running a single test, but we don't want every test with * the same number of parameters to choose the same parameter * number, so use the test name as a primitive salt. */ pidx = munit_rand_at_most(munit_str_hash(test_name), possible - 1); if (MUNIT_UNLIKELY(munit_parameters_add(¶ms_l, ¶ms, pe->name, pe->values[pidx]) != MUNIT_OK)) goto cleanup; } else { /* We want to try every permutation. Put in a placeholder * entry, we'll iterate through them later. */ if (MUNIT_UNLIKELY(munit_parameters_add(&wild_params_l, &wild_params, pe->name, NULL) != MUNIT_OK)) goto cleanup; } } if (wild_params_l != 0) { first_wild = params_l; for (wp = wild_params ; wp != NULL && wp->name != NULL ; wp++) { for (pe = test->parameters ; pe != NULL && pe->name != NULL && pe->values != NULL ; pe++) { if (strcmp(wp->name, pe->name) == 0) { if (MUNIT_UNLIKELY(munit_parameters_add(¶ms_l, ¶ms, pe->name, pe->values[0]) != MUNIT_OK)) goto cleanup; } } } munit_test_runner_run_test_wild(runner, test, test_name, params, params + first_wild); } else { munit_test_runner_run_test_with_params(runner, test, params); } cleanup: free(params); free(wild_params); } munit_maybe_free_concat(test_name, prefix, test->name); } /* Recurse through the suite and run all the tests. If a list of * tests to run was provided on the command line, run only those * tests. */ static void munit_test_runner_run_suite(MunitTestRunner* runner, const MunitSuite* suite, const char* prefix) { size_t pre_l; char* pre = munit_maybe_concat(&pre_l, (char*) prefix, (char*) suite->prefix); const MunitTest* test; const char** test_name; const MunitSuite* child_suite; /* Run the tests. */ for (test = suite->tests ; test != NULL && test->test != NULL ; test++) { if (runner->tests != NULL) { /* Specific tests were requested on the CLI */ for (test_name = runner->tests ; test_name != NULL && *test_name != NULL ; test_name++) { if ((pre_l == 0 || strncmp(pre, *test_name, pre_l) == 0) && strncmp(test->name, *test_name + pre_l, strlen(*test_name + pre_l)) == 0) { munit_test_runner_run_test(runner, test, pre); if (runner->fatal_failures && (runner->report.failed != 0 || runner->report.errored != 0)) goto cleanup; } } } else { /* Run all tests */ munit_test_runner_run_test(runner, test, pre); } } if (runner->fatal_failures && (runner->report.failed != 0 || runner->report.errored != 0)) goto cleanup; /* Run any child suites. */ for (child_suite = suite->suites ; child_suite != NULL && child_suite->prefix != NULL ; child_suite++) { munit_test_runner_run_suite(runner, child_suite, pre); } cleanup: munit_maybe_free_concat(pre, prefix, suite->prefix); } static void munit_test_runner_run(MunitTestRunner* runner) { munit_test_runner_run_suite(runner, runner->suite, NULL); } static void munit_print_help(int argc, char* const argv[MUNIT_ARRAY_PARAM(argc + 1)], void* user_data, const MunitArgument arguments[]) { const MunitArgument* arg; (void) argc; printf("USAGE: %s [OPTIONS...] [TEST...]\n\n", argv[0]); puts(" --seed SEED\n" " Value used to seed the PRNG. Must be a 32-bit integer in decimal\n" " notation with no separators (commas, decimals, spaces, etc.), or\n" " hexadecimal prefixed by \"0x\".\n" " --iterations N\n" " Run each test N times. 0 means the default number.\n" " --param name value\n" " A parameter key/value pair which will be passed to any test with\n" " takes a parameter of that name. If not provided, the test will be\n" " run once for each possible parameter value.\n" " --list Write a list of all available tests.\n" " --list-params\n" " Write a list of all available tests and their possible parameters.\n" " --single Run each parameterized test in a single configuration instead of\n" " every possible combination\n" " --log-visible debug|info|warning|error\n" " --log-fatal debug|info|warning|error\n" " Set the level at which messages of different severities are visible,\n" " or cause the test to terminate.\n" #if !defined(MUNIT_NO_FORK) " --no-fork Do not execute tests in a child process. If this option is supplied\n" " and a test crashes (including by failing an assertion), no further\n" " tests will be performed.\n" #endif " --fatal-failures\n" " Stop executing tests as soon as a failure is found.\n" " --show-stderr\n" " Show data written to stderr by the tests, even if the test succeeds.\n" " --color auto|always|never\n" " Colorize (or don't) the output.\n" /* 12345678901234567890123456789012345678901234567890123456789012345678901234567890 */ " --help Print this help message and exit.\n"); #if defined(MUNIT_NL_LANGINFO) setlocale(LC_ALL, ""); fputs((strcasecmp("UTF-8", nl_langinfo(CODESET)) == 0) ? "µnit" : "munit", stdout); #else puts("munit"); #endif printf(" %d.%d.%d\n" "Full documentation at: https://nemequ.github.io/munit/\n", (MUNIT_CURRENT_VERSION >> 16) & 0xff, (MUNIT_CURRENT_VERSION >> 8) & 0xff, (MUNIT_CURRENT_VERSION >> 0) & 0xff); for (arg = arguments ; arg != NULL && arg->name != NULL ; arg++) arg->write_help(arg, user_data); } static const MunitArgument* munit_arguments_find(const MunitArgument arguments[], const char* name) { const MunitArgument* arg; for (arg = arguments ; arg != NULL && arg->name != NULL ; arg++) if (strcmp(arg->name, name) == 0) return arg; return NULL; } static void munit_suite_list_tests(const MunitSuite* suite, bool show_params, const char* prefix) { size_t pre_l; char* pre = munit_maybe_concat(&pre_l, (char*) prefix, (char*) suite->prefix); const MunitTest* test; const MunitParameterEnum* params; bool first; char** val; const MunitSuite* child_suite; for (test = suite->tests ; test != NULL && test->name != NULL ; test++) { if (pre != NULL) fputs(pre, stdout); puts(test->name); if (show_params) { for (params = test->parameters ; params != NULL && params->name != NULL ; params++) { fprintf(stdout, " - %s: ", params->name); if (params->values == NULL) { puts("Any"); } else { first = true; for (val = params->values ; *val != NULL ; val++ ) { if(!first) { fputs(", ", stdout); } else { first = false; } fputs(*val, stdout); } putc('\n', stdout); } } } } for (child_suite = suite->suites ; child_suite != NULL && child_suite->prefix != NULL ; child_suite++) { munit_suite_list_tests(child_suite, show_params, pre); } munit_maybe_free_concat(pre, prefix, suite->prefix); } static bool munit_stream_supports_ansi(FILE *stream) { #if !defined(_WIN32) return isatty(fileno(stream)); #else #if !defined(__MINGW32__) size_t ansicon_size = 0; #endif if (isatty(fileno(stream))) { #if !defined(__MINGW32__) getenv_s(&ansicon_size, NULL, 0, "ANSICON"); return ansicon_size != 0; #else return getenv("ANSICON") != NULL; #endif } return false; #endif } int munit_suite_main_custom(const MunitSuite* suite, void* user_data, int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)], const MunitArgument arguments[]) { int result = EXIT_FAILURE; MunitTestRunner runner; size_t parameters_size = 0; size_t tests_size = 0; int arg; char* envptr; unsigned long ts; char* endptr; unsigned long long iterations; MunitLogLevel level; const MunitArgument* argument; const char** runner_tests; unsigned int tests_run; unsigned int tests_total; runner.prefix = NULL; runner.suite = NULL; runner.tests = NULL; runner.seed = 0; runner.iterations = 0; runner.parameters = NULL; runner.single_parameter_mode = false; runner.user_data = NULL; runner.report.successful = 0; runner.report.skipped = 0; runner.report.failed = 0; runner.report.errored = 0; #if defined(MUNIT_ENABLE_TIMING) runner.report.cpu_clock = 0; runner.report.wall_clock = 0; #endif runner.colorize = false; #if !defined(_WIN32) runner.fork = true; #else runner.fork = false; #endif runner.show_stderr = false; runner.fatal_failures = false; runner.suite = suite; runner.user_data = user_data; runner.seed = munit_rand_generate_seed(); runner.colorize = munit_stream_supports_ansi(MUNIT_OUTPUT_FILE); for (arg = 1 ; arg < argc ; arg++) { if (strncmp("--", argv[arg], 2) == 0) { if (strcmp("seed", argv[arg] + 2) == 0) { if (arg + 1 >= argc) { munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires an argument", argv[arg]); goto cleanup; } envptr = argv[arg + 1]; ts = strtoul(argv[arg + 1], &envptr, 0); if (*envptr != '\0' || ts > (~((munit_uint32_t) 0U))) { munit_logf_internal(MUNIT_LOG_ERROR, stderr, "invalid value ('%s') passed to %s", argv[arg + 1], argv[arg]); goto cleanup; } runner.seed = (munit_uint32_t) ts; arg++; } else if (strcmp("iterations", argv[arg] + 2) == 0) { if (arg + 1 >= argc) { munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires an argument", argv[arg]); goto cleanup; } endptr = argv[arg + 1]; iterations = strtoul(argv[arg + 1], &endptr, 0); if (*endptr != '\0' || iterations > UINT_MAX) { munit_logf_internal(MUNIT_LOG_ERROR, stderr, "invalid value ('%s') passed to %s", argv[arg + 1], argv[arg]); goto cleanup; } runner.iterations = (unsigned int) iterations; arg++; } else if (strcmp("param", argv[arg] + 2) == 0) { if (arg + 2 >= argc) { munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires two arguments", argv[arg]); goto cleanup; } runner.parameters = realloc(runner.parameters, sizeof(MunitParameter) * (parameters_size + 2)); if (runner.parameters == NULL) { munit_log_internal(MUNIT_LOG_ERROR, stderr, "failed to allocate memory"); goto cleanup; } runner.parameters[parameters_size].name = (char*) argv[arg + 1]; runner.parameters[parameters_size].value = (char*) argv[arg + 2]; parameters_size++; runner.parameters[parameters_size].name = NULL; runner.parameters[parameters_size].value = NULL; arg += 2; } else if (strcmp("color", argv[arg] + 2) == 0) { if (arg + 1 >= argc) { munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires an argument", argv[arg]); goto cleanup; } if (strcmp(argv[arg + 1], "always") == 0) runner.colorize = true; else if (strcmp(argv[arg + 1], "never") == 0) runner.colorize = false; else if (strcmp(argv[arg + 1], "auto") == 0) runner.colorize = munit_stream_supports_ansi(MUNIT_OUTPUT_FILE); else { munit_logf_internal(MUNIT_LOG_ERROR, stderr, "invalid value ('%s') passed to %s", argv[arg + 1], argv[arg]); goto cleanup; } arg++; } else if (strcmp("help", argv[arg] + 2) == 0) { munit_print_help(argc, argv, user_data, arguments); result = EXIT_SUCCESS; goto cleanup; } else if (strcmp("single", argv[arg] + 2) == 0) { runner.single_parameter_mode = true; } else if (strcmp("show-stderr", argv[arg] + 2) == 0) { runner.show_stderr = true; #if !defined(_WIN32) } else if (strcmp("no-fork", argv[arg] + 2) == 0) { runner.fork = false; #endif } else if (strcmp("fatal-failures", argv[arg] + 2) == 0) { runner.fatal_failures = true; } else if (strcmp("log-visible", argv[arg] + 2) == 0 || strcmp("log-fatal", argv[arg] + 2) == 0) { if (arg + 1 >= argc) { munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires an argument", argv[arg]); goto cleanup; } if (strcmp(argv[arg + 1], "debug") == 0) level = MUNIT_LOG_DEBUG; else if (strcmp(argv[arg + 1], "info") == 0) level = MUNIT_LOG_INFO; else if (strcmp(argv[arg + 1], "warning") == 0) level = MUNIT_LOG_WARNING; else if (strcmp(argv[arg + 1], "error") == 0) level = MUNIT_LOG_ERROR; else { munit_logf_internal(MUNIT_LOG_ERROR, stderr, "invalid value ('%s') passed to %s", argv[arg + 1], argv[arg]); goto cleanup; } if (strcmp("log-visible", argv[arg] + 2) == 0) munit_log_level_visible = level; else munit_log_level_fatal = level; arg++; } else if (strcmp("list", argv[arg] + 2) == 0) { munit_suite_list_tests(suite, false, NULL); result = EXIT_SUCCESS; goto cleanup; } else if (strcmp("list-params", argv[arg] + 2) == 0) { munit_suite_list_tests(suite, true, NULL); result = EXIT_SUCCESS; goto cleanup; } else { argument = munit_arguments_find(arguments, argv[arg] + 2); if (argument == NULL) { munit_logf_internal(MUNIT_LOG_ERROR, stderr, "unknown argument ('%s')", argv[arg]); goto cleanup; } if (!argument->parse_argument(suite, user_data, &arg, argc, argv)) goto cleanup; } } else { runner_tests = realloc((void*) runner.tests, sizeof(char*) * (tests_size + 2)); if (runner_tests == NULL) { munit_log_internal(MUNIT_LOG_ERROR, stderr, "failed to allocate memory"); goto cleanup; } runner.tests = runner_tests; runner.tests[tests_size++] = argv[arg]; runner.tests[tests_size] = NULL; } } fflush(stderr); fprintf(MUNIT_OUTPUT_FILE, "Running test suite with seed 0x%08" PRIx32 "...\n", runner.seed); munit_test_runner_run(&runner); tests_run = runner.report.successful + runner.report.failed + runner.report.errored; tests_total = tests_run + runner.report.skipped; if (tests_run == 0) { fprintf(stderr, "No tests run, %d (100%%) skipped.\n", runner.report.skipped); } else { fprintf(MUNIT_OUTPUT_FILE, "%d of %d (%0.0f%%) tests successful, %d (%0.0f%%) test skipped.\n", runner.report.successful, tests_run, (((double) runner.report.successful) / ((double) tests_run)) * 100.0, runner.report.skipped, (((double) runner.report.skipped) / ((double) tests_total)) * 100.0); } if (runner.report.failed == 0 && runner.report.errored == 0) { result = EXIT_SUCCESS; } cleanup: free(runner.parameters); free((void*) runner.tests); return result; } int munit_suite_main(const MunitSuite* suite, void* user_data, int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)]) { return munit_suite_main_custom(suite, user_data, argc, argv, NULL); } dqlite-1.16.7/test/raft/lib/munit.h000066400000000000000000000422131465252713400170710ustar00rootroot00000000000000/* µnit Testing Framework * Copyright (c) 2013-2017 Evan Nemerson * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, * modify, merge, publish, distribute, sublicense, and/or sell copies * of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #if !defined(MUNIT_H) #define MUNIT_H #include #include #define MUNIT_VERSION(major, minor, revision) \ (((major) << 16) | ((minor) << 8) | (revision)) #define MUNIT_CURRENT_VERSION MUNIT_VERSION(0, 4, 1) #if defined(_MSC_VER) && (_MSC_VER < 1600) # define munit_int8_t __int8 # define munit_uint8_t unsigned __int8 # define munit_int16_t __int16 # define munit_uint16_t unsigned __int16 # define munit_int32_t __int32 # define munit_uint32_t unsigned __int32 # define munit_int64_t __int64 # define munit_uint64_t unsigned __int64 #else # include # define munit_int8_t int8_t # define munit_uint8_t uint8_t # define munit_int16_t int16_t # define munit_uint16_t uint16_t # define munit_int32_t int32_t # define munit_uint32_t uint32_t # define munit_int64_t int64_t # define munit_uint64_t uint64_t #endif #if defined(_MSC_VER) && (_MSC_VER < 1800) # if !defined(PRIi8) # define PRIi8 "i" # endif # if !defined(PRIi16) # define PRIi16 "i" # endif # if !defined(PRIi32) # define PRIi32 "i" # endif # if !defined(PRIi64) # define PRIi64 "I64i" # endif # if !defined(PRId8) # define PRId8 "d" # endif # if !defined(PRId16) # define PRId16 "d" # endif # if !defined(PRId32) # define PRId32 "d" # endif # if !defined(PRId64) # define PRId64 "I64d" # endif # if !defined(PRIx8) # define PRIx8 "x" # endif # if !defined(PRIx16) # define PRIx16 "x" # endif # if !defined(PRIx32) # define PRIx32 "x" # endif # if !defined(PRIx64) # define PRIx64 "I64x" # endif # if !defined(PRIu8) # define PRIu8 "u" # endif # if !defined(PRIu16) # define PRIu16 "u" # endif # if !defined(PRIu32) # define PRIu32 "u" # endif # if !defined(PRIu64) # define PRIu64 "I64u" # endif # if !defined(bool) # define bool int # endif # if !defined(true) # define true (!0) # endif # if !defined(false) # define false (!!0) # endif #else # include # include #endif #if defined(__cplusplus) extern "C" { #endif #if defined(__GNUC__) # define MUNIT_LIKELY(expr) (__builtin_expect ((expr), 1)) # define MUNIT_UNLIKELY(expr) (__builtin_expect ((expr), 0)) # define MUNIT_UNUSED __attribute__((__unused__)) #else # define MUNIT_LIKELY(expr) (expr) # define MUNIT_UNLIKELY(expr) (expr) # define MUNIT_UNUSED #endif #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && !defined(__PGI) # define MUNIT_ARRAY_PARAM(name) name #else # define MUNIT_ARRAY_PARAM(name) #endif #if !defined(_WIN32) # define MUNIT_SIZE_MODIFIER "z" # define MUNIT_CHAR_MODIFIER "hh" # define MUNIT_SHORT_MODIFIER "h" #else # if defined(_M_X64) || defined(__amd64__) # define MUNIT_SIZE_MODIFIER "I64" # else # define MUNIT_SIZE_MODIFIER "" # endif # define MUNIT_CHAR_MODIFIER "" # define MUNIT_SHORT_MODIFIER "" #endif #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L # define MUNIT_NO_RETURN _Noreturn #elif defined(__GNUC__) # define MUNIT_NO_RETURN __attribute__((__noreturn__)) #elif defined(_MSC_VER) # define MUNIT_NO_RETURN __declspec(noreturn) #else # define MUNIT_NO_RETURN #endif #if defined(_MSC_VER) && (_MSC_VER >= 1500) # define MUNIT__PUSH_DISABLE_MSVC_C4127 __pragma(warning(push)) __pragma(warning(disable:4127)) # define MUNIT__POP_DISABLE_MSVC_C4127 __pragma(warning(pop)) #else # define MUNIT__PUSH_DISABLE_MSVC_C4127 # define MUNIT__POP_DISABLE_MSVC_C4127 #endif typedef enum { MUNIT_LOG_DEBUG, MUNIT_LOG_INFO, MUNIT_LOG_WARNING, MUNIT_LOG_ERROR } MunitLogLevel; #if defined(__GNUC__) && !defined(__MINGW32__) # define MUNIT_PRINTF(string_index, first_to_check) __attribute__((format (printf, string_index, first_to_check))) #else # define MUNIT_PRINTF(string_index, first_to_check) #endif MUNIT_PRINTF(4, 5) void munit_logf_ex(MunitLogLevel level, const char* filename, int line, const char* format, ...); #define munit_logf(level, format, ...) \ munit_logf_ex(level, __FILE__, __LINE__, format, __VA_ARGS__) #define munit_log(level, msg) \ munit_logf(level, "%s", msg) MUNIT_NO_RETURN MUNIT_PRINTF(3, 4) void munit_errorf_ex(const char* filename, int line, const char* format, ...); #define munit_errorf(format, ...) \ munit_errorf_ex(__FILE__, __LINE__, format, __VA_ARGS__) #define munit_error(msg) \ munit_errorf("%s", msg) #define munit_assert(expr) \ do { \ if (!MUNIT_LIKELY(expr)) { \ munit_error("assertion failed: " #expr); \ } \ MUNIT__PUSH_DISABLE_MSVC_C4127 \ } while (0) \ MUNIT__POP_DISABLE_MSVC_C4127 #define munit_assert_true(expr) \ do { \ if (!MUNIT_LIKELY(expr)) { \ munit_error("assertion failed: " #expr " is not true"); \ } \ MUNIT__PUSH_DISABLE_MSVC_C4127 \ } while (0) \ MUNIT__POP_DISABLE_MSVC_C4127 #define munit_assert_false(expr) \ do { \ if (!MUNIT_LIKELY(!(expr))) { \ munit_error("assertion failed: " #expr " is not false"); \ } \ MUNIT__PUSH_DISABLE_MSVC_C4127 \ } while (0) \ MUNIT__POP_DISABLE_MSVC_C4127 #define munit_assert_type_full(prefix, suffix, T, fmt, a, op, b) \ do { \ T munit_tmp_a_ = (a); \ T munit_tmp_b_ = (b); \ if (!(munit_tmp_a_ op munit_tmp_b_)) { \ munit_errorf("assertion failed: %s %s %s (" prefix "%" fmt suffix " %s " prefix "%" fmt suffix ")", \ #a, #op, #b, munit_tmp_a_, #op, munit_tmp_b_); \ } \ MUNIT__PUSH_DISABLE_MSVC_C4127 \ } while (0) \ MUNIT__POP_DISABLE_MSVC_C4127 #define munit_assert_type(T, fmt, a, op, b) \ munit_assert_type_full("", "", T, fmt, a, op, b) #define munit_assert_char(a, op, b) \ munit_assert_type_full("'\\x", "'", char, "02" MUNIT_CHAR_MODIFIER "x", a, op, b) #define munit_assert_uchar(a, op, b) \ munit_assert_type_full("'\\x", "'", unsigned char, "02" MUNIT_CHAR_MODIFIER "x", a, op, b) #define munit_assert_short(a, op, b) \ munit_assert_type(short, MUNIT_SHORT_MODIFIER "d", a, op, b) #define munit_assert_ushort(a, op, b) \ munit_assert_type(unsigned short, MUNIT_SHORT_MODIFIER "u", a, op, b) #define munit_assert_int(a, op, b) \ munit_assert_type(int, "d", a, op, b) #define munit_assert_uint(a, op, b) \ munit_assert_type(unsigned int, "u", a, op, b) #define munit_assert_long(a, op, b) \ munit_assert_type(long int, "ld", a, op, b) #define munit_assert_ulong(a, op, b) \ munit_assert_type(unsigned long int, "lu", a, op, b) #define munit_assert_llong(a, op, b) \ munit_assert_type(long long int, "lld", a, op, b) #define munit_assert_ullong(a, op, b) \ munit_assert_type(unsigned long long int, "llu", a, op, b) #define munit_assert_size(a, op, b) \ munit_assert_type(size_t, MUNIT_SIZE_MODIFIER "u", a, op, b) #define munit_assert_float(a, op, b) \ munit_assert_type(float, "f", a, op, b) #define munit_assert_double(a, op, b) \ munit_assert_type(double, "g", a, op, b) #define munit_assert_ptr(a, op, b) \ munit_assert_type(const void*, "p", a, op, b) #define munit_assert_int8(a, op, b) \ munit_assert_type(munit_int8_t, PRIi8, a, op, b) #define munit_assert_uint8(a, op, b) \ munit_assert_type(munit_uint8_t, PRIu8, a, op, b) #define munit_assert_int16(a, op, b) \ munit_assert_type(munit_int16_t, PRIi16, a, op, b) #define munit_assert_uint16(a, op, b) \ munit_assert_type(munit_uint16_t, PRIu16, a, op, b) #define munit_assert_int32(a, op, b) \ munit_assert_type(munit_int32_t, PRIi32, a, op, b) #define munit_assert_uint32(a, op, b) \ munit_assert_type(munit_uint32_t, PRIu32, a, op, b) #define munit_assert_int64(a, op, b) \ munit_assert_type(munit_int64_t, PRIi64, a, op, b) #define munit_assert_uint64(a, op, b) \ munit_assert_type(munit_uint64_t, PRIu64, a, op, b) #define munit_assert_double_equal(a, b, precision) \ do { \ const double munit_tmp_a_ = (a); \ const double munit_tmp_b_ = (b); \ const double munit_tmp_diff_ = ((munit_tmp_a_ - munit_tmp_b_) < 0) ? \ -(munit_tmp_a_ - munit_tmp_b_) : \ (munit_tmp_a_ - munit_tmp_b_); \ if (MUNIT_UNLIKELY(munit_tmp_diff_ > 1e-##precision)) { \ munit_errorf("assertion failed: %s == %s (%0." #precision "g == %0." #precision "g)", \ #a, #b, munit_tmp_a_, munit_tmp_b_); \ } \ MUNIT__PUSH_DISABLE_MSVC_C4127 \ } while (0) \ MUNIT__POP_DISABLE_MSVC_C4127 #include #define munit_assert_string_equal(a, b) \ do { \ const char* munit_tmp_a_ = a; \ const char* munit_tmp_b_ = b; \ if (MUNIT_UNLIKELY(strcmp(munit_tmp_a_, munit_tmp_b_) != 0)) { \ munit_errorf("assertion failed: string %s == %s (\"%s\" == \"%s\")", \ #a, #b, munit_tmp_a_, munit_tmp_b_); \ } \ MUNIT__PUSH_DISABLE_MSVC_C4127 \ } while (0) \ MUNIT__POP_DISABLE_MSVC_C4127 #define munit_assert_string_not_equal(a, b) \ do { \ const char* munit_tmp_a_ = a; \ const char* munit_tmp_b_ = b; \ if (MUNIT_UNLIKELY(strcmp(munit_tmp_a_, munit_tmp_b_) == 0)) { \ munit_errorf("assertion failed: string %s != %s (\"%s\" == \"%s\")", \ #a, #b, munit_tmp_a_, munit_tmp_b_); \ } \ MUNIT__PUSH_DISABLE_MSVC_C4127 \ } while (0) \ MUNIT__POP_DISABLE_MSVC_C4127 #define munit_assert_memory_equal(size, a, b) \ do { \ const unsigned char* munit_tmp_a_ = (const unsigned char*) (a); \ const unsigned char* munit_tmp_b_ = (const unsigned char*) (b); \ const size_t munit_tmp_size_ = (size); \ if (MUNIT_UNLIKELY(memcmp(munit_tmp_a_, munit_tmp_b_, munit_tmp_size_)) != 0) { \ size_t munit_tmp_pos_; \ for (munit_tmp_pos_ = 0 ; munit_tmp_pos_ < munit_tmp_size_ ; munit_tmp_pos_++) { \ if (munit_tmp_a_[munit_tmp_pos_] != munit_tmp_b_[munit_tmp_pos_]) { \ munit_errorf("assertion failed: memory %s == %s, at offset %" MUNIT_SIZE_MODIFIER "u", \ #a, #b, munit_tmp_pos_); \ break; \ } \ } \ } \ MUNIT__PUSH_DISABLE_MSVC_C4127 \ } while (0) \ MUNIT__POP_DISABLE_MSVC_C4127 #define munit_assert_memory_not_equal(size, a, b) \ do { \ const unsigned char* munit_tmp_a_ = (const unsigned char*) (a); \ const unsigned char* munit_tmp_b_ = (const unsigned char*) (b); \ const size_t munit_tmp_size_ = (size); \ if (MUNIT_UNLIKELY(memcmp(munit_tmp_a_, munit_tmp_b_, munit_tmp_size_)) == 0) { \ munit_errorf("assertion failed: memory %s != %s (%zu bytes)", \ #a, #b, munit_tmp_size_); \ } \ MUNIT__PUSH_DISABLE_MSVC_C4127 \ } while (0) \ MUNIT__POP_DISABLE_MSVC_C4127 #define munit_assert_ptr_equal(a, b) \ munit_assert_ptr(a, ==, b) #define munit_assert_ptr_not_equal(a, b) \ munit_assert_ptr(a, !=, b) #define munit_assert_null(ptr) \ munit_assert_ptr(ptr, ==, NULL) #define munit_assert_not_null(ptr) \ munit_assert_ptr(ptr, !=, NULL) #define munit_assert_ptr_null(ptr) \ munit_assert_ptr(ptr, ==, NULL) #define munit_assert_ptr_not_null(ptr) \ munit_assert_ptr(ptr, !=, NULL) /*** Memory allocation ***/ void* munit_malloc_ex(const char* filename, int line, size_t size); #define munit_malloc(size) \ munit_malloc_ex(__FILE__, __LINE__, (size)) #define munit_new(type) \ ((type*) munit_malloc(sizeof(type))) #define munit_calloc(nmemb, size) \ munit_malloc((nmemb) * (size)) #define munit_newa(type, nmemb) \ ((type*) munit_calloc((nmemb), sizeof(type))) /*** Random number generation ***/ void munit_rand_seed(munit_uint32_t seed); munit_uint32_t munit_rand_uint32(void); int munit_rand_int_range(int min, int max); double munit_rand_double(void); void munit_rand_memory(size_t size, munit_uint8_t buffer[MUNIT_ARRAY_PARAM(size)]); /*** Tests and Suites ***/ typedef enum { /* Test successful */ MUNIT_OK, /* Test failed */ MUNIT_FAIL, /* Test was skipped */ MUNIT_SKIP, /* Test failed due to circumstances not intended to be tested * (things like network errors, invalid parameter value, failure to * allocate memory in the test harness, etc.). */ MUNIT_ERROR } MunitResult; typedef struct { char* name; char** values; } MunitParameterEnum; typedef struct { char* name; char* value; } MunitParameter; const char* munit_parameters_get(const MunitParameter params[], const char* key); typedef enum { MUNIT_TEST_OPTION_NONE = 0, MUNIT_TEST_OPTION_SINGLE_ITERATION = 1 << 0, MUNIT_TEST_OPTION_TODO = 1 << 1 } MunitTestOptions; typedef MunitResult (* MunitTestFunc)(const MunitParameter params[], void* user_data_or_fixture); typedef void* (* MunitTestSetup)(const MunitParameter params[], void* user_data); typedef void (* MunitTestTearDown)(void* fixture); typedef struct { char* name; MunitTestFunc test; MunitTestSetup setup; MunitTestTearDown tear_down; MunitTestOptions options; MunitParameterEnum* parameters; } MunitTest; typedef enum { MUNIT_SUITE_OPTION_NONE = 0 } MunitSuiteOptions; typedef struct MunitSuite_ MunitSuite; struct MunitSuite_ { char* prefix; MunitTest* tests; MunitSuite* suites; unsigned int iterations; MunitSuiteOptions options; }; int munit_suite_main(const MunitSuite* suite, void* user_data, int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)]); /* Note: I'm not very happy with this API; it's likely to change if I * figure out something better. Suggestions welcome. */ typedef struct MunitArgument_ MunitArgument; struct MunitArgument_ { char* name; bool (* parse_argument)(const MunitSuite* suite, void* user_data, int* arg, int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)]); void (* write_help)(const MunitArgument* argument, void* user_data); }; int munit_suite_main_custom(const MunitSuite* suite, void* user_data, int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)], const MunitArgument arguments[]); #if defined(MUNIT_ENABLE_ASSERT_ALIASES) #define assert_true(expr) munit_assert_true(expr) #define assert_false(expr) munit_assert_false(expr) #define assert_char(a, op, b) munit_assert_char(a, op, b) #define assert_uchar(a, op, b) munit_assert_uchar(a, op, b) #define assert_short(a, op, b) munit_assert_short(a, op, b) #define assert_ushort(a, op, b) munit_assert_ushort(a, op, b) #define assert_int(a, op, b) munit_assert_int(a, op, b) #define assert_uint(a, op, b) munit_assert_uint(a, op, b) #define assert_long(a, op, b) munit_assert_long(a, op, b) #define assert_ulong(a, op, b) munit_assert_ulong(a, op, b) #define assert_llong(a, op, b) munit_assert_llong(a, op, b) #define assert_ullong(a, op, b) munit_assert_ullong(a, op, b) #define assert_size(a, op, b) munit_assert_size(a, op, b) #define assert_float(a, op, b) munit_assert_float(a, op, b) #define assert_double(a, op, b) munit_assert_double(a, op, b) #define assert_ptr(a, op, b) munit_assert_ptr(a, op, b) #define assert_int8(a, op, b) munit_assert_int8(a, op, b) #define assert_uint8(a, op, b) munit_assert_uint8(a, op, b) #define assert_int16(a, op, b) munit_assert_int16(a, op, b) #define assert_uint16(a, op, b) munit_assert_uint16(a, op, b) #define assert_int32(a, op, b) munit_assert_int32(a, op, b) #define assert_uint32(a, op, b) munit_assert_uint32(a, op, b) #define assert_int64(a, op, b) munit_assert_int64(a, op, b) #define assert_uint64(a, op, b) munit_assert_uint64(a, op, b) #define assert_double_equal(a, b, precision) munit_assert_double_equal(a, b, precision) #define assert_string_equal(a, b) munit_assert_string_equal(a, b) #define assert_string_not_equal(a, b) munit_assert_string_not_equal(a, b) #define assert_memory_equal(size, a, b) munit_assert_memory_equal(size, a, b) #define assert_memory_not_equal(size, a, b) munit_assert_memory_not_equal(size, a, b) #define assert_ptr_equal(a, b) munit_assert_ptr_equal(a, b) #define assert_ptr_not_equal(a, b) munit_assert_ptr_not_equal(a, b) #define assert_ptr_null(ptr) munit_assert_null_equal(ptr) #define assert_ptr_not_null(ptr) munit_assert_not_null(ptr) #define assert_null(ptr) munit_assert_null(ptr) #define assert_not_null(ptr) munit_assert_not_null(ptr) #endif /* defined(MUNIT_ENABLE_ASSERT_ALIASES) */ #if defined(__cplusplus) } #endif #endif /* !defined(MUNIT_H) */ #if defined(MUNIT_ENABLE_ASSERT_ALIASES) # if defined(assert) # undef assert # endif # define assert(expr) munit_assert(expr) #endif dqlite-1.16.7/test/raft/lib/runner.h000066400000000000000000000136061465252713400172520ustar00rootroot00000000000000/* Convenience macros to reduce munit boiler plate. */ #ifndef TEST_RUNNER_H_ #define TEST_RUNNER_H_ #include #include "munit.h" #include "../../../src/tracing.h" /* Top-level suites array declaration. * * These top-level suites hold all module-level child suites and must be defined * and then set as child suites of a root suite created at runtime by the test * runner's main(). This can be done using the TEST_RUNNER macro. */ extern MunitSuite _main_suites[]; extern int _main_suites_n; /* Maximum number of test cases for each suite */ #define SUITE__CAP 128 /* Define the top-level suites array and the main() function of the test. */ #define RUNNER(NAME) \ MunitSuite _main_suites[SUITE__CAP]; \ int _main_suites_n = 0; \ \ int main(int argc, char *argv[MUNIT_ARRAY_PARAM(argc)]) \ { \ signal(SIGPIPE, SIG_IGN); \ dqliteTracingMaybeEnable(true); \ MunitSuite suite = {(char *)"", NULL, _main_suites, 1, 0}; \ return munit_suite_main(&suite, (void *)NAME, argc, argv); \ } /* Declare and register a new test suite #S belonging to the file's test module. * * A test suite is a pair of static variables: * * static MunitTest _##S##_suites[SUITE__CAP] * static MunitTest _##S##_tests[SUITE__CAP] * * The tests and suites attributes of the next available MunitSuite slot in the * _module_suites array will be set to the suite's tests and suites arrays, and * the prefix attribute of the slot will be set to /S. */ #define SUITE(S) \ SUITE__DECLARE(S) \ SUITE__ADD_CHILD(main, #S, S) /* Declare and register a new test. */ #define TEST(S, C, SETUP, TEAR_DOWN, OPTIONS, PARAMS) \ static MunitResult test_##S##_##C(const MunitParameter params[], \ void *data); \ TEST__ADD_TO_SUITE(S, C, SETUP, TEAR_DOWN, OPTIONS, PARAMS) \ static MunitResult test_##S##_##C( \ MUNIT_UNUSED const MunitParameter params[], MUNIT_UNUSED void *data) #define SKIP_IF_NO_FIXTURE \ if (f == NULL) { \ return MUNIT_SKIP; \ } /* Declare the MunitSuite[] and the MunitTest[] arrays that compose the test * suite identified by S. */ #define SUITE__DECLARE(S) \ static MunitSuite _##S##_suites[SUITE__CAP]; \ static MunitTest _##S##_tests[SUITE__CAP]; \ static MunitTestSetup _##S##_setup = NULL; \ static MunitTestTearDown _##S##_tear_down = NULL; \ static int _##S##_suites_n = 0; \ static int _##S##_tests_n = 0; \ __attribute__((constructor(101))) static void _##S##_init(void) \ { \ memset(_##S##_suites, 0, sizeof(_##S##_suites)); \ memset(_##S##_tests, 0, sizeof(_##S##_tests)); \ (void)_##S##_suites_n; \ (void)_##S##_tests_n; \ (void)_##S##_setup; \ (void)_##S##_tear_down; \ } /* Set the tests and suites attributes of the next available slot of the * MunitSuite[] array of S1 to the MunitTest[] and MunitSuite[] arrays of S2, * using the given PREFIX. */ #define SUITE__ADD_CHILD(S1, PREFIX, S2) \ __attribute__((constructor(102))) static void _##S1##_##S2##_init(void) \ { \ int n = _##S1##_suites_n; \ _##S1##_suites[n].prefix = PREFIX; \ _##S1##_suites[n].tests = _##S2##_tests; \ _##S1##_suites[n].suites = _##S2##_suites; \ _##S1##_suites[n].iterations = 0; \ _##S1##_suites[n].options = 0; \ _##S1##_suites_n = n + 1; \ } /* Add a test case to the MunitTest[] array of suite S. */ #define TEST__ADD_TO_SUITE(S, C, SETUP, TEAR_DOWN, OPTIONS, PARAMS) \ __attribute__((constructor(103))) static void _##S##_tests_##C##_init( \ void) \ { \ MunitTest *tests = _##S##_tests; \ int n = _##S##_tests_n; \ TEST__SET_IN_ARRAY(tests, n, "/" #C, test_##S##_##C, SETUP, TEAR_DOWN, \ OPTIONS, PARAMS); \ _##S##_tests_n = n + 1; \ } /* Set the values of the I'th test case slot in the given test array */ #define TEST__SET_IN_ARRAY(TESTS, I, NAME, FUNC, SETUP, TEAR_DOWN, OPTIONS, \ PARAMS) \ TESTS[I].name = NAME; \ TESTS[I].test = FUNC; \ TESTS[I].setup = SETUP; \ TESTS[I].tear_down = TEAR_DOWN; \ TESTS[I].options = OPTIONS; \ TESTS[I].parameters = PARAMS #endif /* TEST_RUNNER_H_ */ dqlite-1.16.7/test/raft/lib/snapshot.h000066400000000000000000000020521465252713400175710ustar00rootroot00000000000000/** * Raft snapshot test helpers. */ #ifndef TEST_SNAPSHOT_H #define TEST_SNAPSHOT_H #include "../../../src/raft.h" #include "../../../src/raft/configuration.h" /** * Allocate and create the given snapshot, using the given @LAST_INDEX, * @LAST_TERM, the given @CONF, and generating an FSM snapshot using @X and @Y. */ #define CREATE_SNAPSHOT(SNAPSHOT, LAST_INDEX, LAST_TERM, CONF, CONF_INDEX, X, \ Y) \ SNAPSHOT = raft_malloc(sizeof *SNAPSHOT); \ munit_assert_ptr_not_null(SNAPSHOT); \ SNAPSHOT->index = LAST_INDEX; \ SNAPSHOT->term = LAST_TERM; \ SNAPSHOT->configuration = CONF; \ SNAPSHOT->configuration_index = CONF_INDEX; \ FsmEncodeSnapshot(X, Y, &SNAPSHOT->bufs, &SNAPSHOT->n_bufs) #endif /* TEST_CONFIGURATION_H */ dqlite-1.16.7/test/raft/lib/tcp.c000066400000000000000000000130201465252713400165100ustar00rootroot00000000000000#include "tcp.h" #include #include #include #include void TcpServerInit(struct TcpServer *s) { struct sockaddr_in addr; socklen_t size = sizeof addr; int rv; /* Initialize the socket address structure. */ memset(&addr, 0, size); addr.sin_family = AF_INET; addr.sin_addr.s_addr = inet_addr("127.0.0.1"); addr.sin_port = 0; /* Get a random free port */ /* Create the server socket. */ s->socket = socket(AF_INET, SOCK_STREAM, 0); if (s->socket == -1) { munit_errorf("tcp server: socket(): %s", strerror(errno)); } /* Bind the socket. */ rv = bind(s->socket, (struct sockaddr *)&addr, size); if (rv == -1) { munit_errorf("tcp server: bind(): %s", strerror(errno)); } /* Start listening. */ rv = listen(s->socket, 1); if (rv == -1) { munit_errorf("tcp server: listen(): %s", strerror(errno)); } /* Get the actual addressed assigned by the kernel and save it back in the * relevant field. */ rv = getsockname(s->socket, (struct sockaddr *)&addr, &size); if (rv != 0) { munit_errorf("tcp: getsockname(): %s", strerror(errno)); } s->port = htons(addr.sin_port); sprintf(s->address, "127.0.0.1:%d", s->port); } void TcpServerClose(struct TcpServer *s) { int rv; if (s->socket == -1) { return; } rv = close(s->socket); if (rv == -1) { munit_errorf("tcp server: close(): %s", strerror(errno)); } } int TcpServerAccept(struct TcpServer *s) { int socket; struct sockaddr_in address; socklen_t size; size = sizeof(address); socket = accept(s->socket, (struct sockaddr *)&address, &size); if (socket < 0) { munit_errorf("tcp server: accept(): %s", strerror(errno)); } return socket; } void TcpServerStop(struct TcpServer *s) { int rv; rv = close(s->socket); if (rv == -1) { munit_errorf("tcp server: close(): %s", strerror(errno)); } s->socket = -1; } void test_tcp_setup(const MunitParameter params[], struct test_tcp *t) { (void)params; t->server.socket = -1; t->client.socket = -1; } void test_tcp_tear_down(struct test_tcp *t) { int rv; if (t->server.socket != -1) { rv = close(t->server.socket); if (rv == -1) { munit_errorf("tcp: close(): %s", strerror(errno)); } } if (t->client.socket != -1) { rv = close(t->client.socket); if (rv == -1) { munit_errorf("tcp: close(): %s", strerror(errno)); } } } void test_tcp_listen(struct test_tcp *t) { struct sockaddr_in addr; socklen_t size = sizeof addr; int rv; /* Initialize the socket address structure. */ memset(&addr, 0, size); addr.sin_family = AF_INET; addr.sin_addr.s_addr = inet_addr("127.0.0.1"); addr.sin_port = 0; /* Get a random free port */ /* Create the server socket. */ t->server.socket = socket(AF_INET, SOCK_STREAM, 0); if (t->server.socket == -1) { munit_errorf("tcp: socket(): %s", strerror(errno)); } /* Bind the socket. */ rv = bind(t->server.socket, (struct sockaddr *)&addr, size); if (rv == -1) { munit_errorf("tcp: bind(): %s", strerror(errno)); } /* Start listening. */ rv = listen(t->server.socket, 1); if (rv == -1) { munit_errorf("tcp: listen(): %s", strerror(errno)); } /* Get the actual addressed assigned by the kernel and save it back in * the relevant test_socket__server field (pointed to by address). */ rv = getsockname(t->server.socket, (struct sockaddr *)&addr, &size); if (rv != 0) { munit_errorf("tcp: getsockname(): %s", strerror(errno)); } sprintf(t->server.address, "127.0.0.1:%d", htons(addr.sin_port)); } const char *test_tcp_address(struct test_tcp *t) { return t->server.address; } void test_tcp_connect(struct test_tcp *t, int port) { struct sockaddr_in addr; int rv; /* Create the client socket. */ t->client.socket = socket(AF_INET, SOCK_STREAM, 0); if (t->client.socket == -1) { munit_errorf("tcp: socket(): %s", strerror(errno)); } /* Initialize the socket address structure. */ memset(&addr, 0, sizeof addr); addr.sin_family = AF_INET; addr.sin_addr.s_addr = inet_addr("127.0.0.1"); addr.sin_port = htons(port); /* Connect */ rv = connect(t->client.socket, (struct sockaddr *)&addr, sizeof addr); if (rv == -1) { munit_errorf("tcp: connect(): %s", strerror(errno)); } } void test_tcp_close(struct test_tcp *t) { int rv; rv = close(t->client.socket); if (rv == -1) { munit_errorf("tcp: close(): %s", strerror(errno)); } t->client.socket = -1; } void test_tcp_stop(struct test_tcp *t) { int rv; rv = close(t->server.socket); if (rv == -1) { munit_errorf("tcp: close(): %s", strerror(errno)); } t->server.socket = -1; } void test_tcp_send(struct test_tcp *t, const void *buf, int len) { int rv; rv = write(t->client.socket, buf, len); if (rv == -1) { munit_errorf("tcp: write(): %s", strerror(errno)); } if (rv != len) { munit_errorf("tcp: write(): only %d bytes written", rv); } } int test_tcp_accept(struct test_tcp *t) { int socket; struct sockaddr_in address; socklen_t size; size = sizeof(address); socket = accept(t->server.socket, (struct sockaddr *)&address, &size); if (socket < 0) { munit_errorf("tcp: accept(): %s", strerror(errno)); } return socket; } dqlite-1.16.7/test/raft/lib/tcp.h000066400000000000000000000053441465252713400165270ustar00rootroot00000000000000/* Test TCP utilities. * * This module sports helpers to create server or client sockets, and * send/receive data through them. */ #ifndef TEST_TCP_H #define TEST_TCP_H #include "munit.h" /* Macro helpers. */ #define FIXTURE_TCP_SERVER struct TcpServer server #define SETUP_TCP_SERVER TcpServerInit(&f->server) #define TEAR_DOWN_TCP_SERVER TcpServerClose(&f->server) #define TCP_SERVER_STOP TcpServerStop(&f->server) #define TCP_SERVER_PORT f->server.port #define TCP_SERVER_ADDRESS f->server.address #define FIXTURE_TCP struct test_tcp tcp #define SETUP_TCP test_tcp_setup(params, &f->tcp) #define TEAR_DOWN_TCP test_tcp_tear_down(&f->tcp) #define TCP_CLIENT_CONNECT(PORT) test_tcp_connect(&f->tcp, PORT) #define TCP_CLIENT_SEND(BUF, N) test_tcp_send(&f->tcp, BUF, N) #define TCP_CLIENT_CLOSE test_tcp_close(&f->tcp) struct TcpServer { int socket; /* Socket listening to incoming connections */ int port; char address[128]; /* IPv4 address of the server, with port */ }; void TcpServerInit(struct TcpServer *s); void TcpServerClose(struct TcpServer *s); /* Accept inbound client connection and return the relevant socket. */ int TcpServerAccept(struct TcpServer *s); /* Close the server socket. */ void TcpServerStop(struct TcpServer *s); struct TcpClient { int socket; /* Socket connected to a server. */ }; void TcpClientInit(struct TcpClient *s); void TcpClientClose(struct TcpClient *s); /* Object that can be used to setup and control a TCP server and/or client. */ struct test_tcp { struct { int socket; /* Socket listening to incoming connections */ char address[128]; /* IPv4 address of the server, with port */ } server; struct { int socket; /* Socket connected to another host */ } client; }; /** * Bind the server socket of the given test TCP host to localhost and start * listening to it. */ void test_tcp_setup(const MunitParameter params[], struct test_tcp *t); void test_tcp_tear_down(struct test_tcp *t); /** * Start listening to a random free port on localhost. */ void test_tcp_listen(struct test_tcp *t); /** * Return the address of the server socket created with @test_tcp_listen. */ const char *test_tcp_address(struct test_tcp *t); /** * Connect the client socket to the given port on localhost. */ void test_tcp_connect(struct test_tcp *t, int port); /** * Close the client socket. */ void test_tcp_close(struct test_tcp *t); /** * Send data using the client socket. */ void test_tcp_send(struct test_tcp *t, const void *buf, int len); /** * Accept inbound client connection and return the relevant socket. */ int test_tcp_accept(struct test_tcp *t); /** * Close the server socket. */ void test_tcp_stop(struct test_tcp *t); #endif /* TEST_TCP_H */ dqlite-1.16.7/test/raft/lib/uv.h000066400000000000000000000041511465252713400163660ustar00rootroot00000000000000/* Helpers around the libuv-based implementation of the raft_io interface. */ #ifndef TEST_UV_H #define TEST_UV_H #include "../../../src/raft.h" #include "dir.h" #include "heap.h" #include "loop.h" #define FIXTURE_UV_TRANSPORT struct raft_uv_transport transport #define SETUP_UV_TRANSPORT \ do { \ int rv_; \ f->transport.version = 1; \ rv_ = raft_uv_tcp_init(&f->transport, &f->loop); \ munit_assert_int(rv_, ==, 0); \ } while (0) #define TEAR_DOWN_UV_TRANSPORT raft_uv_tcp_close(&f->transport) #define FIXTURE_UV_DEPS \ FIXTURE_DIR; \ FIXTURE_HEAP; \ FIXTURE_LOOP; \ FIXTURE_UV_TRANSPORT #define SETUP_UV_DEPS \ SET_UP_DIR; \ SET_UP_HEAP; \ SETUP_LOOP; \ SETUP_UV_TRANSPORT #define TEAR_DOWN_UV_DEPS \ TEAR_DOWN_UV_TRANSPORT; \ TEAR_DOWN_LOOP; \ TEAR_DOWN_HEAP; \ TEAR_DOWN_DIR #define FIXTURE_UV struct raft_io io #define SETUP_UV \ do { \ int rv_; \ rv_ = raft_uv_init(&f->io, &f->loop, f->dir, &f->transport); \ munit_assert_int(rv_, ==, 0); \ raft_uv_set_auto_recovery(&f->io, false); \ rv_ = f->io.init(&f->io, 1, "127.0.0.1:9001"); \ munit_assert_int(rv_, ==, 0); \ } while (0) MUNIT_UNUSED static void uvCloseCb(struct raft_io *io) { bool *closed = io->data; *closed = true; } #define TEAR_DOWN_UV \ do { \ bool _closed = false; \ f->io.data = &_closed; \ f->io.close(&f->io, uvCloseCb); \ LOOP_RUN_UNTIL(&_closed); \ raft_uv_close(&f->io); \ } while (0) #endif /* TEST_UV_H */ dqlite-1.16.7/test/raft/unit/000077500000000000000000000000001465252713400157735ustar00rootroot00000000000000dqlite-1.16.7/test/raft/unit/main_core.c000066400000000000000000000000531465252713400200710ustar00rootroot00000000000000#include "../lib/runner.h" RUNNER("core") dqlite-1.16.7/test/raft/unit/main_uv.c000066400000000000000000000000511465252713400175710ustar00rootroot00000000000000#include "../lib/runner.h" RUNNER("uv") dqlite-1.16.7/test/raft/unit/test_byte.c000066400000000000000000000121631465252713400201440ustar00rootroot00000000000000#include #include #include "../../../src/raft/byte.h" #include "../lib/runner.h" /****************************************************************************** * * Helper macros * *****************************************************************************/ #define CRC32(VALUE) byteCrc32(&(VALUE), sizeof VALUE, 0) /****************************************************************************** * * byteCrc32 * *****************************************************************************/ SUITE(byteCrc32) /* The same data produces the same sum. */ TEST(byteCrc32, valid, NULL, NULL, 0, NULL) { uint64_t value1 = 123456789; uint64_t value2 = 123456789; munit_assert_int(CRC32(value1), ==, CRC32(value2)); return MUNIT_OK; } /* Different data produces a different sum. */ TEST(byteCrc32, invalid, NULL, NULL, 0, NULL) { uint64_t value1 = 123456789; uint64_t value2 = 123466789; munit_assert_int(CRC32(value1), !=, CRC32(value2)); return MUNIT_OK; } /****************************************************************************** * * Convert to little endian representation (least significant byte first). * *****************************************************************************/ SUITE(byteFlip) /* Convert a 32-bit number. */ TEST(byteFlip, 32, NULL, NULL, 0, NULL) { uint32_t value; unsigned i; value = byteFlip32(0x03020100); for (i = 0; i < 4; i++) { munit_assert_int(*((uint8_t *)&value + i), ==, i); } return MUNIT_OK; } /* Convert a 64-bit number. */ TEST(byteFlip, 64, NULL, NULL, 0, NULL) { uint64_t value; unsigned i; value = byteFlip64(0x0706050403020100); for (i = 0; i < 8; i++) { munit_assert_int(*((uint8_t *)&value + i), ==, i); } return MUNIT_OK; } /****************************************************************************** * * byteGetString * *****************************************************************************/ SUITE(byteGetString) TEST(byteGetString, success, NULL, NULL, 0, NULL) { uint8_t buf[] = {'h', 'e', 'l', 'l', 'o', 0}; const void *cursor = buf; munit_assert_string_equal(byteGetString(&cursor, sizeof buf), "hello"); munit_assert_ptr_equal(cursor, buf + sizeof buf); return MUNIT_OK; } TEST(byteGetString, malformed, NULL, NULL, 0, NULL) { uint8_t buf[] = {'h', 'e', 'l', 'l', 'o', 'w'}; const void *cursor = buf; munit_assert_ptr_equal(byteGetString(&cursor, sizeof buf), NULL); munit_assert_ptr_equal(cursor, buf); return MUNIT_OK; } /****************************************************************************** * * byteGet64 * *****************************************************************************/ SUITE(byteGet64) TEST(byteGet64, success, NULL, NULL, 0, NULL) { uint8_t *buf = munit_malloc(sizeof(uint64_t) * 2); void *cursor1 = buf + 1; const void *cursor2 = buf + 1; bytePut64(&cursor1, 1); munit_assert_int(byteGet64(&cursor2), ==, 1); free(buf); return MUNIT_OK; } /****************************************************************************** * * byteSha1 * *****************************************************************************/ /* Assert that the 20 bytes contained in VALUE match the given DIGEST * hexadecimal representation. */ #define ASSERT_SHA1(VALUE, DIGEST) \ do { \ char _digest[41]; \ unsigned _i; \ for (_i = 0; _i < 20; _i++) { \ unsigned _j = _i * 2; \ sprintf(&_digest[_j], "%.2x", value[_i]); \ _digest[_j] = toupper(_digest[_j]); \ _digest[_j + 1] = toupper(_digest[_j + 1]); \ } \ _digest[40] = '\0'; \ munit_assert_string_equal(_digest, DIGEST); \ } while (0) SUITE(byteSha1) TEST(byteSha1, abc, NULL, NULL, 0, NULL) { struct byteSha1 sha1; uint8_t text[] = "abc"; uint8_t value[20]; byteSha1Init(&sha1); byteSha1Update(&sha1, text, sizeof text - 1); byteSha1Digest(&sha1, value); ASSERT_SHA1(value, "A9993E364706816ABA3E25717850C26C9CD0D89D"); return MUNIT_OK; } TEST(byteSha1, abcWithZeroLen, NULL, NULL, 0, NULL) { struct byteSha1 sha1; uint8_t text[] = "abc"; uint8_t garbage[] = "garbage"; uint8_t value[20]; byteSha1Init(&sha1); byteSha1Update(&sha1, text, sizeof text - 1); /* Update with 0 length buffer doesn't change digest */ byteSha1Update(&sha1, garbage, 0); byteSha1Digest(&sha1, value); ASSERT_SHA1(value, "A9993E364706816ABA3E25717850C26C9CD0D89D"); return MUNIT_OK; } TEST(byteSha1, abcbd, NULL, NULL, 0, NULL) { struct byteSha1 sha1; uint8_t text[] = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"; uint8_t value[20]; byteSha1Init(&sha1); byteSha1Update(&sha1, text, sizeof text - 1); byteSha1Digest(&sha1, value); ASSERT_SHA1(value, "84983E441C3BD26EBAAE4AA1F95129E5E54670F1"); return MUNIT_OK; } dqlite-1.16.7/test/raft/unit/test_compress.c000066400000000000000000000237571465252713400210470ustar00rootroot00000000000000#include "../../../src/raft/byte.h" #include "../../../src/raft/compress.h" #include "../lib/munit.h" #include "../lib/runner.h" #include #ifdef LZ4_AVAILABLE #include #endif SUITE(Compress) struct raft_buffer getBufWithRandom(size_t len) { struct raft_buffer buf = {0}; buf.len = len; buf.base = munit_malloc(buf.len); if (len != 0) { munit_assert_ptr_not_null(buf.base); } size_t offset = 0; /* Write as many random ints in buf as possible */ for (size_t n = buf.len / sizeof(int); n > 0; n--) { *((int *)(buf.base) + offset) = rand(); offset += 1; } /* Fill the remaining bytes */ size_t rem = buf.len % sizeof(int); /* Offset will now be used in char* arithmetic */ offset *= sizeof(int); if (rem) { int r_int = rand(); for (unsigned i = 0; i < rem; i++) { *((char *)buf.base + offset) = *((char *)&r_int + i); offset++; } } munit_assert_ulong(offset, ==, buf.len); return buf; } struct raft_buffer getBufWithNonRandom(size_t len) { struct raft_buffer buf = {0}; buf.len = len; buf.base = munit_malloc(buf.len); if (len != 0) { munit_assert_ptr_not_null(buf.base); } memset(buf.base, 0xAC, buf.len); return buf; } #ifdef LZ4_AVAILABLE static void sha1(struct raft_buffer bufs[], unsigned n_bufs, uint8_t value[20]) { struct byteSha1 sha; byteSha1Init(&sha); for (unsigned i = 0; i < n_bufs; i++) { byteSha1Update(&sha, (const uint8_t *)bufs[i].base, (uint32_t)bufs[i].len); } byteSha1Digest(&sha, value); } TEST(Compress, compressDecompressZeroLength, NULL, NULL, 0, NULL) { char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0}; struct raft_buffer bufs1[2] = {{NULL, 0}, {(void *)0xDEADBEEF, 0}}; /* 0 length */ struct raft_buffer bufs2[2] = {{(void *)0xDEADBEEF, 0}, {NULL, 0}}; /* 0 length */ struct raft_buffer compressed = {0}; munit_assert_int(Compress(&bufs1[0], 1, &compressed, errmsg), ==, RAFT_INVALID); munit_assert_int(Compress(&bufs1[1], 1, &compressed, errmsg), ==, RAFT_INVALID); munit_assert_int(Compress(bufs1, 2, &compressed, errmsg), ==, RAFT_INVALID); munit_assert_int(Compress(bufs2, 2, &compressed, errmsg), ==, RAFT_INVALID); return MUNIT_OK; } static char *len_one_params[] = { /* 16B 1KB 64KB 4MB 128MB */ "16", "1024", "65536", "4194304", "134217728", /* Around Blocksize*/ "65516", "65517", "65518", "65521", "65535", "65537", "65551", "65555", "65556", /* Ugly lengths */ "0", "1", "9", "123450", "1337", "6655111", NULL}; static MunitParameterEnum random_one_params[] = { {"len_one", len_one_params}, {NULL, NULL}, }; TEST(Compress, compressDecompressRandomOne, NULL, NULL, 0, random_one_params) { char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0}; struct raft_buffer compressed = {0}; struct raft_buffer decompressed = {0}; uint8_t sha1_virgin[20] = {0}; uint8_t sha1_decompressed[20] = {1}; /* Fill a buffer with random data */ size_t len = strtoul(munit_parameters_get(params, "len_one"), NULL, 0); if (len == 0) { return MUNIT_SKIP; } struct raft_buffer buf = getBufWithRandom(len); /* Assert that after compression and decompression the data is unchanged */ sha1(&buf, 1, sha1_virgin); munit_assert_int(Compress(&buf, 1, &compressed, errmsg), ==, 0); free(buf.base); munit_assert_true(IsCompressed(compressed.base, compressed.len)); munit_assert_int(Decompress(compressed, &decompressed, errmsg), ==, 0); munit_assert_ulong(decompressed.len, ==, len); sha1(&decompressed, 1, sha1_decompressed); munit_assert_int(memcmp(sha1_virgin, sha1_decompressed, 20), ==, 0); raft_free(compressed.base); raft_free(decompressed.base); return MUNIT_OK; } static char *len_nonrandom_one_params[] = { #if !defined(__LP64__) && \ (defined(__arm__) || defined(__i386__) || defined(__mips__)) /* 4KB 64KB 4MB 1GB INT_MAX (larger allocations fail on 32-bit archs */ "4096", "65536", "4194304", "1073741824", "2147483647", #else /* 4KB 64KB 4MB 1GB 2GB + 200MB */ "4096", "65536", "4194304", "1073741824", "2357198848", #endif /* Around Blocksize*/ "65516", "65517", "65518", "65521", "65535", "65537", "65551", "65555", "65556", /* Ugly lengths */ "0", "993450", "31337", "83883825", NULL}; static MunitParameterEnum nonrandom_one_params[] = { {"len_one", len_nonrandom_one_params}, {NULL, NULL}, }; TEST(Compress, compressDecompressNonRandomOne, NULL, NULL, 0, nonrandom_one_params) { char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0}; struct raft_buffer compressed = {0}; struct raft_buffer decompressed = {0}; uint8_t sha1_virgin[20] = {0}; uint8_t sha1_decompressed[20] = {1}; /* Fill a buffer with non-random data */ size_t len = strtoul(munit_parameters_get(params, "len_one"), NULL, 0); if (len == 0) { return MUNIT_SKIP; } struct raft_buffer buf = getBufWithNonRandom(len); /* Assert that after compression and decompression the data is unchanged and * that the compressed data is actually smaller */ sha1(&buf, 1, sha1_virgin); munit_assert_int(Compress(&buf, 1, &compressed, errmsg), ==, 0); free(buf.base); munit_assert_true(IsCompressed(compressed.base, compressed.len)); if (len > 0) { munit_assert_ulong(compressed.len, <, buf.len); } munit_assert_int(Decompress(compressed, &decompressed, errmsg), ==, 0); munit_assert_ulong(decompressed.len, ==, len); sha1(&decompressed, 1, sha1_decompressed); munit_assert_int(memcmp(sha1_virgin, sha1_decompressed, 20), ==, 0); raft_free(compressed.base); raft_free(decompressed.base); return MUNIT_OK; } static char *len_two_params[] = {"4194304", "13373", "66", "0", NULL}; static MunitParameterEnum random_two_params[] = { {"len_one", len_one_params}, {"len_two", len_two_params}, {NULL, NULL}, }; TEST(Compress, compressDecompressRandomTwo, NULL, NULL, 0, random_two_params) { char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0}; struct raft_buffer compressed = {0}; struct raft_buffer decompressed = {0}; uint8_t sha1_virgin[20] = {0}; uint8_t sha1_single[20] = {0}; uint8_t sha1_decompressed[20] = {1}; /* Fill two buffers with random data */ size_t len1 = strtoul(munit_parameters_get(params, "len_one"), NULL, 0); size_t len2 = strtoul(munit_parameters_get(params, "len_two"), NULL, 0); if (len1 + len2 == 0) { return MUNIT_SKIP; } struct raft_buffer buf1 = getBufWithRandom(len1); struct raft_buffer buf2 = getBufWithRandom(len2); struct raft_buffer bufs[2] = {buf1, buf2}; /* If one of the buffers is empty ensure data is identical to single buffer * case. */ if (len1 == 0) { sha1(&buf2, 1, sha1_single); } else if (len2 == 0) { sha1(&buf1, 1, sha1_single); } /* Assert that after compression and decompression the data is unchanged */ sha1(bufs, 2, sha1_virgin); munit_assert_int(Compress(bufs, 2, &compressed, errmsg), ==, 0); free(buf1.base); free(buf2.base); munit_assert_true(IsCompressed(compressed.base, compressed.len)); munit_assert_int(Decompress(compressed, &decompressed, errmsg), ==, 0); munit_assert_ulong(decompressed.len, ==, buf1.len + buf2.len); sha1(&decompressed, 1, sha1_decompressed); munit_assert_int(memcmp(sha1_virgin, sha1_decompressed, 20), ==, 0); if (len1 == 0 || len2 == 0) { munit_assert_int(memcmp(sha1_single, sha1_virgin, 20), ==, 0); munit_assert_int(memcmp(sha1_single, sha1_decompressed, 20), ==, 0); } raft_free(compressed.base); raft_free(decompressed.base); return MUNIT_OK; } TEST(Compress, compressDecompressCorruption, NULL, NULL, 0, NULL) { char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0}; struct raft_buffer compressed = {0}; struct raft_buffer decompressed = {0}; /* Fill a buffer with random data */ size_t len = 2048; struct raft_buffer buf = getBufWithRandom(len); munit_assert_int(Compress(&buf, 1, &compressed, errmsg), ==, 0); munit_assert_true(IsCompressed(compressed.base, compressed.len)); /* Corrupt the a data byte after the header */ munit_assert_ulong(LZ4F_HEADER_SIZE_MAX_RAFT, <, compressed.len); ((char *)compressed.base)[LZ4F_HEADER_SIZE_MAX_RAFT] += 1; munit_assert_int(Decompress(compressed, &decompressed, errmsg), !=, 0); munit_assert_string_equal(errmsg, "LZ4F_decompress ERROR_contentChecksum_invalid"); munit_assert_ptr_null(decompressed.base); raft_free(compressed.base); free(buf.base); return MUNIT_OK; } #else TEST(Compress, lz4Disabled, NULL, NULL, 0, NULL) { char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0}; struct raft_buffer compressed = {0}; /* Fill a buffer with random data */ size_t len = 2048; struct raft_buffer buf = getBufWithRandom(len); munit_assert_int(Compress(&buf, 1, &compressed, errmsg), ==, RAFT_INVALID); munit_assert_ptr_null(compressed.base); free(buf.base); return MUNIT_OK; } #endif /* LZ4_AVAILABLE */ static const char LZ4_MAGIC[4] = {0x04, 0x22, 0x4d, 0x18}; TEST(Compress, isCompressedTooSmall, NULL, NULL, 0, NULL) { munit_assert_false(IsCompressed(&LZ4_MAGIC[1], sizeof(LZ4_MAGIC) - 1)); return MUNIT_OK; } TEST(Compress, isCompressedNull, NULL, NULL, 0, NULL) { munit_assert_false(IsCompressed(NULL, sizeof(LZ4_MAGIC))); return MUNIT_OK; } TEST(Compress, isCompressed, NULL, NULL, 0, NULL) { munit_assert_true(IsCompressed(LZ4_MAGIC, sizeof(LZ4_MAGIC))); return MUNIT_OK; } TEST(Compress, notCompressed, NULL, NULL, 0, NULL) { char not_compressed[4] = {0x18, 0x4d, 0x22, 0x04}; munit_assert_false(IsCompressed(not_compressed, sizeof(not_compressed))); return MUNIT_OK; } dqlite-1.16.7/test/raft/unit/test_configuration.c000066400000000000000000000507741465252713400220620ustar00rootroot00000000000000#include "../../../src/raft/byte.h" #include "../../../src/raft/configuration.h" #include "../lib/heap.h" #include "../lib/runner.h" /****************************************************************************** * * Fixture * *****************************************************************************/ struct fixture { FIXTURE_HEAP; struct raft_configuration configuration; }; static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SET_UP_HEAP; configurationInit(&f->configuration); return f; } static void tearDownNoClose(void *data) { struct fixture *f = data; TEAR_DOWN_HEAP; free(f); } static void tearDown(void *data) { struct fixture *f = data; configurationClose(&f->configuration); tearDownNoClose(data); } /****************************************************************************** * * Helper macros * *****************************************************************************/ /* Accessors */ #define VOTER_COUNT configurationVoterCount(&f->configuration) #define INDEX_OF(ID) configurationIndexOf(&f->configuration, ID) #define INDEX_OF_VOTER(ID) configurationIndexOfVoter(&f->configuration, ID) #define GET(ID) configurationGet(&f->configuration, ID) /* Add a server to the fixture's configuration. */ #define ADD_RV(ID, ADDRESS, ROLE) \ configurationAdd(&f->configuration, ID, ADDRESS, ROLE) #define ADD(...) munit_assert_int(ADD_RV(__VA_ARGS__), ==, 0) #define ADD_ERROR(RV, ...) munit_assert_int(ADD_RV(__VA_ARGS__), ==, RV) /* Remove a server from the fixture's configuration */ #define REMOVE_RV(ID) configurationRemove(&f->configuration, ID) #define REMOVE(...) munit_assert_int(REMOVE_RV(__VA_ARGS__), ==, 0) #define REMOVE_ERROR(RV, ...) munit_assert_int(REMOVE_RV(__VA_ARGS__), ==, RV) /* Copy the fixture's configuration into the given one. */ #define COPY_RV(CONF) configurationCopy(&f->configuration, CONF) #define COPY(...) munit_assert_int(COPY_RV(__VA_ARGS__), ==, 0) #define COPY_ERROR(RV, ...) munit_assert_int(COPY_RV(__VA_ARGS__), ==, RV) /* Encode the fixture's configuration into the given buffer. */ #define ENCODE_RV(BUF) configurationEncode(&f->configuration, BUF) #define ENCODE(...) munit_assert_int(ENCODE_RV(__VA_ARGS__), ==, 0) #define ENCODE_ERROR(RV, ...) munit_assert_int(ENCODE_RV(__VA_ARGS__), ==, RV) /* Decode the given buffer into the fixture's configuration. */ #define DECODE_RV(BUF) configurationDecode(BUF, &f->configuration) #define DECODE(...) munit_assert_int(DECODE_RV(__VA_ARGS__), ==, 0) #define DECODE_ERROR(RV, ...) munit_assert_int(DECODE_RV(__VA_ARGS__), ==, RV) /****************************************************************************** * * Assertions * *****************************************************************************/ /* Assert that the fixture's configuration has n servers. */ #define ASSERT_N(N) \ { \ munit_assert_int(f->configuration.n, ==, N); \ if (N == 0) { \ munit_assert_ptr_null(f->configuration.servers); \ } else { \ munit_assert_ptr_not_null(f->configuration.servers); \ } \ } /* Assert that the attributes of the I'th server in the fixture's configuration * match the given values. */ #define ASSERT_SERVER(I, ID, ADDRESS, ROLE) \ { \ struct raft_server *server; \ munit_assert_int(I, <, f->configuration.n); \ server = &f->configuration.servers[I]; \ munit_assert_int(server->id, ==, ID); \ munit_assert_string_equal(server->address, ADDRESS); \ munit_assert_int(server->role, ==, ROLE); \ } /****************************************************************************** * * configurationVoterCount * *****************************************************************************/ SUITE(configurationVoterCount) /* All servers are voting. */ TEST(configurationVoterCount, all_voters, setUp, tearDown, 0, NULL) { struct fixture *f = data; ADD(1, "192.168.1.1:666", RAFT_VOTER); ADD(2, "192.168.1.2:666", RAFT_VOTER); munit_assert_int(VOTER_COUNT, ==, 2); return MUNIT_OK; } /* Return only voting servers. */ TEST(configurationVoterCount, filter, setUp, tearDown, 0, NULL) { struct fixture *f = data; ADD(1, "192.168.1.1:666", RAFT_VOTER); ADD(2, "192.168.1.2:666", RAFT_STANDBY); munit_assert_int(VOTER_COUNT, ==, 1); return MUNIT_OK; } /****************************************************************************** * * configurationIndexOf * *****************************************************************************/ SUITE(configurationIndexOf) /* If a matching server is found, it's index is returned. */ TEST(configurationIndexOf, match, setUp, tearDown, 0, NULL) { struct fixture *f = data; ADD(1, "192.168.1.1:666", RAFT_VOTER); ADD(2, "192.168.1.2:666", RAFT_STANDBY); munit_assert_int(INDEX_OF(2), ==, 1); return MUNIT_OK; } /* If no matching server is found, the length of the configuration is * returned. */ TEST(configurationIndexOf, no_match, setUp, tearDown, 0, NULL) { struct fixture *f = data; ADD(1, "127.0.0.1:666", RAFT_VOTER); munit_assert_int(INDEX_OF(3), ==, f->configuration.n); return MUNIT_OK; } /****************************************************************************** * * configurationIndexOfVoter * *****************************************************************************/ SUITE(configurationIndexOfVoter) /* The index of the matching voting server (relative to the number of voting servers) is returned. */ TEST(configurationIndexOfVoter, match, setUp, tearDown, 0, NULL) { struct fixture *f = data; ADD(1, "192.168.1.1:666", RAFT_STANDBY); ADD(2, "192.168.1.2:666", RAFT_VOTER); ADD(3, "192.168.1.3:666", RAFT_VOTER); munit_assert_int(INDEX_OF_VOTER(3), ==, 1); return MUNIT_OK; } /* If no matching server is found, the length of the configuration is * returned. */ TEST(configurationIndexOfVoter, no_match, setUp, tearDown, 0, NULL) { struct fixture *f = data; ADD(1, "192.168.1.1:666", RAFT_VOTER); munit_assert_int(INDEX_OF_VOTER(3), ==, 1); return MUNIT_OK; } /* If the server exists but is non-voting, the length of the configuration is * returned. */ TEST(configurationIndexOfVoter, non_voting, setUp, tearDown, 0, NULL) { struct fixture *f = data; ADD(1, "192.168.1.1:666", RAFT_STANDBY); munit_assert_int(INDEX_OF_VOTER(1), ==, 1); return MUNIT_OK; } /****************************************************************************** * * configurationGet * *****************************************************************************/ SUITE(configurationGet) /* If a matching server is found, it's returned. */ TEST(configurationGet, match, setUp, tearDown, 0, NULL) { struct fixture *f = data; const struct raft_server *server; ADD(1, "192.168.1.1:666", RAFT_VOTER); ADD(2, "192.168.1.2:666", RAFT_STANDBY); server = GET(2); munit_assert_ptr_not_null(server); munit_assert_int(server->id, ==, 2); munit_assert_string_equal(server->address, "192.168.1.2:666"); return MUNIT_OK; } /* If no matching server is found, NULL is returned. */ TEST(configurationGet, no_match, setUp, tearDown, 0, NULL) { struct fixture *f = data; ADD(1, "127.0.0.1:666", RAFT_VOTER); munit_assert_ptr_null(GET(3)); return MUNIT_OK; } /****************************************************************************** * * configurationCopy * *****************************************************************************/ SUITE(configurationCopy) /* Copy a configuration containing two servers */ TEST(configurationCopy, two, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_configuration configuration; ADD(1, "192.168.1.1:666", RAFT_STANDBY); ADD(2, "192.168.1.2:666", RAFT_VOTER); COPY(&configuration); munit_assert_int(configuration.n, ==, 2); munit_assert_int(configuration.servers[0].id, ==, 1); munit_assert_int(configuration.servers[1].id, ==, 2); configurationClose(&configuration); return MUNIT_OK; } static char *copy_oom_heap_fault_delay[] = {"0", "1", "2", NULL}; static char *copy_oom_heap_fault_repeat[] = {"1", NULL}; static MunitParameterEnum copy_oom_params[] = { {TEST_HEAP_FAULT_DELAY, copy_oom_heap_fault_delay}, {TEST_HEAP_FAULT_REPEAT, copy_oom_heap_fault_repeat}, {NULL, NULL}, }; /* Out of memory */ TEST(configurationCopy, oom, setUp, tearDown, 0, copy_oom_params) { struct fixture *f = data; struct raft_configuration configuration; ADD(1, "192.168.1.1:666", RAFT_STANDBY); ADD(2, "192.168.1.2:666", RAFT_VOTER); HEAP_FAULT_ENABLE; COPY_ERROR(RAFT_NOMEM, &configuration); return MUNIT_OK; } /****************************************************************************** * * raft_configuration_add * *****************************************************************************/ SUITE(configurationAdd) /* Add a server to the configuration. */ TEST(configurationAdd, one, setUp, tearDown, 0, NULL) { struct fixture *f = data; ADD(1, "127.0.0.1:666", RAFT_VOTER); ASSERT_N(1); ASSERT_SERVER(0, 1, "127.0.0.1:666", RAFT_VOTER); return MUNIT_OK; } /* Add two servers to the configuration. */ TEST(configurationAdd, two, setUp, tearDown, 0, NULL) { struct fixture *f = data; ADD(1, "127.0.0.1:666", RAFT_VOTER); ADD(2, "192.168.1.1:666", RAFT_STANDBY); ASSERT_N(2); ASSERT_SERVER(0, 1, "127.0.0.1:666", RAFT_VOTER); ASSERT_SERVER(1, 2, "192.168.1.1:666", RAFT_STANDBY); return MUNIT_OK; } /* Add a server with an ID which is already in use. */ TEST(configurationAdd, duplicateId, setUp, tearDown, 0, NULL) { struct fixture *f = data; ADD(1, "127.0.0.1:666", RAFT_VOTER); ADD_ERROR(RAFT_DUPLICATEID, 1, "192.168.1.1:666", RAFT_STANDBY); return MUNIT_OK; } /* Add a server with an address which is already in use. */ TEST(configurationAdd, duplicateAddress, setUp, tearDown, 0, NULL) { struct fixture *f = data; ADD(1, "127.0.0.1:666", RAFT_VOTER); ADD_ERROR(RAFT_DUPLICATEADDRESS, 2, "127.0.0.1:666", RAFT_STANDBY); return MUNIT_OK; } /* Add a server with an invalid role. */ TEST(configurationAdd, invalidRole, setUp, tearDown, 0, NULL) { struct fixture *f = data; ADD_ERROR(RAFT_BADROLE, 2, "127.0.0.1:666", 666); return MUNIT_OK; } static char *add_oom_heap_fault_delay[] = {"0", "1", NULL}; static char *add_oom_heap_fault_repeat[] = {"1", NULL}; static MunitParameterEnum add_oom_params[] = { {TEST_HEAP_FAULT_DELAY, add_oom_heap_fault_delay}, {TEST_HEAP_FAULT_REPEAT, add_oom_heap_fault_repeat}, {NULL, NULL}, }; /* Out of memory. */ TEST(configurationAdd, oom, setUp, tearDown, 0, add_oom_params) { struct fixture *f = data; HeapFaultEnable(&f->heap); ADD_ERROR(RAFT_NOMEM, 1, "127.0.0.1:666", RAFT_VOTER); munit_assert_null(f->configuration.servers); return MUNIT_OK; } /****************************************************************************** * * configurationRemove * *****************************************************************************/ SUITE(configurationRemove) /* Remove the last and only server. */ TEST(configurationRemove, last, setUp, tearDown, 0, NULL) { struct fixture *f = data; ADD(1, "127.0.0.1:666", RAFT_VOTER); REMOVE(1); ASSERT_N(0); return MUNIT_OK; } /* Remove the first server. */ TEST(configurationRemove, first, setUp, tearDown, 0, NULL) { struct fixture *f = data; ADD(1, "127.0.0.1:666", RAFT_VOTER); ADD(2, "192.168.1.1:666", RAFT_STANDBY); REMOVE(1); ASSERT_N(1); ASSERT_SERVER(0, 2, "192.168.1.1:666", RAFT_STANDBY); return MUNIT_OK; } /* Remove a server in the middle. */ TEST(configurationRemove, middle, setUp, tearDown, 0, NULL) { struct fixture *f = data; ADD(1, "127.0.0.1:666", RAFT_VOTER); ADD(2, "192.168.1.1:666", RAFT_STANDBY); ADD(3, "10.0.1.1:666", RAFT_VOTER); REMOVE(2); ASSERT_N(2); ASSERT_SERVER(0, 1, "127.0.0.1:666", RAFT_VOTER); ASSERT_SERVER(1, 3, "10.0.1.1:666", RAFT_VOTER); return MUNIT_OK; } /* Attempts to remove a server with an unknown ID result in an error. */ TEST(configurationRemove, unknownId, setUp, tearDown, 0, NULL) { struct fixture *f = data; REMOVE_ERROR(RAFT_BADID, 1); return MUNIT_OK; } /* Out of memory. */ TEST(configurationRemove, oom, setUp, tearDown, 0, NULL) { struct fixture *f = data; ADD(1, "127.0.0.1:666", RAFT_VOTER); ADD(2, "192.168.1.1:666", RAFT_STANDBY); HeapFaultConfig(&f->heap, 0, 1); HeapFaultEnable(&f->heap); REMOVE_ERROR(RAFT_NOMEM, 1); return MUNIT_OK; } /****************************************************************************** * * configurationEncode * *****************************************************************************/ SUITE(configurationEncode) /* Encode a configuration with one server. */ TEST(configurationEncode, one_server, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_buffer buf; size_t len; const void *cursor; const char *address = "127.0.0.1:666"; ADD(1, address, RAFT_VOTER); ENCODE(&buf); len = 1 + 8 + /* Version and n of servers */ 8 + strlen(address) + 1; /* Server */ len = bytePad64(len); munit_assert_int(buf.len, ==, len); cursor = buf.base; munit_assert_int(byteGet8(&cursor), ==, 1); munit_assert_int(byteGet64(&cursor), ==, 1); munit_assert_int(byteGet64(&cursor), ==, 1); munit_assert_string_equal(byteGetString(&cursor, strlen(address) + 1), address); munit_assert_int(byteGet8(&cursor), ==, RAFT_VOTER); raft_free(buf.base); return MUNIT_OK; } /* Encode a configuration with two servers. */ TEST(configurationEncode, two_servers, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_buffer buf; size_t len; const void *cursor; const char *address1 = "127.0.0.1:666"; const char *address2 = "192.168.1.1:666"; ADD(1, address1, RAFT_STANDBY); ADD(2, address2, RAFT_VOTER); ENCODE(&buf); len = 1 + 8 + /* Version and n of servers */ 8 + strlen(address1) + 1 + 1 + /* Server 1 */ 8 + strlen(address2) + 1 + 1; /* Server 2 */ len = bytePad64(len); munit_assert_int(buf.len, ==, len); cursor = buf.base; munit_assert_int(byteGet8(&cursor), ==, 1); munit_assert_int(byteGet64(&cursor), ==, 2); munit_assert_int(byteGet64(&cursor), ==, 1); munit_assert_string_equal(byteGetString(&cursor, strlen(address1) + 1), address1); munit_assert_int(byteGet8(&cursor), ==, RAFT_STANDBY); munit_assert_int(byteGet64(&cursor), ==, 2); munit_assert_string_equal(byteGetString(&cursor, strlen(address2) + 1), address2); munit_assert_int(byteGet8(&cursor), ==, RAFT_VOTER); raft_free(buf.base); return MUNIT_OK; } /* Out of memory. */ TEST(configurationEncode, oom, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_buffer buf; HeapFaultConfig(&f->heap, 2, 1); HeapFaultEnable(&f->heap); ADD(1, "127.0.0.1:666", RAFT_VOTER); ENCODE_ERROR(RAFT_NOMEM, &buf); return MUNIT_OK; } /****************************************************************************** * * configurationDecode * *****************************************************************************/ SUITE(configurationDecode) /* The decode a payload encoding a configuration with one server */ TEST(configurationDecode, one_server, setUp, tearDown, 0, NULL) { struct fixture *f = data; uint8_t bytes[] = {1, /* Version */ 1, 0, 0, 0, 0, 0, 0, 0, /* Number of servers */ 5, 0, 0, 0, 0, 0, 0, 0, /* Server ID */ 'x', '.', 'y', 0, /* Server address */ 1}; /* Role code */ struct raft_buffer buf; buf.base = bytes; buf.len = sizeof bytes; DECODE(&buf); ASSERT_N(1); ASSERT_SERVER(0, 5, "x.y", RAFT_VOTER); return MUNIT_OK; } /* The decode size is the size of a raft_server array plus the length of the * addresses. */ TEST(configurationDecode, two_servers, setUp, tearDown, 0, NULL) { struct fixture *f = data; uint8_t bytes[] = {1, /* Version */ 2, 0, 0, 0, 0, 0, 0, 0, /* Number of servers */ 5, 0, 0, 0, 0, 0, 0, 0, /* Server ID */ 'x', '.', 'y', 0, /* Server address */ 1, /* Role code */ 3, 0, 0, 0, 0, 0, 0, 0, /* Server ID */ '1', '9', '2', '.', '2', 0, /* Server address */ 0}; /* Role code */ struct raft_buffer buf; buf.base = bytes; buf.len = sizeof bytes; DECODE(&buf); ASSERT_N(2); ASSERT_SERVER(0, 5, "x.y", RAFT_VOTER); ASSERT_SERVER(1, 3, "192.2", RAFT_STANDBY); return MUNIT_OK; } static char *decode_oom_heap_fault_delay[] = {"0", "1", "2", "3", NULL}; static char *decode_oom_heap_fault_repeat[] = {"1", NULL}; static MunitParameterEnum decode_oom_params[] = { {TEST_HEAP_FAULT_DELAY, decode_oom_heap_fault_delay}, {TEST_HEAP_FAULT_REPEAT, decode_oom_heap_fault_repeat}, {NULL, NULL}, }; /* Not enough memory for creating the decoded configuration object. */ TEST(configurationDecode, oom, setUp, tearDownNoClose, 0, decode_oom_params) { struct fixture *f = data; uint8_t bytes[] = {1, /* Version */ 2, 0, 0, 0, 0, 0, 0, 0, /* Number of servers */ 5, 0, 0, 0, 0, 0, 0, 0, /* Server ID */ 'x', '.', 'y', 0, /* Server address */ 1, /* Role code */ 3, 0, 0, 0, 0, 0, 0, 0, /* Server ID */ 'z', '.', 'w', 0, /* Server address */ 0}; /* Role code */ struct raft_buffer buf; HEAP_FAULT_ENABLE; buf.base = bytes; buf.len = sizeof bytes; DECODE_ERROR(RAFT_NOMEM, &buf); return MUNIT_OK; } /* If the encoding version is wrong, an error is returned. */ TEST(configurationDecode, badVersion, setUp, tearDownNoClose, 0, NULL) { struct fixture *f = data; uint8_t bytes = 127; struct raft_buffer buf; buf.base = &bytes; buf.len = 1; DECODE_ERROR(RAFT_MALFORMED, &buf); return MUNIT_OK; } /* The address of a server is not a nul-terminated string. */ TEST(configurationDecode, badAddress, setUp, tearDownNoClose, 0, NULL) { struct fixture *f = data; uint8_t bytes[] = {1, /* Version */ 1, 0, 0, 0, 0, 0, 0, 0, /* Number of servers */ 5, 0, 0, 0, 0, 0, 0, 0, /* Server ID */ 'x', '.', 'y', /* Server address */ 1}; /* Voting flag */ struct raft_buffer buf; buf.base = bytes; buf.len = sizeof bytes; DECODE_ERROR(RAFT_MALFORMED, &buf); return MUNIT_OK; } /* The encoded configuration is invalid because it has a duplicated server * ID. In that case RAFT_MALFORMED is returned. */ TEST(configurationDecode, duplicatedID, setUp, tearDownNoClose, 0, NULL) { struct fixture *f = data; uint8_t bytes[] = {1, /* Version */ 2, 0, 0, 0, 0, 0, 0, 0, /* Number of servers */ 5, 0, 0, 0, 0, 0, 0, 0, /* Server ID */ 'x', '.', 'y', 0, /* Server address */ 1, /* Role code */ 5, 0, 0, 0, 0, 0, 0, 0, /* Server ID */ 'z', '.', 'w', 0, /* Server address */ 0}; /* Role code */ struct raft_buffer buf; buf.base = bytes; buf.len = sizeof bytes; DECODE_ERROR(RAFT_MALFORMED, &buf); return MUNIT_OK; } dqlite-1.16.7/test/raft/unit/test_err.c000066400000000000000000000051411465252713400177670ustar00rootroot00000000000000#include #include #include "../../../src/raft/err.h" #include "../lib/heap.h" #include "../lib/runner.h" /* An error messages which is 249 characters. */ #define LONG_ERRMSG \ "boom boom boom boom boom boom boom boom boom boom boom boom boom boom " \ "boom boom boom boom boom boom boom boom boom boom boom boom boom boom " \ "boom boom boom boom boom boom boom boom boom boom boom boom boom boom " \ "boom boom boom boom boom boom boom boom" /****************************************************************************** * * ErrMsgPrintf * *****************************************************************************/ SUITE(ErrMsgPrintf) /* The format string has no parameters. */ TEST(ErrMsgPrintf, noParams, NULL, NULL, 0, NULL) { char errmsg[RAFT_ERRMSG_BUF_SIZE]; ErrMsgPrintf(errmsg, "boom"); munit_assert_string_equal(errmsg, "boom"); return MUNIT_OK; } /* The format string has parameters. */ TEST(ErrMsgPrintf, params, NULL, NULL, 0, NULL) { char errmsg[RAFT_ERRMSG_BUF_SIZE]; ErrMsgPrintf(errmsg, "boom %d", 123); munit_assert_string_equal(errmsg, "boom 123"); return MUNIT_OK; } /****************************************************************************** * * ErrMsgWrapf * *****************************************************************************/ SUITE(ErrMsgWrapf) /* The wrapping format string has no parameters. */ TEST(ErrMsgWrapf, noParams, NULL, NULL, 0, NULL) { char errmsg[RAFT_ERRMSG_BUF_SIZE]; ErrMsgPrintf(errmsg, "boom"); ErrMsgWrapf(errmsg, "no luck"); munit_assert_string_equal(errmsg, "no luck: boom"); return MUNIT_OK; } /* The wrapping format string has parameters. */ TEST(ErrMsgWrapf, params, NULL, NULL, 0, NULL) { char errmsg[RAFT_ERRMSG_BUF_SIZE]; ErrMsgPrintf(errmsg, "boom"); ErrMsgWrapf(errmsg, "no luck, %s", "joe"); munit_assert_string_equal(errmsg, "no luck, joe: boom"); return MUNIT_OK; } /* The wrapped error message gets partially truncated. */ TEST(ErrMsgWrapf, partialTruncate, NULL, NULL, 0, NULL) { char errmsg[RAFT_ERRMSG_BUF_SIZE]; ErrMsgPrintf(errmsg, "no luck"); ErrMsgWrapf(errmsg, LONG_ERRMSG); munit_assert_string_equal(errmsg, LONG_ERRMSG ": no l"); return MUNIT_OK; } /* The wrapped error message gets entirely truncated. */ TEST(ErrMsgWrapf, fullTruncate, NULL, NULL, 0, NULL) { char errmsg[RAFT_ERRMSG_BUF_SIZE]; ErrMsgPrintf(errmsg, "no luck"); ErrMsgWrapf(errmsg, LONG_ERRMSG " boom"); munit_assert_string_equal(errmsg, LONG_ERRMSG " boom"); return MUNIT_OK; } dqlite-1.16.7/test/raft/unit/test_flags.c000066400000000000000000000055611465252713400203010ustar00rootroot00000000000000#include "../../../src/raft/flags.h" #include "../lib/runner.h" /****************************************************************************** * * flags * *****************************************************************************/ SUITE(flags) TEST(flags, empty, NULL, NULL, 0, NULL) { raft_flags flags = 0; for (int i = 0; i < 64; i++) { munit_assert_false(flagsIsSet(flags, ((raft_flags)1) << i)); } return MUNIT_OK; } TEST(flags, setClear, NULL, NULL, 0, NULL) { raft_flags flags = 0; raft_flags flag = 0; for (int i = 0; i < 64; i++) { flag = ((raft_flags)1) << i; flags = flagsSet(flags, flag); munit_assert_true(flagsIsSet(flags, flag)); flags = flagsClear(flags, flag); munit_assert_false(flagsIsSet(flags, flag)); munit_assert_true(flags == 0); } return MUNIT_OK; } TEST(flags, setMultipleClearMultiple, NULL, NULL, 0, NULL) { raft_flags in = 0; raft_flags out; raft_flags flags = (raft_flags)(1 | 1 << 4 | 1 << 13 | (raft_flags)1 << 40 | (raft_flags)1 << 63); out = flagsSet(in, flags); /* clang-format off */ int positions[64] = { 1, 0, 0, 0, 1, 0, 0, 0, // 0th and 4th 0, 0, 0, 0, 0, 1, 0, 0, // 13th 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // 40th 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, // 63th }; /* clang-format on */ for (unsigned i = 0; i < 64; i++) { if (positions[i]) { munit_assert_true(flagsIsSet(out, (raft_flags)1 << i)); } else { munit_assert_false(flagsIsSet(out, (raft_flags)1 << i)); } } out = flagsClear(out, flags); munit_assert_true(out == 0); return MUNIT_OK; } TEST(flags, setMultipleClearSingle, NULL, NULL, 0, NULL) { raft_flags in = 0; raft_flags out; raft_flags flags = (raft_flags)(1 << 3 | 1 << 5 | 1 << 18 | (raft_flags)1 << 32 | (raft_flags)1 << 35); out = flagsSet(in, flags); /* clang-format off */ int positions[64] = { 0, 0, 0, 1, 0, 1, 0, 0, // 3rd and 5th 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, // 18th 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, // 32rd 35th 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; /* clang-format on */ for (unsigned i = 0; i < 64; i++) { if (positions[i]) { munit_assert_true(flagsIsSet(out, (raft_flags)1 << i)); } else { munit_assert_false(flagsIsSet(out, (raft_flags)1 << i)); } } out = flagsClear(out, (raft_flags)1 << 32); munit_assert_true( out == (raft_flags)(1 << 3 | 1 << 5 | 1 << 18 | (raft_flags)1 << 35)); return MUNIT_OK; } dqlite-1.16.7/test/raft/unit/test_log.c000066400000000000000000001175431465252713400177720ustar00rootroot00000000000000#include "../../../src/raft/configuration.h" #include "../../../src/raft/log.h" #include "../lib/heap.h" #include "../lib/runner.h" /****************************************************************************** * * Fixture * *****************************************************************************/ struct fixture { FIXTURE_HEAP; struct raft_log *log; }; /****************************************************************************** * * Helper macros * *****************************************************************************/ /* Accessors */ #define NUM_ENTRIES logNumEntries(f->log) #define LAST_INDEX logLastIndex(f->log) #define TERM_OF(INDEX) logTermOf(f->log, INDEX) #define LAST_TERM logLastTerm(f->log) #define GET(INDEX) logGet(f->log, INDEX) /* Append one command entry with the given term and a hard-coded payload. */ #define APPEND(TERM) \ { \ struct raft_buffer buf_; \ int rv_; \ buf_.base = raft_malloc(8); \ buf_.len = 8; \ strcpy(buf_.base, "hello"); \ rv_ = logAppend(f->log, TERM, RAFT_COMMAND, buf_, (struct raft_entry_local_data){}, true, NULL); \ munit_assert_int(rv_, ==, 0); \ } /* Same as APPEND, but repeated N times. */ #define APPEND_MANY(TERM, N) \ { \ int i_; \ for (i_ = 0; i_ < N; i_++) { \ APPEND(TERM); \ } \ } /* Invoke append and assert that it returns the given error. */ #define APPEND_ERROR(TERM, RV) \ { \ struct raft_buffer buf_; \ int rv_; \ buf_.base = raft_malloc(8); \ buf_.len = 8; \ rv_ = logAppend(f->log, TERM, RAFT_COMMAND, buf_, (struct raft_entry_local_data){}, true, NULL); \ munit_assert_int(rv_, ==, RV); \ raft_free(buf_.base); \ } /* Append N entries all belonging to the same batch. Each entry will have 64-bit * payload set to i * 1000, where i is the index of the entry in the batch. */ #define APPEND_BATCH(N) \ { \ void *batch; \ size_t offset; \ int i; \ batch = raft_malloc(8 * N); \ munit_assert_ptr_not_null(batch); \ offset = 0; \ for (i = 0; i < N; i++) { \ struct raft_buffer buf; \ int rv; \ buf.base = (uint8_t *)batch + offset; \ buf.len = 8; \ *(uint64_t *)buf.base = i * 1000; \ rv = logAppend(f->log, 1, RAFT_COMMAND, buf, (struct raft_entry_local_data){}, true, batch); \ munit_assert_int(rv, ==, 0); \ offset += 8; \ } \ } #define ACQUIRE(INDEX) \ { \ int rv2; \ rv2 = logAcquire(f->log, INDEX, &entries, &n); \ munit_assert_int(rv2, ==, 0); \ } #define RELEASE(INDEX) logRelease(f->log, INDEX, entries, n); #define TRUNCATE(N) logTruncate(f->log, N) #define SNAPSHOT(INDEX, TRAILING) logSnapshot(f->log, INDEX, TRAILING) #define RESTORE(INDEX, TERM) logRestore(f->log, INDEX, TERM) /****************************************************************************** * * Set up an empty configuration. * *****************************************************************************/ static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_malloc(sizeof *f); SET_UP_HEAP; f->log = logInit(); if (f->log == NULL) { munit_assert_true(false); } return f; } static void tearDown(void *data) { struct fixture *f = data; logClose(f->log); TEAR_DOWN_HEAP; free(f); } /****************************************************************************** * * Assertions * *****************************************************************************/ /* Assert the state of the fixture's log in terms of size, front/back indexes, * offset and number of entries. */ #define ASSERT(SIZE, FRONT, BACK, OFFSET, N) \ munit_assert_int(f->log->size, ==, SIZE); \ munit_assert_int(f->log->front, ==, FRONT); \ munit_assert_int(f->log->back, ==, BACK); \ munit_assert_int(f->log->offset, ==, OFFSET); \ munit_assert_int(logNumEntries(f->log), ==, N) /* Assert the last index and term of the most recent snapshot. */ #define ASSERT_SNAPSHOT(INDEX, TERM) \ munit_assert_int(f->log->snapshot.last_index, ==, INDEX); \ munit_assert_int(f->log->snapshot.last_term, ==, TERM) /* Assert that the term of entry at INDEX equals TERM. */ #define ASSERT_TERM_OF(INDEX, TERM) \ { \ const struct raft_entry *entry; \ entry = logGet(f->log, INDEX); \ munit_assert_ptr_not_null(entry); \ munit_assert_int(entry->term, ==, TERM); \ } /* Assert that the number of outstanding references for the entry at INDEX * equals COUNT. */ #define ASSERT_REFCOUNT(INDEX, COUNT) \ { \ size_t i; \ munit_assert_ptr_not_null(f->log->refs); \ for (i = 0; i < f->log->refs_size; i++) { \ if (f->log->refs[i].index == INDEX) { \ munit_assert_int(f->log->refs[i].count, ==, COUNT); \ break; \ } \ } \ if (i == f->log->refs_size) { \ munit_errorf("no refcount found for entry with index %d", \ (int)INDEX); \ } \ } /****************************************************************************** * * logNumEntries * *****************************************************************************/ SUITE(logNumEntries) /* If the log is empty, the return value is zero. */ TEST(logNumEntries, empty, setUp, tearDown, 0, NULL) { struct fixture *f = data; munit_assert_int(NUM_ENTRIES, ==, 0); return MUNIT_OK; } /* The log is not wrapped. */ TEST(logNumEntries, not_wrapped, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(1 /* term */); munit_assert_int(NUM_ENTRIES, ==, 1); return MUNIT_OK; } /* The log is wrapped. */ TEST(logNumEntries, wrapped, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND_MANY(1 /* term */, 5 /* n entries */); SNAPSHOT(4 /* last_index */, 1 /* trailing */); APPEND_MANY(1 /* term */, 2 /* n entries */); munit_assert_int(NUM_ENTRIES, ==, 4); return MUNIT_OK; } /* The log has an offset and is empty. */ TEST(logNumEntries, offset, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND_MANY(1 /* term */, 5 /* n entries */); SNAPSHOT(5 /* last index */, 0 /* trailing */); munit_assert_int(NUM_ENTRIES, ==, 0); return MUNIT_OK; } /* The log has an offset and is not empty. */ TEST(logNumEntries, offsetNotEmpty, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND_MANY(1 /* term */, 5 /* n entries */); SNAPSHOT(4 /* last index */, 2 /* trailing */); munit_assert_int(NUM_ENTRIES, ==, 3); return MUNIT_OK; } /****************************************************************************** * * logLastIndex * *****************************************************************************/ SUITE(logLastIndex) /* If the log is empty, last index is 0. */ TEST(logLastIndex, empty, setUp, tearDown, 0, NULL) { struct fixture *f = data; munit_assert_int(LAST_INDEX, ==, 0); return MUNIT_OK; } /* If the log is empty and has an offset, last index is calculated accordingly. */ TEST(logLastIndex, emptyWithOffset, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(1); SNAPSHOT(1, 0); munit_assert_int(LAST_INDEX, ==, 1); return MUNIT_OK; } /* The log has one entry. */ TEST(logLastIndex, one, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(1 /* term */); munit_assert_int(LAST_INDEX, ==, 1); return MUNIT_OK; } /* The log has two entries. */ TEST(logLastIndex, two, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND_MANY(1 /* term */, 2 /* n */); munit_assert_int(LAST_INDEX, ==, 2); return MUNIT_OK; } /* If the log starts at a certain offset, the last index is bumped * accordingly. */ TEST(logLastIndex, twoWithOffset, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND_MANY(1 /* term */, 5 /* n */); SNAPSHOT(5 /* last index */, 2 /* trailing */); munit_assert_int(LAST_INDEX, ==, 5); return MUNIT_OK; } /****************************************************************************** * * logLastTerm * *****************************************************************************/ SUITE(logLastTerm) /* If the log is empty, return zero. */ TEST(logLastTerm, empty, setUp, tearDown, 0, NULL) { struct fixture *f = data; munit_assert_int(LAST_TERM, ==, 0); return MUNIT_OK; } /* If the log has a snapshot and no outstanding entries, return the last term of * the snapshot. */ TEST(logLastTerm, snapshot, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(1 /* term */); SNAPSHOT(1 /* last index */, 0 /* trailing */); munit_assert_int(LAST_TERM, ==, 1); return MUNIT_OK; } /****************************************************************************** * * logTermOf * *****************************************************************************/ SUITE(logTermOf) /* If the given index is beyond the last index, return 0. */ TEST(logTermOf, beyondLast, setUp, tearDown, 0, NULL) { struct fixture *f = data; munit_assert_int(TERM_OF(2), ==, 0); munit_assert_int(TERM_OF(10), ==, 0); return MUNIT_OK; } /* If the log is empty but has a snapshot, and the given index matches the last * index of the snapshot, return the snapshot last term. */ TEST(logTermOf, snapshotLastIndex, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND_MANY(1 /* term */, 5 /* n entries */); SNAPSHOT(5 /* last entry */, 0 /* trailing */); munit_assert_int(TERM_OF(5), ==, 1); return MUNIT_OK; } /* The log has one entry. */ TEST(logTermOf, one, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(3 /* term */); munit_assert_int(TERM_OF(1), ==, 3); return MUNIT_OK; } /* The log has two entries. */ TEST(logTermOf, two, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND_MANY(4 /* term */, 2 /* n */); munit_assert_int(TERM_OF(1), ==, 4); munit_assert_int(TERM_OF(2), ==, 4); return MUNIT_OK; } /* The log has a snapshot and hence has an offset. */ TEST(logTermOf, withSnapshot, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND_MANY(1 /* term */, 5 /* n entries */); SNAPSHOT(3 /* last index */, 0 /* trailing */); munit_assert_int(TERM_OF(1), ==, 0); munit_assert_int(TERM_OF(2), ==, 0); munit_assert_int(TERM_OF(3), ==, 1); munit_assert_int(TERM_OF(4), ==, 1); munit_assert_int(TERM_OF(5), ==, 1); return MUNIT_OK; } /* The log has a snapshot with trailing entries. */ TEST(logTermOf, snapshotTrailing, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND_MANY(1 /* term */, 5 /* n entries */); SNAPSHOT(3 /* last index */, 2 /* trailing */); munit_assert_int(TERM_OF(1), ==, 0); munit_assert_int(TERM_OF(2), ==, 1); munit_assert_int(TERM_OF(3), ==, 1); munit_assert_int(TERM_OF(4), ==, 1); munit_assert_int(TERM_OF(5), ==, 1); return MUNIT_OK; } /****************************************************************************** * * logGet * *****************************************************************************/ SUITE(logGet) /* The log is empty. */ TEST(logGet, empty_log, setUp, tearDown, 0, NULL) { struct fixture *f = data; munit_assert_ptr_null(GET(1)); return MUNIT_OK; } /* The log is empty but has an offset. */ TEST(logGet, emptyWithOffset, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND_MANY(4 /* term */, 10 /* n */); SNAPSHOT(10 /* last index */, 0 /* trailing */); munit_assert_ptr_null(GET(1)); munit_assert_ptr_null(GET(10)); munit_assert_ptr_null(GET(11)); return MUNIT_OK; } /* The log has one entry. */ TEST(logGet, one, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(3 /* term */); munit_assert_int(GET(1)->term, ==, 3); munit_assert_ptr_null(GET(2)); return MUNIT_OK; } /* The log has two entries. */ TEST(logGet, two, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND_MANY(4 /* term */, 2 /* n */); munit_assert_int(GET(1)->term, ==, 4); munit_assert_int(GET(2)->term, ==, 4); munit_assert_ptr_null(GET(3)); return MUNIT_OK; } /* The log starts at a certain offset. */ TEST(logGet, twoWithOffset, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND_MANY(1 /* term */, 3 /* n */); APPEND(2 /* term */); APPEND(3 /* term */); SNAPSHOT(4 /* las index */, 1 /* trailing */); munit_assert_ptr_null(GET(1)); munit_assert_ptr_null(GET(2)); munit_assert_ptr_null(GET(3)); munit_assert_int(GET(4)->term, ==, 2); munit_assert_int(GET(5)->term, ==, 3); return MUNIT_OK; } /****************************************************************************** * * logAppend * *****************************************************************************/ SUITE(logAppend) /* Append one entry to an empty log. */ TEST(logAppend, one, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(1 /* term */); ASSERT(2 /* size */, 0 /* front */, 1 /* back */, 0 /* offset */, 1 /* n */); ASSERT_TERM_OF(1 /* entry index */, 1 /* term */); ASSERT_REFCOUNT(1 /* entry index */, 1 /* count */); return MUNIT_OK; } /* Append two entries to to an empty log. */ TEST(logAppend, two, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(1 /* term */); APPEND(1 /* term */); ASSERT(6 /* size */, 0 /* front */, 2 /* back */, 0 /* offset */, 2 /* n */); ASSERT_TERM_OF(1 /* entry index */, 1 /* term */); ASSERT_TERM_OF(2 /* entry index */, 1 /* term */); ASSERT_REFCOUNT(1 /* entry index */, 1 /* count */); ASSERT_REFCOUNT(2 /* entry index */, 1 /* count */); return MUNIT_OK; } /* Append three entries in sequence. */ TEST(logAppend, three, setUp, tearDown, 0, NULL) { struct fixture *f = data; /* One -> [e1, NULL] */ APPEND(1 /* term */); /* Two -> [e1, e2, NULL, NULL, NULL, NULL] */ APPEND(1 /* term */); /* Three -> [e1, e2, e3, NULL, NULL, NULL] */ APPEND(1 /* term */); ASSERT(6 /* size */, 0 /* front */, 3 /* back */, 0 /* offset */, 3 /* n */); ASSERT_TERM_OF(1 /* entry index */, 1 /* term */); ASSERT_TERM_OF(2 /* entry index */, 1 /* term */); ASSERT_TERM_OF(3 /* entry index */, 1 /* term */); ASSERT_REFCOUNT(1 /* entry index */, 1 /* count */); ASSERT_REFCOUNT(2 /* entry index */, 1 /* count */); ASSERT_REFCOUNT(3 /* entry index */, 1 /* count */); return MUNIT_OK; } /* Append enough entries to force the reference count hash table to be * resized. */ TEST(logAppend, many, setUp, tearDown, 0, NULL) { struct fixture *f = data; int i; for (i = 0; i < 3000; i++) { APPEND(1 /* term */); } munit_assert_int(f->log->refs_size, ==, 4096); return MUNIT_OK; } /* Append to wrapped log that needs to be grown. */ TEST(logAppend, wrap, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND_MANY(1 /* term */, 5 /* n */); /* Now the log is [e1, e2, e3, e4, e5, NULL] */ ASSERT(6 /* size */, 0 /* front */, 5 /* back */, 0 /* offset */, 5 /* n */); /* Delete the first 4 entries. */ SNAPSHOT(4 /* last entry */, 0 /* trailing */); /* Now the log is [NULL, NULL, NULL, NULL, e5, NULL] */ ASSERT(6 /* size */, 4 /* front */, 5 /* back */, 4 /* offset */, 1 /* n */); /* Append another 3 entries. */ APPEND_MANY(1 /* term */, 3 /* n */); /* Now the log is [e7, e8, NULL, NULL, e5, e6] */ ASSERT(6 /* size */, 4 /* front */, 2 /* back */, 4 /* offset */, 4 /* n */); /* Append another 3 entries. */ APPEND_MANY(1 /* term */, 3 /* n */); /* Now the log is [e5, ..., e11, NULL, ..., NULL] */ ASSERT(14 /* size */, 0 /* front */, 7 /* back */, 4 /* offset */, 7 /* n */); return MUNIT_OK; } /* Append a batch of entries to an empty log. */ TEST(logAppend, batch, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND_BATCH(3); ASSERT(6 /* size */, 0 /* front */, 3 /* back */, 0 /* offset */, 3 /* n */); return MUNIT_OK; } static char *logAppendOomHeapFaultDelay[] = {"0", "1", NULL}; static char *logAppendOomHeapFaultRepeat[] = {"1", NULL}; static MunitParameterEnum logAppendOom[] = { {TEST_HEAP_FAULT_DELAY, logAppendOomHeapFaultDelay}, {TEST_HEAP_FAULT_REPEAT, logAppendOomHeapFaultRepeat}, {NULL, NULL}, }; /* Out of memory. */ TEST(logAppend, oom, setUp, tearDown, 0, logAppendOom) { struct fixture *f = data; struct raft_buffer buf; int rv; buf.base = NULL; buf.len = 0; HeapFaultEnable(&f->heap); rv = logAppend(f->log, 1, RAFT_COMMAND, buf, (struct raft_entry_local_data){}, true, NULL); munit_assert_int(rv, ==, RAFT_NOMEM); return MUNIT_OK; } /* Out of memory when trying to grow the refs count table. */ TEST(logAppend, oomRefs, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND_MANY(1, LOG__REFS_INITIAL_SIZE); HeapFaultConfig(&f->heap, 1, 1); HeapFaultEnable(&f->heap); APPEND_ERROR(1, RAFT_NOMEM); return MUNIT_OK; } /****************************************************************************** * * logAppendConfiguration * *****************************************************************************/ SUITE(logAppendConfiguration) static char *logAppendConfigurationOomHeapFaultDelay[] = {"0", "1", NULL}; static char *logAppendConfigurationOomHeapFaultRepeat[] = {"1", NULL}; static MunitParameterEnum logAppendConfigurationOom[] = { {TEST_HEAP_FAULT_DELAY, logAppendConfigurationOomHeapFaultDelay}, {TEST_HEAP_FAULT_REPEAT, logAppendConfigurationOomHeapFaultRepeat}, {NULL, NULL}, }; /* Out of memory. */ TEST(logAppendConfiguration, oom, setUp, tearDown, 0, logAppendConfigurationOom) { struct fixture *f = data; struct raft_configuration configuration; int rv; configurationInit(&configuration); rv = configurationAdd(&configuration, 1, "1", RAFT_VOTER); munit_assert_int(rv, ==, 0); HeapFaultEnable(&f->heap); rv = logAppendConfiguration(f->log, 1, &configuration); munit_assert_int(rv, ==, RAFT_NOMEM); configurationClose(&configuration); return MUNIT_OK; } /****************************************************************************** * * logAcquire * *****************************************************************************/ SUITE(logAcquire) /* Acquire a single log entry. */ TEST(logAcquire, one, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry *entries; unsigned n; APPEND(1 /* term */); ACQUIRE(1 /* index */); munit_assert_ptr_not_null(entries); munit_assert_int(n, ==, 1); munit_assert_int(entries[0].type, ==, RAFT_COMMAND); ASSERT_REFCOUNT(1 /* index */, 2 /* count */); RELEASE(1 /* index */); ASSERT_REFCOUNT(1 /* index */, 1 /* count */); return MUNIT_OK; } /* Acquire two log entries. */ TEST(logAcquire, two, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry *entries; unsigned n; APPEND(1 /* term */); APPEND(1 /* term */); ACQUIRE(1 /* index */); munit_assert_ptr_not_null(entries); munit_assert_int(n, ==, 2); munit_assert_int(entries[0].type, ==, RAFT_COMMAND); munit_assert_int(entries[1].type, ==, RAFT_COMMAND); ASSERT_REFCOUNT(1 /* index */, 2 /* count */); ASSERT_REFCOUNT(2 /* index */, 2 /* count */); RELEASE(1 /* index */); ASSERT_REFCOUNT(1 /* index */, 1 /* count */); ASSERT_REFCOUNT(2 /* index */, 1 /* count */); return MUNIT_OK; } /* Acquire two log entries in a wrapped log. */ TEST(logAcquire, wrap, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry *entries; unsigned n; APPEND_MANY(1 /* term */, 5 /* n */); /* Now the log is [e1, e2, e3, e4, e5, NULL] */ ASSERT(6 /* size */, 0 /* front */, 5 /* back */, 0 /* offset */, 5 /* n */); /* Delete the first 4 entries. */ SNAPSHOT(4 /* last index */, 0 /* trailing */); /* Now the log is [NULL, NULL, NULL, NULL, e5, NULL] */ ASSERT(6 /* size */, 4 /* front */, 5 /* back */, 4 /* offset */, 1 /* n */); /* Append another 3 entries. */ APPEND_MANY(1 /* term */, 3 /* n */); /* Now the log is [e7, e8, NULL, NULL, e5, e6] */ ASSERT(6 /* size */, 4 /* front */, 2 /* back */, 4 /* offset */, 4 /* n */); ACQUIRE(6 /* index */); munit_assert_int(n, ==, 3); RELEASE(6 /* index */); return MUNIT_OK; } /* Acquire several entries some of which belong to batches. */ TEST(logAcquire, batch, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry *entries; unsigned n; APPEND(1 /* term */); APPEND_BATCH(2 /* n entries */); APPEND(1 /* term */); APPEND_BATCH(3 /* n entries */); ACQUIRE(2 /* index */); munit_assert_ptr_not_null(entries); munit_assert_int(n, ==, 6); ASSERT_REFCOUNT(2 /* index */, 2 /* count */); /* Truncate the last 5 entries, so the only references left for the second * batch are the ones in the acquired entries. */ TRUNCATE(3 /* index */); RELEASE(2 /* index */); ASSERT_REFCOUNT(2 /* index */, 1 /* count */); return MUNIT_OK; } /* Trying to acquire entries out of range results in a NULL pointer. */ TEST(logAcquire, outOfRange, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry *entries; unsigned n; APPEND(1 /* term */); APPEND(1 /* term */); SNAPSHOT(1 /* index */, 0 /* trailing */); ACQUIRE(1 /* index */); munit_assert_ptr_null(entries); ACQUIRE(3 /* index */); munit_assert_ptr_null(entries); return MUNIT_OK; } /* Out of memory. */ TEST(logAcquire, oom, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry *entries; unsigned n; int rv; APPEND(1 /* term */); HeapFaultConfig(&f->heap, 0, 1); HeapFaultEnable(&f->heap); rv = logAcquire(f->log, 1, &entries, &n); munit_assert_int(rv, ==, RAFT_NOMEM); return MUNIT_OK; } /****************************************************************************** * * logTruncate * *****************************************************************************/ SUITE(logTruncate) /* Truncate the last entry of a log with a single entry. */ TEST(logTruncate, lastOfOne, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(1 /* term */); TRUNCATE(1 /* index */); ASSERT(0 /* size */, 0 /* front */, 0 /* back */, 0 /* offset */, 0 /* n */); return MUNIT_OK; } /* Truncate the last entry of a log with a two entries. */ TEST(logTruncate, lastOfTwo, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(1 /* term */); APPEND(1 /* term */); TRUNCATE(2 /* index */); ASSERT(6 /* size */, 0 /* front */, 1 /* back */, 0 /* offset */, 1 /* n */); ASSERT_TERM_OF(1 /* entry index */, 1 /* term */); return MUNIT_OK; } /* Truncate from an entry which makes the log wrap. */ TEST(logTruncate, wrap, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND_MANY(1 /* term */, 5 /* n entries */); /* Now the log is [e1, e2, e3, e4, e5, NULL] */ ASSERT(6 /* size */, 0 /* front */, 5 /* back */, 0 /* offset */, 5 /* n */); /* Delete the first 4 entries. */ SNAPSHOT(4 /* last index */, 0 /* trailing */); /* Now the log is [NULL, NULL, NULL, NULL, e5, NULL] */ ASSERT(6 /* size */, 4 /* front */, 5 /* back */, 4 /* offset */, 1 /* n */); /* Append another 3 entries. */ APPEND_MANY(1 /* term */, 3 /* n entries */); /* Now the log is [e7, e8, NULL, NULL, e5, e6] */ ASSERT(6 /* size */, 4 /* front */, 2 /* back */, 4 /* offset */, 4 /* n */); /* Truncate from e6 onward (wrapping) */ TRUNCATE(6 /* index */); /* Now the log is [NULL, NULL, NULL, NULL, e5, NULL] */ ASSERT(6 /* size */, 4 /* front */, 5 /* back */, 4 /* offset */, 1 /* n */); return MUNIT_OK; } /* Truncate the last entry of a log with a single entry, which still has an * outstanding reference created by a call to logAcquire(). */ TEST(logTruncate, referenced, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry *entries; unsigned n; APPEND(1 /* term */); ACQUIRE(1 /* index */); TRUNCATE(1 /* index */); ASSERT(0 /* size */, 0 /* front */, 0 /* back */, 0 /* offset */, 0 /* n */); /* The entry has still an outstanding reference. */ ASSERT_REFCOUNT(1 /* index */, 1 /* count */); munit_assert_string_equal((const char *)entries[0].buf.base, "hello"); RELEASE(1 /* index */); ASSERT_REFCOUNT(1 /* index */, 0 /* count */); return MUNIT_OK; } /* Truncate all entries belonging to a batch. */ TEST(logTruncate, batch, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND_BATCH(3 /* n entries */); TRUNCATE(1 /* index */); munit_assert_int(f->log->size, ==, 0); return MUNIT_OK; } /* Acquire entries at a certain index. Truncate the log at that index. The * truncated entries are still referenced. Then append a new entry, which will * have the same index but different term. */ TEST(logTruncate, acquired, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry *entries; unsigned n; APPEND(1 /* term */); APPEND(1 /* term */); ACQUIRE(2 /* index */); munit_assert_int(n, ==, 1); TRUNCATE(2 /* index */); APPEND(2 /* term */); RELEASE(2 /*index */); return MUNIT_OK; } /* Acquire some entries, truncate the log and then append new ones forcing the log to be grown and the reference count hash table to be re-built. */ TEST(logTruncate, acquireAppend, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct raft_entry *entries; unsigned n; size_t i; APPEND(1 /* term */); APPEND(1 /* term */); ACQUIRE(2); munit_assert_int(n, ==, 1); TRUNCATE(2); for (i = 0; i < LOG__REFS_INITIAL_SIZE; i++) { APPEND(2 /* term */); } RELEASE(2); return MUNIT_OK; } static char *logTruncateAcquiredHeapFaultDelay[] = {"0", NULL}; static char *logTruncateAcquiredFaultRepeat[] = {"1", NULL}; static MunitParameterEnum logTruncateAcquiredOom[] = { {TEST_HEAP_FAULT_DELAY, logTruncateAcquiredHeapFaultDelay}, {TEST_HEAP_FAULT_REPEAT, logTruncateAcquiredFaultRepeat}, {NULL, NULL}, }; /* Acquire entries at a certain index. Truncate the log at that index. The * truncated entries are still referenced. Then append a new entry, which fails * to be appended due to OOM. */ TEST(logTruncate, acquiredOom, setUp, tearDown, 0, logTruncateAcquiredOom) { struct fixture *f = data; struct raft_entry *entries; unsigned n; struct raft_buffer buf; int rv; APPEND(1 /* term */); APPEND(1 /* term */); ACQUIRE(2); munit_assert_int(n, ==, 1); TRUNCATE(2); buf.base = NULL; buf.len = 0; HeapFaultEnable(&f->heap); rv = logAppend(f->log, 2, RAFT_COMMAND, buf, (struct raft_entry_local_data){}, true, NULL); munit_assert_int(rv, ==, RAFT_NOMEM); RELEASE(2); return MUNIT_OK; } /****************************************************************************** * * logSnapshot * *****************************************************************************/ SUITE(logSnapshot) /* Take a snapshot at entry 3, keeping 2 trailing entries. */ TEST(logSnapshot, trailing, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND(1 /* term */); APPEND(2 /* term */); APPEND(2 /* term */); SNAPSHOT(3 /* last index */, 2 /* trailing */); ASSERT(6 /* size */, 1 /* front */, 3 /* back */, 1 /* offset */, 2 /* n */); ASSERT_SNAPSHOT(3 /* index */, 2 /* term */); munit_assert_int(NUM_ENTRIES, ==, 2); munit_assert_int(LAST_INDEX, ==, 3); return MUNIT_OK; } /* Take a snapshot when the number of outstanding entries is lower than the * desired trail (so no entry will be deleted). */ TEST(logSnapshot, trailingHigherThanNumEntries, setUp, tearDown, 0, NULL) { struct fixture *f = data; /* Take a snapshot leaving just one entry in the log. */ APPEND_MANY(1 /* term */, 3 /* n entries */); SNAPSHOT(3 /* last index */, 1 /* trailing */); /* Take another snapshot, trying to leave 3 entries, but only 2 are * available at all. */ APPEND(2 /* term */); SNAPSHOT(4 /* last index */, 3 /* trailing */); ASSERT(6 /* size */, 2 /* front */, 4 /* back */, 2 /* offset */, 2 /* n */); ASSERT_SNAPSHOT(4 /* index */, 2 /* term */); munit_assert_int(NUM_ENTRIES, ==, 2); munit_assert_int(LAST_INDEX, ==, 4); return MUNIT_OK; } /* Take a snapshot when the number of outstanding entries is exactly equal to * the desired trail (so no entry will be deleted). */ TEST(logSnapshot, trailingMatchesOutstanding, setUp, tearDown, 0, NULL) { struct fixture *f = data; /* Take a snapshot leaving just one entry in the log. */ APPEND_MANY(1 /* term */, 3 /* n entries */); SNAPSHOT(3 /* last index */, 1 /* trailing */); /* Take another snapshot, leaving 2 entries, which are the ones we have. */ APPEND(2 /* term */); SNAPSHOT(4 /* last index */, 2 /* trailing */); ASSERT(6 /* size */, 2 /* front */, 4 /* back */, 2 /* offset */, 2 /* n */); ASSERT_SNAPSHOT(4 /* index */, 2 /* term */); munit_assert_int(NUM_ENTRIES, ==, 2); munit_assert_int(LAST_INDEX, ==, 4); return MUNIT_OK; } /* Take a snapshot at an index which is not the last one. */ TEST(logSnapshot, lessThanHighestIndex, setUp, tearDown, 0, NULL) { struct fixture *f = data; /* Take a snapshot leaving three entries in the log. */ APPEND_MANY(1 /* term */, 5 /* n entries */); SNAPSHOT(4 /* last index */, 2 /* trailing */); ASSERT(6 /* size */, 2 /* front */, 5 /* back */, 2 /* offset */, 3 /* n */); ASSERT_SNAPSHOT(4 /* index */, 1 /* term */); munit_assert_int(NUM_ENTRIES, ==, 3); munit_assert_int(LAST_INDEX, ==, 5); return MUNIT_OK; } /* Take a snapshot at a point where the log needs to wrap. */ TEST(logSnapshot, wrap, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND_MANY(1 /* term */, 5 /* n entries */); /* Now the log is [e1, e2, e3, e4, e5, NULL] */ ASSERT(6 /* size */, 0 /* front */, 5 /* back */, 0 /* offset */, 5 /* n */); /* Take a snapshot at e5, keeping just e5 itself. */ SNAPSHOT(5 /* last index */, 1 /* trailing */); /* Now the log is [NULL, NULL, NULL, NULL, e5, NULL] */ ASSERT(6 /* size */, 4 /* front */, 5 /* back */, 4 /* offset */, 1 /* n */); ASSERT_SNAPSHOT(5 /* index */, 1 /* term */); /* Append another 4 entries. */ APPEND_MANY(1 /* term */, 4 /* n */); /* Now the log is [e7, e8, e9, NULL, e5, e6] */ ASSERT(6 /* size */, 4 /* front */, 3 /* back */, 4 /* offset */, 5 /* n */); /* Take a snapshot at e8 keeping only e8 itself (wrapping) */ SNAPSHOT(8 /* last index */, 1 /* trailing */); /* Now the log is [NULL, e8, e9, NULL, NULL, NULL] */ ASSERT(6 /* size */, 1 /* front */, 3 /* back */, 7 /* offset */, 2 /* n */); ASSERT_SNAPSHOT(8 /* index */, 1 /* term */); return MUNIT_OK; } /****************************************************************************** * * logRestore * *****************************************************************************/ SUITE(logRestore) /* Mimic the initial restore of a snapshot after loading state from disk, when * there are no outstanding entries. */ TEST(logRestore, initial, setUp, tearDown, 0, NULL) { struct fixture *f = data; RESTORE(2 /* last index */, 3 /* last term */); ASSERT_SNAPSHOT(2 /* index */, 3 /* term */); munit_assert_int(LAST_INDEX, ==, 2); return MUNIT_OK; } /* If there are existing entries they are wiped out. */ TEST(logRestore, wipe, setUp, tearDown, 0, NULL) { struct fixture *f = data; APPEND_MANY(1 /* term */, 5 /* n entries */); RESTORE(2 /* last index */, 3 /* last term */); ASSERT_SNAPSHOT(2 /* index */, 3 /* term */); munit_assert_int(LAST_INDEX, ==, 2); return MUNIT_OK; } dqlite-1.16.7/test/raft/unit/test_queue.c000066400000000000000000000145471465252713400203350ustar00rootroot00000000000000#include "../../../src/lib/queue.h" #include "../lib/runner.h" /****************************************************************************** * * Fixture with a single queue and a few test items that can be added to it. * *****************************************************************************/ struct item { int value; queue queue; }; struct fixture { queue queue; struct item items[3]; }; static void *setUp(MUNIT_UNUSED const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_malloc(sizeof *f); queue_init(&f->queue); return f; } static void tearDown(void *data) { struct fixture *f = data; free(f); } /****************************************************************************** * * Helper macros * *****************************************************************************/ /* Initialize and push the given number of fixture items to the fixture's * queue. Each item will have a value equal to its index plus one. */ #define PUSH(N) \ { \ int i_; \ for (i_ = 0; i_ < N; i_++) { \ struct item *item_ = &f->items[i_]; \ item_->value = i_ + 1; \ queue_insert_tail(&f->queue, &item_->queue); \ } \ } /* Remove the i'th fixture item from the fixture queue. */ #define REMOVE(I) queue_remove(&f->items[I].queue) /****************************************************************************** * * Assertions * *****************************************************************************/ /* Assert that the item at the head of the fixture's queue has the given * value. */ #define ASSERT_HEAD(VALUE) \ { \ queue *head_ = queue_head(&f->queue); \ struct item *item_; \ item_ = QUEUE_DATA(head_, struct item, queue); \ munit_assert_int(item_->value, ==, VALUE); \ } /* Assert that the item at the tail of the queue has the given value. */ #define ASSERT_TAIL(VALUE) \ { \ queue *tail_ = queue_tail(&f->queue); \ struct item *item_; \ item_ = QUEUE_DATA(tail_, struct item, queue); \ munit_assert_int(item_->value, ==, VALUE); \ } /* Assert that the fixture's queue is empty. */ #define ASSERT_EMPTY munit_assert_true(queue_empty(&f->queue)) /* Assert that the fixture's queue is not empty. */ #define ASSERT_NOT_EMPTY munit_assert_false(queue_empty(&f->queue)) /****************************************************************************** * * queue_empty * *****************************************************************************/ SUITE(queue_empty) TEST(queue_empty, yes, setUp, tearDown, 0, NULL) { struct fixture *f = data; ASSERT_EMPTY; return MUNIT_OK; } TEST(queue_empty, no, setUp, tearDown, 0, NULL) { struct fixture *f = data; PUSH(1); ASSERT_NOT_EMPTY; return MUNIT_OK; } /****************************************************************************** * * queue_insert_tail * *****************************************************************************/ SUITE(queue_insert_tail) TEST(queue_insert_tail, one, setUp, tearDown, 0, NULL) { struct fixture *f = data; PUSH(1); ASSERT_HEAD(1); return MUNIT_OK; } TEST(queue_insert_tail, two, setUp, tearDown, 0, NULL) { struct fixture *f = data; int i; PUSH(2); for (i = 0; i < 2; i++) { ASSERT_HEAD(i + 1); REMOVE(i); } ASSERT_EMPTY; return MUNIT_OK; } /****************************************************************************** * * queue_remove * *****************************************************************************/ SUITE(queue_remove) TEST(queue_remove, first, setUp, tearDown, 0, NULL) { struct fixture *f = data; PUSH(3); REMOVE(0); ASSERT_HEAD(2); return MUNIT_OK; } TEST(queue_remove, second, setUp, tearDown, 0, NULL) { struct fixture *f = data; PUSH(3); REMOVE(1); ASSERT_HEAD(1); return MUNIT_OK; } TEST(queue_remove, success, setUp, tearDown, 0, NULL) { struct fixture *f = data; PUSH(3); REMOVE(2); ASSERT_HEAD(1); return MUNIT_OK; } /****************************************************************************** * * queue_tail * *****************************************************************************/ SUITE(queue_tail) TEST(queue_tail, one, setUp, tearDown, 0, NULL) { struct fixture *f = data; PUSH(1); ASSERT_TAIL(1); return MUNIT_OK; } TEST(queue_tail, two, setUp, tearDown, 0, NULL) { struct fixture *f = data; PUSH(2); ASSERT_TAIL(2); return MUNIT_OK; } TEST(queue_tail, three, setUp, tearDown, 0, NULL) { struct fixture *f = data; PUSH(3); ASSERT_TAIL(3); return MUNIT_OK; } /****************************************************************************** * * QUEUE_FOREACH * *****************************************************************************/ SUITE(QUEUE_FOREACH) /* Loop through a queue of zero items. */ TEST(QUEUE_FOREACH, zero, setUp, tearDown, 0, NULL) { struct fixture *f = data; queue *head; int count = 0; QUEUE_FOREACH (head, &f->queue) { count++; } munit_assert_int(count, ==, 0); return MUNIT_OK; } /* Loop through a queue of one item. */ TEST(QUEUE_FOREACH, one, setUp, tearDown, 0, NULL) { struct fixture *f = data; queue *head; int count = 0; PUSH(1); QUEUE_FOREACH (head, &f->queue) { count++; } munit_assert_int(count, ==, 1); return MUNIT_OK; } /* Loop through a queue of two items. The order of the loop is from the head to * the tail. */ TEST(QUEUE_FOREACH, two, setUp, tearDown, 0, NULL) { struct fixture *f = data; queue *head; int values[2] = {0, 0}; int i = 0; PUSH(2); QUEUE_FOREACH (head, &f->queue) { struct item *item; item = QUEUE_DATA(head, struct item, queue); values[i] = item->value; i++; } munit_assert_int(values[0], ==, 1); munit_assert_int(values[1], ==, 2); return MUNIT_OK; } dqlite-1.16.7/test/raft/unit/test_snapshot.c000066400000000000000000000267261465252713400210520ustar00rootroot00000000000000#include #include #include #include #include #include "../lib/runner.h" #include "../../../src/lib/sm.h" #include "../../../src/raft.h" #include "../../../src/raft/recv_install_snapshot.h" #include "../../../src/utils.h" struct fixture { }; static void *set_up(MUNIT_UNUSED const MunitParameter params[], MUNIT_UNUSED void *user_data) { struct fixture *f = munit_malloc(sizeof *f); return f; } static void tear_down(void *data) { free(data); } SUITE(snapshot_leader) SUITE(snapshot_follower) static void ut_leader_message_received(struct leader *leader, const struct raft_message *incoming) { leader_tick(leader, incoming); } static void ut_follower_message_received(struct follower *follower, const struct raft_message *incoming) { follower_tick(follower, incoming); } static void ut_ht_create_op(struct work *w) { (void)w; } static void ut_fill_ht_op(struct work *w) { (void)w; } static void ut_write_chunk_op(struct work *w) { (void)w; } static void ut_read_sig_op(struct work *w) { (void)w; } static void ut_disk_io(struct work *work) { work->work_cb(work); } static void ut_disk_io_done(struct work *work) { work->after_cb(work); } static void ut_to_expired(struct leader *leader) { leader->timeout.cb(&leader->timeout.handle); } static void ut_rpc_sent(struct rpc *rpc) { rpc->sender.cb(&rpc->sender, 0); } static void ut_rpc_to_expired(struct rpc *rpc) { rpc->timeout.cb(&rpc->timeout.handle); } static const struct raft_message *append_entries(void) { static struct raft_message append_entries = { .type = RAFT_IO_APPEND_ENTRIES, }; return &append_entries; } static const struct raft_message *ut_install_snapshot(void) { static struct raft_message ut_install_snapshot = { .type = RAFT_IO_INSTALL_SNAPSHOT, }; return &ut_install_snapshot; } static const struct raft_message *ut_install_snapshot_result(void) { static struct raft_message ut_install_snapshot_result = { .type = RAFT_IO_INSTALL_SNAPSHOT_RESULT, }; return &ut_install_snapshot_result; } static const struct raft_message *ut_sign(void) { static struct raft_message ut_sign = { .type = RAFT_IO_SIGNATURE, }; return &ut_sign; } static const struct raft_message *ut_sign_result(void) { static struct raft_message ut_sign_result = { .type = RAFT_IO_SIGNATURE_RESULT, }; return &ut_sign_result; } static const struct raft_message *ut_page(void) { static struct raft_message ut_page = { .type = RAFT_IO_INSTALL_SNAPSHOT_CP, }; return &ut_page; } static const struct raft_message *ut_page_result(void) { static struct raft_message ut_page_result = { .type = RAFT_IO_INSTALL_SNAPSHOT_CP_RESULT, }; return &ut_page_result; } static void ut_work_queue_op(struct work *w, work_op work_cb, work_op after_cb) { w->work_cb = work_cb; w->after_cb = after_cb; } static void ut_to_init_op(struct timeout *to) { (void)to; } static void ut_to_start_op(struct timeout *to, unsigned delay, to_cb_op cb) { (void)delay; to->cb = cb; } static void ut_to_stop_op(struct timeout *to) { (void)to; } int ut_sender_send_op(struct sender *s, struct raft_message *payload, sender_cb_op cb) { (void)payload; s->cb = cb; return 0; } TEST(snapshot_follower, basic, set_up, tear_down, 0, NULL) { struct follower_ops ops = { .ht_create = ut_ht_create_op, .work_queue = ut_work_queue_op, .sender_send = ut_sender_send_op, .read_sig = ut_read_sig_op, .write_chunk = ut_write_chunk_op, .fill_ht = ut_fill_ht_op, }; struct follower follower = { .ops = &ops, }; sm_init(&follower.sm, follower_sm_invariant, NULL, follower_sm_conf, "follower", FS_NORMAL); PRE(sm_state(&follower.sm) == FS_NORMAL); ut_follower_message_received(&follower, ut_install_snapshot()); ut_rpc_sent(&follower.rpc); ut_disk_io(&follower.work); PRE(sm_state(&follower.sm) == FS_HT_WAIT); ut_disk_io_done(&follower.work); PRE(sm_state(&follower.sm) == FS_SIGS_CALC_LOOP); ut_follower_message_received(&follower, ut_sign()); ut_rpc_sent(&follower.rpc); PRE(sm_state(&follower.sm) == FS_SIGS_CALC_LOOP); ut_disk_io(&follower.work); follower.sigs_calculated = true; ut_disk_io_done(&follower.work); PRE(sm_state(&follower.sm) == FS_SIGS_CALC_LOOP); ut_follower_message_received(&follower, ut_sign()); ut_rpc_sent(&follower.rpc); PRE(sm_state(&follower.sm) == FS_SIG_RECEIVING); ut_follower_message_received(&follower, ut_sign()); PRE(sm_state(&follower.sm) == FS_SIG_PROCESSED); ut_disk_io(&follower.work); ut_disk_io_done(&follower.work); PRE(sm_state(&follower.sm) == FS_SIG_READ); ut_rpc_sent(&follower.rpc); PRE(sm_state(&follower.sm) == FS_CHUNCK_RECEIVING); ut_follower_message_received(&follower, ut_page()); ut_disk_io(&follower.work); ut_disk_io_done(&follower.work); PRE(sm_state(&follower.sm) == FS_CHUNCK_APPLIED); ut_rpc_sent(&follower.rpc); sm_fini(&follower.sm); return MUNIT_OK; } TEST(snapshot_leader, basic, set_up, tear_down, 0, NULL) { struct leader_ops ops = { .to_init = ut_to_init_op, .to_stop = ut_to_stop_op, .to_start = ut_to_start_op, .ht_create = ut_ht_create_op, .work_queue = ut_work_queue_op, .sender_send = ut_sender_send_op, }; struct leader leader = { .ops = &ops, .sigs_more = false, .pages_more = false, .sigs_calculated = false, }; sm_init(&leader.sm, leader_sm_invariant, NULL, leader_sm_conf, "leader", LS_F_ONLINE); PRE(sm_state(&leader.sm) == LS_F_ONLINE); ut_leader_message_received(&leader, append_entries()); PRE(sm_state(&leader.sm) == LS_HT_WAIT); ut_disk_io(&leader.work); ut_disk_io_done(&leader.work); PRE(sm_state(&leader.sm) == LS_F_NEEDS_SNAP); ut_rpc_sent(&leader.rpc); ut_leader_message_received(&leader, ut_install_snapshot_result()); PRE(sm_state(&leader.sm) == LS_CHECK_F_HAS_SIGS); ut_rpc_sent(&leader.rpc); ut_leader_message_received(&leader, ut_sign_result()); ut_to_expired(&leader); leader.sigs_calculated = true; ut_rpc_sent(&leader.rpc); ut_leader_message_received(&leader, ut_sign_result()); PRE(sm_state(&leader.sm) == LS_REQ_SIG_LOOP); ut_rpc_sent(&leader.rpc); PRE(sm_state(&leader.sm) == LS_REQ_SIG_LOOP); ut_leader_message_received(&leader, ut_sign_result()); ut_disk_io(&leader.work); ut_disk_io_done(&leader.work); ut_disk_io(&leader.work); ut_disk_io_done(&leader.work); PRE(sm_state(&leader.sm) == LS_PAGE_READ); ut_rpc_sent(&leader.rpc); ut_leader_message_received(&leader, ut_page_result()); PRE(sm_state(&leader.sm) == LS_SNAP_DONE); ut_rpc_sent(&leader.rpc); ut_leader_message_received(&leader, ut_install_snapshot_result()); sm_fini(&leader.sm); return MUNIT_OK; } TEST(snapshot_leader, timeouts, set_up, tear_down, 0, NULL) { struct leader_ops ops = { .to_init = ut_to_init_op, .to_stop = ut_to_stop_op, .to_start = ut_to_start_op, .ht_create = ut_ht_create_op, .work_queue = ut_work_queue_op, .sender_send = ut_sender_send_op, }; struct leader leader = { .ops = &ops, .sigs_more = false, .pages_more = false, .sigs_calculated = false, }; sm_init(&leader.sm, leader_sm_invariant, NULL, leader_sm_conf, "leader", LS_F_ONLINE); PRE(sm_state(&leader.sm) == LS_F_ONLINE); ut_leader_message_received(&leader, append_entries()); PRE(sm_state(&leader.sm) == LS_HT_WAIT); ut_disk_io(&leader.work); ut_disk_io_done(&leader.work); PRE(sm_state(&leader.sm) == LS_F_NEEDS_SNAP); ut_rpc_sent(&leader.rpc); ut_rpc_to_expired(&leader.rpc); PRE(sm_state(&leader.sm) == LS_F_NEEDS_SNAP); ut_rpc_sent(&leader.rpc); ut_leader_message_received(&leader, ut_install_snapshot_result()); PRE(sm_state(&leader.sm) == LS_CHECK_F_HAS_SIGS); ut_rpc_sent(&leader.rpc); ut_leader_message_received(&leader, ut_sign_result()); ut_to_expired(&leader); PRE(sm_state(&leader.sm) == LS_CHECK_F_HAS_SIGS); ut_rpc_sent(&leader.rpc); ut_rpc_to_expired(&leader.rpc); PRE(sm_state(&leader.sm) == LS_CHECK_F_HAS_SIGS); leader.sigs_calculated = true; ut_rpc_sent(&leader.rpc); ut_leader_message_received(&leader, ut_sign_result()); PRE(sm_state(&leader.sm) == LS_REQ_SIG_LOOP); ut_rpc_sent(&leader.rpc); PRE(sm_state(&leader.sm) == LS_REQ_SIG_LOOP); ut_leader_message_received(&leader, ut_sign_result()); ut_disk_io(&leader.work); ut_disk_io_done(&leader.work); ut_disk_io(&leader.work); ut_disk_io_done(&leader.work); PRE(sm_state(&leader.sm) == LS_PAGE_READ); ut_rpc_sent(&leader.rpc); ut_rpc_to_expired(&leader.rpc); PRE(sm_state(&leader.sm) == LS_PAGE_READ); ut_rpc_sent(&leader.rpc); ut_leader_message_received(&leader, ut_page_result()); PRE(sm_state(&leader.sm) == LS_SNAP_DONE); ut_rpc_sent(&leader.rpc); ut_leader_message_received(&leader, ut_install_snapshot_result()); sm_fini(&leader.sm); return MUNIT_OK; } static void progress(void) { for (unsigned i = 0; i < 100; i++) { uv_run(uv_default_loop(), UV_RUN_NOWAIT); } } static void pool_to_start_op(struct timeout *to, unsigned delay, to_cb_op cb) { uv_timer_start(&to->handle, cb, delay, 0); to->cb = cb; } static void pool_to_stop_op(struct timeout *to) { uv_timer_stop(&to->handle); } static void pool_to_init_op(struct timeout *to) { uv_timer_init(uv_default_loop(), &to->handle); } static void pool_to_expired(struct leader *leader) { uv_timer_start(&leader->timeout.handle, leader->timeout.cb, 0, 0); progress(); } static void pool_rpc_to_expired(struct rpc *rpc) { uv_timer_start(&rpc->timeout.handle, rpc->timeout.cb, 0, 0); progress(); } /* TODO(alberto): combine them with tests above once the rest is in place. * Dispatch to one of two implementations ut or pool in general functions. */ TEST(snapshot_leader, pool_timeouts, set_up, tear_down, 0, NULL) { struct leader_ops ops = { .to_init = pool_to_init_op, .to_stop = pool_to_stop_op, .to_start = pool_to_start_op, .ht_create = ut_ht_create_op, .work_queue = ut_work_queue_op, .sender_send = ut_sender_send_op, }; struct leader leader = { .ops = &ops, .sigs_more = false, .pages_more = false, .sigs_calculated = false, }; sm_init(&leader.sm, leader_sm_invariant, NULL, leader_sm_conf, "leader", LS_F_ONLINE); PRE(sm_state(&leader.sm) == LS_F_ONLINE); ut_leader_message_received(&leader, append_entries()); PRE(sm_state(&leader.sm) == LS_HT_WAIT); ut_disk_io(&leader.work); ut_disk_io_done(&leader.work); PRE(sm_state(&leader.sm) == LS_F_NEEDS_SNAP); ut_rpc_sent(&leader.rpc); pool_rpc_to_expired(&leader.rpc); PRE(sm_state(&leader.sm) == LS_F_NEEDS_SNAP); ut_rpc_sent(&leader.rpc); ut_leader_message_received(&leader, ut_install_snapshot_result()); PRE(sm_state(&leader.sm) == LS_CHECK_F_HAS_SIGS); ut_rpc_sent(&leader.rpc); ut_leader_message_received(&leader, ut_sign_result()); pool_to_expired(&leader); PRE(sm_state(&leader.sm) == LS_CHECK_F_HAS_SIGS); ut_rpc_sent(&leader.rpc); pool_rpc_to_expired(&leader.rpc); PRE(sm_state(&leader.sm) == LS_CHECK_F_HAS_SIGS); leader.sigs_calculated = true; ut_rpc_sent(&leader.rpc); ut_leader_message_received(&leader, ut_sign_result()); PRE(sm_state(&leader.sm) == LS_REQ_SIG_LOOP); ut_rpc_sent(&leader.rpc); PRE(sm_state(&leader.sm) == LS_REQ_SIG_LOOP); ut_leader_message_received(&leader, ut_sign_result()); ut_disk_io(&leader.work); ut_disk_io_done(&leader.work); ut_disk_io(&leader.work); ut_disk_io_done(&leader.work); PRE(sm_state(&leader.sm) == LS_PAGE_READ); ut_rpc_sent(&leader.rpc); pool_rpc_to_expired(&leader.rpc); PRE(sm_state(&leader.sm) == LS_PAGE_READ); ut_rpc_sent(&leader.rpc); ut_leader_message_received(&leader, ut_page_result()); PRE(sm_state(&leader.sm) == LS_SNAP_DONE); ut_rpc_sent(&leader.rpc); ut_leader_message_received(&leader, ut_install_snapshot_result()); sm_fini(&leader.sm); return MUNIT_OK; } dqlite-1.16.7/test/raft/unit/test_uv_fs.c000066400000000000000000000421411465252713400203220ustar00rootroot00000000000000#include #include "../../../src/raft/uv_fs.h" #include "../../../src/raft/uv_os.h" #include "../lib/aio.h" #include "../lib/dir.h" #include "../lib/runner.h" /****************************************************************************** * * UvFsCheckDir * *****************************************************************************/ /* Invoke UvFsCheckDir passing it the given dir. */ #define CHECK_DIR(DIR) \ { \ int _rv; \ char _errmsg[RAFT_ERRMSG_BUF_SIZE]; \ _rv = UvFsCheckDir(DIR, _errmsg); \ munit_assert_int(_rv, ==, 0); \ } /* Invoke UvFsCheckDir passing it the given dir and check that the given error * occurs. */ #define CHECK_DIR_ERROR(DIR, RV, ERRMSG) \ { \ int _rv; \ char _errmsg[RAFT_ERRMSG_BUF_SIZE]; \ _rv = UvFsCheckDir(DIR, _errmsg); \ munit_assert_int(_rv, ==, RV); \ munit_assert_string_equal(_errmsg, ERRMSG); \ } SUITE(UvFsCheckDir) /* If the directory exists, the function succeeds. */ TEST(UvFsCheckDir, exists, DirSetUp, DirTearDown, 0, NULL) { const char *dir = data; CHECK_DIR(dir); return MUNIT_OK; } /* If the directory doesn't exist, it an error is returned. */ TEST(UvFsCheckDir, doesNotExist, DirSetUp, DirTearDown, 0, NULL) { const char *parent = data; char errmsg[RAFT_ERRMSG_BUF_SIZE]; char dir[128]; sprintf(errmsg, "%s/sub", parent); sprintf(errmsg, "directory '%s' does not exist", dir); CHECK_DIR_ERROR(dir, RAFT_NOTFOUND, errmsg); return MUNIT_OK; } /* If the process can't access the directory, an error is returned. */ TEST(UvFsCheckDir, permissionDenied, NULL, NULL, 0, NULL) { bool has_access = DirHasFile("/proc/1", "root"); /* Skip the test is the process actually has access to /proc/1/root. */ if (has_access) { return MUNIT_SKIP; } CHECK_DIR_ERROR("/proc/1/root", RAFT_UNAUTHORIZED, "can't access directory '/proc/1/root'"); return MUNIT_OK; } /* If the given path contains a non-directory prefix, an error is returned. */ TEST(UvFsCheckDir, notDirPrefix, NULL, NULL, 0, NULL) { CHECK_DIR_ERROR("/dev/null/foo", RAFT_INVALID, "path '/dev/null/foo' is not a directory"); return MUNIT_OK; } /* If the given path is not a directory, an error is returned. */ TEST(UvFsCheckDir, notDir, NULL, NULL, 0, NULL) { CHECK_DIR_ERROR("/dev/null", RAFT_INVALID, "path '/dev/null' is not a directory"); return MUNIT_OK; } /* If the given directory is not writable, an error is returned. */ TEST(UvFsCheckDir, notWritable, DirSetUp, DirTearDown, 0, NULL) { const char *dir = data; char errmsg[RAFT_ERRMSG_BUF_SIZE]; sprintf(errmsg, "directory '%s' is not writable", dir); DirMakeUnwritable(dir); CHECK_DIR_ERROR(dir, RAFT_INVALID, errmsg); return MUNIT_OK; } /****************************************************************************** * * UvFsSyncDir * *****************************************************************************/ /* Invoke UvFsSyncDir passing it the given dir. */ #define SYNC_DIR_ERROR(DIR, RV, ERRMSG) \ { \ char _errmsg[RAFT_ERRMSG_BUF_SIZE]; \ munit_assert_int(UvFsSyncDir(DIR, _errmsg), ==, RV); \ munit_assert_string_equal(_errmsg, ERRMSG); \ } SUITE(UvFsSyncDir) /* If the directory doesn't exist, an error is returned. */ TEST(UvFsSyncDir, noExists, NULL, NULL, 0, NULL) { SYNC_DIR_ERROR("/abcdef", RAFT_IOERR, "open directory: no such file or directory"); return MUNIT_OK; } /****************************************************************************** * * UvFsOpenFileForReading * *****************************************************************************/ /* Open a file in the given dir. */ #define OPEN_FILE_FOR_READING_ERROR(DIR, FILENAME, RV, ERRMSG) \ { \ uv_file fd_; \ char errmsg_[RAFT_ERRMSG_BUF_SIZE]; \ int rv_ = UvFsOpenFileForReading(DIR, FILENAME, &fd_, errmsg_); \ munit_assert_int(rv_, ==, RV); \ munit_assert_string_equal(errmsg_, ERRMSG); \ } SUITE(UvFsOpenFileForReading) /* If the directory doesn't exist, an error is returned. */ TEST(UvFsOpenFileForReading, noExists, DirSetUp, DirTearDown, 0, NULL) { const char *dir = data; OPEN_FILE_FOR_READING_ERROR(dir, "foo", RAFT_IOERR, "open: no such file or directory"); return MUNIT_OK; } /****************************************************************************** * * UvFsAllocateFile * *****************************************************************************/ #define FALLOCATE_PARAM "fallocate" static char *fallocate_params[] = {"1", "0", NULL}; MunitParameterEnum fallocateParams[] = { {FALLOCATE_PARAM, fallocate_params}, {NULL, NULL}, }; /* Allocate a file with the given parameters and assert that no error occurred. */ #define ALLOCATE_FILE(DIR, FILENAME, SIZE) \ { \ uv_file fd_; \ char errmsg_; \ int rv_; \ bool fallocate_ = true; \ const char *f = munit_parameters_get(params, FALLOCATE_PARAM); \ if (f != NULL) { \ fallocate_ = atoi(f); \ } \ rv_ = \ UvFsAllocateFile(DIR, FILENAME, SIZE, &fd_, fallocate_, &errmsg_); \ munit_assert_int(rv_, ==, 0); \ munit_assert_int(UvOsClose(fd_), ==, 0); \ } /* Assert that creating a file with the given parameters fails with the given * code and error message. */ #define ALLOCATE_FILE_ERROR(DIR, FILENAME, SIZE, RV, ERRMSG) \ { \ uv_file fd_; \ char errmsg_[RAFT_ERRMSG_BUF_SIZE]; \ int rv_; \ bool fallocate_ = true; \ const char *f = munit_parameters_get(params, FALLOCATE_PARAM); \ if (f != NULL) { \ fallocate_ = atoi(f); \ } \ rv_ = \ UvFsAllocateFile(DIR, FILENAME, SIZE, &fd_, fallocate_, errmsg_); \ munit_assert_int(rv_, ==, RV); \ munit_assert_string_equal(errmsg_, ERRMSG); \ } SUITE(UvFsAllocateFile) /* If the given path is valid, the file gets created. */ TEST(UvFsAllocateFile, success, DirSetUp, DirTearDown, 0, fallocateParams) { const char *dir = data; ALLOCATE_FILE(dir, /* dir */ "foo", /* filename */ 4096 /* size */); munit_assert_true(DirHasFile(dir, "foo")); return MUNIT_OK; } /* The directory of given path does not exist, an error is returned. */ TEST(UvFsAllocateFile, dirNoExists, NULL, NULL, 0, fallocateParams) { ALLOCATE_FILE_ERROR("/non/existing/dir", /* dir */ "foo", /* filename */ 64, /* size */ RAFT_IOERR, /* status */ "open: no such file or directory"); return MUNIT_OK; } /* If the given path already exists, an error is returned. */ TEST(UvFsAllocateFile, fileAlreadyExists, DirSetUp, DirTearDown, 0, fallocateParams) { const char *dir = data; char buf[8] = {0}; DirWriteFile(dir, "foo", buf, sizeof buf); ALLOCATE_FILE_ERROR(dir, /* dir */ "foo", /* filename */ 64, /* size */ RAFT_IOERR, /* status */ "open: file already exists"); return MUNIT_OK; } static char *dirTmpfs_params[] = {"tmpfs", NULL}; MunitParameterEnum noSpaceParams[] = { {DIR_FS_PARAM, dirTmpfs_params}, {"fallocate", fallocate_params}, {NULL, NULL}, }; /* The file system has run out of space. */ TEST(UvFsAllocateFile, noSpace, DirSetUp, DirTearDown, 0, noSpaceParams) { const char *dir = data; if (dir == NULL) { return MUNIT_SKIP; } ALLOCATE_FILE_ERROR(dir, /* dir */ "foo", /* filename */ 4096 * 32768, /* size */ RAFT_NOSPACE, /* status */ "not enough space to allocate 134217728 bytes"); munit_assert_false(DirHasFile(dir, "foo")); return MUNIT_OK; } /****************************************************************************** * * UvFsProbeCapabilities * *****************************************************************************/ /* Invoke UvFsProbeCapabilities against the given dir and assert that it returns * the given values for direct I/O and async I/O. */ #define PROBE_CAPABILITIES(DIR, DIRECT_IO, ASYNC_IO, FALLOCATE) \ { \ size_t direct_io_; \ bool async_io_; \ bool fallocate_; \ char errmsg_[RAFT_ERRMSG_BUF_SIZE]; \ int rv_; \ rv_ = UvFsProbeCapabilities(DIR, &direct_io_, &async_io_, &fallocate_, \ errmsg_); \ munit_assert_int(rv_, ==, 0); \ munit_assert_size(direct_io_, ==, DIRECT_IO); \ munit_assert_int(fallocate_, ==, FALLOCATE); \ if (ASYNC_IO) { \ munit_assert_true(async_io_); \ } else { \ munit_assert_false(async_io_); \ } \ } /* Invoke UvFsProbeCapabilities and check that the given error occurs. */ #define PROBE_CAPABILITIES_ERROR(DIR, RV, ERRMSG) \ { \ size_t direct_io_; \ bool async_io_; \ bool fallocate_; \ char errmsg_[RAFT_ERRMSG_BUF_SIZE]; \ int rv_; \ rv_ = UvFsProbeCapabilities(DIR, &direct_io_, &async_io_, &fallocate_, \ errmsg_); \ munit_assert_int(rv_, ==, RV); \ munit_assert_string_equal(errmsg_, ERRMSG); \ } SUITE(UvFsProbeCapabilities) TEST(UvFsProbeCapabilities, tmpfs, DirTmpfsSetUp, DirTearDown, 0, NULL) { const char *dir = data; if (dir == NULL) { return MUNIT_SKIP; } PROBE_CAPABILITIES(dir, 0, false, true); return MUNIT_OK; } /* ZFS 0.8 reports that it supports direct I/O, but does not support fully * support asynchronous kernel AIO. */ TEST(UvFsProbeCapabilities, zfsDirectIO, DirZfsSetUp, DirTearDown, 0, NULL) { const char *dir = data; size_t direct_io = 0; #if defined(RAFT_HAVE_ZFS_WITH_DIRECT_IO) direct_io = 4096; #endif if (dir == NULL) { return MUNIT_SKIP; } PROBE_CAPABILITIES(dir, direct_io, false, true); return MUNIT_OK; } /* File systems that fully support DIO. */ TEST(UvFsProbeCapabilities, aio, DirSetUp, DirTearDown, 0, DirAioParams) { const char *dir = data; if (dir == NULL) { return MUNIT_SKIP; } /* FIXME: btrfs doesn't like that we perform a first write to the probe file * to detect the direct I/O buffer size. */ if (strcmp(munit_parameters_get(params, DIR_FS_PARAM), "btrfs") == 0) { return MUNIT_SKIP; } PROBE_CAPABILITIES(dir, 4096, true, true); return MUNIT_OK; } /* If the given path is not executable, the block size of the underlying file * system can't be determined and an error is returned. */ TEST(UvFsProbeCapabilities, noAccess, DirSetUp, DirTearDown, 0, NULL) { const char *dir = data; /* Skip the test when running as root, since EACCES would not be triggered * in that case. */ if (getuid() == 0) { return MUNIT_SKIP; } DirMakeUnexecutable(dir); PROBE_CAPABILITIES_ERROR( dir, RAFT_IOERR, "create I/O capabilities probe file: open: permission denied"); return MUNIT_OK; } /* No space is left on the target device. */ TEST(UvFsProbeCapabilities, noSpace, DirTmpfsSetUp, DirTearDown, 0, NULL) { const char *dir = data; if (dir == NULL) { return MUNIT_SKIP; } DirFill(dir, 0); PROBE_CAPABILITIES_ERROR(dir, RAFT_NOSPACE, "create I/O capabilities probe file: not enough " "space to allocate 4096 bytes"); return MUNIT_OK; } /* The uvIoSetup() call fails with EAGAIN. */ TEST(UvFsProbeCapabilities, noResources, DirBtrfsSetUp, DirTearDown, 0, NULL) { const char *dir = data; aio_context_t ctx = 0; int rv; if (dir == NULL) { return MUNIT_SKIP; } rv = AioFill(&ctx, 0); if (rv != 0) { return MUNIT_SKIP; } PROBE_CAPABILITIES_ERROR( dir, RAFT_IOERR, "probe Async I/O: io_setup: resource temporarily unavailable"); AioDestroy(ctx); return MUNIT_OK; } /****************************************************************************** * * UvFsMakeFile * *****************************************************************************/ SUITE(UvFsMakeFile) /* If the file does not exist, the function succeeds. */ TEST(UvFsMakeFile, notExists, DirSetUp, DirTearDown, 0, NULL) { const char *dir = data; int rv; char errmsg[RAFT_ERRMSG_BUF_SIZE]; struct raft_buffer bufs[2] = {{0}, {0}}; rv = UvFsMakeFile(dir, "foo", bufs, 2, errmsg); munit_assert_int(rv, ==, 0); return MUNIT_OK; } /* If the file exists, the function does not succeed. */ TEST(UvFsMakeFile, exists, DirSetUp, DirTearDown, 0, NULL) { const char *dir = data; int rv; char errmsg[RAFT_ERRMSG_BUF_SIZE]; struct raft_buffer bufs[2] = {{0}, {0}}; rv = UvFsMakeFile(dir, "foo", bufs, 2, errmsg); munit_assert_int(rv, ==, 0); rv = UvFsMakeFile(dir, "foo", bufs, 2, errmsg); munit_assert_int(rv, !=, 0); return MUNIT_OK; } /****************************************************************************** * * UvFsRenameFile * *****************************************************************************/ SUITE(UvFsRenameFile) TEST(UvFsRenameFile, rename, DirSetUp, DirTearDown, 0, NULL) { const char *dir = data; int rv; char errmsg[RAFT_ERRMSG_BUF_SIZE]; struct raft_buffer bufs[2] = {{0}, {0}}; rv = UvFsMakeFile(dir, "foo", bufs, 2, errmsg); munit_assert_int(rv, ==, 0); rv = UvFsRenameFile(dir, "foo", "bar", errmsg); munit_assert_int(rv, ==, 0); munit_assert_false(DirHasFile(dir, "foo")); munit_assert_true(DirHasFile(dir, "bar")); return MUNIT_OK; } /* rename to same name */ TEST(UvFsRenameFile, same, DirSetUp, DirTearDown, 0, NULL) { const char *dir = data; int rv; char errmsg[RAFT_ERRMSG_BUF_SIZE]; struct raft_buffer bufs[2] = {{0}, {0}}; rv = UvFsMakeFile(dir, "foo", bufs, 2, errmsg); munit_assert_int(rv, ==, 0); rv = UvFsRenameFile(dir, "foo", "foo", errmsg); munit_assert_int(rv, ==, 0); munit_assert_true(DirHasFile(dir, "foo")); return MUNIT_OK; } dqlite-1.16.7/test/raft/unit/test_uv_os.c000066400000000000000000000045761465252713400203450ustar00rootroot00000000000000#include "../../../src/raft/uv_os.h" #include "../lib/runner.h" SUITE(UvOsJoin) /* dir and filename have sensible lengths */ TEST(UvOsJoin, basic, NULL, NULL, 0, NULL) { int rv; const char *dir = "/home"; const char *filename = "testfile"; char path[UV__PATH_SZ]; rv = UvOsJoin(dir, filename, path); munit_assert_int(rv, ==, 0); munit_assert_string_equal(path, "/home/testfile"); return MUNIT_OK; } TEST(UvOsJoin, dirTooLong, NULL, NULL, 0, NULL) { int rv; char path[UV__PATH_SZ]; char dir[UV__DIR_LEN + 2]; /* Room for '\0' and then 1 char over limit. */ memset((char *)dir, '/', sizeof(dir)); dir[sizeof(dir) - 1] = '\0'; const char *filename = "testfile"; rv = UvOsJoin(dir, filename, path); munit_assert_int(rv, !=, 0); return MUNIT_OK; } TEST(UvOsJoin, filenameTooLong, NULL, NULL, 0, NULL) { int rv; char path[UV__PATH_SZ]; const char *dir = "testdir"; char filename[UV__FILENAME_LEN + 2]; memset((char *)filename, 'a', sizeof(filename)); filename[sizeof(filename) - 1] = '\0'; rv = UvOsJoin(dir, filename, path); munit_assert_int(rv, !=, 0); return MUNIT_OK; } TEST(UvOsJoin, dirAndFilenameTooLong, NULL, NULL, 0, NULL) { int rv; /* +2 to silence compilers that complain that dir & filename would overflow * path, but it's strictly not needed and doesn't influence the test. */ char path[UV__PATH_SZ + 2]; char dir[UV__DIR_LEN + 2]; memset((char *)dir, '/', sizeof(dir)); dir[sizeof(dir) - 1] = '\0'; char filename[UV__FILENAME_LEN + 2]; memset((char *)filename, 'a', sizeof(filename)); filename[sizeof(filename) - 1] = '\0'; rv = UvOsJoin(dir, filename, path); munit_assert_int(rv, !=, 0); return MUNIT_OK; } TEST(UvOsJoin, dirAndFilenameMax, NULL, NULL, 0, NULL) { int rv; char path[UV__PATH_SZ]; char dir[UV__DIR_LEN + 1]; memset((char *)dir, '/', sizeof(dir)); dir[sizeof(dir) - 1] = '\0'; char filename[UV__FILENAME_LEN + 1]; memset((char *)filename, 'a', sizeof(filename)); filename[sizeof(filename) - 1] = '\0'; rv = UvOsJoin(dir, filename, path); munit_assert_int(rv, ==, 0); char cmp_path[UV__DIR_LEN + UV__FILENAME_LEN + 1 + 1]; snprintf(cmp_path, UV__DIR_LEN + UV__FILENAME_LEN + 1 + 1, "%s/%s", dir, filename); munit_assert_string_equal(path, cmp_path); return MUNIT_OK; } dqlite-1.16.7/test/raft/unit/test_uv_writer.c000066400000000000000000000323051465252713400212270ustar00rootroot00000000000000#include "../../../src/raft/uv_fs.h" #include "../../../src/raft/uv_writer.h" #include "../lib/aio.h" #include "../lib/dir.h" #include "../lib/loop.h" #include "../lib/runner.h" /****************************************************************************** * * Fixture with a UvWriter and an open file ready for writing. * *****************************************************************************/ struct fixture { FIXTURE_DIR; FIXTURE_LOOP; int fd; size_t block_size; size_t direct_io; bool fallocate; bool async_io; char errmsg[256]; struct UvWriter writer; bool closed; }; /****************************************************************************** * * Helper macros. * *****************************************************************************/ struct result { int status; bool done; }; static void closeCb(struct UvWriter *writer) { struct fixture *f = writer->data; f->closed = true; } static void submitCbAssertResult(struct UvWriterReq *req, int status) { struct result *result = req->data; munit_assert_int(status, ==, result->status); result->done = true; } /* Initialize the fixture's writer. */ #define INIT(MAX_WRITES) \ do { \ int _rv; \ _rv = UvWriterInit(&f->writer, &f->loop, f->fd, f->direct_io != 0, \ f->async_io, MAX_WRITES, f->errmsg); \ munit_assert_int(_rv, ==, 0); \ f->writer.data = f; \ f->closed = false; \ } while (0) /* Try to initialize the fixture's writer and check that the given error is * returned. */ #define INIT_ERROR(RV, ERRMSG) \ do { \ int _rv; \ _rv = UvWriterInit(&f->writer, &f->loop, f->fd, f->direct_io != 0, \ f->async_io, 1, f->errmsg); \ munit_assert_int(_rv, ==, RV); \ munit_assert_string_equal(f->errmsg, ERRMSG); \ } while (0) /* Close helper. */ #define CLOSE_SUBMIT \ munit_assert_false(f->closed); \ UvWriterClose(&f->writer, closeCb); \ munit_assert_false(f->closed) #define CLOSE_WAIT LOOP_RUN_UNTIL(&f->closed) #define CLOSE \ CLOSE_SUBMIT; \ CLOSE_WAIT #define MAKE_BUFS(BUFS, N_BUFS, CONTENT) \ { \ int __i; \ BUFS = munit_malloc(sizeof *BUFS * N_BUFS); \ for (__i = 0; __i < N_BUFS; __i++) { \ uv_buf_t *__buf = &BUFS[__i]; \ __buf->len = f->block_size; \ __buf->base = aligned_alloc(f->block_size, f->block_size); \ munit_assert_ptr_not_null(__buf->base); \ memset(__buf->base, CONTENT + __i, __buf->len); \ } \ } #define DESTROY_BUFS(BUFS, N_BUFS) \ { \ int __i; \ for (__i = 0; __i < N_BUFS; __i++) { \ free(BUFS[__i].base); \ } \ free(BUFS); \ } #define WRITE_REQ(N_BUFS, CONTENT, OFFSET, RV, STATUS) \ struct uv_buf_t *_bufs; \ struct UvWriterReq _req; \ struct result _result = {STATUS, false}; \ int _rv; \ MAKE_BUFS(_bufs, N_BUFS, CONTENT); \ _req.data = &_result; \ _rv = UvWriterSubmit(&f->writer, &_req, _bufs, N_BUFS, OFFSET, \ submitCbAssertResult); \ munit_assert_int(_rv, ==, RV); /* Submit a write request with the given parameters and wait for the operation * to successfully complete. Deallocate BUFS when done. * * N_BUFS is the number of buffers to allocate and write, each of them will have * f->block_size bytes. * * CONTENT must be an unsigned byte value: all bytes of the first buffer will be * filled with that value, all bytes of the second buffer will be filled will * that value plus one, etc. * * OFFSET is the offset at which to write the buffers. */ #define WRITE(N_BUFS, CONTENT, OFFSET) \ do { \ WRITE_REQ(N_BUFS, CONTENT, OFFSET, 0 /* rv */, 0 /* status */); \ LOOP_RUN_UNTIL(&_result.done); \ DESTROY_BUFS(_bufs, N_BUFS); \ } while (0) /* Submit a write request with the given parameters and wait for the operation * to fail with the given code and message. */ #define WRITE_FAILURE(N_BUFS, CONTENT, OFFSET, STATUS, ERRMSG) \ do { \ WRITE_REQ(N_BUFS, CONTENT, OFFSET, 0 /* rv */, STATUS); \ LOOP_RUN_UNTIL(&_result.done); \ munit_assert_string_equal(f->writer.errmsg, ERRMSG); \ DESTROY_BUFS(_bufs, N_BUFS); \ } while (0) /* Submit a write request with the given parameters, close the writer right * after and assert that the request got canceled. */ #define WRITE_CLOSE(N_BUFS, CONTENT, OFFSET, STATUS) \ do { \ WRITE_REQ(N_BUFS, CONTENT, OFFSET, 0 /* rv */, STATUS); \ CLOSE_SUBMIT; \ munit_assert_false(_result.done); \ LOOP_RUN_UNTIL(&_result.done); \ DESTROY_BUFS(_bufs, N_BUFS); \ CLOSE_WAIT; \ } while (0) /* Assert that the content of the test file has the given number of blocks, each * filled with progressive numbers. */ #define ASSERT_CONTENT(N) \ do { \ size_t _size = N * f->block_size; \ void *_buf = munit_malloc(_size); \ unsigned _i; \ unsigned _j; \ \ DirReadFile(f->dir, "foo", _buf, _size); \ \ for (_i = 0; _i < N; _i++) { \ char *cursor = (char *)_buf + _i * f->block_size; \ for (_j = 0; _j < f->block_size; _j++) { \ munit_assert_int(cursor[_j], ==, _i + 1); \ } \ } \ \ free(_buf); \ } while (0) #define N_BLOCKS 5 /****************************************************************************** * * Set up and tear down. * *****************************************************************************/ static void *setUpDeps(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); char path[UV__PATH_SZ]; char errmsg[256]; int rv; SET_UP_DIR; SETUP_LOOP; rv = UvFsProbeCapabilities(f->dir, &f->direct_io, &f->async_io, &f->fallocate, errmsg); munit_assert_int(rv, ==, 0); f->block_size = f->direct_io != 0 ? f->direct_io : 4096; rv = UvOsJoin(f->dir, "foo", path); munit_assert_int(rv, ==, 0); rv = UvOsOpen(path, O_WRONLY | O_CREAT, S_IRUSR | S_IWUSR, &f->fd); munit_assert_int(rv, ==, 0); rv = UvOsFallocate(f->fd, 0, f->block_size * N_BLOCKS); munit_assert_int(rv, ==, 0); return f; } static void tearDownDeps(void *data) { struct fixture *f = data; if (f == NULL) { return; /* Was skipped. */ } UvOsClose(f->fd); TEAR_DOWN_LOOP; TEAR_DOWN_DIR; free(f); } static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = setUpDeps(params, user_data); if (f == NULL) { return NULL; } INIT(1); return f; } static void tearDown(void *data) { struct fixture *f = data; if (f == NULL) { return; /* Was skipped. */ } CLOSE; tearDownDeps(f); } /****************************************************************************** * * UvWriterInit * *****************************************************************************/ SUITE(UvWriterInit) /* The kernel has ran out of available AIO events. */ TEST(UvWriterInit, noResources, setUpDeps, tearDownDeps, 0, NULL) { struct fixture *f = data; aio_context_t ctx = 0; int rv; rv = AioFill(&ctx, 0); if (rv != 0) { return MUNIT_SKIP; } INIT_ERROR(RAFT_TOOMANY, "AIO events user limit exceeded"); AioDestroy(ctx); return MUNIT_OK; } /****************************************************************************** * * UvWriterSubmit * *****************************************************************************/ SUITE(UvWriterSubmit) TEST(UvWriterSubmit, one, setUp, tearDown, 0, DirAllParams) { struct fixture *f = data; SKIP_IF_NO_FIXTURE; WRITE(1 /* n bufs */, 1 /* content */, 0 /* offset */); ASSERT_CONTENT(1); return MUNIT_OK; } /* Write two buffers, one after the other. */ TEST(UvWriterSubmit, two, setUp, tearDown, 0, DirAllParams) { struct fixture *f = data; SKIP_IF_NO_FIXTURE; WRITE(1 /* n bufs */, 1 /* content */, 0 /* offset */); WRITE(1 /* n bufs */, 2 /* content */, f->block_size /* offset */); ASSERT_CONTENT(2); return MUNIT_OK; } /* Write the same block twice. */ TEST(UvWriterSubmit, twice, setUp, tearDown, 0, DirAllParams) { struct fixture *f = data; SKIP_IF_NO_FIXTURE; WRITE(1 /* n bufs */, 0 /* content */, 0 /* offset */); WRITE(1 /* n bufs */, 1 /* content */, 0 /* offset */); ASSERT_CONTENT(1); return MUNIT_OK; } /* Write a vector of buffers. */ TEST(UvWriterSubmit, vec, setUp, tearDown, 0, DirAllParams) { struct fixture *f = data; SKIP_IF_NO_FIXTURE; WRITE(2 /* n bufs */, 1 /* content */, 0 /* offset */); ASSERT_CONTENT(1); return MUNIT_OK; } /* Write a vector of buffers twice. */ TEST(UvWriterSubmit, vecTwice, setUp, tearDown, 0, DirAllParams) { struct fixture *f = data; SKIP_IF_NO_FIXTURE; WRITE(2 /* n bufs */, 1 /* content */, 0 /* offset */); WRITE(2 /* n bufs */, 1 /* content */, 0 /* offset */); ASSERT_CONTENT(2); return MUNIT_OK; } /* Write past the allocated space. */ TEST(UvWriterSubmit, beyondEOF, setUp, tearDown, 0, DirAllParams) { struct fixture *f = data; int i; SKIP_IF_NO_FIXTURE; for (i = 0; i < N_BLOCKS + 1; i++) { WRITE(1 /* n bufs */, i + 1 /* content */, i * f->block_size /* offset */); } ASSERT_CONTENT((N_BLOCKS + 1)); return MUNIT_OK; } /* Write two different blocks concurrently. */ TEST(UvWriterSubmit, concurrent, NULL, NULL, 0, DirAllParams) { return MUNIT_SKIP; /* TODO: tests stop responding */ } /* Write the same block concurrently. */ TEST(UvWriterSubmit, concurrentSame, NULL, NULL, 0, DirAllParams) { return MUNIT_SKIP; /* TODO: tests stop responding */ } /* There are not enough resources to create an AIO context to perform the * write. */ TEST(UvWriterSubmit, noResources, setUpDeps, tearDown, 0, DirNoAioParams) { struct fixture *f = data; aio_context_t ctx = 0; int rv; SKIP_IF_NO_FIXTURE; INIT(2); rv = AioFill(&ctx, 0); if (rv != 0) { return MUNIT_SKIP; } WRITE_FAILURE(1, 0, 0, RAFT_TOOMANY, "AIO events user limit exceeded"); AioDestroy(ctx); return MUNIT_OK; } /****************************************************************************** * * UvWriterSubmit * *****************************************************************************/ SUITE(UvWriterClose) /* Close with an inflight write running in the threadpool. */ TEST(UvWriterClose, threadpool, setUp, tearDownDeps, 0, DirNoAioParams) { struct fixture *f = data; SKIP_IF_NO_FIXTURE; WRITE_CLOSE(1, 0, 0, 0); return MUNIT_OK; } /* Close with an inflight AIO write . */ TEST(UvWriterClose, aio, setUp, tearDownDeps, 0, DirAioParams) { struct fixture *f = data; SKIP_IF_NO_FIXTURE; WRITE_CLOSE(1, 0, 0, RAFT_CANCELED); return MUNIT_OK; } dqlite-1.16.7/test/test_error.c000066400000000000000000000154661465252713400164300ustar00rootroot00000000000000#include #include "../include/dqlite.h" #include "../src/error.h" #include "./lib/heap.h" #include "./lib/runner.h" #include "./lib/sqlite.h" TEST_MODULE(error); /****************************************************************************** * * Setup and tear down * ******************************************************************************/ static void *setup(const MunitParameter params[], void *user_data) { dqlite__error *error; test_heap_setup(params, user_data); test_sqlite_setup(params); error = (dqlite__error *)munit_malloc(sizeof(*error)); dqlite__error_init(error); return error; } static void tear_down(void *data) { dqlite__error *error = data; dqlite__error_close(error); test_sqlite_tear_down(); test_heap_tear_down(data); free(error); } /****************************************************************************** * * dqlite__error_printf * ******************************************************************************/ TEST_SUITE(printf); TEST_SETUP(printf, setup); TEST_TEAR_DOWN(printf, tear_down); TEST_CASE(printf, success, NULL) { dqlite__error *error = data; (void)params; munit_assert_true(dqlite__error_is_null(error)); dqlite__error_printf(error, "hello %s", "world"); munit_assert_string_equal(*error, "hello world"); return MUNIT_OK; } TEST_CASE(printf, override, NULL) { dqlite__error *error = data; (void)params; dqlite__error_printf(error, "hello %s", "world"); dqlite__error_printf(error, "I'm %s!", "here"); munit_assert_string_equal(*error, "I'm here!"); return MUNIT_OK; } TEST_CASE(printf, oom, NULL) { dqlite__error *error = data; (void)params; test_heap_fault_config(0, 1); test_heap_fault_enable(); dqlite__error_printf(error, "hello %s", "world"); munit_assert_string_equal(*error, "error message unavailable (out of memory)"); return MUNIT_OK; } /****************************************************************************** * * dqlite__error_wrapf * ******************************************************************************/ TEST_SUITE(wrapf); TEST_SETUP(wrapf, setup); TEST_TEAR_DOWN(wrapf, tear_down); TEST_CASE(wrapf, success, NULL) { dqlite__error *error = data; dqlite__error cause; (void)params; dqlite__error_init(&cause); dqlite__error_printf(&cause, "hello %s", "world"); dqlite__error_wrapf(error, &cause, "boom"); dqlite__error_close(&cause); munit_assert_string_equal(*error, "boom: hello world"); return MUNIT_OK; } TEST_CASE(wrapf, null_cause, NULL) { dqlite__error *error = data; dqlite__error cause; (void)params; dqlite__error_init(&cause); dqlite__error_wrapf(error, &cause, "boom"); dqlite__error_close(&cause); munit_assert_string_equal(*error, "boom: (null)"); return MUNIT_OK; } TEST_CASE(wrapf, itself, NULL) { dqlite__error *error = data; (void)params; dqlite__error_printf(error, "I'm %s!", "here"); dqlite__error_wrapf(error, error, "boom"); munit_assert_string_equal(*error, "boom: I'm here!"); return MUNIT_OK; } /****************************************************************************** * * dqlite__error_oom * ******************************************************************************/ TEST_SUITE(oom); TEST_SETUP(oom, setup); TEST_TEAR_DOWN(oom, tear_down); TEST_CASE(oom, success, NULL) { dqlite__error *error = data; (void)params; dqlite__error_oom(error, "boom"); munit_assert_string_equal(*error, "boom: out of memory"); return MUNIT_OK; } TEST_CASE(oom, vargs, NULL) { dqlite__error *error = data; (void)params; dqlite__error_oom(error, "boom %d", 123); munit_assert_string_equal(*error, "boom 123: out of memory"); return MUNIT_OK; } /****************************************************************************** * * dqlite__error_sys * ******************************************************************************/ TEST_SUITE(sys); TEST_SETUP(sys, setup); TEST_TEAR_DOWN(sys, tear_down); TEST_CASE(sys, success, NULL) { dqlite__error *error = data; (void)params; open("/foo/bar/egg/baz", 0); dqlite__error_sys(error, "boom"); munit_assert_string_equal(*error, "boom: No such file or directory"); return MUNIT_OK; } /****************************************************************************** * * dqlite__error_uv * ******************************************************************************/ TEST_SUITE(uv); TEST_SETUP(uv, setup); TEST_TEAR_DOWN(uv, tear_down); TEST_CASE(uv, success, NULL) { dqlite__error *error = data; (void)params; dqlite__error_uv(error, UV_EBUSY, "boom"); munit_assert_string_equal(*error, "boom: resource busy or locked (EBUSY)"); return MUNIT_OK; } /****************************************************************************** * * dqlite__error_copy * ******************************************************************************/ TEST_SUITE(copy); TEST_SETUP(copy, setup); TEST_TEAR_DOWN(copy, tear_down); TEST_CASE(copy, success, NULL) { dqlite__error *error = data; int err; char *msg; (void)params; dqlite__error_printf(error, "hello %s", "world"); err = dqlite__error_copy(error, &msg); munit_assert_int(err, ==, 0); munit_assert_string_equal(msg, "hello world"); sqlite3_free(msg); return MUNIT_OK; } TEST_CASE(copy, null, NULL) { dqlite__error *error = data; int err; char *msg; (void)params; err = dqlite__error_copy(error, &msg); munit_assert_int(err, ==, DQLITE_ERROR); munit_assert_ptr_equal(msg, NULL); return MUNIT_OK; } TEST_CASE(copy, oom, NULL) { dqlite__error *error = data; int err; char *msg; (void)params; return MUNIT_SKIP; test_heap_fault_config(2, 1); test_heap_fault_enable(); dqlite__error_printf(error, "hello"); err = dqlite__error_copy(error, &msg); munit_assert_int(err, ==, DQLITE_NOMEM); munit_assert_ptr_equal(msg, NULL); return MUNIT_OK; } /****************************************************************************** * * dqlite__error_is_disconnect * ******************************************************************************/ TEST_SUITE(is_disconnect); TEST_SETUP(is_disconnect, setup); TEST_TEAR_DOWN(is_disconnect, tear_down); TEST_CASE(is_disconnect, eof, NULL) { dqlite__error *error = data; (void)params; dqlite__error_uv(error, UV_EOF, "boom"); munit_assert_true(dqlite__error_is_disconnect(error)); return MUNIT_OK; } TEST_CASE(is_disconnect, econnreset, NULL) { dqlite__error *error = data; (void)params; dqlite__error_uv(error, UV_ECONNRESET, "boom"); munit_assert_true(dqlite__error_is_disconnect(error)); return MUNIT_OK; } TEST_CASE(is_disconnect, other, NULL) { dqlite__error *error = data; (void)params; dqlite__error_printf(error, "boom"); munit_assert_true(!dqlite__error_is_disconnect(error)); return MUNIT_OK; } TEST_CASE(is_disconnect, null, NULL) { dqlite__error *error = data; (void)params; munit_assert_true(!dqlite__error_is_disconnect(error)); return MUNIT_OK; } dqlite-1.16.7/test/test_integration.c000066400000000000000000000210721465252713400176100ustar00rootroot00000000000000#include #include #include "../include/dqlite.h" #include "./lib/runner.h" TEST_MODULE(integration); #if 0 /****************************************************************************** * * Helpers * ******************************************************************************/ /* A worker that keeps inserting rows into a test table and fetching them back, * checking that they have been all inserted. */ struct worker { struct test_client *client; /* A connected client */ int i; /* Worker index */ int a; /* Start inserting from this number */ int n; /* Number of insertions to perform */ pthread_t thread; /* System thread we run in */ }; static void *__worker_run(void *arg) { struct worker *w; char *leader; uint64_t heartbeat; uint32_t db_id; int b; int i; munit_assert_ptr_not_null(arg); w = (struct worker *)arg; /* Initialize the connection and open a database. */ test_client_handshake(w->client); test_client_leader(w->client, &leader); test_client_client(w->client, &heartbeat); test_client_open(w->client, "test.db", &db_id); b = w->a + w->n; for (i = w->a; i < b; i++) { uint32_t stmt_id; char sql[128]; struct test_client_result result; struct test_client_rows rows; struct test_client_row *row; int j; /* Insert a row in the test table. */ sprintf(sql, "INSERT INTO test(n) VALUES(%d)", i); test_client_prepare(w->client, db_id, sql, &stmt_id); test_client_exec(w->client, db_id, stmt_id, &result); munit_assert_int(result.rows_affected, ==, 1); test_client_finalize(w->client, db_id, stmt_id); /* Fetch all rows within our own working range. */ sprintf(sql, "SELECT n FROM test WHERE n >= %d AND n < %d", w->a, b); test_client_prepare(w->client, db_id, sql, &stmt_id); test_client_query(w->client, db_id, stmt_id, &rows); munit_assert_int(rows.column_count, ==, 1); munit_assert_string_equal(rows.column_names[0], "n"); row = rows.next; for (j = w->a; j <= i; j++) { munit_assert_ptr_not_null(row); munit_assert_int(row->types[0], ==, SQLITE_INTEGER); munit_assert_int(*(int64_t *)row->values[0], ==, j); row = row->next; } test_client_rows_close(&rows); test_client_finalize(w->client, db_id, stmt_id); } return 0; } static void __worker_start(struct worker *w, struct test_server *server, int i, int a, int n) { int err; w->i = i; w->a = a; w->n = n; test_server_connect(server, &w->client); err = pthread_create(&w->thread, 0, &__worker_run, (void *)w); if (err) { munit_errorf("failed to spawn test worker thread: %s", strerror(errno)); } } static void __worker_wait(struct worker *w) { int err; void *retval; err = pthread_join(w->thread, &retval); if (err) { munit_errorf("failed to wait test worker thread: %s", strerror(errno)); } test_client_close(w->client); free(w->client); } /****************************************************************************** * * Setup and tear down * ******************************************************************************/ static void *setup(const MunitParameter params[], void *user_data) { struct test_server *server; const char *errmsg; int err; (void)user_data; (void)params; err = dqlite_init(&errmsg); munit_assert_int(err, ==, 0); server = test_server_start("unix", params); return server; } static void tear_down(void *data) { struct test_server *server = data; int rc; test_server_stop(server); rc = sqlite3_shutdown(); munit_assert_int(rc, ==, 0); } /****************************************************************************** * * Tests * ******************************************************************************/ TEST_SUITE(exec); TEST_SETUP(exec, setup); TEST_TEAR_DOWN(exec, tear_down); #include TEST_CASE(exec, single_query, NULL) { struct test_server *server = data; struct test_client *client; char *leader; uint64_t heartbeat; uint32_t db_id; uint32_t stmt_id; struct test_client_result result; struct test_client_rows rows; (void)params; test_server_connect(server, &client); /* Initialize the connection and open a database. */ test_client_handshake(client); test_client_leader(client, &leader); test_client_client(client, &heartbeat); test_client_open(client, "test.db", &db_id); munit_assert_int(db_id, ==, 0); /* Create a test table. */ test_client_prepare(client, db_id, "CREATE TABLE test (n INT)", &stmt_id); test_client_exec(client, db_id, stmt_id, &result); test_client_finalize(client, db_id, stmt_id); /* Insert a row in the test table. */ test_client_prepare(client, db_id, "INSERT INTO test VALUES(123)", &stmt_id); munit_assert_int(stmt_id, ==, 0); test_client_exec(client, db_id, stmt_id, &result); munit_assert_int(result.last_insert_id, ==, 1); munit_assert_int(result.rows_affected, ==, 1); test_client_finalize(client, db_id, stmt_id); /* Select rows from the test table. */ test_client_prepare(client, db_id, "SELECT n FROM test", &stmt_id); munit_assert_int(stmt_id, ==, 0); test_client_query(client, db_id, stmt_id, &rows); munit_assert_int(rows.column_count, ==, 1); munit_assert_string_equal(rows.column_names[0], "n"); munit_assert_ptr_not_null(rows.next); munit_assert_int(rows.next->types[0], ==, SQLITE_INTEGER); munit_assert_int(*(int64_t *)rows.next->values[0], ==, 123); test_client_rows_close(&rows); test_client_finalize(client, db_id, stmt_id); test_client_close(client); free(client); return MUNIT_OK; } TEST_CASE(exec, large_query, NULL) { struct test_server *server = data; struct test_client *client; char *leader; uint64_t heartbeat; uint32_t db_id; uint32_t stmt_id; struct test_client_result result; struct test_client_rows rows; int i; (void)params; test_server_connect(server, &client); /* Initialize the connection and open a database. */ test_client_handshake(client); test_client_leader(client, &leader); test_client_client(client, &heartbeat); test_client_open(client, "test.db", &db_id); munit_assert_int(db_id, ==, 0); /* Create a test table. */ test_client_prepare(client, db_id, "CREATE TABLE test (n INT)", &stmt_id); test_client_exec(client, db_id, stmt_id, &result); test_client_finalize(client, db_id, stmt_id); test_client_prepare(client, db_id, "BEGIN", &stmt_id); test_client_exec(client, db_id, stmt_id, &result); test_client_finalize(client, db_id, stmt_id); /* Insert lots of rows in the test table. */ test_client_prepare(client, db_id, "INSERT INTO test VALUES(123456789)", &stmt_id); for (i = 0; i < 256; i++) { munit_assert_int(stmt_id, ==, 0); test_client_exec(client, db_id, stmt_id, &result); munit_assert_int(result.rows_affected, ==, 1); } test_client_finalize(client, db_id, stmt_id); test_client_prepare(client, db_id, "COMMIT", &stmt_id); test_client_exec(client, db_id, stmt_id, &result); test_client_finalize(client, db_id, stmt_id); /* Select all rows from the test table. */ test_client_prepare(client, db_id, "SELECT n FROM test", &stmt_id); munit_assert_int(stmt_id, ==, 0); test_client_query(client, db_id, stmt_id, &rows); munit_assert_int(rows.column_count, ==, 1); munit_assert_string_equal(rows.column_names[0], "n"); munit_assert_ptr_not_null(rows.next); munit_assert_int(rows.next->types[0], ==, SQLITE_INTEGER); munit_assert_int(*(int64_t *)rows.next->values[0], ==, 123456789); test_client_rows_close(&rows); test_client_finalize(client, db_id, stmt_id); test_client_close(client); free(client); return MUNIT_OK; } TEST_CASE(exec, multi_thread, NULL) { struct test_server *server = data; struct worker *workers; struct test_client *client; struct test_client_result result; char *leader; uint64_t heartbeat; uint32_t db_id; uint32_t stmt_id; (void)params; int n = 2; int i; test_server_connect(server, &client); /* Initialize the connection and open a database. */ test_client_handshake(client); test_client_leader(client, &leader); test_client_client(client, &heartbeat); test_client_open(client, "test.db", &db_id); munit_assert_int(db_id, ==, 0); /* Create a test table and close this client. */ test_client_prepare(client, db_id, "CREATE TABLE test (n INT)", &stmt_id); test_client_exec(client, db_id, stmt_id, &result); test_client_finalize(client, db_id, stmt_id); test_client_close(client); /* Spawn the workers. */ workers = munit_malloc(n * sizeof *workers); for (i = 0; i < n; i++) { __worker_start(&(workers[i]), server, i, i * 100000, 4); } /* Wait for the workers. */ for (i = 0; i < n; i++) { __worker_wait(&(workers[i])); } free(client); free(workers); return MUNIT_OK; } #endif dqlite-1.16.7/test/unit/000077500000000000000000000000001465252713400150375ustar00rootroot00000000000000dqlite-1.16.7/test/unit/ext/000077500000000000000000000000001465252713400156375ustar00rootroot00000000000000dqlite-1.16.7/test/unit/ext/test_uv.c000066400000000000000000000117401465252713400174770ustar00rootroot00000000000000#include #include #include "../../../src/lib/transport.h" #include "../../../src/raft.h" #include "../../lib/endpoint.h" #include "../../lib/runner.h" #include "../../lib/uv.h" TEST_MODULE(ext_uv); /****************************************************************************** * * Helpers * ******************************************************************************/ struct fixture { struct uv_loop_s loop; struct uv_stream_s *listener; struct test_endpoint endpoint; int client; union { uv_tcp_t tcp; uv_pipe_t pipe; uv_stream_t stream; }; }; /* Return a buffer of size TEST_SOCKET_MIN_BUF_SIZE */ static uv_buf_t *buf_malloc(void) { uv_buf_t *buf = munit_malloc(sizeof *buf); buf->base = munit_malloc(TEST_SOCKET_MIN_BUF_SIZE); buf->len = TEST_SOCKET_MIN_BUF_SIZE; return buf; } /* Free the buffer returned by buf_malloc() */ static void buf_free(uv_buf_t *buf) { free(buf->base); free(buf); } /****************************************************************************** * * Parameters * ******************************************************************************/ /* Run the tests using both TCP and Unix sockets. */ static MunitParameterEnum endpointParams[] = { {TEST_ENDPOINT_FAMILY, test_endpoint_family_values}, {NULL, NULL}, }; /****************************************************************************** * * Setup and tear down * ******************************************************************************/ static void listenCb(uv_stream_t *listener, int status) { struct fixture *f = listener->data; int rv; munit_assert_int(status, ==, 0); switch (listener->type) { case UV_TCP: rv = uv_tcp_init(&f->loop, &f->tcp); munit_assert_int(rv, ==, 0); break; case UV_NAMED_PIPE: rv = uv_pipe_init(&f->loop, &f->pipe, 0); munit_assert_int(rv, ==, 0); break; default: munit_assert(0); } rv = uv_accept(listener, &f->stream); munit_assert_int(rv, ==, 0); } static void *setup(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); int rv; (void)user_data; test_uv_setup(params, &f->loop); test_endpoint_setup(&f->endpoint, params); rv = transport__stream(&f->loop, f->endpoint.fd, &f->listener); munit_assert_int(rv, ==, 0); f->listener->data = f; rv = uv_listen(f->listener, 128, listenCb); munit_assert_int(rv, ==, 0); f->client = test_endpoint_connect(&f->endpoint); test_uv_run(&f->loop, 1); return f; } static void tear_down(void *data) { struct fixture *f = data; int rv; rv = close(f->client); munit_assert_int(rv, ==, 0); uv_close((struct uv_handle_s *)f->listener, (uv_close_cb)raft_free); test_endpoint_tear_down(&f->endpoint); uv_close((uv_handle_t *)(&f->stream), NULL); test_uv_stop(&f->loop); test_uv_tear_down(&f->loop); free(f); } /****************************************************************************** * * uv_write * ******************************************************************************/ TEST_SUITE(write); TEST_SETUP(write, setup); TEST_TEAR_DOWN(write, tear_down); /* Writing an amount of data below the buffer size makes that data immediately * available for reading. */ TEST_CASE(write, sync, endpointParams) { struct fixture *f = data; uv_write_t req; uv_buf_t *buf1 = buf_malloc(); uv_buf_t *buf2 = buf_malloc(); int rv; (void)params; rv = uv_write(&req, &f->stream, buf1, 1, NULL); munit_assert_int(rv, ==, 0); rv = read(f->client, buf2->base, buf2->len); munit_assert_int(rv, ==, buf2->len); test_uv_run(&f->loop, 1); buf_free(buf1); buf_free(buf2); return MUNIT_OK; } /****************************************************************************** * * uv_read * ******************************************************************************/ TEST_SUITE(read); TEST_SETUP(read, setup); TEST_TEAR_DOWN(read, tear_down); static void test_read_sync__alloc_cb(uv_handle_t *stream, size_t _, uv_buf_t *buf) { (void)stream; (void)_; buf->len = TEST_SOCKET_MIN_BUF_SIZE; buf->base = munit_malloc(TEST_SOCKET_MIN_BUF_SIZE); } static void test_read_sync__read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf) { bool *read_cb_called; /* Apprently there's an empty read before the actual one. */ if (nread == 0) { free(buf->base); return; } munit_assert_int(nread, ==, TEST_SOCKET_MIN_BUF_SIZE); munit_assert_int(buf->len, ==, TEST_SOCKET_MIN_BUF_SIZE); read_cb_called = stream->data; *read_cb_called = true; free(buf->base); } /* Reading an amount of data below the buffer happens synchronously. */ TEST_CASE(read, sync, endpointParams) { struct fixture *f = data; uv_buf_t *buf = buf_malloc(); int rv; bool read_cb_called; (void)params; f->stream.data = &read_cb_called; rv = uv_read_start(&f->stream, test_read_sync__alloc_cb, test_read_sync__read_cb); rv = write(f->client, buf->base, buf->len); munit_assert_int(rv, ==, buf->len); test_uv_run(&f->loop, 1); munit_assert_true(read_cb_called); buf_free(buf); return MUNIT_OK; } dqlite-1.16.7/test/unit/ext/test_uv_pool.c000066400000000000000000000041321465252713400205250ustar00rootroot00000000000000#include "../../../src/lib/threadpool.h" #include "../../../src/utils.h" #include "../../lib/runner.h" #include "../../lib/uv.h" TEST_MODULE(ext_uv_pool); /****************************************************************************** * * threadpool * ******************************************************************************/ enum { WORK_ITEMS_NR = 50000 }; struct fixture { pool_work_t w; uv_loop_t loop; pool_t pool; }; static void loop_setup(struct fixture *f) { int rc; rc = uv_loop_init(&f->loop); munit_assert_int(rc, ==, 0); rc = pool_init(&f->pool, &f->loop, 4, POOL_QOS_PRIO_FAIR); munit_assert_int(rc, ==, 0); } static void bottom_work_cb(pool_work_t *w) { (void)w; } static void bottom_after_work_cb(pool_work_t *w) { static int count = 0; if (count == WORK_ITEMS_NR) pool_close(w->pool); count++; assert(w->type != WT_BAR); free(w); } static void after_work_cb(pool_work_t *w) { enum pool_work_type pwt; pool_work_t *work; unsigned int wt; unsigned int i; for (i = 0; i <= WORK_ITEMS_NR + 1 /* +WT_BAR */; i++) { work = calloc(1, sizeof(*work)); if (i < WORK_ITEMS_NR / 2) wt = WT_ORD1; else if (i == WORK_ITEMS_NR / 2) wt = WT_BAR; else wt = WT_ORD2; pwt = i % 2 == 0 ? wt : WT_UNORD; pool_queue_work(w->pool, work, i, pwt, bottom_work_cb, bottom_after_work_cb); } } static void work_cb(pool_work_t *w) { (void)w; } static void threadpool_tear_down(void *data) { int rc; struct fixture *f = data; pool_fini(&f->pool); rc = uv_loop_close(&f->loop); munit_assert_int(rc, ==, 0); free(f); } static void *threadpool_setup(const MunitParameter params[], void *user_data) { (void)params; (void)user_data; struct fixture *f = calloc(1, sizeof *f); loop_setup(f); return f; } TEST_SUITE(threadpool); TEST_SETUP(threadpool, threadpool_setup); TEST_TEAR_DOWN(threadpool, threadpool_tear_down); TEST_CASE(threadpool, sync, NULL) { (void)params; struct fixture *f = data; int rc; pool_queue_work(&f->pool, &f->w, 0, WT_UNORD, work_cb, after_work_cb); rc = uv_run(&f->loop, UV_RUN_DEFAULT); munit_assert_int(rc, ==, 0); return MUNIT_OK; } dqlite-1.16.7/test/unit/lib/000077500000000000000000000000001465252713400156055ustar00rootroot00000000000000dqlite-1.16.7/test/unit/lib/test_addr.c000066400000000000000000000034211465252713400177220ustar00rootroot00000000000000#include #include #include "../../../src/lib/addr.h" #include "../../lib/runner.h" TEST_MODULE(lib_addr); struct fixture { struct sockaddr_un addr_un; }; static void *setup(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof(*f)); (void)params; (void)user_data; return f; } static void tear_down(void *data) { struct fixture *f = data; free(f); } #define ASSERT_PARSE(ADDR, STATUS, FAMILY) \ socklen_t addr_len = sizeof(f->addr_un); \ int rv; \ rv = AddrParse(ADDR, (struct sockaddr *)&f->addr_un, &addr_len, \ "8080", DQLITE_ADDR_PARSE_UNIX); \ munit_assert_int(rv, ==, STATUS); \ munit_assert_int(f->addr_un.sun_family, ==, FAMILY) TEST_SUITE(parse); TEST_SETUP(parse, setup); TEST_TEAR_DOWN(parse, tear_down); TEST_CASE(parse, ipv4_no_port, NULL) { struct fixture *f = data; (void)params; ASSERT_PARSE("1.2.3.4", 0, AF_INET); return MUNIT_OK; } TEST_CASE(parse, ipv4_with_port, NULL) { struct fixture *f = data; (void)params; ASSERT_PARSE("127.0.0.1:9001", 0, AF_INET); return MUNIT_OK; } TEST_CASE(parse, ipv6_no_port, NULL) { struct fixture *f = data; (void)params; ASSERT_PARSE("::1", 0, AF_INET6); return MUNIT_OK; } TEST_CASE(parse, ipv6_with_port, NULL) { struct fixture *f = data; (void)params; ASSERT_PARSE("[2001:4860:4860::8888]:9001", 0, AF_INET6); return MUNIT_OK; } TEST_CASE(parse, unix, NULL) { struct fixture *f = data; (void)params; ASSERT_PARSE("@xyz", 0, AF_UNIX); return MUNIT_OK; } TEST_CASE(parse, unix_auto, NULL) { struct fixture *f = data; (void)params; ASSERT_PARSE("@", 0, AF_UNIX); return MUNIT_OK; } dqlite-1.16.7/test/unit/lib/test_buffer.c000066400000000000000000000053061465252713400202650ustar00rootroot00000000000000#include "../../../src/lib/buffer.h" #include "../../lib/runner.h" TEST_MODULE(lib_buffer); /****************************************************************************** * * Fixture * ******************************************************************************/ struct fixture { struct buffer buffer; }; static void *setup(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); int rc; (void)params; (void)user_data; rc = buffer__init(&f->buffer); munit_assert_int(rc, ==, 0); return f; } static void tear_down(void *data) { struct fixture *f = data; buffer__close(&f->buffer); free(f); } /****************************************************************************** * * Helper macros. * ******************************************************************************/ #define ADVANCE(SIZE) \ { \ cursor = buffer__advance(&f->buffer, SIZE); \ munit_assert_ptr_not_null(cursor); \ } /****************************************************************************** * * Assertions. * ******************************************************************************/ #define ASSERT_N_PAGES(N) munit_assert_int(f->buffer.n_pages, ==, N) /****************************************************************************** * * buffer__init * ******************************************************************************/ TEST_SUITE(init); TEST_SETUP(init, setup); TEST_TEAR_DOWN(init, tear_down); /* If n is 0, then the prefix is used to dermine the number of elements of the * tuple. */ TEST_CASE(init, n_pages, NULL) { struct fixture *f = data; (void)params; ASSERT_N_PAGES(1); munit_assert_long(f->buffer.page_size, ==, sysconf(_SC_PAGESIZE)); return MUNIT_OK; } /****************************************************************************** * * buffer__advance * ******************************************************************************/ TEST_SUITE(advance); TEST_SETUP(advance, setup); TEST_TEAR_DOWN(advance, tear_down); /* The buffer already has enough capacity. */ TEST_CASE(advance, enough, NULL) { struct fixture *f = data; void *cursor; (void)params; ADVANCE(16); ASSERT_N_PAGES(1); return MUNIT_OK; } /* The buffer needs to double its size once. */ TEST_CASE(advance, double, NULL) { struct fixture *f = data; void *cursor; (void)params; ADVANCE(16 + f->buffer.page_size); ASSERT_N_PAGES(2); return MUNIT_OK; } /* The buffer needs to double its sice twice. */ TEST_CASE(advance, double_twice, NULL) { struct fixture *f = data; void *cursor; (void)params; ADVANCE(16 + 3 * f->buffer.page_size); ASSERT_N_PAGES(4); return MUNIT_OK; } dqlite-1.16.7/test/unit/lib/test_byte.c000066400000000000000000000041321465252713400177530ustar00rootroot00000000000000#include "../../../src/lib/byte.h" #include "../../lib/runner.h" TEST_MODULE(lib_addr); TEST_SUITE(endian); static uint16_t vfsFlip16(uint16_t v) { #if defined(DQLITE_BIG_ENDIAN) return v; #elif defined(DQLITE_LITTLE_ENDIAN) && defined(DQLITE_HAVE_BSWAP) defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 return __builtin_bswap16(v); #else union { uint16_t u; uint8_t v[4]; } s; s.v[0] = (uint8_t)(v >> 8); s.v[1] = (uint8_t)v; return s.u; #endif } static uint32_t vfsFlip32(uint32_t v) { #if defined(DQLITE_BIG_ENDIAN) return v; #elif defined(DQLITE_LITTLE_ENDIAN) && defined(DQLITE_HAVE_BSWAP) return __builtin_bswap32(v); #else union { uint32_t u; uint8_t v[4]; } s; s.v[0] = (uint8_t)(v >> 24); s.v[1] = (uint8_t)(v >> 16); s.v[2] = (uint8_t)(v >> 8); s.v[3] = (uint8_t)v; return s.u; #endif } static uint16_t vfsGet16(const uint8_t *buf) { union { uint16_t u; uint8_t v[2]; } s; s.v[0] = buf[0]; s.v[1] = buf[1]; return vfsFlip16(s.u); } static uint32_t vfsGet32(const uint8_t *buf) { union { uint32_t u; uint8_t v[4]; } s; s.v[0] = buf[0]; s.v[1] = buf[1]; s.v[2] = buf[2]; s.v[3] = buf[3]; return vfsFlip32(s.u); } static void vfsPut32(uint32_t v, uint8_t *buf) { uint32_t u = vfsFlip32(v); memcpy(buf, &u, sizeof u); } TEST_CASE(endian, get16, NULL) { (void)params; (void)data; uint16_t x, y; uint8_t buf[2]; for (x = 0; x < 1 << 8; x++) { for (y = 0; y < 1 << 8; y++) { buf[0] = (uint8_t)x; buf[1] = (uint8_t)y; munit_assert_uint16(ByteGetBe16(buf), ==, vfsGet16(buf)); } } return MUNIT_OK; } TEST_CASE(endian, get32, NULL) { (void)params; (void)data; uint8_t buf[4]; uint32_t i; for (i = 0; i < 1 << 16; i++) { munit_rand_memory(4, buf); munit_assert_uint32(ByteGetBe32(buf), ==, vfsGet32(buf)); } return MUNIT_OK; } TEST_CASE(endian, put32, NULL) { (void)params; (void)data; uint32_t v; uint8_t buf[4], vfs_buf[4]; uint32_t i; for (i = 0; i < (1 << 16); i++) { v = munit_rand_uint32(); BytePutBe32(v, buf); vfsPut32(v, vfs_buf); munit_assert_memory_equal(4, buf, vfs_buf); } return MUNIT_OK; } dqlite-1.16.7/test/unit/lib/test_registry.c000066400000000000000000000200101465252713400206510ustar00rootroot00000000000000#include #include "../../../src/lib/registry.h" #include "../../lib/runner.h" TEST_MODULE(lib_registry); struct test_item { size_t id; int *ptr; }; static void test_item_init(struct test_item *i) { munit_assert(i != NULL); i->ptr = (int *)sqlite3_malloc(sizeof(*(i->ptr))); *i->ptr = 123; } static void test_item_close(struct test_item *i) { munit_assert(i != NULL); munit_assert(i->ptr != NULL); sqlite3_free(i->ptr); } static const char *test_item_hash(struct test_item *i) { munit_assert(i != NULL); return "x"; } REGISTRY(test_registry, test_item); REGISTRY_METHODS(test_registry, test_item); static void *setup(const MunitParameter params[], void *user_data) { struct test_registry *registry; (void)params; (void)user_data; registry = (struct test_registry *)munit_malloc(sizeof(*registry)); test_registry_init(registry); return registry; } static void tear_down(void *data) { struct test_registry *registry = data; test_registry_close(registry); free(registry); } TEST_SUITE(add); TEST_SETUP(add, setup); TEST_TEAR_DOWN(add, tear_down); static char *test_add_n[] = {"1", "2", "3", "5", "6", "7", "8", "9", "10", NULL}; static MunitParameterEnum test_add_params[] = { {"n", test_add_n}, {NULL, NULL}, }; /* Add N items. */ TEST_CASE(add, basic, test_add_params) { struct test_registry *registry = data; int err; struct test_item *item; int n; int i; n = atoi(munit_parameters_get(params, "n")); munit_assert_int(n, >, 0); for (i = 0; i < n; i++) { err = test_registry_add(registry, &item); munit_assert_int(err, ==, 0); munit_assert_ptr_not_equal(item, NULL); munit_assert_ptr_not_equal(item->ptr, NULL); munit_assert_int(123, ==, *item->ptr); } return MUNIT_OK; } /* Add three items, delete the second, and then add another one. The original ID * of the deleted item gets reused. */ TEST_CASE(add, del_add, NULL) { struct test_registry *registry = data; int err; struct test_item *item1; struct test_item *item2; struct test_item *item3; struct test_item *item4; int item2_id; (void)params; err = test_registry_add(registry, &item1); munit_assert_int(err, ==, 0); err = test_registry_add(registry, &item2); munit_assert_int(err, ==, 0); item2_id = item2->id; err = test_registry_add(registry, &item3); munit_assert_int(err, ==, 0); err = test_registry_del(registry, item2); munit_assert_int(err, ==, 0); err = test_registry_add(registry, &item4); munit_assert_int(err, ==, 0); munit_assert_int(item4->id, ==, item2_id); return MUNIT_OK; } /* Add N items and then delete them all. */ TEST_CASE(add, and_del, test_add_params) { struct test_registry *registry = data; int err; struct test_item **items; int n; int i; n = atoi(munit_parameters_get(params, "n")); munit_assert_int(n, >, 0); items = munit_malloc(n * sizeof(*items)); for (i = 0; i < n; i++) { err = test_registry_add(registry, &items[i]); munit_assert_int(err, ==, 0); } for (i = 0; i < n; i++) { err = test_registry_del(registry, items[i]); munit_assert_int(err, ==, 0); } free(items); return MUNIT_OK; } TEST_SUITE(get); TEST_SETUP(get, setup); TEST_TEAR_DOWN(get, tear_down); /* Retrieve a previously added item. */ TEST_CASE(get, basic, NULL) { struct test_registry *registry = data; int err; struct test_item *item; (void)params; err = test_registry_add(registry, &item); munit_assert_int(err, ==, 0); munit_assert_ptr_equal(test_registry_get(registry, item->id), item); return MUNIT_OK; } /* An item gets added and then deleted. Trying to fetch the item using its * former ID results in a NULL pointer. */ TEST_CASE(get, deleted, NULL) { struct test_registry *registry = data; int err; struct test_item *item; size_t id; (void)params; err = test_registry_add(registry, &item); munit_assert_int(err, ==, 0); id = item->id; err = test_registry_del(registry, item); munit_assert_int(err, ==, 0); munit_assert_ptr_equal(test_registry_get(registry, id), NULL); return MUNIT_OK; } /* Retrieve an item with an ID bigger than the current registry's length. */ TEST_CASE(get, out_of_bound, NULL) { struct test_registry *registry = data; struct test_item *item = test_registry_get(registry, 123); (void)params; munit_assert_ptr_equal(item, NULL); return MUNIT_OK; } TEST_SUITE(idx); TEST_SETUP(idx, setup); TEST_TEAR_DOWN(idx, tear_down); /* Find the index of a matching item. */ TEST_CASE(idx, found, NULL) { struct test_registry *registry = data; struct test_item *item; size_t i; int err; (void)params; err = test_registry_add(registry, &item); munit_assert_int(err, ==, 0); err = test_registry_idx(registry, "x", &i); munit_assert_int(err, ==, 0); munit_assert_int(i, ==, item->id); return MUNIT_OK; } /* No matching item. */ TEST_CASE(idx, not_found, NULL) { struct test_registry *registry = data; struct test_item *item1; struct test_item *item2; size_t i; int err; (void)params; err = test_registry_add(registry, &item1); munit_assert_int(err, ==, 0); err = test_registry_add(registry, &item2); munit_assert_int(err, ==, 0); err = test_registry_del(registry, item1); munit_assert_int(err, ==, 0); err = test_registry_idx(registry, "y", &i); munit_assert_int(err, ==, DQLITE_NOTFOUND); return MUNIT_OK; } TEST_SUITE(del); TEST_SETUP(del, setup); TEST_TEAR_DOWN(del, tear_down); /* Delete an item from the registry. */ TEST_CASE(del, basic, NULL) { struct test_registry *registry = data; int err; struct test_item *item; (void)params; err = test_registry_add(registry, &item); munit_assert_int(err, ==, 0); err = test_registry_del(registry, item); munit_assert_int(err, ==, 0); return MUNIT_OK; } /* Deleting an item twice results in an error. */ TEST_CASE(del, twice, NULL) { struct test_registry *registry = data; int err; struct test_item *item; struct test_item item_clone; (void)params; err = test_registry_add(registry, &item); munit_assert_int(err, ==, 0); item_clone.id = item->id; err = test_registry_del(registry, item); munit_assert_int(err, ==, 0); err = test_registry_del(registry, &item_clone); munit_assert_int(err, ==, DQLITE_NOTFOUND); return MUNIT_OK; } /* Deleting an item twice results in an error, also if the item being deleted * again has an ID lower than the highest one. */ TEST_CASE(del, twice_middle, NULL) { struct test_registry *registry = data; int err; struct test_item *item1; struct test_item *item2; struct test_item item1_clone; (void)params; err = test_registry_add(registry, &item1); munit_assert_int(err, ==, 0); item1_clone.id = item1->id; err = test_registry_add(registry, &item2); munit_assert_int(err, ==, 0); err = test_registry_del(registry, item1); munit_assert_int(err, ==, 0); err = test_registry_del(registry, &item1_clone); munit_assert_int(err, ==, DQLITE_NOTFOUND); return MUNIT_OK; } /* Deleting an item with an unknown ID results in an error. */ TEST_CASE(del, out_of_bounds, NULL) { struct test_registry *registry = data; struct test_item item; int err; (void)params; item.id = 123; err = test_registry_del(registry, &item); munit_assert_int(err, ==, DQLITE_NOTFOUND); return MUNIT_OK; } /* Add several items and then delete them. */ TEST_CASE(del, many, NULL) { struct test_registry *registry = data; int err; struct test_item *item1; struct test_item *item2; struct test_item *item3; (void)params; err = test_registry_add(registry, &item1); munit_assert_int(err, ==, 0); munit_assert_int(item1->id, ==, 0); err = test_registry_add(registry, &item2); munit_assert_int(err, ==, 0); munit_assert_int(item2->id, ==, 1); err = test_registry_add(registry, &item3); munit_assert_int(err, ==, 0); munit_assert_int(item3->id, ==, 2); munit_assert_int(3, ==, registry->len); munit_assert_int(4, ==, registry->cap); err = test_registry_del(registry, item3); munit_assert_int(err, ==, 0); munit_assert_int(2, ==, registry->len); munit_assert_int(4, ==, registry->cap); err = test_registry_del(registry, item2); munit_assert_int(err, ==, 0); munit_assert_int(1, ==, registry->len); munit_assert_int(2, ==, registry->cap); return MUNIT_OK; } dqlite-1.16.7/test/unit/lib/test_serialize.c000066400000000000000000000231161465252713400210020ustar00rootroot00000000000000#include "../../../src/lib/serialize.h" #include "../../lib/runner.h" TEST_MODULE(lib_serialize); /****************************************************************************** * * Simple schema with stock fields. * ******************************************************************************/ #define PERSON(X, ...) \ X(text, name, ##__VA_ARGS__) \ X(uint64, age, ##__VA_ARGS__) SERIALIZE__DEFINE(person, PERSON); SERIALIZE__IMPLEMENT(person, PERSON); /****************************************************************************** * * Complex schema with a custom field. * ******************************************************************************/ struct pages { uint16_t n; /* Number of pages */ uint16_t size; /* Size of each page */ uint32_t __unused__; void **bufs; /* Array of page buffers */ }; static void create_pages(unsigned n, unsigned size, struct pages *pages) { unsigned i; pages->n = n; pages->size = size; pages->bufs = munit_malloc(n * sizeof *pages->bufs); for (i = 0; i < pages->n; i++) { pages->bufs[i] = munit_malloc(size); } } static void destroy_pages(struct pages *pages) { unsigned i; for (i = 0; i < pages->n; i++) { free(pages->bufs[i]); } free(pages->bufs); } /* Opaque pointer to a struct pages object. */ typedef struct pages pages_t; typedef struct person person_t; static size_t pages__sizeof(const pages_t *pages) { return uint16__sizeof(&pages->n) + uint16__sizeof(&pages->size) + uint32__sizeof(&pages->__unused__) + pages->size * pages->n /* bufs */; } static void pages__encode(const pages_t *pages, char **cursor) { unsigned i; uint16__encode(&pages->n, cursor); uint16__encode(&pages->size, cursor); uint32__encode(&pages->__unused__, cursor); for (i = 0; i < pages->n; i++) { memcpy(*cursor, pages->bufs[i], pages->size); *cursor += pages->size; } } static int pages__decode(struct cursor *cursor, pages_t *pages) { unsigned i; uint16__decode(cursor, &pages->n); uint16__decode(cursor, &pages->size); uint32__decode(cursor, &pages->__unused__); pages->bufs = munit_malloc(pages->n * sizeof *pages->bufs); for (i = 0; i < pages->n; i++) { pages->bufs[i] = (void *)cursor->p; cursor->p += pages->size; cursor->cap -= pages->size; } return 0; } #define BOOK(X, ...) \ X(text, title, ##__VA_ARGS__) \ X(person, author, ##__VA_ARGS__) \ X(pages, pages, ##__VA_ARGS__) SERIALIZE__DEFINE(book, BOOK); SERIALIZE__IMPLEMENT(book, BOOK); /****************************************************************************** * * Fixture * ******************************************************************************/ struct fixture { struct person person; struct book book; }; static void *setup(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); (void)params; (void)user_data; return f; } static void tear_down(void *data) { free(data); } /****************************************************************************** * * Fields definition. * ******************************************************************************/ TEST_SUITE(fields); TEST_SETUP(fields, setup); TEST_TEAR_DOWN(fields, tear_down); /* The expected fields are defined on the struct. */ TEST_CASE(fields, define, NULL) { struct fixture *f = data; (void)params; f->person.name = "John Doh"; f->person.age = 40; return MUNIT_OK; } /****************************************************************************** * * Sizeof method. * ******************************************************************************/ TEST_SUITE(sizeof); TEST_SETUP(sizeof, setup); TEST_TEAR_DOWN(sizeof, tear_down); /* Padding is added if needed. */ TEST_CASE(sizeof, padding, NULL) { struct fixture *f = data; size_t size; (void)params; f->person.name = "John Doh"; f->person.age = 40; size = person__sizeof(&f->person); munit_assert_int(size, ==, 16 /* name */ + 8 /* age */); return MUNIT_OK; } /* Padding is not added if a string ends exactly at word boundary. */ TEST_CASE(sizeof, no_padding, NULL) { struct fixture *f = data; size_t size; (void)params; f->person.name = "Joe Doh"; f->person.age = 40; size = person__sizeof(&f->person); munit_assert_int(size, ==, 8 /* name */ + 8 /* age */); return MUNIT_OK; } /****************************************************************************** * * Encode method. * ******************************************************************************/ TEST_SUITE(encode); TEST_SETUP(encode, setup); TEST_TEAR_DOWN(encode, tear_down); /* Padding is added if needed. */ TEST_CASE(encode, padding, NULL) { struct fixture *f = data; size_t size; void *buf; char *cursor; (void)params; f->person.name = "John Doh"; f->person.age = 40; size = person__sizeof(&f->person); buf = munit_malloc(size); cursor = buf; person__encode(&f->person, &cursor); munit_assert_string_equal(buf, "John Doh"); munit_assert_int(ByteFlipLe64(*(uint64_t *)(buf + 16)), ==, 40); free(buf); return MUNIT_OK; } /* Padding is not added if a string ends exactly at word boundary. */ TEST_CASE(encode, no_padding, NULL) { struct fixture *f = data; size_t size; void *buf; char *cursor; (void)params; f->person.name = "Joe Doh"; f->person.age = 40; size = person__sizeof(&f->person); buf = munit_malloc(size); cursor = buf; person__encode(&f->person, &cursor); munit_assert_string_equal(buf, "Joe Doh"); munit_assert_int(ByteFlipLe64(*(uint64_t *)(buf + 8)), ==, 40); free(buf); return MUNIT_OK; } /* Encode a custom complex field. */ TEST_CASE(encode, custom, NULL) { struct fixture *f = data; size_t size; void *buf; char *cursor; (void)params; f->book.title = "Les miserables"; f->book.author.name = "Victor Hugo"; f->book.author.age = 40; create_pages(2, 8, &f->book.pages); strcpy(f->book.pages.bufs[0], "Fantine"); strcpy(f->book.pages.bufs[1], "Cosette"); size = book__sizeof(&f->book); munit_assert_int(size, ==, 16 + /* title */ 16 + /* author name */ 8 + /* author age */ 2 + /* n pages */ 2 + /* page size */ 4 + /* unused */ 8 * 2 /* page buffers */); buf = munit_malloc(size); cursor = buf; book__encode(&f->book, &cursor); cursor = buf; munit_assert_string_equal(cursor, "Les miserables"); cursor += 16; munit_assert_string_equal(cursor, "Victor Hugo"); cursor += 16; uint64_t x; memcpy(&x, cursor, sizeof(x)); munit_assert_uint64(ByteFlipLe64(x), ==, 40); cursor += 8; uint16_t y; memcpy(&y, cursor, sizeof(y)); munit_assert_uint16(ByteFlipLe16(y), ==, 2); cursor += 2; uint16_t z; memcpy(&z, cursor, sizeof(z)); munit_assert_uint16(ByteFlipLe16(z), ==, 8); cursor += 2; cursor += 4; /* Unused */ munit_assert_string_equal(cursor, "Fantine"); cursor += 8; munit_assert_string_equal(cursor, "Cosette"); free(buf); destroy_pages(&f->book.pages); return MUNIT_OK; } /****************************************************************************** * * Decode method. * ******************************************************************************/ TEST_SUITE(decode); TEST_SETUP(decode, setup); TEST_TEAR_DOWN(decode, tear_down); /* Padding is added if needed. */ TEST_CASE(decode, padding, NULL) { struct fixture *f = data; void *buf = munit_malloc(16 + 8); struct cursor cursor = {buf, 16 + 8}; (void)params; strcpy(buf, "John Doh"); *(uint64_t *)(buf + 16) = ByteFlipLe64(40); person__decode(&cursor, &f->person); munit_assert_string_equal(f->person.name, "John Doh"); munit_assert_int(f->person.age, ==, 40); free(buf); return MUNIT_OK; } /* Padding is not added if a string ends exactly at word boundary. */ TEST_CASE(decode, no_padding, NULL) { struct fixture *f = data; void *buf = munit_malloc(16 + 8); struct cursor cursor = {buf, 16 + 8}; (void)params; strcpy(buf, "Joe Doh"); *(uint64_t *)(buf + 8) = ByteFlipLe64(40); person__decode(&cursor, &f->person); munit_assert_string_equal(f->person.name, "Joe Doh"); munit_assert_int(f->person.age, ==, 40); free(buf); return MUNIT_OK; } /* The given buffer has not enough data. */ TEST_CASE(decode, short, NULL) { struct fixture *f = data; void *buf = munit_malloc(16); struct cursor cursor = {buf, 16}; int rc; (void)params; strcpy(buf, "John Doh"); rc = person__decode(&cursor, &f->person); munit_assert_int(rc, ==, DQLITE_PARSE); free(buf); return MUNIT_OK; } /* Decode a custom complex field. */ TEST_CASE(decode, custom, NULL) { struct fixture *f = data; size_t len = 16 + /* title */ 16 + /* author name */ 8 + /* author age */ 2 + /* n pages */ 2 + /* page size */ 4 + /* unused */ 8 * 2 /* page buffers */; void *buf = munit_malloc(len); void *p = buf; struct cursor cursor = {buf, len}; (void)params; strcpy(p, "Les miserables"); p += 16; strcpy(p, "Victor Hugo"); p += 16; *(uint64_t *)p = ByteFlipLe64(40); p += 8; *(uint16_t *)p = ByteFlipLe16(2); p += 2; *(uint16_t *)p = ByteFlipLe16(8); p += 2; p += 4; /* Unused */ strcpy(p, "Fantine"); p += 8; strcpy(p, "Cosette"); book__decode(&cursor, &f->book); munit_assert_string_equal(f->book.title, "Les miserables"); munit_assert_string_equal(f->book.author.name, "Victor Hugo"); munit_assert_int(f->book.author.age, ==, 40); munit_assert_int(f->book.pages.n, ==, 2); munit_assert_int(f->book.pages.size, ==, 8); munit_assert_string_equal(f->book.pages.bufs[0], "Fantine"); munit_assert_string_equal(f->book.pages.bufs[1], "Cosette"); free(f->book.pages.bufs); free(buf); return MUNIT_OK; } dqlite-1.16.7/test/unit/lib/test_transport.c000066400000000000000000000121521465252713400210450ustar00rootroot00000000000000#include #include "../../../src/lib/transport.h" #include "../../lib/endpoint.h" #include "../../lib/runner.h" #include "../../lib/uv.h" TEST_MODULE(lib_transport); /****************************************************************************** * * Fixture * ******************************************************************************/ struct fixture { struct test_endpoint endpoint; struct uv_loop_s loop; struct transport transport; int client; struct { bool invoked; int status; } read; struct { bool invoked; int status; } write; }; static void read_cb(struct transport *transport, int status) { struct fixture *f = transport->data; f->read.invoked = true; f->read.status = status; } static void write_cb(struct transport *transport, int status) { struct fixture *f = transport->data; f->write.invoked = true; f->write.status = status; } static void *setup(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); struct uv_stream_s *stream; int rv; int server; (void)user_data; test_endpoint_setup(&f->endpoint, params); rv = listen(f->endpoint.fd, 16); munit_assert_int(rv, ==, 0); test_endpoint_pair(&f->endpoint, &server, &f->client); test_uv_setup(params, &f->loop); rv = transport__stream(&f->loop, server, &stream); munit_assert_int(rv, ==, 0); rv = transport__init(&f->transport, stream); munit_assert_int(rv, ==, 0); f->transport.data = f; f->read.invoked = false; f->read.status = -1; f->write.invoked = false; f->write.status = -1; return f; } static void tear_down(void *data) { struct fixture *f = data; int rv; rv = close(f->client); munit_assert_int(rv, ==, 0); transport__close(&f->transport, NULL); test_uv_stop(&f->loop); test_uv_tear_down(&f->loop); test_endpoint_tear_down(&f->endpoint); free(data); } /****************************************************************************** * * Helper macros. * ******************************************************************************/ /* Allocate a libuv buffer with the given amount of bytes. */ #define BUF_ALLOC(N) {munit_malloc(N), N}; /* Start reading into the current buffer */ #define READ(BUF) \ { \ int rv2; \ rv2 = transport__read(&f->transport, BUF, read_cb); \ munit_assert_int(rv2, ==, 0); \ } /* Start writing the current buffer into the stream */ #define WRITE(BUF) \ { \ int rv2; \ rv2 = transport__write(&f->transport, BUF, write_cb); \ munit_assert_int(rv2, ==, 0); \ } /* Write N bytes into the client buffer. Each byte will contain a progressive * number starting from 1. */ #define CLIENT_WRITE(N) \ { \ uint8_t *buf_ = munit_malloc(N); \ unsigned i_; \ int rv_; \ for (i_ = 0; i_ < N; i_++) { \ buf_[i_] = i_ + 1; \ } \ rv_ = write(f->client, buf_, N); \ munit_assert_int(rv_, ==, N); \ free(buf_); \ } /****************************************************************************** * * Assertions. * ******************************************************************************/ /* Assert that the read callback was invoked with the given status. */ #define ASSERT_READ(STATUS) \ munit_assert_true(f->read.invoked); \ munit_assert_int(f->read.status, ==, STATUS); \ f->read.invoked = false; \ f->read.status = -1 /* Assert that the write callback was invoked with the given status. */ #define ASSERT_WRITE(STATUS) \ munit_assert_true(f->write.invoked); \ munit_assert_int(f->write.status, ==, STATUS); \ f->write.invoked = false; \ f->write.status = -1 /****************************************************************************** * * transport__read * ******************************************************************************/ TEST_SUITE(read); TEST_SETUP(read, setup); TEST_TEAR_DOWN(read, tear_down); TEST_CASE(read, success, NULL) { struct fixture *f = data; uv_buf_t buf = BUF_ALLOC(2); (void)params; CLIENT_WRITE(2); READ(&buf); test_uv_run(&f->loop, 1); ASSERT_READ(0); munit_assert_int(((uint8_t *)buf.base)[0], ==, 1); munit_assert_int(((uint8_t *)buf.base)[1], ==, 2); free(buf.base); return MUNIT_OK; } /****************************************************************************** * * transport__write * ******************************************************************************/ TEST_SUITE(write); TEST_SETUP(write, setup); TEST_TEAR_DOWN(write, tear_down); TEST_CASE(write, success, NULL) { struct fixture *f = data; uv_buf_t buf = BUF_ALLOC(2); (void)params; WRITE(&buf); test_uv_run(&f->loop, 1); ASSERT_WRITE(0); free(buf.base); return MUNIT_OK; } dqlite-1.16.7/test/unit/main.c000066400000000000000000000000541465252713400161260ustar00rootroot00000000000000#include "../lib/runner.h" RUNNER("unit"); dqlite-1.16.7/test/unit/test_command.c000066400000000000000000000020751465252713400176640ustar00rootroot00000000000000#include #include "../../src/command.h" #include "../lib/runner.h" TEST_MODULE(command); /****************************************************************************** * * Open. * ******************************************************************************/ TEST_SUITE(open); TEST_CASE(open, encode, NULL) { struct command_open c; struct raft_buffer buf; int rc; (void)data; (void)params; c.filename = "test.db"; rc = command__encode(COMMAND_OPEN, &c, &buf); munit_assert_int(rc, ==, 0); munit_assert_int(buf.len, ==, 16); raft_free(buf.base); return MUNIT_OK; } TEST_CASE(open, decode, NULL) { struct command_open c1; void *c2; int type; struct raft_buffer buf; int rc; (void)data; (void)params; c1.filename = "db"; rc = command__encode(COMMAND_OPEN, &c1, &buf); munit_assert_int(rc, ==, 0); rc = command__decode(&buf, &type, &c2); munit_assert_int(rc, ==, 0); munit_assert_int(type, ==, COMMAND_OPEN); munit_assert_string_equal(((struct command_open *)c2)->filename, "db"); raft_free(c2); raft_free(buf.base); return MUNIT_OK; } dqlite-1.16.7/test/unit/test_concurrency.c000066400000000000000000000254171465252713400206050ustar00rootroot00000000000000#include "../lib/cluster.h" #include "../lib/runner.h" #include "../../src/gateway.h" #include "../../src/protocol.h" #include "../../src/request.h" #include "../../src/response.h" TEST_MODULE(concurrency); /****************************************************************************** * * Fixture. * ******************************************************************************/ #define N_GATEWAYS 2 /* Context for a gateway handle request */ struct context { bool invoked; int status; uint8_t type; uint8_t schema; }; /* Standalone leader database connection */ struct connection { struct gateway gateway; struct buffer request; /* Request payload */ struct buffer response; /* Response payload */ struct handle handle; /* Async handle request */ struct context context; }; #define FIXTURE \ FIXTURE_CLUSTER; \ struct connection connections[N_GATEWAYS] #define SETUP \ unsigned i; \ int rc; \ pool_ut_fallback()->flags |= POOL_FOR_UT_NOT_ASYNC; \ pool_ut_fallback()->flags |= POOL_FOR_UT; \ SETUP_CLUSTER(V2); \ CLUSTER_ELECT(0); \ for (i = 0; i < N_GATEWAYS; i++) { \ struct connection *c = &f->connections[i]; \ struct request_open open; \ struct response_db db; \ struct id_state seed = { { 1 } }; \ gateway__init(&c->gateway, CLUSTER_CONFIG(0), \ CLUSTER_REGISTRY(0), CLUSTER_RAFT(0), seed); \ c->handle.data = &c->context; \ rc = buffer__init(&c->request); \ munit_assert_int(rc, ==, 0); \ rc = buffer__init(&c->response); \ munit_assert_int(rc, ==, 0); \ open.filename = "test"; \ open.vfs = ""; \ ENCODE(c, &open, open); \ HANDLE(c, OPEN); \ ASSERT_CALLBACK(c, 0, DB); \ DECODE(c, &db, db); \ munit_assert_int(db.id, ==, 0); \ } #define TEAR_DOWN \ unsigned i; \ for (i = 0; i < N_GATEWAYS; i++) { \ struct connection *c = &f->connections[i]; \ buffer__close(&c->request); \ buffer__close(&c->response); \ gateway__close(&c->gateway); \ } \ TEAR_DOWN_CLUSTER; static void fixture_handle_cb(struct handle *req, int status, uint8_t type, uint8_t schema) { struct context *c = req->data; c->invoked = true; c->status = status; c->type = type; c->schema = schema; } /****************************************************************************** * * Helper macros. * ******************************************************************************/ /* Reset the request buffer of the given connection and encode a request of the * given lower case name. */ #define ENCODE(C, REQUEST, LOWER) \ { \ size_t n2 = request_##LOWER##__sizeof(REQUEST); \ char *cursor; \ buffer__reset(&C->request); \ cursor = buffer__advance(&C->request, n2); \ munit_assert_ptr_not_null(cursor); \ request_##LOWER##__encode(REQUEST, &cursor); \ } /* Decode a response of the given lower/upper case name using the response * buffer of the given connection. */ #define DECODE(C, RESPONSE, LOWER) \ { \ struct cursor cursor; \ int rc2; \ cursor.p = buffer__cursor(&C->response, 0); \ cursor.cap = buffer__offset(&C->response); \ rc2 = response_##LOWER##__decode(&cursor, RESPONSE); \ munit_assert_int(rc2, ==, 0); \ } /* Submit a request of the given type to the given connection and check that no * error occurs. */ #define HANDLE(C, TYPE) \ { \ int rc2; \ C->handle.cursor.p = buffer__cursor(&C->request, 0); \ C->handle.cursor.cap = buffer__offset(&C->request); \ buffer__reset(&C->response); \ rc2 = gateway__handle(&C->gateway, &C->handle, \ DQLITE_REQUEST_##TYPE, 0, &C->response, \ fixture_handle_cb); \ munit_assert_int(rc2, ==, 0); \ } /* Prepare a statement on the given connection. The ID will be saved in * the STMT_ID pointer. */ #define PREPARE(C, SQL, STMT_ID) \ { \ struct request_prepare prepare; \ struct response_stmt stmt; \ prepare.db_id = 0; \ prepare.sql = SQL; \ ENCODE(C, &prepare, prepare); \ HANDLE(C, PREPARE); \ WAIT(C); \ ASSERT_CALLBACK(C, 0, STMT); \ DECODE(C, &stmt, stmt); \ *(STMT_ID) = stmt.id; \ } /* Submit a request to exec a statement. */ #define EXEC(C, STMT_ID) \ { \ struct request_exec exec; \ exec.db_id = 0; \ exec.stmt_id = STMT_ID; \ ENCODE(C, &exec, exec); \ HANDLE(C, EXEC); \ } /* Submit a query request. */ #define QUERY(C, STMT_ID) \ { \ struct request_query query; \ query.db_id = 0; \ query.stmt_id = STMT_ID; \ ENCODE(C, &query, query); \ HANDLE(C, QUERY); \ } /* Wait for the gateway of the given connection to finish handling a request. */ #define WAIT(C) \ { \ unsigned _i; \ for (_i = 0; _i < 50; _i++) { \ CLUSTER_STEP; \ if (C->context.invoked) { \ break; \ } \ } \ munit_assert_true(C->context.invoked); \ } /****************************************************************************** * * Assertions. * ******************************************************************************/ /* Assert that the handle callback of the given connection has been invoked with * the given status and response type.. */ #define ASSERT_CALLBACK(C, STATUS, UPPER) \ munit_assert_true(C->context.invoked); \ munit_assert_int(C->context.status, ==, STATUS); \ munit_assert_int(C->context.type, ==, DQLITE_RESPONSE_##UPPER); \ C->context.invoked = false /* Assert that the failure response generated by the gateway of the given * connection matches the given details. */ #define ASSERT_FAILURE(C, CODE, MESSAGE) \ { \ struct response_failure failure; \ DECODE(C, &failure, failure); \ munit_assert_int(failure.code, ==, CODE); \ munit_assert_string_equal(failure.message, MESSAGE); \ } /****************************************************************************** * * Concurrent exec requests * ******************************************************************************/ struct exec_fixture { FIXTURE; struct connection *c1; struct connection *c2; unsigned stmt_id1; unsigned stmt_id2; }; TEST_SUITE(exec); TEST_SETUP(exec) { struct exec_fixture *f = munit_malloc(sizeof *f); SETUP; f->c1 = &f->connections[0]; f->c2 = &f->connections[1]; return f; } TEST_TEAR_DOWN(exec) { struct exec_fixture *f = data; TEAR_DOWN; free(f); } /* If another leader connection has submitted an Open request and is waiting for * it to complete, SQLITE_BUSY is returned. */ TEST_CASE(exec, open, NULL) { struct exec_fixture *f = data; (void)params; PREPARE(f->c1, "CREATE TABLE test1 (n INT)", &f->stmt_id1); PREPARE(f->c2, "CREATE TABLE test2 (n INT)", &f->stmt_id2); EXEC(f->c1, f->stmt_id1); EXEC(f->c2, f->stmt_id2); WAIT(f->c2); ASSERT_CALLBACK(f->c2, 0, FAILURE); ASSERT_FAILURE(f->c2, SQLITE_BUSY, "database is locked"); WAIT(f->c1); ASSERT_CALLBACK(f->c1, 0, RESULT); return MUNIT_OK; } /* If an exec request is already in progress on another leader connection, * SQLITE_BUSY is returned. */ TEST_CASE(exec, tx, NULL) { struct exec_fixture *f = data; (void)params; /* Create a test table using connection 0 */ PREPARE(f->c1, "CREATE TABLE test (n INT)", &f->stmt_id1); EXEC(f->c1, f->stmt_id1); WAIT(f->c1); ASSERT_CALLBACK(f->c1, 0, RESULT); PREPARE(f->c1, "INSERT INTO test(n) VALUES(1)", &f->stmt_id1); PREPARE(f->c2, "INSERT INTO test(n) VALUES(1)", &f->stmt_id2); EXEC(f->c1, f->stmt_id1); EXEC(f->c2, f->stmt_id2); WAIT(f->c2); ASSERT_CALLBACK(f->c2, 0, FAILURE); ASSERT_FAILURE(f->c2, SQLITE_BUSY, "database is locked"); WAIT(f->c1); ASSERT_CALLBACK(f->c1, 0, RESULT); return MUNIT_OK; } /****************************************************************************** * * Concurrent query requests * ******************************************************************************/ struct query_fixture { FIXTURE; struct connection *c1; struct connection *c2; unsigned stmt_id1; unsigned stmt_id2; }; TEST_SUITE(query); TEST_SETUP(query) { struct exec_fixture *f = munit_malloc(sizeof *f); SETUP; f->c1 = &f->connections[0]; f->c2 = &f->connections[1]; PREPARE(f->c1, "CREATE TABLE test (n INT)", &f->stmt_id1); EXEC(f->c1, f->stmt_id1); WAIT(f->c1); return f; } TEST_TEAR_DOWN(query) { struct exec_fixture *f = data; TEAR_DOWN; free(f); } /* Handle a query request while there is a transaction in progress. */ TEST_CASE(query, tx, NULL) { struct exec_fixture *f = data; (void)params; PREPARE(f->c1, "INSERT INTO test VALUES(1)", &f->stmt_id1); PREPARE(f->c2, "SELECT n FROM test", &f->stmt_id2); EXEC(f->c1, f->stmt_id1); QUERY(f->c2, f->stmt_id2); WAIT(f->c1); WAIT(f->c2); ASSERT_CALLBACK(f->c1, 0, RESULT); ASSERT_CALLBACK(f->c2, 0, ROWS); return MUNIT_OK; } dqlite-1.16.7/test/unit/test_conn.c000066400000000000000000000300761465252713400172050ustar00rootroot00000000000000#include "../lib/client.h" #include "../lib/config.h" #include "../lib/heap.h" #include "../lib/logger.h" #include "../lib/raft.h" #include "../lib/registry.h" #include "../lib/runner.h" #include "../lib/sqlite.h" #include "../lib/vfs.h" #include "../../src/client/protocol.h" #include "../../src/conn.h" #include "../../src/gateway.h" #include "../../src/lib/threadpool.h" #include "../../src/lib/transport.h" #include "../../src/raft.h" #include "../../src/transport.h" TEST_MODULE(conn); /****************************************************************************** * * Fixture * ******************************************************************************/ struct conn_test { struct conn conn; bool closed; }; static void connCloseCb(struct conn *conn) { struct conn_test *conn_test = CONTAINER_OF(conn, struct conn_test, conn); conn_test->closed = true; } #define FIXTURE \ FIXTURE_LOGGER; \ FIXTURE_VFS; \ FIXTURE_CONFIG; \ FIXTURE_REGISTRY; \ FIXTURE_RAFT; \ FIXTURE_CLIENT; \ struct conn_test conn_test; #define SETUP \ struct uv_stream_s *stream; \ struct id_state seed = { { 1 } }; \ int rv; \ SETUP_HEAP; \ SETUP_SQLITE; \ SETUP_LOGGER; \ SETUP_VFS; \ SETUP_CONFIG; \ SETUP_REGISTRY; \ SETUP_RAFT; \ rv = pool_init(pool_ut_fallback(), &f->loop, 4, POOL_QOS_PRIO_FAIR); \ pool_ut_fallback()->flags |= POOL_FOR_UT; \ munit_assert_int(rv, ==, 0); \ SETUP_CLIENT; \ RAFT_BOOTSTRAP; \ RAFT_START; \ rv = transport__stream(&f->loop, f->server, &stream); \ munit_assert_int(rv, ==, 0); \ f->conn_test.closed = false; \ rv = conn__start(&f->conn_test.conn, &f->config, &f->loop, \ &f->registry, &f->raft, stream, &f->raft_transport, \ seed, connCloseCb); \ munit_assert_int(rv, ==, 0) #define TEAR_DOWN \ pool_close(pool_ut_fallback()); \ pool_fini(pool_ut_fallback()); \ conn__stop(&f->conn_test.conn); \ while (!f->conn_test.closed) { \ test_uv_run(&f->loop, 1); \ }; \ TEAR_DOWN_RAFT; \ TEAR_DOWN_CLIENT; \ TEAR_DOWN_REGISTRY; \ TEAR_DOWN_CONFIG; \ TEAR_DOWN_VFS; \ TEAR_DOWN_LOGGER; \ TEAR_DOWN_SQLITE; \ TEAR_DOWN_HEAP /****************************************************************************** * * Helper macros. * ******************************************************************************/ /* Send the initial client handshake. */ #define HANDSHAKE_CONN \ { \ int rv2; \ rv2 = clientSendHandshake(&f->client, NULL); \ munit_assert_int(rv2, ==, 0); \ test_uv_run(&f->loop, 1); \ } /* Open a test database. */ #define OPEN_CONN \ { \ int rv2; \ rv2 = clientSendOpen(&f->client, "test", NULL); \ munit_assert_int(rv2, ==, 0); \ test_uv_run(&f->loop, 2); \ rv2 = clientRecvDb(&f->client, NULL); \ munit_assert_int(rv2, ==, 0); \ } /* Prepare a statement. */ #define PREPARE_CONN(SQL, STMT_ID) \ { \ int rv2; \ rv2 = clientSendPrepare(&f->client, SQL, NULL); \ munit_assert_int(rv2, ==, 0); \ test_uv_run(&f->loop, 1); \ rv2 = clientRecvStmt(&f->client, STMT_ID, NULL, NULL, NULL); \ munit_assert_int(rv2, ==, 0); \ } /* Execute a statement. */ #define EXEC_CONN(STMT_ID, LAST_INSERT_ID, ROWS_AFFECTED, LOOP) \ { \ int rv2; \ rv2 = clientSendExec(&f->client, STMT_ID, NULL, 0, NULL); \ munit_assert_int(rv2, ==, 0); \ test_uv_run(&f->loop, LOOP); \ rv2 = clientRecvResult(&f->client, LAST_INSERT_ID, \ ROWS_AFFECTED, NULL); \ munit_assert_int(rv2, ==, 0); \ } /* Execute a non-prepared statement. */ #define EXEC_SQL_CONN(SQL, LAST_INSERT_ID, ROWS_AFFECTED, LOOP) \ { \ int rv2; \ rv2 = clientSendExecSQL(&f->client, SQL, NULL, 0, NULL); \ munit_assert_int(rv2, ==, 0); \ test_uv_run(&f->loop, LOOP); \ rv2 = clientRecvResult(&f->client, LAST_INSERT_ID, \ ROWS_AFFECTED, NULL); \ munit_assert_int(rv2, ==, 0); \ } /* Perform a query. */ #define QUERY_CONN(STMT_ID, ROWS) \ { \ int rv2; \ rv2 = clientSendQuery(&f->client, STMT_ID, NULL, 0, NULL); \ munit_assert_int(rv2, ==, 0); \ test_uv_run(&f->loop, 2); \ rv2 = clientRecvRows(&f->client, ROWS, NULL, NULL); \ munit_assert_int(rv2, ==, 0); \ } /* Perform a non-prepared query. */ #define QUERY_SQL_CONN(SQL, ROWS) \ { \ int rv2; \ rv2 = clientSendQuerySql(&f->client, SQL, NULL, 0, NULL); \ munit_assert_int(rv2, ==, 0); \ test_uv_run(&f->loop, 2); \ rv2 = clientRecvRows(&f->client, ROWS, NULL); \ munit_assert_int(rv2, ==, 0); \ } /****************************************************************************** * * Handle the handshake * ******************************************************************************/ TEST_SUITE(handshake); struct handshake_fixture { FIXTURE; }; TEST_SETUP(handshake) { struct handshake_fixture *f = munit_malloc(sizeof *f); SETUP; return f; } TEST_TEAR_DOWN(handshake) { struct handshake_fixture *f = data; TEAR_DOWN; free(f); } TEST_CASE(handshake, success, NULL) { struct handshake_fixture *f = data; (void)params; HANDSHAKE_CONN; return MUNIT_OK; } /****************************************************************************** * * Handle an open request * ******************************************************************************/ TEST_SUITE(open); struct open_fixture { FIXTURE; }; TEST_SETUP(open) { struct open_fixture *f = munit_malloc(sizeof *f); SETUP; HANDSHAKE_CONN; return f; } TEST_TEAR_DOWN(open) { struct open_fixture *f = data; TEAR_DOWN; free(f); } TEST_CASE(open, success, NULL) { struct open_fixture *f = data; (void)params; OPEN_CONN; return MUNIT_OK; } /****************************************************************************** * * Handle an prepare request * ******************************************************************************/ TEST_SUITE(prepare); struct prepare_fixture { FIXTURE; }; TEST_SETUP(prepare) { struct prepare_fixture *f = munit_malloc(sizeof *f); SETUP; HANDSHAKE_CONN; OPEN_CONN; return f; } TEST_TEAR_DOWN(prepare) { struct prepare_fixture *f = data; TEAR_DOWN; free(f); } TEST_CASE(prepare, success, NULL) { struct prepare_fixture *f = data; unsigned stmt_id; (void)params; PREPARE_CONN("CREATE TABLE test (n INT)", &stmt_id); munit_assert_int(stmt_id, ==, 0); return MUNIT_OK; } /****************************************************************************** * * Handle an exec * ******************************************************************************/ TEST_SUITE(exec); struct exec_fixture { FIXTURE; unsigned stmt_id; }; TEST_SETUP(exec) { struct exec_fixture *f = munit_malloc(sizeof *f); SETUP; HANDSHAKE_CONN; OPEN_CONN; return f; } TEST_TEAR_DOWN(exec) { struct exec_fixture *f = data; TEAR_DOWN; free(f); } TEST_CASE(exec, success, NULL) { struct exec_fixture *f = data; uint64_t last_insert_id; uint64_t rows_affected; (void)params; PREPARE_CONN("CREATE TABLE test (n INT)", &f->stmt_id); EXEC_CONN(f->stmt_id, &last_insert_id, &rows_affected, 8); munit_assert_int(last_insert_id, ==, 0); munit_assert_int(rows_affected, ==, 0); return MUNIT_OK; } TEST_CASE(exec, result, NULL) { struct exec_fixture *f = data; uint64_t last_insert_id; uint64_t rows_affected; (void)params; PREPARE_CONN("BEGIN", &f->stmt_id); EXEC_CONN(f->stmt_id, &last_insert_id, &rows_affected, 5); PREPARE_CONN("CREATE TABLE test (n INT)", &f->stmt_id); EXEC_CONN(f->stmt_id, &last_insert_id, &rows_affected, 8); PREPARE_CONN("INSERT INTO test (n) VALUES(123)", &f->stmt_id); EXEC_CONN(f->stmt_id, &last_insert_id, &rows_affected, 5); PREPARE_CONN("COMMIT", &f->stmt_id); EXEC_CONN(f->stmt_id, &last_insert_id, &rows_affected, 8); munit_assert_int(last_insert_id, ==, 1); munit_assert_int(rows_affected, ==, 1); return MUNIT_OK; } TEST_CASE(exec, close_while_in_flight, NULL) { struct exec_fixture *f = data; uint64_t last_insert_id; uint64_t rows_affected; int rv; (void)params; EXEC_SQL_CONN("CREATE TABLE test (n)", &last_insert_id, &rows_affected, 9); rv = clientSendExecSQL(&f->client, "INSERT INTO test(n) VALUES(1)", NULL, 0, NULL); munit_assert_int(rv, ==, 0); test_uv_run(&f->loop, 1); pool_ut_fallback()->flags |= POOL_FOR_UT_NON_CLEAN_FINI; return MUNIT_OK; } /****************************************************************************** * * Handle a query * ******************************************************************************/ TEST_SUITE(query); struct query_fixture { FIXTURE; uint32_t stmt_id; uint32_t insert_stmt_id; uint64_t last_insert_id; uint64_t rows_affected; struct rows rows; }; TEST_SETUP(query) { struct query_fixture *f = munit_malloc(sizeof *f); uint32_t stmt_id; SETUP; HANDSHAKE_CONN; OPEN_CONN; PREPARE_CONN("CREATE TABLE test (n INT)", &stmt_id); EXEC_CONN(stmt_id, &f->last_insert_id, &f->rows_affected, 7); PREPARE_CONN("INSERT INTO test(n) VALUES (123)", &f->insert_stmt_id); EXEC_CONN(f->insert_stmt_id, &f->last_insert_id, &f->rows_affected, 4); return f; } TEST_TEAR_DOWN(query) { struct query_fixture *f = data; clientCloseRows(&f->rows); TEAR_DOWN; free(f); } /* Perform a query yielding one row. */ TEST_CASE(query, one, NULL) { struct query_fixture *f = data; struct row *row; (void)params; PREPARE_CONN("SELECT n FROM test", &f->stmt_id); QUERY_CONN(f->stmt_id, &f->rows); munit_assert_int(f->rows.column_count, ==, 1); munit_assert_string_equal(f->rows.column_names[0], "n"); row = f->rows.next; munit_assert_ptr_not_null(row); munit_assert_ptr_null(row->next); munit_assert_int(row->values[0].type, ==, SQLITE_INTEGER); munit_assert_int(row->values[0].integer, ==, 123); return MUNIT_OK; } dqlite-1.16.7/test/unit/test_gateway.c000066400000000000000000001750101465252713400177070ustar00rootroot00000000000000#include "../../include/dqlite.h" #include "../../src/gateway.h" #include "../../src/lib/threadpool.h" #include "../../src/request.h" #include "../../src/response.h" #include "../../src/tuple.h" #include "../lib/cluster.h" #include "../lib/raft_heap.h" #include "../lib/runner.h" TEST_MODULE(gateway); /****************************************************************************** * * Fixture. * ******************************************************************************/ /* Context for a gateway handle request. */ struct context { bool invoked; int status; uint8_t type; uint8_t schema; }; /* Drive a single gateway. Each gateway is associated with a different raft * node. */ struct connection { struct gateway gateway; struct buffer buf1; /* Request payload */ struct buffer buf2; /* Response payload */ struct cursor cursor; /* Response read cursor */ struct handle handle; /* Async handle request */ struct context context; }; #define FIXTURE \ FIXTURE_CLUSTER; \ struct connection connections[N_SERVERS]; \ struct gateway *gateway; \ struct buffer *buf1; \ struct cursor *cursor; \ struct buffer *buf2; \ struct handle *handle; \ struct context *context; #define SETUP \ unsigned i; \ int rc; \ SETUP_CLUSTER(V2); \ for (i = 0; i < N_SERVERS; i++) { \ struct connection *c = &f->connections[i]; \ struct config *config; \ struct id_state seed = { { 1 } }; \ config = CLUSTER_CONFIG(i); \ config->page_size = 512; \ gateway__init(&c->gateway, config, CLUSTER_REGISTRY(i), \ CLUSTER_RAFT(i), seed); \ c->handle.data = &c->context; \ rc = buffer__init(&c->buf1); \ munit_assert_int(rc, ==, 0); \ rc = buffer__init(&c->buf2); \ munit_assert_int(rc, ==, 0); \ } \ test_raft_heap_setup(params, user_data); \ pool_ut_fallback()->flags |= POOL_FOR_UT_NOT_ASYNC; \ pool_ut_fallback()->flags |= POOL_FOR_UT; \ SELECT(0) #define TEAR_DOWN \ unsigned i; \ test_raft_heap_tear_down(data); \ for (i = 0; i < N_SERVERS; i++) { \ struct connection *c = &f->connections[i]; \ gateway__close(&c->gateway); \ buffer__close(&c->buf1); \ buffer__close(&c->buf2); \ } \ TEAR_DOWN_CLUSTER; static void handleCb(struct handle *req, int status, uint8_t type, uint8_t schema) { struct context *c = req->data; c->invoked = true; c->status = status; c->type = type; c->schema = schema; } /****************************************************************************** * * Helper macros. * ******************************************************************************/ /* Select which gateway to use for performing requests. */ #define SELECT(I) \ f->gateway = &f->connections[I].gateway; \ f->buf1 = &f->connections[I].buf1; \ f->buf2 = &f->connections[I].buf2; \ f->cursor = &f->connections[I].cursor; \ f->context = &f->connections[I].context; \ f->handle = &f->connections[I].handle /* Allocate the payload buffer, encode a request of the given lower case name * and initialize the fixture cursor. */ #define ENCODE(REQUEST, LOWER) \ { \ size_t n2 = request_##LOWER##__sizeof(REQUEST); \ char *cursor; \ buffer__reset(f->buf1); \ cursor = buffer__advance(f->buf1, n2); \ munit_assert_ptr_not_null(cursor); \ request_##LOWER##__encode(REQUEST, &cursor); \ } /* Encode N parameters with the given values in the given format */ #define ENCODE_PARAMS(N, VALUES, FORMAT) \ { \ struct tuple_encoder encoder; \ unsigned long i2; \ int rc2; \ rc2 = tuple_encoder__init(&encoder, N, FORMAT, f->buf1); \ munit_assert_int(rc2, ==, 0); \ for (i2 = 0; i2 < N; i2++) { \ rc2 = tuple_encoder__next(&encoder, &((VALUES)[i2])); \ munit_assert_int(rc2, ==, 0); \ } \ } /* Decode a response of the given lower/upper case name using the buffer that * was written by the gateway. */ #define DECODE(RESPONSE, LOWER) \ { \ int rc2; \ rc2 = response_##LOWER##__decode(f->cursor, RESPONSE); \ munit_assert_int(rc2, ==, 0); \ } /* Decode a row with N columns filling the given values. */ #define DECODE_ROW(N, VALUES) \ { \ struct tuple_decoder decoder; \ int i2; \ int rc2; \ rc2 = tuple_decoder__init(&decoder, N, TUPLE__ROW, f->cursor); \ munit_assert_int(rc2, ==, 0); \ for (i2 = 0; i2 < N; i2++) { \ rc2 = tuple_decoder__next(&decoder, &((VALUES)[i2])); \ munit_assert_int(rc2, ==, 0); \ } \ } #define HANDLE_SCHEMA_STATUS(TYPE, SCHEMA, RC) \ { \ int rc2; \ f->handle->cursor.p = buffer__cursor(f->buf1, 0); \ f->handle->cursor.cap = buffer__offset(f->buf1); \ buffer__reset(f->buf2); \ f->context->invoked = false; \ f->context->status = -1; \ f->context->type = -1; \ rc2 = gateway__handle(f->gateway, f->handle, TYPE, SCHEMA, \ f->buf2, handleCb); \ munit_assert_int(rc2, ==, RC); \ } /* Handle a request of the given type and check for the given return code. */ #define HANDLE_STATUS(TYPE, RC) HANDLE_SCHEMA_STATUS(TYPE, 0, RC) /* Handle a request of the given type and check that no error occurs. */ #define HANDLE(TYPE) HANDLE_STATUS(DQLITE_REQUEST_##TYPE, 0) /* Open a leader connection against the "test" database */ #define OPEN \ { \ struct request_open open; \ open.filename = "test"; \ open.vfs = ""; \ ENCODE(&open, open); \ HANDLE(OPEN); \ ASSERT_CALLBACK(0, DB); \ } /* Prepare a statement. The ID will be saved in stmt_id. */ #define PREPARE(SQL) \ { \ struct request_prepare prepare; \ struct response_stmt stmt; \ prepare.db_id = 0; \ prepare.sql = SQL; \ ENCODE(&prepare, prepare); \ HANDLE(PREPARE); \ WAIT; \ ASSERT_CALLBACK(0, STMT); \ DECODE(&stmt, stmt); \ stmt_id = stmt.id; \ } /* Finalize the statement with the given ID. */ #define FINALIZE(STMT_ID) \ { \ struct request_finalize finalize; \ finalize.db_id = 0; \ finalize.stmt_id = STMT_ID; \ ENCODE(&finalize, finalize); \ HANDLE(FINALIZE); \ ASSERT_CALLBACK(0, EMPTY); \ } /* Submit a request to execute the given statement. */ #define EXEC_SUBMIT(STMT_ID) \ { \ struct request_exec exec; \ exec.db_id = 0; \ exec.stmt_id = STMT_ID; \ ENCODE(&exec, exec); \ HANDLE(EXEC); \ } /* Submit a request to execute the given statement. */ #define EXEC_SQL_SUBMIT(SQL) \ { \ struct request_exec_sql exec_sql; \ exec_sql.db_id = 0; \ exec_sql.sql = SQL; \ ENCODE(&exec_sql, exec_sql); \ HANDLE(EXEC_SQL); \ } /* Submit a request to execute the given statement. */ #define QUERY_SQL_SUBMIT(SQL) \ { \ struct request_query_sql query_sql; \ query_sql.db_id = 0; \ query_sql.sql = SQL; \ ENCODE(&query_sql, query_sql); \ HANDLE(QUERY_SQL); \ } /* Wait for the last request to complete */ #define WAIT \ { \ unsigned _i; \ for (_i = 0; _i < 60; _i++) { \ CLUSTER_STEP; \ if (f->context->invoked) { \ break; \ } \ } \ munit_assert_true(f->context->invoked); \ } /* Prepare and exec a statement. */ #define EXEC(SQL) \ { \ uint64_t _stmt_id; \ struct request_prepare prepare; \ struct response_stmt stmt; \ prepare.db_id = 0; \ prepare.sql = SQL; \ ENCODE(&prepare, prepare); \ HANDLE(PREPARE); \ WAIT; \ ASSERT_CALLBACK(0, STMT); \ DECODE(&stmt, stmt); \ _stmt_id = stmt.id; \ EXEC_SUBMIT(_stmt_id); \ WAIT; \ ASSERT_CALLBACK(0, RESULT); \ FINALIZE(_stmt_id); \ } /* Execute a pragma statement to lowers SQLite's page cache size, in order to * force it to write uncommitted dirty pages to the WAL and hance trigger calls * to the xFrames hook with non-commit batches. */ #define LOWER_CACHE_SIZE EXEC("PRAGMA cache_size = 1") /****************************************************************************** * * Assertions. * ******************************************************************************/ /* Assert that the handle callback has been invoked with the given status and * response type. Also, initialize the fixture's cursor to read the response * buffer. */ #define ASSERT_CALLBACK(STATUS, UPPER) \ munit_assert_true(f->context->invoked); \ munit_assert_int(f->context->status, ==, STATUS); \ munit_assert_int(f->context->type, ==, DQLITE_RESPONSE_##UPPER); \ f->cursor->p = buffer__cursor(f->buf2, 0); \ f->cursor->cap = buffer__offset(f->buf2); \ buffer__reset(f->buf2); \ f->context->invoked = false; /* Assert that the failure response generated by the gateway matches the given * details. */ #define ASSERT_FAILURE(CODE, MESSAGE) \ { \ struct response_failure failure; \ int rc2; \ rc2 = response_failure__decode(f->cursor, &failure); \ munit_assert_int(rc2, ==, 0); \ munit_assert_int(failure.code, ==, CODE); \ munit_assert_string_equal(failure.message, MESSAGE); \ } /****************************************************************************** * * leader * ******************************************************************************/ struct leader_fixture { FIXTURE; struct request_leader request; struct response_server response; }; TEST_SUITE(leader); TEST_SETUP(leader) { struct leader_fixture *f = munit_malloc(sizeof *f); SETUP; return f; } TEST_TEAR_DOWN(leader) { struct leader_fixture *f = data; TEAR_DOWN; free(f); } /* If the leader is not available, an empty string is returned. */ TEST_CASE(leader, not_available, NULL) { struct leader_fixture *f = data; (void)params; ENCODE(&f->request, leader); HANDLE(LEADER); ASSERT_CALLBACK(0, SERVER); DECODE(&f->response, server); munit_assert_int(f->response.id, ==, 0); munit_assert_string_equal(f->response.address, ""); return MUNIT_OK; } /* The leader is the same node serving the request. */ TEST_CASE(leader, same_node, NULL) { struct leader_fixture *f = data; (void)params; CLUSTER_ELECT(0); ENCODE(&f->request, leader); HANDLE(LEADER); ASSERT_CALLBACK(0, SERVER); DECODE(&f->response, server); munit_assert_string_equal(f->response.address, "1"); return MUNIT_OK; } /* The leader is a different node than the one serving the request. */ TEST_CASE(leader, other_node, NULL) { struct leader_fixture *f = data; (void)params; CLUSTER_ELECT(1); ENCODE(&f->request, leader); HANDLE(LEADER); ASSERT_CALLBACK(0, SERVER); DECODE(&f->response, server); munit_assert_string_equal(f->response.address, "2"); return MUNIT_OK; } /****************************************************************************** * * open * ******************************************************************************/ struct open_fixture { FIXTURE; struct request_open request; struct response_db response; }; TEST_SUITE(open); TEST_SETUP(open) { struct open_fixture *f = munit_malloc(sizeof *f); SETUP; return f; } TEST_TEAR_DOWN(open) { struct open_fixture *f = data; TEAR_DOWN; free(f); } /* Successfully open a database connection. */ TEST_CASE(open, success, NULL) { struct open_fixture *f = data; (void)params; f->request.filename = "test"; f->request.vfs = ""; ENCODE(&f->request, open); HANDLE(OPEN); ASSERT_CALLBACK(0, DB); DECODE(&f->response, db); munit_assert_int(f->response.id, ==, 0); return MUNIT_OK; } TEST_GROUP(open, error); /* Attempting to open two databases on the same gateway results in an error. */ TEST_CASE(open, error, twice, NULL) { struct open_fixture *f = data; (void)params; f->request.filename = "test"; f->request.vfs = ""; ENCODE(&f->request, open); HANDLE(OPEN); ASSERT_CALLBACK(0, DB); ENCODE(&f->request, open); HANDLE(OPEN); ASSERT_CALLBACK(0, FAILURE); ASSERT_FAILURE(SQLITE_BUSY, "a database for this connection is already open"); return MUNIT_OK; } /****************************************************************************** * * prepare * ******************************************************************************/ struct prepare_fixture { FIXTURE; struct request_prepare request; struct response_stmt response; }; TEST_SUITE(prepare); TEST_SETUP(prepare) { struct prepare_fixture *f = munit_malloc(sizeof *f); SETUP; OPEN; return f; } TEST_TEAR_DOWN(prepare) { struct prepare_fixture *f = data; TEAR_DOWN; free(f); } /* Successfully prepare a statement. */ TEST_CASE(prepare, success, NULL) { struct prepare_fixture *f = data; (void)params; f->request.db_id = 0; f->request.sql = "CREATE TABLE test (n INT)"; CLUSTER_ELECT(0); ENCODE(&f->request, prepare); HANDLE(PREPARE); WAIT; ASSERT_CALLBACK(0, STMT); DECODE(&f->response, stmt); munit_assert_int(f->response.id, ==, 0); return MUNIT_OK; } /* Prepare an empty statement. */ TEST_CASE(prepare, empty1, NULL) { struct prepare_fixture *f = data; (void)params; f->request.db_id = 0; f->request.sql = ""; CLUSTER_ELECT(0); ENCODE(&f->request, prepare); HANDLE(PREPARE); WAIT; ASSERT_CALLBACK(0, FAILURE); ASSERT_FAILURE(0, "empty statement"); munit_assert_int(f->response.id, ==, 0); return MUNIT_OK; } /* Prepare an empty statement. */ TEST_CASE(prepare, empty2, NULL) { struct prepare_fixture *f = data; (void)params; f->request.db_id = 0; f->request.sql = " -- This is a comment"; CLUSTER_ELECT(0); ENCODE(&f->request, prepare); HANDLE(PREPARE); WAIT; ASSERT_CALLBACK(0, FAILURE); ASSERT_FAILURE(0, "empty statement"); munit_assert_int(f->response.id, ==, 0); return MUNIT_OK; } /* Prepare an invalid statement. */ TEST_CASE(prepare, invalid, NULL) { struct prepare_fixture *f = data; (void)params; f->request.db_id = 0; f->request.sql = "NOT SQL"; CLUSTER_ELECT(0); ENCODE(&f->request, prepare); HANDLE(PREPARE); WAIT; ASSERT_CALLBACK(0, FAILURE); ASSERT_FAILURE(SQLITE_ERROR, "near \"NOT\": syntax error"); munit_assert_int(f->response.id, ==, 0); return MUNIT_OK; } /* Prepare a statement and close the gateway early. */ TEST_CASE(prepare, closing, NULL) { struct prepare_fixture *f = data; (void)params; f->request.db_id = 0; f->request.sql = "CREATE TABLE test (n INT)"; ENCODE(&f->request, prepare); CLUSTER_ELECT(0); HANDLE(PREPARE); return MUNIT_OK; } /* Submit a prepare request that triggers a failed barrier operation. */ TEST_CASE(prepare, barrier_error, NULL) { struct prepare_fixture *f = data; uint64_t stmt_id; (void)params; /* Set up an uncommitted exec operation */ CLUSTER_ELECT(0); PREPARE("CREATE TABLE test (n INT)"); EXEC_SUBMIT(stmt_id); CLUSTER_DEPOSE; ASSERT_CALLBACK(0, FAILURE); /* Submit a prepare request, forcing a barrier, which fails */ CLUSTER_ELECT(0); f->request.db_id = 0; f->request.sql = "SELECT n FROM test"; ENCODE(&f->request, prepare); /* We rely on leader__barrier (called by handle_prepare) attempting * an allocation using raft_malloc. */ test_raft_heap_fault_config(0, 1); test_raft_heap_fault_enable(); HANDLE_STATUS(DQLITE_REQUEST_PREPARE, RAFT_NOMEM); return MUNIT_OK; } /* Submit a prepare request to a non-leader node. */ TEST_CASE(prepare, non_leader, NULL) { struct prepare_fixture *f = data; (void)params; CLUSTER_ELECT(0); SELECT(1); f->request.db_id = 0; f->request.sql = "CREATE TABLE test (n INT)"; ENCODE(&f->request, prepare); HANDLE(PREPARE); WAIT; ASSERT_CALLBACK(0, FAILURE); ASSERT_FAILURE(SQLITE_IOERR_NOT_LEADER, "not leader"); return MUNIT_OK; } /* Try to prepare a string containing more than one statement. */ TEST_CASE(prepare, nonempty_tail, NULL) { struct prepare_fixture *f = data; (void)params; f->request.db_id = 0; f->request.sql = "CREATE TABLE test (n INT); SELECT * FROM test"; CLUSTER_ELECT(0); ENCODE(&f->request, prepare); HANDLE(PREPARE); WAIT; ASSERT_CALLBACK(0, FAILURE); ASSERT_FAILURE(SQLITE_ERROR, "nonempty statement tail"); return MUNIT_OK; } /* Try to prepare a string containing a comment after the statement. */ TEST_CASE(prepare, comment_in_tail, NULL) { struct prepare_fixture *f = data; (void)params; f->request.db_id = 0; f->request.sql = "CREATE TABLE test (n INT); /* comment */"; CLUSTER_ELECT(0); ENCODE(&f->request, prepare); HANDLE(PREPARE); WAIT; ASSERT_CALLBACK(0, STMT); return MUNIT_OK; } /* Try to prepare a string containing more than one statement, successfully. */ TEST_CASE(prepare, nonempty_tail_v1, NULL) { struct prepare_fixture *f = data; struct response_stmt_with_offset response = { 0 }; struct request_exec exec = { 0 }; uint64_t offset; int rc; (void)params; f->request.db_id = 0; f->request.sql = "CREATE TABLE test (n INT); SELECT * FROM test"; CLUSTER_ELECT(0); ENCODE(&f->request, prepare); HANDLE_SCHEMA_STATUS(DQLITE_REQUEST_PREPARE, 1, 0); WAIT; ASSERT_CALLBACK(0, STMT_WITH_OFFSET); DECODE(&response, stmt_with_offset); munit_assert_int(response.id, ==, 0); munit_assert_uint64(response.offset, ==, 26); offset = response.offset; ENCODE(&exec, exec); f->handle->cursor.p = buffer__cursor(f->buf1, 0); f->handle->cursor.cap = buffer__offset(f->buf1); buffer__reset(f->buf2); f->context->invoked = false; f->context->status = -1; f->context->type = -1; rc = gateway__handle(f->gateway, f->handle, DQLITE_REQUEST_EXEC, DQLITE_REQUEST_PARAMS_SCHEMA_V0, f->buf2, handleCb); munit_assert_int(rc, ==, 0); WAIT; ASSERT_CALLBACK(0, RESULT); f->request.sql += offset; ENCODE(&f->request, prepare); HANDLE_SCHEMA_STATUS(DQLITE_REQUEST_PREPARE, 1, 0); WAIT; ASSERT_CALLBACK(0, STMT_WITH_OFFSET); DECODE(&response, stmt_with_offset); munit_assert_int(response.id, ==, 1); munit_assert_uint64(response.offset, ==, 19); return MUNIT_OK; } /****************************************************************************** * * exec * ******************************************************************************/ struct exec_fixture { FIXTURE; struct request_exec request; struct response_result response; }; TEST_SUITE(exec); TEST_SETUP(exec) { struct exec_fixture *f = munit_malloc(sizeof *f); SETUP; OPEN; return f; } TEST_TEAR_DOWN(exec) { struct exec_fixture *f = data; TEAR_DOWN; free(f); } /* Successfully execute a simple statement with no parameters. */ TEST_CASE(exec, simple, NULL) { struct exec_fixture *f = data; uint64_t stmt_id; (void)params; CLUSTER_ELECT(0); PREPARE("CREATE TABLE test (n INT)"); f->request.db_id = 0; f->request.stmt_id = stmt_id; ENCODE(&f->request, exec); HANDLE(EXEC); WAIT; ASSERT_CALLBACK(0, RESULT); DECODE(&f->response, result); munit_assert_int(f->response.last_insert_id, ==, 0); munit_assert_int(f->response.rows_affected, ==, 0); return MUNIT_OK; } /* Successfully execute a statement with a one parameter. */ TEST_CASE(exec, one_param, NULL) { struct exec_fixture *f = data; struct value value; uint64_t stmt_id; (void)params; CLUSTER_ELECT(0); /* Create the test table */ EXEC("CREATE TABLE test (n INT)"); /* Insert a row with one parameter */ PREPARE("INSERT INTO test VALUES (?)"); f->request.stmt_id = stmt_id; ENCODE(&f->request, exec); value.type = SQLITE_INTEGER; value.integer = 7; ENCODE_PARAMS(1, &value, TUPLE__PARAMS); HANDLE(EXEC); WAIT; ASSERT_CALLBACK(0, RESULT); DECODE(&f->response, result); munit_assert_int(f->response.last_insert_id, ==, 1); munit_assert_int(f->response.rows_affected, ==, 1); return MUNIT_OK; } /* Successfully execute a statement with a blob parameter. */ TEST_CASE(exec, blob, NULL) { struct exec_fixture *f = data; struct request_query query; struct value value; char buf[8] = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h' }; uint64_t stmt_id; uint64_t n; const char *column; (void)params; CLUSTER_ELECT(0); /* Create the test table */ EXEC("CREATE TABLE test (data BLOB)"); /* Insert a row with one parameter */ PREPARE("INSERT INTO test VALUES (?)"); f->request.stmt_id = stmt_id; ENCODE(&f->request, exec); value.type = SQLITE_BLOB; value.blob.base = buf; value.blob.len = sizeof buf; ENCODE_PARAMS(1, &value, TUPLE__PARAMS); HANDLE(EXEC); WAIT; ASSERT_CALLBACK(0, RESULT); DECODE(&f->response, result); munit_assert_int(f->response.last_insert_id, ==, 1); munit_assert_int(f->response.rows_affected, ==, 1); PREPARE("SELECT data FROM test"); query.db_id = 0; query.stmt_id = stmt_id; ENCODE(&query, query); HANDLE(QUERY); ASSERT_CALLBACK(0, ROWS); uint64__decode(f->cursor, &n); munit_assert_int(n, ==, 1); text__decode(f->cursor, &column); munit_assert_string_equal(column, "data"); DECODE_ROW(1, &value); munit_assert_int(value.type, ==, SQLITE_BLOB); munit_assert_int(value.blob.len, ==, sizeof buf); munit_assert_int(value.blob.base[0], ==, 'a'); munit_assert_int(value.blob.base[7], ==, 'h'); return MUNIT_OK; } /* The server is not the leader anymore when the first frames hook for a * non-commit frames batch fires. The same leader gets re-elected. */ TEST_CASE(exec, frames_not_leader_1st_non_commit_re_elected, NULL) { struct exec_fixture *f = data; uint64_t stmt_id; unsigned i; (void)params; CLUSTER_ELECT(0); /* Accumulate enough dirty data to fill the page cache */ LOWER_CACHE_SIZE; EXEC("CREATE TABLE test (n INT)"); EXEC("BEGIN"); for (i = 0; i < 162; i++) { EXEC("INSERT INTO test(n) VALUES(1)"); } /* Trigger a page cache flush to the WAL, which fails because we are not * leader anymore */ PREPARE("INSERT INTO test(n) VALUES(1)"); CLUSTER_DEPOSE; EXEC_SUBMIT(stmt_id); ASSERT_CALLBACK(0, FAILURE); ASSERT_FAILURE(SQLITE_IOERR_NOT_LEADER, "not leader"); /* Re-elect ourselves and re-try */ CLUSTER_ELECT(0); EXEC("INSERT INTO test(n) VALUES(1)"); return MUNIT_OK; } /* The server is not the leader anymore when the first frames hook for a * non-commit frames batch fires. Another leader gets re-elected. */ TEST_CASE(exec, frames_not_leader_1st_non_commit_other_elected, NULL) { struct exec_fixture *f = data; uint64_t stmt_id; unsigned i; (void)params; CLUSTER_ELECT(0); /* Accumulate enough dirty data to fill the page cache */ LOWER_CACHE_SIZE; EXEC("CREATE TABLE test (n INT)"); EXEC("BEGIN"); for (i = 0; i < 162; i++) { EXEC("INSERT INTO test(n) VALUES(1)"); } /* Trigger a page cache flush to the WAL, which fails because we are not * leader anymore */ PREPARE("INSERT INTO test(n) VALUES(1)"); CLUSTER_DEPOSE; EXEC_SUBMIT(stmt_id); ASSERT_CALLBACK(0, FAILURE); ASSERT_FAILURE(SQLITE_IOERR_NOT_LEADER, "not leader"); /* Elect another leader and re-try */ CLUSTER_ELECT(1); SELECT(1); OPEN; EXEC("INSERT INTO test(n) VALUES(1)"); return MUNIT_OK; } /* The server is not the leader anymore when the second frames hook for a * non-commit frames batch fires. The same leader gets re-elected. */ TEST_CASE(exec, frames_not_leader_2nd_non_commit_re_elected, NULL) { struct exec_fixture *f = data; uint64_t stmt_id; unsigned i; (void)params; CLUSTER_ELECT(0); /* Accumulate enough dirty data to fill the page cache a first time, * flush it and then fill it a second time. */ LOWER_CACHE_SIZE; EXEC("CREATE TABLE test (n INT)"); EXEC("BEGIN"); for (i = 0; i < 234; i++) { EXEC("INSERT INTO test(n) VALUES(1)"); } /* Trigger a second page cache flush to the WAL, which fails because we * are not leader anymore */ PREPARE("INSERT INTO test(n) VALUES(1)"); CLUSTER_DEPOSE; EXEC_SUBMIT(stmt_id); ASSERT_CALLBACK(0, FAILURE); ASSERT_FAILURE(SQLITE_IOERR_NOT_LEADER, "not leader"); /* Re-elect ourselves and re-try */ CLUSTER_ELECT(0); EXEC("INSERT INTO test(n) VALUES(1)"); return MUNIT_OK; } /* The gateway is closed while a raft commit is in flight. */ TEST_CASE(exec, close_while_in_flight, NULL) { struct exec_fixture *f = data; unsigned i; (void)params; CLUSTER_ELECT(0); /* Accumulate enough dirty data to fill the page cache and trigger * an apply request. */ LOWER_CACHE_SIZE; EXEC("CREATE TABLE test (n INT)"); EXEC("BEGIN"); for (i = 0; i < 162; i++) { EXEC("INSERT INTO test(n) VALUES(1)"); } /* Trigger a second page cache flush to the WAL, and abort before it's * done. */ EXEC_SQL_SUBMIT("INSERT INTO test(n) VALUES(1)"); return MUNIT_OK; } /* The server is not the leader anymore when the second frames hook for a * non-commit frames batch fires. Another leader gets elected. */ TEST_CASE(exec, frames_not_leader_2nd_non_commit_other_elected, NULL) { struct exec_fixture *f = data; uint64_t stmt_id; unsigned i; (void)params; CLUSTER_ELECT(0); /* Accumulate enough dirty data to fill the page cache a first time, * flush it and then fill it a second time. */ LOWER_CACHE_SIZE; EXEC("CREATE TABLE test (n INT)"); EXEC("BEGIN"); for (i = 0; i < 234; i++) { EXEC("INSERT INTO test(n) VALUES(1)"); } /* Trigger a second page cache flush to the WAL, which fails because we * are not leader anymore */ PREPARE("INSERT INTO test(n) VALUES(1)"); CLUSTER_DEPOSE; EXEC_SUBMIT(stmt_id); ASSERT_CALLBACK(0, FAILURE); /* Elect another leader and re-try */ CLUSTER_ELECT(1); SELECT(1); OPEN; EXEC("INSERT INTO test(n) VALUES(1)"); return MUNIT_OK; } /* The server loses leadership after trying to apply the first Frames command * for a non-commit frames batch. The same leader gets re-elected. */ TEST_CASE(exec, frames_leadership_lost_1st_non_commit_re_elected, NULL) { struct exec_fixture *f = data; uint64_t stmt_id; unsigned i; (void)params; CLUSTER_ELECT(0); /* Accumulate enough dirty data to fill the page cache */ LOWER_CACHE_SIZE; EXEC("CREATE TABLE test (n INT)"); EXEC("BEGIN"); for (i = 0; i < 162; i++) { EXEC("INSERT INTO test(n) VALUES(1)"); } /* Trigger a page cache flush to the WAL */ EXEC("INSERT INTO test(n) VALUES(1)"); /* Try to commit */ PREPARE("COMMIT"); EXEC_SUBMIT(stmt_id); CLUSTER_DEPOSE; ASSERT_CALLBACK(0, FAILURE); ASSERT_FAILURE(SQLITE_IOERR_LEADERSHIP_LOST, "disk I/O error"); /* Re-elect ourselves and re-try */ CLUSTER_ELECT(0); EXEC("INSERT INTO test(n) VALUES(1)"); return MUNIT_OK; } /* The server is not the leader anymore when the undo hook for a writing * transaction fires. The same leader gets re-elected. */ TEST_CASE(exec, undo_not_leader_pending_re_elected, NULL) { struct exec_fixture *f = data; uint64_t stmt_id; unsigned i; (void)params; CLUSTER_ELECT(0); /* Accumulate enough dirty data to fill the page cache a first time */ LOWER_CACHE_SIZE; EXEC("CREATE TABLE test (n INT)"); EXEC("BEGIN"); for (i = 0; i < 163; i++) { EXEC("INSERT INTO test(n) VALUES(1)"); } /* Trying to rollback fails because we are not leader anymore */ PREPARE("ROLLBACK"); CLUSTER_DEPOSE; EXEC_SUBMIT(stmt_id); ASSERT_CALLBACK(0, FAILURE); ASSERT_FAILURE(SQLITE_IOERR_NOT_LEADER, "not leader"); /* Re-elect ourselves and re-try */ CLUSTER_ELECT(0); EXEC("INSERT INTO test(n) VALUES(1)"); return MUNIT_OK; } /* The server is not the leader anymore when the undo hook for a writing * transaction fires. Another leader gets elected. */ TEST_CASE(exec, undo_not_leader_pending_other_elected, NULL) { struct exec_fixture *f = data; uint64_t stmt_id; unsigned i; (void)params; CLUSTER_ELECT(0); /* Accumulate enough dirty data to fill the page cache a first time */ LOWER_CACHE_SIZE; EXEC("CREATE TABLE test (n INT)"); EXEC("BEGIN"); for (i = 0; i < 163; i++) { EXEC("INSERT INTO test(n) VALUES(1)"); } /* Trying to rollback fails because we are not leader anymore */ PREPARE("ROLLBACK"); CLUSTER_DEPOSE; EXEC_SUBMIT(stmt_id); ASSERT_CALLBACK(0, FAILURE); ASSERT_FAILURE(SQLITE_IOERR_NOT_LEADER, "not leader"); /* Re-elect ourselves and re-try */ CLUSTER_ELECT(1); SELECT(1); OPEN; EXEC("INSERT INTO test(n) VALUES(1)"); return MUNIT_OK; } /* A follower remains behind and needs to restore state from a snapshot. */ TEST_CASE(exec, restore, NULL) { struct exec_fixture *f = data; uint64_t stmt_id; struct request_query request; struct response_rows response; struct value value; uint64_t n; const char *column; (void)params; CLUSTER_SNAPSHOT_THRESHOLD(0, 5); CLUSTER_SNAPSHOT_TRAILING(0, 2); CLUSTER_ELECT(0); CLUSTER_DISCONNECT(0, 1); EXEC("CREATE TABLE test (n INT)"); EXEC("INSERT INTO test(n) VALUES(1)"); EXEC("INSERT INTO test(n) VALUES(2)"); CLUSTER_RECONNECT(0, 1); CLUSTER_APPLIED(4); /* TODO: the query below fails because we can exec queries only against * the leader. */ return MUNIT_SKIP; /* The follower contains the expected rows. */ SELECT(1); OPEN; PREPARE("SELECT n FROM test"); request.db_id = 0; request.stmt_id = stmt_id; ENCODE(&request, query); HANDLE(QUERY); ASSERT_CALLBACK(0, ROWS); uint64__decode(f->cursor, &n); munit_assert_int(n, ==, 1); text__decode(f->cursor, &column); munit_assert_string_equal(column, "n"); DECODE_ROW(1, &value); munit_assert_int(value.type, ==, SQLITE_INTEGER); munit_assert_int(value.integer, ==, 1); DECODE_ROW(1, &value); munit_assert_int(value.type, ==, SQLITE_INTEGER); munit_assert_int(value.integer, ==, 2); DECODE(&response, rows); munit_assert_ullong(response.eof, ==, DQLITE_RESPONSE_ROWS_DONE); return MUNIT_OK; } /* Close the gateway early while an exec barrier is in flight. */ TEST_CASE(exec, barrier_closing, NULL) { struct exec_fixture *f = data; uint64_t stmt_id, prev_stmt_id; (void)params; CLUSTER_ELECT(0); EXEC("CREATE TABLE test (n INT)"); /* Save this stmt to exec later */ PREPARE("INSERT INTO test(n) VALUES(2)"); prev_stmt_id = stmt_id; /* Submit exec request, then depose the leader before it commits */ PREPARE("INSERT INTO test(n) VALUES(1)"); EXEC_SUBMIT(stmt_id); CLUSTER_DEPOSE; ASSERT_CALLBACK(0, FAILURE); /* Now try to exec the other stmt (triggering a barrier) and close early */ CLUSTER_ELECT(0); EXEC_SUBMIT(prev_stmt_id); return MUNIT_OK; } /* Send an exec request in the new (schema version 1) format, which * supports larger numbers of parameters. */ TEST_CASE(exec, manyParams, NULL) { struct exec_fixture *f = data; uint64_t stmt_id; size_t len = 20000; char *sql = munit_malloc(len); size_t pos; size_t i; size_t num_exec_params = 999; struct value *values = munit_calloc(num_exec_params, sizeof(*values)); (void)params; pos = snprintf(sql, len, "DELETE FROM test WHERE n = ?"); for (i = 1; i < num_exec_params; i++) { pos += snprintf(sql + pos, len - pos, " OR n = ?"); } for (i = 0; i < num_exec_params; i++) { values[i].type = SQLITE_INTEGER; values[i].integer = i; } CLUSTER_ELECT(0); EXEC("CREATE TABLE test (n INT)"); PREPARE(sql); f->request.db_id = 0; f->request.stmt_id = stmt_id; ENCODE(&f->request, exec); ENCODE_PARAMS(num_exec_params, values, TUPLE__PARAMS32); HANDLE_SCHEMA_STATUS(DQLITE_REQUEST_EXEC, 1, 0); WAIT; ASSERT_CALLBACK(0, RESULT); FINALIZE(stmt_id); free(values); free(sql); return MUNIT_OK; } TEST_CASE(exec, unexpectedRow, NULL) { struct exec_fixture *f = data; uint64_t stmt_id; (void)params; CLUSTER_ELECT(0); EXEC("CREATE TABLE test (n INT)"); EXEC("INSERT INTO test (n) VALUES (1)"); PREPARE("SELECT n FROM test"); f->request.db_id = 0; f->request.stmt_id = stmt_id; ENCODE(&f->request, exec); HANDLE(EXEC); WAIT; ASSERT_CALLBACK(0, FAILURE); ASSERT_FAILURE(SQLITE_ROW, "rows yielded when none expected for EXEC request"); return MUNIT_OK; } /****************************************************************************** * * query * ******************************************************************************/ struct query_fixture { FIXTURE; struct request_query request; struct response_rows response; }; TEST_SUITE(query); TEST_SETUP(query) { struct query_fixture *f = munit_malloc(sizeof *f); SETUP; OPEN; CLUSTER_ELECT(0); EXEC("CREATE TABLE test (n INT, data BLOB)"); return f; } TEST_TEAR_DOWN(query) { struct query_fixture *f = data; TEAR_DOWN; free(f); } /* Successfully query a simple statement with no parameters and yielding no * rows. */ TEST_CASE(query, simple, NULL) { struct query_fixture *f = data; uint64_t stmt_id; uint64_t n; const char *column; (void)params; PREPARE("SELECT n FROM test"); f->request.db_id = 0; f->request.stmt_id = stmt_id; ENCODE(&f->request, query); HANDLE(QUERY); ASSERT_CALLBACK(0, ROWS); uint64__decode(f->cursor, &n); munit_assert_int(n, ==, 1); text__decode(f->cursor, &column); munit_assert_string_equal(column, "n"); DECODE(&f->response, rows); munit_assert_ullong(f->response.eof, ==, DQLITE_RESPONSE_ROWS_DONE); return MUNIT_OK; } /* Successfully query a simple statement with no parameters yielding one row. */ TEST_CASE(query, one_row, NULL) { struct query_fixture *f = data; uint64_t stmt_id; uint64_t n; const char *column; struct value value; (void)params; EXEC("INSERT INTO test(n) VALUES(666)"); PREPARE("SELECT n FROM test"); f->request.db_id = 0; f->request.stmt_id = stmt_id; ENCODE(&f->request, query); HANDLE(QUERY); ASSERT_CALLBACK(0, ROWS); uint64__decode(f->cursor, &n); munit_assert_int(n, ==, 1); text__decode(f->cursor, &column); munit_assert_string_equal(column, "n"); DECODE_ROW(1, &value); munit_assert_int(value.type, ==, SQLITE_INTEGER); munit_assert_int(value.integer, ==, 666); DECODE(&f->response, rows); munit_assert_ullong(f->response.eof, ==, DQLITE_RESPONSE_ROWS_DONE); return MUNIT_OK; } /* Calculate max amount of rows that can fit in 1 response buffer. * A response buffer has _SC_PAGESIZE size. * A response consists of n tuples each row_sz in size * and an 8B EOF marker. */ static unsigned max_rows_buffer(unsigned tuple_row_sz) { unsigned buf_sz = sysconf(_SC_PAGESIZE); unsigned eof_sz = 8; return (buf_sz - eof_sz) / tuple_row_sz; } /* Successfully query that yields a large number of rows that need to be split * into several reponses. */ TEST_CASE(query, large, NULL) { struct query_fixture *f = data; unsigned i; uint64_t stmt_id; uint64_t n; const char *column; struct value value; bool finished; (void)params; EXEC("BEGIN"); /* 16 = 8B header + 8B value (int) */ unsigned n_rows_buffer = max_rows_buffer(16); /* Insert 1 less than 2 response buffers worth of rows, otherwise we * need 3 responses, of which the last one contains no rows. */ for (i = 0; i < ((2 * n_rows_buffer) - 1); i++) { EXEC("INSERT INTO test(n) VALUES(123)"); } EXEC("COMMIT"); PREPARE("SELECT n FROM test"); f->request.db_id = 0; f->request.stmt_id = stmt_id; ENCODE(&f->request, query); HANDLE(QUERY); ASSERT_CALLBACK(0, ROWS); uint64__decode(f->cursor, &n); munit_assert_int(n, ==, 1); text__decode(f->cursor, &column); munit_assert_string_equal(column, "n"); /* First response contains max amount of rows */ for (i = 0; i < n_rows_buffer; i++) { DECODE_ROW(1, &value); munit_assert_int(value.type, ==, SQLITE_INTEGER); munit_assert_int(value.integer, ==, 123); } DECODE(&f->response, rows); munit_assert_ullong(f->response.eof, ==, DQLITE_RESPONSE_ROWS_PART); gateway__resume(f->gateway, &finished); munit_assert_false(finished); ASSERT_CALLBACK(0, ROWS); uint64__decode(f->cursor, &n); munit_assert_int(n, ==, 1); text__decode(f->cursor, &column); munit_assert_string_equal(column, "n"); /* Second, and last, response contains 1 less than maximum amount */ for (i = 0; i < n_rows_buffer - 1; i++) { DECODE_ROW(1, &value); munit_assert_int(value.type, ==, SQLITE_INTEGER); munit_assert_int(value.integer, ==, 123); } DECODE(&f->response, rows); munit_assert_ullong(f->response.eof, ==, DQLITE_RESPONSE_ROWS_DONE); gateway__resume(f->gateway, &finished); munit_assert_true(finished); return MUNIT_OK; } /* Perform a query using a prepared statement with parameters */ TEST_CASE(query, params, NULL) { struct query_fixture *f = data; struct value values[2]; uint64_t stmt_id; (void)params; EXEC("BEGIN"); EXEC("INSERT INTO test(n) VALUES(1)"); EXEC("INSERT INTO test(n) VALUES(2)"); EXEC("INSERT INTO test(n) VALUES(3)"); EXEC("INSERT INTO test(n) VALUES(4)"); EXEC("COMMIT"); PREPARE("SELECT n FROM test WHERE n > ? AND n < ?"); f->request.db_id = 0; f->request.stmt_id = stmt_id; ENCODE(&f->request, query); values[0].type = SQLITE_INTEGER; values[0].integer = 1; values[1].type = SQLITE_INTEGER; values[1].integer = 4; ENCODE_PARAMS(2, values, TUPLE__PARAMS); HANDLE(QUERY); ASSERT_CALLBACK(0, ROWS); return MUNIT_OK; } /* Interrupt a large query. */ TEST_CASE(query, interrupt, NULL) { struct query_fixture *f = data; struct request_interrupt interrupt; unsigned i; uint64_t stmt_id; uint64_t n; const char *column; struct value value; (void)params; EXEC("BEGIN"); /* 16 = 8B header + 8B value (int) */ unsigned n_rows_buffer = max_rows_buffer(16); /* Insert 2 response buffers worth of rows */ for (i = 0; i < 2 * n_rows_buffer; i++) { EXEC("INSERT INTO test(n) VALUES(123)"); } EXEC("COMMIT"); PREPARE("SELECT n FROM test"); f->request.db_id = 0; f->request.stmt_id = stmt_id; ENCODE(&f->request, query); HANDLE(QUERY); ASSERT_CALLBACK(0, ROWS); uint64__decode(f->cursor, &n); munit_assert_int(n, ==, 1); text__decode(f->cursor, &column); munit_assert_string_equal(column, "n"); for (i = 0; i < n_rows_buffer; i++) { DECODE_ROW(1, &value); munit_assert_int(value.type, ==, SQLITE_INTEGER); munit_assert_int(value.integer, ==, 123); } DECODE(&f->response, rows); munit_assert_ullong(f->response.eof, ==, DQLITE_RESPONSE_ROWS_PART); ENCODE(&interrupt, interrupt); HANDLE(INTERRUPT); ASSERT_CALLBACK(0, EMPTY); return MUNIT_OK; } /* Interrupt without an active query. */ TEST_CASE(query, interruptInactive, NULL) { struct query_fixture *f = data; struct request_interrupt interrupt; (void)params; ENCODE(&interrupt, interrupt); HANDLE(INTERRUPT); ASSERT_CALLBACK(0, EMPTY); return MUNIT_OK; } /* Close the gateway during a large query. */ TEST_CASE(query, largeClose, NULL) { struct query_fixture *f = data; unsigned i; uint64_t stmt_id; uint64_t n; const char *column; struct value value; (void)params; EXEC("BEGIN"); /* 16 = 8B header + 8B value (int) */ unsigned n_rows_buffer = max_rows_buffer(16); /* Insert 2 response buffers worth of rows */ for (i = 0; i < 2 * n_rows_buffer; i++) { EXEC("INSERT INTO test(n) VALUES(123)"); } EXEC("COMMIT"); PREPARE("SELECT n FROM test"); f->request.db_id = 0; f->request.stmt_id = stmt_id; ENCODE(&f->request, query); HANDLE(QUERY); ASSERT_CALLBACK(0, ROWS); uint64__decode(f->cursor, &n); munit_assert_int(n, ==, 1); text__decode(f->cursor, &column); munit_assert_string_equal(column, "n"); for (i = 0; i < n_rows_buffer; i++) { DECODE_ROW(1, &value); munit_assert_int(value.type, ==, SQLITE_INTEGER); munit_assert_int(value.integer, ==, 123); } DECODE(&f->response, rows); munit_assert_ullong(f->response.eof, ==, DQLITE_RESPONSE_ROWS_PART); return MUNIT_OK; } /* Submit a query request right after the server has been re-elected and needs * to catch up with logs. */ TEST_CASE(query, barrier, NULL) { struct query_fixture *f = data; uint64_t stmt_id; (void)params; PREPARE("INSERT INTO test(n) VALUES(1)"); EXEC_SUBMIT(stmt_id); CLUSTER_DEPOSE; ASSERT_CALLBACK(0, FAILURE); /* Re-elect ourselves and issue a query request */ CLUSTER_ELECT(0); PREPARE("SELECT n FROM test"); f->request.db_id = 0; f->request.stmt_id = stmt_id; ENCODE(&f->request, query); HANDLE(QUERY); WAIT; ASSERT_CALLBACK(0, ROWS); return MUNIT_OK; } /* Submit a query request right after the server has been re-elected and needs * to catch up with logs, but close early */ TEST_CASE(query, barrierInFlightQuery, NULL) { struct query_fixture *f = data; uint64_t stmt_id; (void)params; PREPARE("INSERT INTO test(n) VALUES(1)"); EXEC_SUBMIT(stmt_id); CLUSTER_DEPOSE; ASSERT_CALLBACK(0, FAILURE); /* Re-elect ourselves and issue a query request */ CLUSTER_ELECT(0); PREPARE("SELECT n FROM test"); f->request.db_id = 0; f->request.stmt_id = stmt_id; ENCODE(&f->request, query); HANDLE(QUERY); return MUNIT_OK; } /* Submit a query sql request right after the server has been re-elected and * needs to catch up with logs, but close early */ TEST_CASE(query, barrierInFlightQuerySql, NULL) { struct query_fixture *f = data; uint64_t stmt_id; (void)params; PREPARE("INSERT INTO test(n) VALUES(1)"); EXEC_SUBMIT(stmt_id); CLUSTER_DEPOSE; ASSERT_CALLBACK(0, FAILURE); /* Re-elect ourselves and issue a query request */ CLUSTER_ELECT(0); QUERY_SQL_SUBMIT("SELECT n FROM test"); return MUNIT_OK; } /* Submit an exec request right after the server has been re-elected and needs * to catch up with logs, but close early */ TEST_CASE(query, barrierInFlightExec, NULL) { struct query_fixture *f = data; uint64_t stmt_id; (void)params; PREPARE("INSERT INTO test(n) VALUES(1)"); EXEC_SUBMIT(stmt_id); CLUSTER_DEPOSE; ASSERT_CALLBACK(0, FAILURE); /* Re-elect ourselves and issue an exec request */ CLUSTER_ELECT(0); PREPARE("INSERT INTO test(n) VALUES(2)"); EXEC_SUBMIT(stmt_id); return MUNIT_OK; } /* Send a QUERY request in the new (schema version 1) format, which * supports larger numbers of parameters. */ TEST_CASE(query, manyParams, NULL) { struct query_fixture *f = data; uint64_t stmt_id; size_t len = 20000; char *sql = munit_malloc(len); size_t pos; size_t i; size_t num_query_params = 999; struct value *values = munit_calloc(num_query_params, sizeof(*values)); (void)params; pos = snprintf(sql, len, "SELECT (n) FROM test WHERE n = ?"); for (i = 1; i < num_query_params; i++) { pos += snprintf(sql + pos, len - pos, " OR n = ?"); } for (i = 0; i < num_query_params; i++) { values[i].type = SQLITE_INTEGER; values[i].integer = i; } PREPARE(sql); f->request.db_id = 0; f->request.stmt_id = stmt_id; ENCODE(&f->request, query); ENCODE_PARAMS(num_query_params, values, TUPLE__PARAMS32); HANDLE_SCHEMA_STATUS(DQLITE_REQUEST_QUERY, 1, 0); WAIT; ASSERT_CALLBACK(0, ROWS); FINALIZE(stmt_id); free(values); free(sql); return MUNIT_OK; } /* Successfully query that yields a large number of rows that need to be split * into several reponses. */ TEST_CASE(query, close_while_in_flight, NULL) { struct query_fixture *f = data; unsigned i; uint64_t stmt_id; uint64_t n; const char *column; struct value value; bool finished; (void)params; EXEC("BEGIN"); /* 16 = 8B header + 8B value (int) */ unsigned n_rows_buffer = max_rows_buffer(16); /* Insert 1 less than 2 response buffers worth of rows, otherwise we * need 3 responses, of which the last one contains no rows. */ for (i = 0; i < ((2 * n_rows_buffer) - 1); i++) { EXEC("INSERT INTO test(n) VALUES(123)"); } EXEC("COMMIT"); PREPARE("SELECT n FROM test"); f->request.db_id = 0; f->request.stmt_id = stmt_id; ENCODE(&f->request, query); HANDLE(QUERY); ASSERT_CALLBACK(0, ROWS); uint64__decode(f->cursor, &n); munit_assert_int(n, ==, 1); text__decode(f->cursor, &column); munit_assert_string_equal(column, "n"); /* First response contains max amount of rows */ for (i = 0; i < n_rows_buffer; i++) { DECODE_ROW(1, &value); munit_assert_int(value.type, ==, SQLITE_INTEGER); munit_assert_int(value.integer, ==, 123); } /* Simulate a gateway close */ gateway__close(f->gateway); gateway__resume(f->gateway, &finished); return MUNIT_OK; } /****************************************************************************** * * finalize * ******************************************************************************/ struct finalize_fixture { FIXTURE; struct request_finalize request; struct response_empty response; }; TEST_SUITE(finalize); TEST_SETUP(finalize) { struct finalize_fixture *f = munit_malloc(sizeof *f); SETUP; OPEN; return f; } TEST_TEAR_DOWN(finalize) { struct finalize_fixture *f = data; TEAR_DOWN; free(f); } /* Finalize a prepared statement. */ TEST_CASE(finalize, success, NULL) { uint64_t stmt_id; struct finalize_fixture *f = data; (void)params; CLUSTER_ELECT(0); PREPARE("CREATE TABLE test (n INT)"); f->request.db_id = 0; f->request.stmt_id = stmt_id; ENCODE(&f->request, finalize); HANDLE(FINALIZE); ASSERT_CALLBACK(0, EMPTY); return MUNIT_OK; } /****************************************************************************** * * exec_sql * ******************************************************************************/ struct exec_sql_fixture { FIXTURE; struct request_exec_sql request; struct response_result response; }; TEST_SUITE(exec_sql); TEST_SETUP(exec_sql) { struct exec_sql_fixture *f = munit_malloc(sizeof *f); SETUP; CLUSTER_ELECT(0); OPEN; return f; } TEST_TEAR_DOWN(exec_sql) { struct exec_sql_fixture *f = data; TEAR_DOWN; free(f); } /* Exec a SQL text with a single query. */ TEST_CASE(exec_sql, single, NULL) { struct exec_sql_fixture *f = data; (void)params; f->request.db_id = 0; f->request.sql = "CREATE TABLE test (n INT)"; ENCODE(&f->request, exec_sql); HANDLE(EXEC_SQL); CLUSTER_APPLIED(4); ASSERT_CALLBACK(0, RESULT); return MUNIT_OK; } /* Exec a SQL text with a single query. */ TEST_CASE(exec_sql, empty1, NULL) { struct exec_sql_fixture *f = data; (void)params; f->request.db_id = 0; f->request.sql = ""; ENCODE(&f->request, exec_sql); HANDLE(EXEC_SQL); WAIT; ASSERT_CALLBACK(0, RESULT); return MUNIT_OK; } /* Exec a SQL text with a single query. */ TEST_CASE(exec_sql, empty2, NULL) { struct exec_sql_fixture *f = data; (void)params; f->request.db_id = 0; f->request.sql = " -- Comment"; ENCODE(&f->request, exec_sql); HANDLE(EXEC_SQL); WAIT; ASSERT_CALLBACK(0, RESULT); return MUNIT_OK; } /* Exec an invalid SQL text with a single query. */ TEST_CASE(exec_sql, invalid, NULL) { struct exec_sql_fixture *f = data; (void)params; f->request.db_id = 0; f->request.sql = "NOT SQL"; ENCODE(&f->request, exec_sql); HANDLE(EXEC_SQL); WAIT; ASSERT_CALLBACK(0, FAILURE); ASSERT_FAILURE(SQLITE_ERROR, "near \"NOT\": syntax error"); return MUNIT_OK; } /* Exec a SQL text with a multiple queries. */ TEST_CASE(exec_sql, multi, NULL) { struct exec_sql_fixture *f = data; (void)params; f->request.db_id = 0; f->request.sql = "CREATE TABLE test (n INT); INSERT INTO test VALUES(1)"; ENCODE(&f->request, exec_sql); HANDLE(EXEC_SQL); WAIT; ASSERT_CALLBACK(0, RESULT); return MUNIT_OK; } /* Exec an ATTACH DATABASE statement -- this should fail. */ TEST_CASE(exec_sql, attach, NULL) { struct exec_sql_fixture *f = data; (void)params; f->request.db_id = 0; f->request.sql = "ATTACH DATABASE foo AS foo"; ENCODE(&f->request, exec_sql); HANDLE(EXEC_SQL); WAIT; ASSERT_CALLBACK(0, FAILURE); ASSERT_FAILURE(SQLITE_ERROR, "too many attached databases - max 0"); return MUNIT_OK; } /* Exec an SQL text and close the gateway early. */ TEST_CASE(exec_sql, closing, NULL) { struct exec_sql_fixture *f = data; (void)params; f->request.db_id = 0; f->request.sql = "CREATE TABLE test (n INT)"; ENCODE(&f->request, exec_sql); HANDLE(EXEC_SQL); return MUNIT_OK; } /* Submit an EXEC_SQL request that triggers a failed barrier operation. */ TEST_CASE(exec_sql, barrier_error, NULL) { struct exec_sql_fixture *f = data; uint64_t stmt_id; (void)params; /* Set up an uncommitted exec operation */ PREPARE("CREATE TABLE test (n INT)"); EXEC_SUBMIT(stmt_id); CLUSTER_DEPOSE; ASSERT_CALLBACK(0, FAILURE); /* Submit an EXEC_SQL request, forcing a barrier, which fails */ CLUSTER_ELECT(0); f->request.db_id = 0; f->request.sql = "INSERT INTO test VALUES(123)"; ENCODE(&f->request, exec_sql); /* We rely on leader__barrier (called by handle_exec_sql) attempting * an allocation using raft_malloc. */ test_raft_heap_fault_config(0, 1); test_raft_heap_fault_enable(); HANDLE_STATUS(DQLITE_REQUEST_EXEC_SQL, RAFT_NOMEM); return MUNIT_OK; } /* Send an EXEC_SQL request in the new (schema version 1) format, which * supports larger numbers of parameters. */ TEST_CASE(exec_sql, manyParams, NULL) { struct exec_sql_fixture *f = data; size_t len = 20000; char *sql = munit_malloc(len); size_t pos; size_t i; size_t num_exec_params = 999; struct value *values = munit_calloc(num_exec_params, sizeof(*values)); (void)params; pos = snprintf(sql, len, "DELETE FROM test WHERE n = ?"); for (i = 1; i < num_exec_params; i++) { pos += snprintf(sql + pos, len - pos, " OR n = ?"); } for (i = 0; i < num_exec_params; i++) { values[i].type = SQLITE_INTEGER; values[i].integer = i; } EXEC("CREATE TABLE test (n INT)"); f->request.db_id = 0; f->request.sql = sql; ENCODE(&f->request, exec_sql); ENCODE_PARAMS(num_exec_params, values, TUPLE__PARAMS32); HANDLE_SCHEMA_STATUS(DQLITE_REQUEST_EXEC_SQL, 1, 0); WAIT; ASSERT_CALLBACK(0, RESULT); free(values); free(sql); return MUNIT_OK; } /****************************************************************************** * * query_sql * ******************************************************************************/ struct query_sql_fixture { FIXTURE; struct request_query_sql request; struct response_rows response; }; TEST_SUITE(query_sql); TEST_SETUP(query_sql) { struct query_sql_fixture *f = munit_malloc(sizeof *f); SETUP; CLUSTER_ELECT(0); OPEN; EXEC("CREATE TABLE test (n INT)"); return f; } TEST_TEAR_DOWN(query_sql) { struct query_sql_fixture *f = data; TEAR_DOWN; free(f); } /* Exec a SQL query whose result set fits in a page. */ TEST_CASE(query_sql, small, NULL) { struct query_sql_fixture *f = data; (void)params; EXEC("INSERT INTO test VALUES(123)"); f->request.db_id = 0; f->request.sql = "SELECT n FROM test"; ENCODE(&f->request, query_sql); HANDLE(QUERY_SQL); ASSERT_CALLBACK(0, ROWS); return MUNIT_OK; } /* Exec an empty query sql. */ TEST_CASE(query_sql, empty1, NULL) { struct query_sql_fixture *f = data; (void)params; EXEC("INSERT INTO test VALUES(123)"); f->request.db_id = 0; f->request.sql = ""; ENCODE(&f->request, query_sql); HANDLE(QUERY_SQL); ASSERT_CALLBACK(0, FAILURE); ASSERT_FAILURE(0, "empty statement"); return MUNIT_OK; } /* Exec an empty query sql. */ TEST_CASE(query_sql, empty2, NULL) { struct query_sql_fixture *f = data; (void)params; EXEC("INSERT INTO test VALUES(123)"); f->request.db_id = 0; f->request.sql = " -- a comment"; ENCODE(&f->request, query_sql); HANDLE(QUERY_SQL); ASSERT_CALLBACK(0, FAILURE); ASSERT_FAILURE(0, "empty statement"); return MUNIT_OK; } /* Exec an invalid query sql. */ TEST_CASE(query_sql, invalid, NULL) { struct query_sql_fixture *f = data; (void)params; EXEC("INSERT INTO test VALUES(123)"); f->request.db_id = 0; f->request.sql = "NOT SQL"; ENCODE(&f->request, query_sql); HANDLE(QUERY_SQL); ASSERT_CALLBACK(0, FAILURE); ASSERT_FAILURE(SQLITE_ERROR, "near \"NOT\": syntax error"); return MUNIT_OK; } /* Exec a SQL query whose result set needs multiple pages. */ TEST_CASE(query_sql, large, NULL) { struct query_sql_fixture *f = data; (void)params; unsigned i; uint64_t n; const char *column; struct value value; bool finished; EXEC("BEGIN"); /* 16 = 8B header + 8B value (int) */ unsigned n_rows_buffer = max_rows_buffer(16); /* Insert 1 less than 2 response buffers worth of rows, otherwise we * need 3 responses, of which the last one contains no rows. */ for (i = 0; i < ((2 * n_rows_buffer) - 1); i++) { EXEC("INSERT INTO test(n) VALUES(123)"); } EXEC("COMMIT"); f->request.db_id = 0; f->request.sql = "SELECT n FROM test"; ENCODE(&f->request, query_sql); HANDLE(QUERY_SQL); ASSERT_CALLBACK(0, ROWS); uint64__decode(f->cursor, &n); munit_assert_int(n, ==, 1); text__decode(f->cursor, &column); munit_assert_string_equal(column, "n"); /* First response contains max amount of rows */ for (i = 0; i < n_rows_buffer; i++) { DECODE_ROW(1, &value); munit_assert_int(value.type, ==, SQLITE_INTEGER); munit_assert_int(value.integer, ==, 123); } DECODE(&f->response, rows); munit_assert_ullong(f->response.eof, ==, DQLITE_RESPONSE_ROWS_PART); gateway__resume(f->gateway, &finished); munit_assert_false(finished); ASSERT_CALLBACK(0, ROWS); uint64__decode(f->cursor, &n); munit_assert_int(n, ==, 1); text__decode(f->cursor, &column); munit_assert_string_equal(column, "n"); /* Second, and last, response contains 1 less than maximum amount */ for (i = 0; i < n_rows_buffer - 1; i++) { DECODE_ROW(1, &value); munit_assert_int(value.type, ==, SQLITE_INTEGER); munit_assert_int(value.integer, ==, 123); } DECODE(&f->response, rows); munit_assert_ullong(f->response.eof, ==, DQLITE_RESPONSE_ROWS_DONE); gateway__resume(f->gateway, &finished); munit_assert_true(finished); return MUNIT_OK; } /* Exec a SQL query whose result set needs multiple pages and close before * receiving the full result set. */ TEST_CASE(query_sql, largeClose, NULL) { struct query_sql_fixture *f = data; (void)params; unsigned i; uint64_t n; const char *column; struct value value; EXEC("BEGIN"); /* 16 = 8B header + 8B value (int) */ unsigned n_rows_buffer = max_rows_buffer(16); /* Insert 1 less than 2 response buffers worth of rows, otherwise we * need 3 responses, of which the last one contains no rows. */ for (i = 0; i < ((2 * n_rows_buffer) - 1); i++) { EXEC("INSERT INTO test(n) VALUES(123)"); } EXEC("COMMIT"); f->request.db_id = 0; f->request.sql = "SELECT n FROM test"; ENCODE(&f->request, query_sql); HANDLE(QUERY_SQL); ASSERT_CALLBACK(0, ROWS); uint64__decode(f->cursor, &n); munit_assert_int(n, ==, 1); text__decode(f->cursor, &column); munit_assert_string_equal(column, "n"); /* First response contains max amount of rows */ for (i = 0; i < n_rows_buffer; i++) { DECODE_ROW(1, &value); munit_assert_int(value.type, ==, SQLITE_INTEGER); munit_assert_int(value.integer, ==, 123); } DECODE(&f->response, rows); munit_assert_ullong(f->response.eof, ==, DQLITE_RESPONSE_ROWS_PART); return MUNIT_OK; } /* Interrupt a large query sql. */ TEST_CASE(query_sql, interrupt, NULL) { struct query_sql_fixture *f = data; struct request_interrupt interrupt; unsigned i; uint64_t n; const char *column; struct value value; (void)params; EXEC("BEGIN"); /* 16 = 8B header + 8B value (int) */ unsigned n_rows_buffer = max_rows_buffer(16); /* Insert 2 response buffers worth of rows */ for (i = 0; i < 2 * n_rows_buffer; i++) { EXEC("INSERT INTO test(n) VALUES(123)"); } EXEC("COMMIT"); f->request.db_id = 0; f->request.sql = "SELECT n FROM test"; ENCODE(&f->request, query_sql); HANDLE(QUERY_SQL); ASSERT_CALLBACK(0, ROWS); uint64__decode(f->cursor, &n); munit_assert_int(n, ==, 1); text__decode(f->cursor, &column); munit_assert_string_equal(column, "n"); for (i = 0; i < n_rows_buffer; i++) { DECODE_ROW(1, &value); munit_assert_int(value.type, ==, SQLITE_INTEGER); munit_assert_int(value.integer, ==, 123); } DECODE(&f->response, rows); munit_assert_ullong(f->response.eof, ==, DQLITE_RESPONSE_ROWS_PART); ENCODE(&interrupt, interrupt); HANDLE(INTERRUPT); ASSERT_CALLBACK(0, EMPTY); return MUNIT_OK; } /* Perform a query with parameters */ TEST_CASE(query_sql, params, NULL) { struct query_sql_fixture *f = data; struct value values[2]; (void)params; EXEC("BEGIN"); EXEC("INSERT INTO test(n) VALUES(1)"); EXEC("INSERT INTO test(n) VALUES(2)"); EXEC("INSERT INTO test(n) VALUES(3)"); EXEC("INSERT INTO test(n) VALUES(4)"); EXEC("COMMIT"); f->request.db_id = 0; f->request.sql = "SELECT n FROM test WHERE n > ? AND n < ?"; ENCODE(&f->request, query_sql); values[0].type = SQLITE_INTEGER; values[0].integer = 1; values[1].type = SQLITE_INTEGER; values[1].integer = 4; ENCODE_PARAMS(2, values, TUPLE__PARAMS); HANDLE(QUERY_SQL); ASSERT_CALLBACK(0, ROWS); return MUNIT_OK; } /* Perform a query and close the gateway early. */ TEST_CASE(query_sql, closing, NULL) { struct query_sql_fixture *f = data; (void)params; EXEC("INSERT INTO test VALUES(123)"); f->request.db_id = 0; f->request.sql = "SELECT n FROM test"; ENCODE(&f->request, query_sql); HANDLE(QUERY_SQL); return MUNIT_OK; } /* Perform a query yielding a lot of rows and close the gateway early. */ TEST_CASE(query_sql, manyClosing, NULL) { (void)params; struct query_sql_fixture *f = data; bool finished; int rv; /* Insert more than maximum amount of rows that can fit in a single * response. 16 = 8B header + 8B value (int) */ unsigned n_rows_buffer = max_rows_buffer(16); for (unsigned i = 0; i < n_rows_buffer + 32; i++) { EXEC("INSERT INTO test VALUES(123)"); } f->request.db_id = 0; f->request.sql = "SELECT n FROM test"; ENCODE(&f->request, query_sql); HANDLE(QUERY_SQL); gateway__close(f->gateway); rv = gateway__resume(f->gateway, &finished); munit_assert_int(rv, ==, 0); return MUNIT_OK; } /* Submit a QUERY_SQL request that triggers a failed barrier operation. */ TEST_CASE(query_sql, barrier_error, NULL) { struct query_sql_fixture *f = data; uint64_t stmt_id; (void)params; /* Set up an uncommitted exec operation */ PREPARE("INSERT INTO test VALUES(123)"); EXEC_SUBMIT(stmt_id); CLUSTER_DEPOSE; ASSERT_CALLBACK(0, FAILURE); /* Submit a QUERY_SQL request, forcing a barrier, which fails */ CLUSTER_ELECT(0); f->request.db_id = 0; f->request.sql = "SELECT n FROM test"; ENCODE(&f->request, query_sql); /* We rely on leader__barrier (called by handle_query_sql) attempting * an allocation using raft_malloc. */ test_raft_heap_fault_config(0, 1); test_raft_heap_fault_enable(); HANDLE_STATUS(DQLITE_REQUEST_QUERY_SQL, RAFT_NOMEM); return MUNIT_OK; } /* Send a QUERY_SQL request in the new (schema version 1) format, which * supports larger numbers of parameters. */ TEST_CASE(query_sql, manyParams, NULL) { struct query_sql_fixture *f = data; size_t len = 20000; char *sql = munit_malloc(len); size_t pos; size_t i; size_t num_query_params = 999; struct value *values = munit_calloc(num_query_params, sizeof(*values)); (void)params; pos = snprintf(sql, len, "SELECT (n) FROM test WHERE n = ?"); for (i = 1; i < num_query_params; i++) { pos += snprintf(sql + pos, len - pos, " OR n = ?"); } for (i = 0; i < num_query_params; i++) { values[i].type = SQLITE_INTEGER; values[i].integer = i; } f->request.db_id = 0; f->request.sql = sql; ENCODE(&f->request, query_sql); ENCODE_PARAMS(num_query_params, values, TUPLE__PARAMS32); HANDLE_SCHEMA_STATUS(DQLITE_REQUEST_QUERY_SQL, 1, 0); WAIT; ASSERT_CALLBACK(0, ROWS); free(values); free(sql); return MUNIT_OK; } /* Send a QUERY_SQL request containing more than one statement. */ TEST_CASE(query_sql, nonemptyTail, NULL) { struct query_sql_fixture *f = data; (void)params; f->request.db_id = 0; f->request.sql = "SELECT * FROM test; SELECT (n) FROM test"; ENCODE(&f->request, query_sql); HANDLE(QUERY_SQL); ASSERT_CALLBACK(0, FAILURE); ASSERT_FAILURE(SQLITE_ERROR, "nonempty statement tail"); return MUNIT_OK; } /****************************************************************************** * * cluster * ******************************************************************************/ struct request_cluster_fixture { FIXTURE; struct request_cluster request; struct response_servers response; }; TEST_SUITE(request_cluster); TEST_SETUP(request_cluster) { struct request_cluster_fixture *f = munit_malloc(sizeof *f); SETUP; CLUSTER_ELECT(0); return f; } TEST_TEAR_DOWN(request_cluster) { struct request_cluster_fixture *f = data; TEAR_DOWN; free(f); } /* Submit a cluster request with an invalid format version. */ TEST_CASE(request_cluster, unrecognizedFormat, NULL) { struct request_cluster_fixture *f = data; (void)params; f->request.format = 2; ENCODE(&f->request, cluster); HANDLE(CLUSTER); ASSERT_CALLBACK(0, FAILURE); ASSERT_FAILURE(DQLITE_PARSE, "unrecognized cluster format"); return MUNIT_OK; } /****************************************************************************** * * invalid * ******************************************************************************/ struct invalid_fixture { FIXTURE; struct request_leader request; struct response_server response; }; TEST_SUITE(invalid); TEST_SETUP(invalid) { struct invalid_fixture *f = munit_malloc(sizeof *f); SETUP; CLUSTER_ELECT(0); return f; } TEST_TEAR_DOWN(invalid) { struct invalid_fixture *f = data; TEAR_DOWN; free(f); } /* Submit a request with an unrecognized type. */ TEST_CASE(invalid, requestType, NULL) { struct invalid_fixture *f = data; (void)params; ENCODE(&f->request, leader); HANDLE_STATUS(123, 0); ASSERT_CALLBACK(0, FAILURE); ASSERT_FAILURE(DQLITE_PARSE, "unrecognized request type"); return MUNIT_OK; } dqlite-1.16.7/test/unit/test_registry.c000066400000000000000000000032341465252713400201140ustar00rootroot00000000000000#include "../lib/config.h" #include "../lib/heap.h" #include "../lib/logger.h" #include "../lib/registry.h" #include "../lib/runner.h" #include "../lib/sqlite.h" #include "../lib/vfs.h" TEST_MODULE(registry); #define FIXTURE \ FIXTURE_LOGGER; \ FIXTURE_VFS; \ FIXTURE_CONFIG; \ FIXTURE_REGISTRY; #define SETUP \ SETUP_HEAP; \ SETUP_SQLITE; \ SETUP_LOGGER; \ SETUP_VFS; \ SETUP_CONFIG; \ SETUP_REGISTRY; #define TEAR_DOWN \ TEAR_DOWN_REGISTRY; \ TEAR_DOWN_CONFIG; \ TEAR_DOWN_VFS; \ TEAR_DOWN_LOGGER; \ TEAR_DOWN_SQLITE; \ TEAR_DOWN_HEAP; /****************************************************************************** * * db-related APIs. * ******************************************************************************/ struct db_fixture { FIXTURE; }; TEST_SUITE(db); TEST_SETUP(db) { struct db_fixture *f = munit_malloc(sizeof *f); SETUP; return f; } TEST_TEAR_DOWN(db) { struct db_fixture *f = data; TEAR_DOWN; free(f); } /* Get a db that didn't exist before. */ TEST_CASE(db, get_new, NULL) { struct db_fixture *f = data; struct db *db; (void)params; int rc; rc = registry__db_get(&f->registry, "test.db", &db); munit_assert_int(rc, ==, 0); munit_assert_string_equal(db->filename, "test.db"); return MUNIT_OK; } /* Get a previously registered db. */ TEST_CASE(db, get_existing, NULL) { struct db_fixture *f = data; struct db *db1; struct db *db2; (void)params; int rc; rc = registry__db_get(&f->registry, "test.db", &db1); munit_assert_int(rc, ==, 0); rc = registry__db_get(&f->registry, "test.db", &db2); munit_assert_int(rc, ==, 0); munit_assert_ptr_equal(db1, db2); return MUNIT_OK; } dqlite-1.16.7/test/unit/test_replication.c000066400000000000000000000264011465252713400205560ustar00rootroot00000000000000#include "../lib/cluster.h" #include "../lib/runner.h" #include "../../src/format.h" #include "../../src/leader.h" TEST_MODULE(replication_v1); /****************************************************************************** * * Fixture * ******************************************************************************/ #define FIXTURE \ FIXTURE_CLUSTER; \ struct leader leaders[N_SERVERS]; \ sqlite3_stmt *stmt; #define SETUP \ unsigned i; \ pool_ut_fallback()->flags |= POOL_FOR_UT_NOT_ASYNC; \ pool_ut_fallback()->flags |= POOL_FOR_UT; \ SETUP_CLUSTER(V2) \ for (i = 0; i < N_SERVERS; i++) { \ SETUP_LEADER(i); \ } #define SETUP_LEADER(I) \ do { \ struct leader *leader = &f->leaders[I]; \ struct registry *registry = CLUSTER_REGISTRY(I); \ struct db *db; \ int rc2; \ rc2 = registry__db_get(registry, "test.db", &db); \ munit_assert_int(rc2, ==, 0); \ rc2 = leader__init(leader, db, CLUSTER_RAFT(I)); \ munit_assert_int(rc2, ==, 0); \ } while (0) #define TEAR_DOWN \ unsigned i; \ for (i = 0; i < N_SERVERS; i++) { \ TEAR_DOWN_LEADER(i); \ } \ TEAR_DOWN_CLUSTER #define TEAR_DOWN_LEADER(I) \ do { \ struct leader *leader = &f->leaders[I]; \ leader__close(leader); \ } while (0) /****************************************************************************** * * Helper macros. * ******************************************************************************/ /* Return the i'th leader object. */ #define LEADER(I) &f->leaders[I] /* Return the SQLite connection of the i'th leader object */ #define CONN(I) (LEADER(I))->conn /* Prepare the fixture's statement using the connection of the I'th leader */ #define PREPARE(I, SQL) \ { \ int rc2; \ rc2 = sqlite3_prepare_v2(CONN(I), SQL, -1, &f->stmt, NULL); \ munit_assert_int(rc2, ==, 0); \ } /* Reset the fixture's statement, expecting the given return code. */ #define RESET(RC) \ { \ int rc2; \ rc2 = sqlite3_reset(f->stmt); \ munit_assert_int(rc2, ==, RC); \ } /* Finalize the fixture's statement */ #define FINALIZE \ { \ int rc2; \ rc2 = sqlite3_finalize(f->stmt); \ munit_assert_int(rc2, ==, 0); \ } /* Submit an exec request using the I'th leader. */ #define EXEC(I) \ { \ int rc2; \ rc2 = leader__exec(LEADER(I), &f->req, f->stmt, 0, \ fixture_exec_cb); \ munit_assert_int(rc2, ==, 0); \ } /* Convenience to prepare, execute and finalize a statement. */ #define EXEC_SQL(I, SQL) \ PREPARE(I, SQL); \ EXEC(I); \ CLUSTER_APPLIED(CLUSTER_LAST_INDEX(I)); \ FINALIZE /****************************************************************************** * * Helper macros. * ******************************************************************************/ /* Assert the number of pages in the WAL file on the I'th node. */ #define ASSERT_WAL_PAGES(I, N) \ { \ struct leader *leader_ = &f->leaders[I]; \ sqlite3_file *file_; \ sqlite_int64 size_; \ int pages_; \ int rv_; \ rv_ = sqlite3_file_control(leader_->conn, "main", \ SQLITE_FCNTL_JOURNAL_POINTER, \ &file_); \ munit_assert_int(rv_, ==, 0); \ rv_ = file_->pMethods->xFileSize(file_, &size_); \ munit_assert_int(rv_, ==, 0); \ pages_ = formatWalCalcFramesNumber( \ leader_->db->config->page_size, size_); \ munit_assert_int(pages_, ==, N); \ } /****************************************************************************** * * leader__init * ******************************************************************************/ struct init_fixture { FIXTURE; }; TEST_SUITE(init); TEST_SETUP(init) { struct init_fixture *f = munit_malloc(sizeof *f); SETUP; return f; } TEST_TEAR_DOWN(init) { struct init_fixture *f = data; TEAR_DOWN; free(f); } /* The connection is open and can be used. */ TEST_CASE(init, conn, NULL) { struct init_fixture *f = data; sqlite3_stmt *stmt; int rc; (void)params; rc = sqlite3_prepare_v2(CONN(0), "SELECT 1", -1, &stmt, NULL); munit_assert_int(rc, ==, 0); sqlite3_finalize(stmt); return MUNIT_OK; } /****************************************************************************** * * leader__exec * ******************************************************************************/ struct exec_fixture { FIXTURE; struct exec req; bool invoked; int status; }; static void fixture_exec_cb(struct exec *req, int status) { struct exec_fixture *f = req->data; f->invoked = true; f->status = status; } TEST_SUITE(exec); TEST_SETUP(exec) { struct exec_fixture *f = munit_malloc(sizeof *f); SETUP; f->req.data = f; return f; } TEST_TEAR_DOWN(exec) { struct exec_fixture *f = data; TEAR_DOWN; free(f); } TEST_CASE(exec, success, NULL) { struct exec_fixture *f = data; (void)params; CLUSTER_ELECT(0); PREPARE(0, "CREATE TABLE test (a INT)"); EXEC(0); CLUSTER_APPLIED(4); munit_assert_true(f->invoked); munit_assert_int(f->status, ==, SQLITE_DONE); FINALIZE; return MUNIT_OK; } /* A snapshot is taken after applying an entry. */ TEST_CASE(exec, snapshot, NULL) { struct exec_fixture *f = data; (void)params; CLUSTER_SNAPSHOT_THRESHOLD(0, 4); CLUSTER_ELECT(0); PREPARE(0, "CREATE TABLE test (n INT)"); EXEC(0); CLUSTER_APPLIED(4); FINALIZE; PREPARE(0, "INSERT INTO test(n) VALUES(1)"); EXEC(0); CLUSTER_APPLIED(5); munit_assert_true(f->invoked); munit_assert_int(f->status, ==, SQLITE_DONE); FINALIZE; return MUNIT_OK; } /* If a transaction is in progress, no snapshot is taken. */ TEST_CASE(exec, snapshot_busy, NULL) { struct exec_fixture *f = data; (void)params; unsigned i; CLUSTER_SNAPSHOT_THRESHOLD(0, 4); CLUSTER_ELECT(0); EXEC_SQL(0, "PRAGMA cache_size = 1"); EXEC_SQL(0, "CREATE TABLE test (n INT)"); EXEC_SQL(0, "BEGIN"); /* Accumulate enough dirty data to fill the page cache */ for (i = 0; i < 163; i++) { EXEC_SQL(0, "INSERT INTO test(n) VALUES(1)"); } return MUNIT_OK; } /* If the WAL size grows beyond the configured threshold, checkpoint it. */ TEST_CASE(exec, checkpoint, NULL) { struct exec_fixture *f = data; struct config *config = CLUSTER_CONFIG(0); (void)params; config->checkpoint_threshold = 3; CLUSTER_ELECT(0); EXEC_SQL(0, "CREATE TABLE test (n INT)"); EXEC_SQL(0, "INSERT INTO test(n) VALUES(1)"); /* The WAL was truncated. */ ASSERT_WAL_PAGES(0, 0); return MUNIT_OK; } /* If a read transaction is in progress, no checkpoint is taken. */ TEST_CASE(exec, checkpoint_read_lock, NULL) { struct exec_fixture *f = data; struct config *config = CLUSTER_CONFIG(0); struct registry *registry = CLUSTER_REGISTRY(0); struct db *db; struct leader leader2; char *errmsg; int rv; (void)params; config->checkpoint_threshold = 3; CLUSTER_ELECT(0); EXEC_SQL(0, "CREATE TABLE test (n INT)"); /* Initialize another leader. */ rv = registry__db_get(registry, "test.db", &db); munit_assert_int(rv, ==, 0); leader__init(&leader2, db, CLUSTER_RAFT(0)); /* Start a read transaction in the other leader. */ rv = sqlite3_exec(leader2.conn, "BEGIN", NULL, NULL, &errmsg); munit_assert_int(rv, ==, 0); rv = sqlite3_exec(leader2.conn, "SELECT * FROM test", NULL, NULL, &errmsg); munit_assert_int(rv, ==, 0); EXEC_SQL(0, "INSERT INTO test(n) VALUES(1)"); /* The WAL was not truncated. */ ASSERT_WAL_PAGES(0, 3); leader__close(&leader2); return MUNIT_OK; } /****************************************************************************** * * Fixture * ******************************************************************************/ struct fixture { FIXTURE_CLUSTER; struct leader leaders[N_SERVERS]; sqlite3_stmt *stmt; struct exec req; bool invoked; int status; }; static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); pool_ut_fallback()->flags |= POOL_FOR_UT_NOT_ASYNC; pool_ut_fallback()->flags |= POOL_FOR_UT; SETUP_CLUSTER(V2); SETUP_LEADER(0); f->req.data = f; return f; } static void tearDown(void *data) { struct fixture *f = data; TEAR_DOWN_LEADER(0); TEAR_DOWN_CLUSTER; free(f); } SUITE(replication) static void execCb(struct exec *req, int status) { struct fixture *f = req->data; f->invoked = true; f->status = status; } TEST(replication, exec, setUp, tearDown, 0, NULL) { struct fixture *f = data; int rv; CLUSTER_ELECT(0); PREPARE(0, "BEGIN"); rv = leader__exec(LEADER(0), &f->req, f->stmt, 0, execCb); CLUSTER_APPLIED(3); munit_assert_int(rv, ==, 0); munit_assert_true(f->invoked); munit_assert_int(f->status, ==, SQLITE_DONE); f->invoked = false; FINALIZE; PREPARE(0, "CREATE TABLE test (a INT)"); rv = leader__exec(LEADER(0), &f->req, f->stmt, 0, execCb); munit_assert_int(rv, ==, 0); munit_assert_true(f->invoked); munit_assert_int(f->status, ==, SQLITE_DONE); f->invoked = false; FINALIZE; PREPARE(0, "COMMIT"); rv = leader__exec(LEADER(0), &f->req, f->stmt, 0, execCb); munit_assert_int(rv, ==, 0); munit_assert_false(f->invoked); FINALIZE; CLUSTER_APPLIED(4); munit_assert_true(f->invoked); munit_assert_int(f->status, ==, SQLITE_DONE); PREPARE(0, "SELECT * FROM test"); FINALIZE; SETUP_LEADER(1); PREPARE(1, "SELECT * FROM test"); FINALIZE; TEAR_DOWN_LEADER(1); return MUNIT_OK; } /* If the WAL size grows beyond the configured threshold, checkpoint it. */ TEST(replication, checkpoint, setUp, tearDown, 0, NULL) { struct fixture *f = data; struct config *config = CLUSTER_CONFIG(0); int rv; config->checkpoint_threshold = 3; CLUSTER_ELECT(0); PREPARE(0, "CREATE TABLE test (n INT)"); rv = leader__exec(LEADER(0), &f->req, f->stmt, 0, execCb); munit_assert_int(rv, ==, 0); CLUSTER_APPLIED(4); FINALIZE; PREPARE(0, "INSERT INTO test(n) VALUES(1)"); rv = leader__exec(LEADER(0), &f->req, f->stmt, 0, execCb); munit_assert_int(rv, ==, 0); CLUSTER_APPLIED(6); FINALIZE; /* The WAL was truncated. */ ASSERT_WAL_PAGES(0, 0); return MUNIT_OK; } dqlite-1.16.7/test/unit/test_request.c000066400000000000000000000054131465252713400177350ustar00rootroot00000000000000#include "../../src/request.h" #include "../lib/heap.h" #include "../lib/runner.h" TEST_MODULE(request); /****************************************************************************** * * Fixture * ******************************************************************************/ struct fixture { void *buf; }; static void *setup(const MunitParameter params[], void *user_data) { struct fixture *f; f = munit_malloc(sizeof *f); SETUP_HEAP; f->buf = NULL; return f; } static void tear_down(void *data) { struct fixture *f = data; free(f->buf); TEAR_DOWN_HEAP; free(f); } /****************************************************************************** * * Helper macros * ******************************************************************************/ #define ALLOC_BUF(N) f->buf = munit_malloc(N); /****************************************************************************** * * Serialize * ******************************************************************************/ TEST_SUITE(serialize); TEST_SETUP(serialize, setup); TEST_TEAR_DOWN(serialize, tear_down); TEST_CASE(serialize, leader, NULL) { struct fixture *f = data; struct request_leader request; char *cursor1; struct cursor cursor2; size_t n = request_leader__sizeof(&request); (void)params; ALLOC_BUF(n); cursor1 = f->buf; request_leader__encode(&request, &cursor1); cursor2.p = f->buf; cursor2.cap = n; request_leader__decode(&cursor2, &request); return MUNIT_OK; } /****************************************************************************** * * Decode * ******************************************************************************/ TEST_SUITE(decode); TEST_SETUP(decode, setup); TEST_TEAR_DOWN(decode, tear_down); TEST_CASE(decode, leader, NULL) { (void)data; (void)params; return MUNIT_OK; } #if 0 TEST_CASE(decode, client, NULL) { struct request *request = data; int err; (void)params; test_message_send_client(123, &request->message); err = request_decode(request); munit_assert_int(err, ==, 0); munit_assert_int(request->client.id, ==, 123); return MUNIT_OK; } TEST_CASE(decode, heartbeat, NULL) { struct request *request = data; int err; (void)params; test_message_send_heartbeat(666, &request->message); err = request_decode(request); munit_assert_int(err, ==, 0); munit_assert_int(request->heartbeat.timestamp, ==, 666); return MUNIT_OK; } TEST_CASE(decode, open, NULL) { struct request *request = data; int err; (void)params; test_message_send_open("test.db", 123, "volatile", &request->message); err = request_decode(request); munit_assert_int(err, ==, 0); munit_assert_string_equal(request->open.name, "test.db"); munit_assert_int(request->open.flags, ==, 123); munit_assert_string_equal(request->open.vfs, "volatile"); return MUNIT_OK; } #endif dqlite-1.16.7/test/unit/test_role_management.c000066400000000000000000000255361465252713400214120ustar00rootroot00000000000000#include "../../src/protocol.h" #include "../../src/roles.h" #include "../../src/tracing.h" #include "../lib/runner.h" /* DSL for writing declarative role-management tests */ #define VOTERS(x) x #define STANDBYS(x) x #define ONLINE true #define OFFLINE false #define FAILURE_DOMAIN(x) x #define WEIGHT(x) x #define TARGET(voters_, standbys_) \ do { \ struct adjust_fixture *f = data; \ f->voters = voters_; \ f->standbys = standbys_; \ f->n = 0; \ } while (0) #define BEFORE(id_, role_, online_, failure_domain_, weight_) \ do { \ struct adjust_fixture *f = data; \ munit_assert_uint(id_, >, 0); \ munit_assert_uint(id_, <=, 10); \ munit_assert_uint(id_, ==, f->n + 1); \ f->nodes[f->n].id = id_; \ f->nodes[f->n].role = role_; \ f->nodes[f->n].online = online_; \ f->nodes[f->n].failure_domain = failure_domain_; \ f->nodes[f->n].weight = weight_; \ f->n += 1; \ } while (0) #define COMPUTE(id_) \ do { \ struct adjust_fixture *f = data; \ RolesComputeChanges(f->voters, f->standbys, f->nodes, f->n, \ id_, applyChangeCb, f); \ } while (0) #define AFTER(id_, role_) \ do { \ unsigned i_; \ struct adjust_fixture *f = data; \ munit_assert_uint(id_, >, 0); \ munit_assert_uint(id_, <=, f->n); \ for (i_ = 0; i_ < f->n; i_ += 1) { \ if (f->nodes[i_].id == id_) { \ munit_assert_int(f->nodes[i_].role, ==, \ role_); \ break; \ } \ } \ if (i_ == f->n) { \ munit_assert(false); \ } \ } while (0) TEST_MODULE(role_management); TEST_SUITE(adjust); struct adjust_fixture { int voters; int standbys; unsigned n; struct all_node_info nodes[10]; }; static void applyChangeCb(uint64_t id, int role, void *arg) { (void)id; (void)role; (void)arg; } TEST_SETUP(adjust) { (void)params; (void)user_data; struct adjust_fixture *f = munit_malloc(sizeof *f); memset(f, 0, sizeof *f); return f; } TEST_TEAR_DOWN(adjust) { free(data); } /* A standby is promoted when there aren't enough voters. */ TEST_CASE(adjust, promote_voter, NULL) { (void)params; TARGET(VOTERS(3), STANDBYS(0)); BEFORE(1, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(2, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(3, DQLITE_STANDBY, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); COMPUTE(1); AFTER(1, DQLITE_VOTER); AFTER(2, DQLITE_VOTER); AFTER(3, DQLITE_VOTER); return MUNIT_OK; } /* A voter is demoted when there are too many voters. */ TEST_CASE(adjust, demote_voter, NULL) { (void)params; TARGET(VOTERS(1), STANDBYS(0)); BEFORE(1, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(2, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); COMPUTE(1); AFTER(1, DQLITE_VOTER); AFTER(2, DQLITE_SPARE); return MUNIT_OK; } /* A spare is promoted when there aren't enough standbys. */ TEST_CASE(adjust, promote_standby, NULL) { (void)params; TARGET(VOTERS(3), STANDBYS(1)); BEFORE(1, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(2, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(3, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(4, DQLITE_SPARE, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); COMPUTE(1); AFTER(1, DQLITE_VOTER); AFTER(2, DQLITE_VOTER); AFTER(3, DQLITE_VOTER); AFTER(4, DQLITE_STANDBY); return MUNIT_OK; } /* A standby is demoted when there are too many standbys. */ TEST_CASE(adjust, demote_standby, NULL) { (void)params; TARGET(VOTERS(1), STANDBYS(0)); BEFORE(1, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(2, DQLITE_STANDBY, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); COMPUTE(1); AFTER(1, DQLITE_VOTER); AFTER(2, DQLITE_SPARE); return MUNIT_OK; } /* An offline node is demoted, even when there's a shortage of voters and * standbys. */ TEST_CASE(adjust, demote_offline, NULL) { (void)params; TARGET(VOTERS(3), STANDBYS(1)); BEFORE(1, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(2, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(3, DQLITE_STANDBY, OFFLINE, FAILURE_DOMAIN(1), WEIGHT(1)); COMPUTE(1); AFTER(1, DQLITE_VOTER); AFTER(2, DQLITE_VOTER); AFTER(3, DQLITE_SPARE); return MUNIT_OK; } /* An offline voter is demoted and an online spare is promoted. */ TEST_CASE(adjust, voter_online_exchange, NULL) { (void)params; TARGET(VOTERS(3), STANDBYS(0)); BEFORE(1, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(2, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(3, DQLITE_VOTER, OFFLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(4, DQLITE_SPARE, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); COMPUTE(1); AFTER(1, DQLITE_VOTER); AFTER(2, DQLITE_VOTER); AFTER(3, DQLITE_SPARE); AFTER(4, DQLITE_VOTER); return MUNIT_OK; } /* An offline standby is demoted and an online spare is promoted. */ TEST_CASE(adjust, standby_online_exchange, NULL) { (void)params; TARGET(VOTERS(1), STANDBYS(1)); BEFORE(1, DQLITE_SPARE, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(2, DQLITE_STANDBY, OFFLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(3, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); COMPUTE(3); AFTER(1, DQLITE_STANDBY); AFTER(2, DQLITE_SPARE); AFTER(3, DQLITE_VOTER); return MUNIT_OK; } /* A standby is promoted to voter, and a spare replaces it. */ TEST_CASE(adjust, voter_standby_promote_succession, NULL) { (void)params; TARGET(VOTERS(3), STANDBYS(1)); BEFORE(1, DQLITE_SPARE, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(2, DQLITE_STANDBY, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(3, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(4, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); COMPUTE(4); AFTER(1, DQLITE_STANDBY); AFTER(2, DQLITE_VOTER); AFTER(3, DQLITE_VOTER); AFTER(4, DQLITE_VOTER); return MUNIT_OK; } /* A standby with a distinctive failure domain is preferred for promotion. */ TEST_CASE(adjust, voter_failure_domains, NULL) { (void)params; TARGET(VOTERS(3), STANDBYS(1)); BEFORE(1, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(2, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(3, DQLITE_STANDBY, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(4, DQLITE_STANDBY, ONLINE, FAILURE_DOMAIN(2), WEIGHT(1)); COMPUTE(1); AFTER(1, DQLITE_VOTER); AFTER(2, DQLITE_VOTER); AFTER(3, DQLITE_STANDBY); AFTER(4, DQLITE_VOTER); return MUNIT_OK; } /* A spare with a distinctive failure domain is preferred for promotion. */ TEST_CASE(adjust, standby_failure_domains, NULL) { (void)params; TARGET(VOTERS(1), STANDBYS(1)); BEFORE(1, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(2, DQLITE_SPARE, ONLINE, FAILURE_DOMAIN(2), WEIGHT(1)); BEFORE(3, DQLITE_SPARE, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); COMPUTE(1); AFTER(1, DQLITE_VOTER); AFTER(2, DQLITE_STANDBY); AFTER(3, DQLITE_SPARE); return MUNIT_OK; } /* An offline standby is demoted even when it has a distinctive failure domain. */ TEST_CASE(adjust, voter_failure_domains_vs_online, NULL) { (void)params; TARGET(VOTERS(3), STANDBYS(1)); BEFORE(1, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(2, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(3, DQLITE_STANDBY, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(4, DQLITE_STANDBY, OFFLINE, FAILURE_DOMAIN(2), WEIGHT(1)); COMPUTE(1); AFTER(1, DQLITE_VOTER); AFTER(2, DQLITE_VOTER); AFTER(3, DQLITE_VOTER); AFTER(4, DQLITE_SPARE); return MUNIT_OK; } /* An offline spare is not promoted even when it has a distinctive failure * domain. */ TEST_CASE(adjust, standby_failure_domains_vs_online, NULL) { (void)params; TARGET(VOTERS(1), STANDBYS(1)); BEFORE(1, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(2, DQLITE_SPARE, OFFLINE, FAILURE_DOMAIN(2), WEIGHT(1)); BEFORE(3, DQLITE_SPARE, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); COMPUTE(1); AFTER(1, DQLITE_VOTER); AFTER(2, DQLITE_SPARE); AFTER(3, DQLITE_STANDBY); return MUNIT_OK; } /* A standby with a lower weight is preferred for promotion. */ TEST_CASE(adjust, voter_weights, NULL) { (void)params; TARGET(VOTERS(3), STANDBYS(1)); BEFORE(1, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(2, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(3, DQLITE_STANDBY, ONLINE, FAILURE_DOMAIN(1), WEIGHT(2)); BEFORE(4, DQLITE_STANDBY, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); COMPUTE(1); AFTER(1, DQLITE_VOTER); AFTER(2, DQLITE_VOTER); AFTER(3, DQLITE_STANDBY); AFTER(4, DQLITE_VOTER); return MUNIT_OK; } /* A spare with a lower weight is preferred for promotion. */ TEST_CASE(adjust, standby_weights, NULL) { (void)params; TARGET(VOTERS(1), STANDBYS(1)); BEFORE(1, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(2, DQLITE_SPARE, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(3, DQLITE_SPARE, ONLINE, FAILURE_DOMAIN(1), WEIGHT(2)); COMPUTE(1); AFTER(1, DQLITE_VOTER); AFTER(2, DQLITE_STANDBY); AFTER(3, DQLITE_SPARE); return MUNIT_OK; } /* A standby with a distinctive failure domain is preferred for promotion over * one with a low weight. */ TEST_CASE(adjust, voter_weights_vs_failure_domains, NULL) { (void)params; TARGET(VOTERS(3), STANDBYS(1)); BEFORE(1, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(2, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(3, DQLITE_STANDBY, ONLINE, FAILURE_DOMAIN(2), WEIGHT(2)); BEFORE(4, DQLITE_STANDBY, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); COMPUTE(1); AFTER(1, DQLITE_VOTER); AFTER(2, DQLITE_VOTER); AFTER(3, DQLITE_VOTER); AFTER(4, DQLITE_STANDBY); return MUNIT_OK; } /* A spare with a distinctive failure domain is preferred for promotion over one * with a low weight. */ TEST_CASE(adjust, standby_weights_vs_failure_domains, NULL) { (void)params; TARGET(VOTERS(1), STANDBYS(2)); BEFORE(1, DQLITE_VOTER, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(2, DQLITE_STANDBY, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(3, DQLITE_SPARE, ONLINE, FAILURE_DOMAIN(1), WEIGHT(1)); BEFORE(4, DQLITE_SPARE, ONLINE, FAILURE_DOMAIN(2), WEIGHT(2)); COMPUTE(1); AFTER(1, DQLITE_VOTER); AFTER(2, DQLITE_STANDBY); AFTER(3, DQLITE_SPARE); AFTER(4, DQLITE_STANDBY); return MUNIT_OK; } dqlite-1.16.7/test/unit/test_sm.c000066400000000000000000000043411465252713400166630ustar00rootroot00000000000000#include #include "../../src/lib/sm.h" #include "../lib/runner.h" TEST_MODULE(sm); /****************************************************************************** * * SM. * ******************************************************************************/ TEST_SUITE(sm); /** * An example of simple state machine. * * TRANSIENT * | ^ * restarted | | crashed * V | * ONLINE--------+ checked * | <-------+ * stopped | * V * OFFLINE */ enum states { S_ONLINE, S_OFFLINE, S_TRANSIENT, S_NR, }; static const struct sm_conf op_states[S_NR] = { [S_ONLINE] = { .flags = SM_INITIAL, .name = "online", .allowed = BITS(S_ONLINE) | BITS(S_TRANSIENT) | BITS(S_OFFLINE), }, [S_TRANSIENT] = {.flags = SM_FAILURE, .name = "transient", .allowed = BITS(S_ONLINE)}, [S_OFFLINE] = { .flags = SM_FINAL, .name = "offline", .allowed = 0, }, }; enum triggers { T_RESTARTED, T_CRASHED, T_CHECKED, T_STOPPED, }; struct op_states_sm { struct sm sm; enum triggers sm_trigger; }; static bool sm_invariant(const struct sm *m, int prev_state) { struct op_states_sm *sm = CONTAINER_OF(m, struct op_states_sm, sm); return ERGO(sm_state(m) == S_ONLINE, ERGO(prev_state == SM_PREV_NONE, sm->sm_trigger == 0)) && ERGO(sm_state(m) == S_ONLINE, ERGO(prev_state == S_ONLINE, sm->sm_trigger == BITS(T_CHECKED)) && ERGO(prev_state == S_TRANSIENT, sm->sm_trigger == BITS(T_RESTARTED))) && ERGO(sm_state(m) == S_TRANSIENT, sm->sm_trigger == BITS(T_CRASHED) && m->rc == -42) && ERGO(sm_state(m) == S_OFFLINE, sm->sm_trigger == BITS(T_STOPPED)); } TEST_CASE(sm, simple, NULL) { (void)data; (void)params; struct op_states_sm sm = {}; struct sm *m = &sm.sm; sm_init(&sm.sm, sm_invariant, NULL, op_states, "test", S_ONLINE); sm.sm_trigger = BITS(T_CHECKED); sm_move(m, S_ONLINE); sm_move(m, S_ONLINE); sm_move(m, S_ONLINE); sm.sm_trigger = BITS(T_CRASHED); sm_fail(m, S_TRANSIENT, -42 /* -rc */); sm.sm_trigger = BITS(T_RESTARTED); sm_move(m, S_ONLINE); sm.sm_trigger = BITS(T_STOPPED); sm_move(m, S_OFFLINE); sm_fini(m); return 0; } dqlite-1.16.7/test/unit/test_tuple.c000066400000000000000000000374001465252713400173770ustar00rootroot00000000000000#include #include "../../src/tuple.h" #include "../lib/runner.h" /* tinycc doesn't have this builtin, nor the warning that it's meant to silence. */ #ifdef __TINYC__ #define __builtin_assume_aligned(x, y) x #endif TEST_MODULE(tuple); /****************************************************************************** * * Helper macros. * ******************************************************************************/ #define DECODER_INIT(N) \ { \ int rc2; \ int format_; \ format_ = (N > 0) ? TUPLE__ROW : TUPLE__PARAMS; \ rc2 = tuple_decoder__init(&decoder, N, format_, &cursor); \ munit_assert_int(rc2, ==, 0); \ } #define DECODER_INIT_PARAMS32 \ { \ int rc2; \ rc2 = tuple_decoder__init(&decoder, 0, TUPLE__PARAMS32, \ &cursor); \ munit_assert_int(rc2, ==, 0); \ } #define DECODER_NEXT \ { \ int rc2; \ rc2 = tuple_decoder__next(&decoder, &value); \ munit_assert_int(rc2, ==, 0); \ } #define ENCODER_INIT(N, FORMAT) \ { \ int rc2; \ rc2 = tuple_encoder__init(&f->encoder, N, FORMAT, &f->buffer); \ munit_assert_int(rc2, ==, 0); \ } #define ENCODER_NEXT \ { \ int rc2; \ rc2 = tuple_encoder__next(&f->encoder, &value); \ munit_assert_int(rc2, ==, 0); \ } /****************************************************************************** * * Assertions. * ******************************************************************************/ #define ASSERT_VALUE_TYPE(TYPE) munit_assert_int(value.type, ==, TYPE) /****************************************************************************** * * Decoder. * ******************************************************************************/ TEST_SUITE(decoder); TEST_GROUP(decoder, init); /* If n is 0, then the parameters format is used to determine the number of * elements of the tuple. */ TEST_CASE(decoder, init, param, NULL) { struct tuple_decoder decoder; char buf[] = {2, 0, 0, 0, 0, 0, 0, 0}; struct cursor cursor = {buf, sizeof buf}; (void)data; (void)params; DECODER_INIT(0); munit_assert_uint(decoder.n, ==, 2); munit_assert_uint(tuple_decoder__n(&decoder), ==, 2); return MUNIT_OK; } TEST_CASE(decoder, init, param32, NULL) { struct tuple_decoder decoder; char buf[] = {2, 0, 0, 0, 0, 0, 0, 0}; struct cursor cursor = {buf, sizeof buf}; (void)data; (void)params; DECODER_INIT_PARAMS32; munit_assert_uint(decoder.n, ==, 2); munit_assert_uint(tuple_decoder__n(&decoder), ==, 2); return MUNIT_OK; } /* If n is not 0, then it is the number of elements. */ TEST_CASE(decoder, init, row, NULL) { struct tuple_decoder decoder; char buf[] = {2, 0, 0, 0, 0, 0, 0, 0, 0}; struct cursor cursor = {buf, sizeof buf}; (void)data; (void)params; DECODER_INIT(3); munit_assert_uint(decoder.n, ==, 3); munit_assert_uint(tuple_decoder__n(&decoder), ==, 3); return MUNIT_OK; } TEST_GROUP(decoder, row); /* Decode a tuple with row format and only one value. */ TEST_CASE(decoder, row, one_value, NULL) { struct tuple_decoder decoder; char buf[][8] = { {SQLITE_INTEGER, 0, 0, 0, 0, 0, 0, 0}, {7, 0, 0, 0, 0, 0, 0, 0}, }; struct cursor cursor = {(const char *)buf, sizeof buf}; struct value value; (void)data; (void)params; DECODER_INIT(1); DECODER_NEXT; ASSERT_VALUE_TYPE(SQLITE_INTEGER); munit_assert_int64(value.integer, ==, 7); return MUNIT_OK; } /* Decode a tuple with row format and two values. */ TEST_CASE(decoder, row, two_values, NULL) { struct tuple_decoder decoder; char buf[][8] = { {SQLITE_INTEGER | SQLITE_TEXT << 4, 0, 0, 0, 0, 0, 0, 0}, {7, 0, 0, 0, 0, 0, 0, 0}, {'h', 'e', 'l', 'l', 'o', 0, 0, 0}, }; struct cursor cursor = {(const char *)buf, sizeof buf}; struct value value; (void)data; (void)params; DECODER_INIT(2); DECODER_NEXT; ASSERT_VALUE_TYPE(SQLITE_INTEGER); munit_assert_int64(value.integer, ==, 7); DECODER_NEXT; ASSERT_VALUE_TYPE(SQLITE_TEXT); munit_assert_string_equal(value.text, "hello"); return MUNIT_OK; } TEST_GROUP(decoder, params); /* Decode a tuple with params format and only one value. */ TEST_CASE(decoder, params, one_value, NULL) { struct tuple_decoder decoder; char buf[][8] = { {1, SQLITE_INTEGER, 0, 0, 0, 0, 0, 0}, {7, 0, 0, 0, 0, 0, 0, 0}, }; struct cursor cursor = {(const char *)buf, sizeof buf}; struct value value; (void)data; (void)params; DECODER_INIT(0); DECODER_NEXT; ASSERT_VALUE_TYPE(SQLITE_INTEGER); munit_assert_int64(value.integer, ==, 7); return MUNIT_OK; } /* Decode a tuple with params format and two values. */ TEST_CASE(decoder, params, two_values, NULL) { struct tuple_decoder decoder; char buf[][8] = { {2, SQLITE_INTEGER, SQLITE_TEXT, 0, 0, 0, 0, 0}, {7, 0, 0, 0, 0, 0, 0, 0}, {'h', 'e', 'l', 'l', 'o', 0, 0, 0}, }; struct cursor cursor = {(const char *)buf, sizeof buf}; struct value value; (void)data; (void)params; DECODER_INIT(0); DECODER_NEXT; ASSERT_VALUE_TYPE(SQLITE_INTEGER); munit_assert_int64(value.integer, ==, 7); DECODER_NEXT; ASSERT_VALUE_TYPE(SQLITE_TEXT); munit_assert_string_equal(value.text, "hello"); return MUNIT_OK; } TEST_GROUP(decoder, params32); /* Decode a tuple with params32 format and only one value. */ TEST_CASE(decoder, params32, one_value, NULL) { struct tuple_decoder decoder; char buf[][8] = { {1, 0, 0, 0, SQLITE_INTEGER, 0, 0, 0}, {7, 0, 0, 0, 0, 0, 0, 0}, }; struct cursor cursor = {(const char *)buf, sizeof buf}; struct value value; (void)data; (void)params; DECODER_INIT_PARAMS32; DECODER_NEXT; ASSERT_VALUE_TYPE(SQLITE_INTEGER); munit_assert_int64(value.integer, ==, 7); return MUNIT_OK; } /* Decode a tuple with params32 format and two values. */ TEST_CASE(decoder, params32, two_values, NULL) { struct tuple_decoder decoder; char buf[][8] = { {2, 0, 0, 0, SQLITE_INTEGER, SQLITE_TEXT, 0, 0}, {7, 0, 0, 0, 0, 0, 0, 0}, {'h', 'e', 'l', 'l', 'o', 0, 0, 0}, }; struct cursor cursor = {(const char *)buf, sizeof buf}; struct value value; (void)data; (void)params; DECODER_INIT_PARAMS32; DECODER_NEXT; ASSERT_VALUE_TYPE(SQLITE_INTEGER); munit_assert_int64(value.integer, ==, 7); DECODER_NEXT; ASSERT_VALUE_TYPE(SQLITE_TEXT); munit_assert_string_equal(value.text, "hello"); return MUNIT_OK; } TEST_GROUP(decoder, type); /* Decode a floating point number. */ TEST_CASE(decoder, type, float, NULL) { struct tuple_decoder decoder; uint8_t buf[][8] __attribute__((aligned(sizeof(uint64_t)))) = { {SQLITE_FLOAT, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0}, }; struct cursor cursor = {(const char *)buf, sizeof buf}; struct value value; double pi = 3.1415; (void)data; (void)params; memcpy(buf[1], &pi, sizeof pi); uint64_t *buf_value = __builtin_assume_aligned(buf[1], sizeof(uint64_t)); *buf_value = ByteFlipLe64(*buf_value); DECODER_INIT(1); DECODER_NEXT; ASSERT_VALUE_TYPE(SQLITE_FLOAT); munit_assert_double(value.float_, ==, 3.1415); return MUNIT_OK; } /* Decode a null value. */ TEST_CASE(decoder, type, null, NULL) { struct tuple_decoder decoder; char buf[][8] __attribute__((aligned(sizeof(uint64_t)))) = { {SQLITE_NULL, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0}, }; struct cursor cursor = {(const char *)buf, sizeof buf}; struct value value; (void)data; (void)params; DECODER_INIT(1); DECODER_NEXT; ASSERT_VALUE_TYPE(SQLITE_NULL); return MUNIT_OK; } /* Decode a date string in ISO8601 format. */ TEST_CASE(decoder, type, iso8601, NULL) { struct tuple_decoder decoder; char buf[5][8] __attribute__((aligned(sizeof(uint64_t)))) = { {DQLITE_ISO8601, 0, 0, 0, 0, 0, 0, 0}, }; struct cursor cursor = {(const char *)buf, sizeof buf}; struct value value; (void)data; (void)params; strcpy((char *)buf[1], "2018-07-20 09:49:05+00:00"); DECODER_INIT(1); DECODER_NEXT; ASSERT_VALUE_TYPE(DQLITE_ISO8601); munit_assert_string_equal(value.iso8601, "2018-07-20 09:49:05+00:00"); return MUNIT_OK; } /* Decode a boolean. */ TEST_CASE(decoder, type, boolean, NULL) { struct tuple_decoder decoder; char buf[][8] __attribute__((aligned(sizeof(uint64_t)))) = { {DQLITE_BOOLEAN, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 0, 0}, }; struct cursor cursor = {(const char *)buf, sizeof buf}; struct value value; (void)data; (void)params; DECODER_INIT(1); DECODER_NEXT; ASSERT_VALUE_TYPE(DQLITE_BOOLEAN); munit_assert_uint64(value.boolean, ==, 1); return MUNIT_OK; } /****************************************************************************** * * Encoder. * ******************************************************************************/ struct encoder_fixture { struct buffer buffer; struct tuple_encoder encoder; }; TEST_SUITE(encoder); TEST_SETUP(encoder) { struct encoder_fixture *f = munit_malloc(sizeof *f); int rc; (void)params; (void)user_data; rc = buffer__init(&f->buffer); munit_assert_int(rc, ==, 0); return f; } TEST_TEAR_DOWN(encoder) { struct encoder_fixture *f = data; buffer__close(&f->buffer); free(data); } TEST_GROUP(encoder, row); /* Encode a tuple with row format and only one value. */ TEST_CASE(encoder, row, one_value, NULL) { struct encoder_fixture *f = data; struct value value; uint8_t(*buf)[8] = f->buffer.data; (void)params; ENCODER_INIT(1, TUPLE__ROW); value.type = SQLITE_INTEGER; value.integer = 7; ENCODER_NEXT; munit_assert_int(buf[0][0], ==, SQLITE_INTEGER); /* malloc'ed buffer is aligned suitably */ uint64_t *value_ptr = __builtin_assume_aligned(buf[1], sizeof(uint64_t)); munit_assert_uint64(*value_ptr, ==, ByteFlipLe64(7)); return MUNIT_OK; } /* Encode a tuple with row format and two values. */ TEST_CASE(encoder, row, two_values, NULL) { struct encoder_fixture *f = data; struct value value; uint8_t(*buf)[8] = f->buffer.data; (void)params; ENCODER_INIT(2, TUPLE__ROW); value.type = SQLITE_INTEGER; value.integer = 7; ENCODER_NEXT; value.type = SQLITE_TEXT; value.text = "hello"; ENCODER_NEXT; munit_assert_int(buf[0][0], ==, SQLITE_INTEGER | SQLITE_TEXT << 4); /* malloc'ed buffer is aligned suitably */ uint64_t *value_ptr = __builtin_assume_aligned(buf[1], sizeof(uint64_t)); munit_assert_uint64(*value_ptr, ==, ByteFlipLe64(7)); munit_assert_string_equal((const char *)buf[2], "hello"); return MUNIT_OK; } TEST_GROUP(encoder, params); /* Encode a tuple with params format and only one value. */ TEST_CASE(encoder, params, one_value, NULL) { struct encoder_fixture *f = data; struct value value; uint8_t(*buf)[8] = f->buffer.data; (void)params; ENCODER_INIT(1, TUPLE__PARAMS); value.type = SQLITE_INTEGER; value.integer = 7; ENCODER_NEXT; munit_assert_int(buf[0][0], ==, 1); munit_assert_int(buf[0][1], ==, SQLITE_INTEGER); uint64_t *value_ptr = __builtin_assume_aligned(buf[1], sizeof(uint64_t)); munit_assert_uint64(*value_ptr, ==, ByteFlipLe64(7)); return MUNIT_OK; } /* Encode a tuple with params format and two values. */ TEST_CASE(encoder, params, two_values, NULL) { struct encoder_fixture *f = data; struct value value; uint8_t(*buf)[8] = f->buffer.data; (void)params; ENCODER_INIT(2, TUPLE__PARAMS); value.type = SQLITE_INTEGER; value.integer = 7; ENCODER_NEXT; value.type = SQLITE_TEXT; value.text = "hello"; ENCODER_NEXT; munit_assert_int(buf[0][0], ==, 2); munit_assert_int(buf[0][1], ==, SQLITE_INTEGER); munit_assert_int(buf[0][2], ==, SQLITE_TEXT); uint64_t *value_ptr = __builtin_assume_aligned(buf[1], sizeof(uint64_t)); munit_assert_uint64(*value_ptr, ==, ByteFlipLe64(7)); munit_assert_string_equal((const char *)buf[2], "hello"); return MUNIT_OK; } TEST_GROUP(encoder, params32); /* Encode a tuple with params32 format and only one value. */ TEST_CASE(encoder, params32, one_value, NULL) { struct encoder_fixture *f = data; struct value value; uint8_t(*buf)[8] = f->buffer.data; (void)params; ENCODER_INIT(1, TUPLE__PARAMS32); value.type = SQLITE_INTEGER; value.integer = 7; ENCODER_NEXT; munit_assert_int(buf[0][0], ==, 1); munit_assert_int(buf[0][1], ==, 0); munit_assert_int(buf[0][2], ==, 0); munit_assert_int(buf[0][3], ==, 0); munit_assert_int(buf[0][4], ==, SQLITE_INTEGER); uint64_t *value_ptr = __builtin_assume_aligned(buf[1], sizeof(uint64_t)); munit_assert_uint64(*value_ptr, ==, ByteFlipLe64(7)); return MUNIT_OK; } /* Encode a tuple with params32 format and two values. */ TEST_CASE(encoder, params32, two_values, NULL) { struct encoder_fixture *f = data; struct value value; uint8_t(*buf)[8] = f->buffer.data; (void)params; ENCODER_INIT(2, TUPLE__PARAMS32); value.type = SQLITE_INTEGER; value.integer = 7; ENCODER_NEXT; value.type = SQLITE_TEXT; value.text = "hello"; ENCODER_NEXT; munit_assert_int(buf[0][0], ==, 2); munit_assert_int(buf[0][1], ==, 0); munit_assert_int(buf[0][2], ==, 0); munit_assert_int(buf[0][3], ==, 0); munit_assert_int(buf[0][4], ==, SQLITE_INTEGER); munit_assert_int(buf[0][5], ==, SQLITE_TEXT); uint64_t *value_ptr = __builtin_assume_aligned(buf[1], sizeof(uint64_t)); munit_assert_uint64(*value_ptr, ==, ByteFlipLe64(7)); munit_assert_string_equal((const char *)buf[2], "hello"); return MUNIT_OK; } TEST_GROUP(encoder, type); /* Encode a float parameter. */ TEST_CASE(encoder, type, float, NULL) { struct encoder_fixture *f = data; struct value value; uint8_t(*buf)[8] = f->buffer.data; (void)params; ENCODER_INIT(1, TUPLE__ROW); value.type = SQLITE_FLOAT; value.float_ = 3.1415; ENCODER_NEXT; munit_assert_int(buf[0][0], ==, SQLITE_FLOAT); uint64_t *value_ptr = __builtin_assume_aligned(buf[1], sizeof(uint64_t)); munit_assert_uint64(*value_ptr, ==, ByteFlipLe64(*(uint64_t *)&value.float_)); return MUNIT_OK; } /* Encode a unix time parameter. */ TEST_CASE(encoder, type, unixtime, NULL) { struct encoder_fixture *f = data; struct value value; uint8_t(*buf)[8] = f->buffer.data; (void)params; ENCODER_INIT(1, TUPLE__ROW); value.type = DQLITE_UNIXTIME; value.unixtime = 12345; ENCODER_NEXT; munit_assert_int(buf[0][0], ==, DQLITE_UNIXTIME); uint64_t *value_ptr = __builtin_assume_aligned(buf[1], sizeof(uint64_t)); munit_assert_uint64(*value_ptr, ==, ByteFlipLe64((uint64_t)value.unixtime)); return MUNIT_OK; } /* Encode an ISO8601 date string time parameter. */ TEST_CASE(encoder, type, iso8601, NULL) { struct encoder_fixture *f = data; struct value value; uint8_t(*buf)[8] = f->buffer.data; (void)params; ENCODER_INIT(1, TUPLE__ROW); value.type = DQLITE_ISO8601; value.iso8601 = "2018-07-20 09:49:05+00:00"; ENCODER_NEXT; munit_assert_int(buf[0][0], ==, DQLITE_ISO8601); munit_assert_string_equal((char *)buf[1], "2018-07-20 09:49:05+00:00"); return MUNIT_OK; } /* Encode a boolean parameter. */ TEST_CASE(encoder, type, boolean, NULL) { struct encoder_fixture *f = data; struct value value; uint8_t(*buf)[8] = f->buffer.data; (void)params; ENCODER_INIT(1, TUPLE__ROW); value.type = DQLITE_BOOLEAN; value.boolean = 1; ENCODER_NEXT; munit_assert_int(buf[0][0], ==, DQLITE_BOOLEAN); uint64_t *value_ptr = __builtin_assume_aligned(buf[1], sizeof(uint64_t)); munit_assert_uint64(*value_ptr, ==, ByteFlipLe64(value.boolean)); return MUNIT_OK; } dqlite-1.16.7/test/unit/test_vfs.c000066400000000000000000001207401465252713400170440ustar00rootroot00000000000000#include #include #include #include "../../include/dqlite.h" #include "../lib/config.h" #include "../lib/fs.h" #include "../lib/heap.h" #include "../lib/runner.h" #include "../lib/sqlite.h" #include "../../src/format.h" #include "../../src/raft.h" #include "../../src/vfs.h" static char *bools[] = {"0", "1", NULL}; static MunitParameterEnum vfs_params[] = { {"disk_mode", bools}, {NULL, NULL}, }; /****************************************************************************** * * Fixture * ******************************************************************************/ #define VFS_PATH_SZ 512 struct fixture { struct sqlite3_vfs vfs; char *dir; char path[VFS_PATH_SZ]; }; static void vfsFillPath(struct fixture *f, char *filename) { int rv; const char *dir = f->dir; if (dir != NULL) { rv = snprintf(f->path, VFS_PATH_SZ, "%s/%s", dir, filename); } else { rv = snprintf(f->path, VFS_PATH_SZ, "%s", filename); } munit_assert_int(rv, >, 0); munit_assert_int(rv, <, VFS_PATH_SZ); } /* Sets the page_size in disk_mode. */ static void setPageSizeDisk(const MunitParameter params[], sqlite3_file *f, unsigned page_size, int rv) { int rc; bool disk_mode = false; char page_sz[32]; rc = snprintf(page_sz, sizeof(page_sz), "%u", page_size); munit_assert_int(rc, >, 0); munit_assert_int(rc, <, sizeof(page_sz)); char *fnctl[] = { "", "page_size", "512", "", }; const char *disk_mode_param = munit_parameters_get(params, "disk_mode"); if (disk_mode_param != NULL) { disk_mode = (bool)atoi(disk_mode_param); } if (disk_mode) { rc = f->pMethods->xFileControl(f, SQLITE_FCNTL_PRAGMA, fnctl); munit_assert_int(rc, ==, rv); } } static void *setUp(const MunitParameter params[], void *user_data) { struct fixture *f = munit_malloc(sizeof *f); int rv; bool disk_mode = false; SETUP_HEAP; SETUP_SQLITE; rv = VfsInit(&f->vfs, "dqlite"); munit_assert_int(rv, ==, 0); f->dir = NULL; const char *disk_mode_param = munit_parameters_get(params, "disk_mode"); if (disk_mode_param != NULL) { disk_mode = (bool)atoi(disk_mode_param); } if (disk_mode) { rv = VfsEnableDisk(&f->vfs); munit_assert_int(rv, ==, 0); f->dir = test_dir_setup(); } rv = sqlite3_vfs_register(&f->vfs, 0); munit_assert_int(rv, ==, 0); return f; } static void tearDown(void *data) { struct fixture *f = data; test_dir_tear_down(f->dir); sqlite3_vfs_unregister(&f->vfs); VfsClose(&f->vfs); TEAR_DOWN_SQLITE; TEAR_DOWN_HEAP; free(f); } /****************************************************************************** * * Helpers * ******************************************************************************/ /* Helper for creating a new file */ static sqlite3_file *__file_create(sqlite3_vfs *vfs, const char *name, int type_flag) { sqlite3_file *file = munit_malloc(vfs->szOsFile); int flags; int rc; flags = SQLITE_OPEN_EXCLUSIVE | SQLITE_OPEN_CREATE | SQLITE_OPEN_READWRITE | type_flag; rc = vfs->xOpen(vfs, name, file, flags, &flags); munit_assert_int(rc, ==, 0); return file; } /* Helper for creating a new database file */ static sqlite3_file *__file_create_main_db(struct fixture *f) { vfsFillPath(f, "test.db"); return __file_create(&f->vfs, f->path, SQLITE_OPEN_MAIN_DB); } /* Helper for allocating a buffer of 100 bytes containing a database header with * a page size field set to 512 bytes. */ static void *__buf_header_main_db(void) { char *buf = munit_malloc(100 * sizeof *buf); /* Set page size to 512. */ buf[16] = 2; buf[17] = 0; return buf; } /* Helper for allocating a buffer with the content of the first page, i.e. the * the header and some other bytes. */ static void *__buf_page_1(void) { char *buf = munit_malloc(512 * sizeof *buf); /* Set page size to 512. */ buf[16] = 2; buf[17] = 0; /* Set some other bytes */ buf[101] = 1; buf[256] = 2; buf[511] = 3; return buf; } /* Helper for allocating a buffer with the content of the second page. */ static void *__buf_page_2(void) { char *buf = munit_malloc(512 * sizeof *buf); buf[0] = 4; buf[256] = 5; buf[511] = 6; return buf; } /* Helper to execute a SQL statement. */ static void __db_exec(sqlite3 *db, const char *sql) { int rc; rc = sqlite3_exec(db, sql, NULL, NULL, NULL); munit_assert_int(rc, ==, SQLITE_OK); } /* Helper to open and initialize a database, setting the page size and * WAL mode. */ static sqlite3 *__db_open(void) { sqlite3 *db; int flags = SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE; int rc; rc = sqlite3_open_v2("test.db", &db, flags, "dqlite"); munit_assert_int(rc, ==, SQLITE_OK); __db_exec(db, "PRAGMA page_size=512"); __db_exec(db, "PRAGMA synchronous=OFF"); __db_exec(db, "PRAGMA journal_mode=WAL"); return db; } /* Helper to close a database. */ static void __db_close(sqlite3 *db) { int rv; rv = sqlite3_close(db); munit_assert_int(rv, ==, SQLITE_OK); } /* Helper get the mxFrame value of the WAL index object associated with the * given database. */ static uint32_t __wal_idx_mx_frame(sqlite3 *db) { sqlite3_file *file; volatile void *region; uint32_t mx_frame; int rc; rc = sqlite3_file_control(db, "main", SQLITE_FCNTL_FILE_POINTER, &file); munit_assert_int(rc, ==, SQLITE_OK); rc = file->pMethods->xShmMap(file, 0, 0, 0, ®ion); munit_assert_int(rc, ==, SQLITE_OK); /* The mxFrame number is 16th byte of the WAL index header. See also * https://sqlite.org/walformat.html. */ mx_frame = ((uint32_t *)region)[4]; return mx_frame; } /* Helper get the read mark array of the WAL index object associated with the * given database. */ static uint32_t *__wal_idx_read_marks(sqlite3 *db) { sqlite3_file *file; volatile void *region; uint32_t *idx; uint32_t *marks; int rc; marks = munit_malloc(FORMAT__WAL_NREADER * sizeof *marks); rc = sqlite3_file_control(db, "main", SQLITE_FCNTL_FILE_POINTER, &file); munit_assert_int(rc, ==, SQLITE_OK); rc = file->pMethods->xShmMap(file, 0, 0, 0, ®ion); munit_assert_int(rc, ==, SQLITE_OK); /* The read-mark array starts at the 100th byte of the WAL index * header. See also https://sqlite.org/walformat.html. */ idx = (uint32_t *)region; memcpy(marks, &idx[25], (sizeof *idx) * FORMAT__WAL_NREADER); return marks; } /* Helper that returns true if the i'th lock of the shared memmory reagion * associated with the given database is currently held. */ static int __shm_shared_lock_held(sqlite3 *db, int i) { sqlite3_file *file; int flags; int locked; int rc; rc = sqlite3_file_control(db, "main", SQLITE_FCNTL_FILE_POINTER, &file); munit_assert_int(rc, ==, SQLITE_OK); /* Try to acquire an exclusive lock, which will fail if the shared lock * is held. */ flags = SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE; rc = file->pMethods->xShmLock(file, i, 1, flags); locked = rc == SQLITE_BUSY; if (rc == SQLITE_OK) { flags = SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE; rc = file->pMethods->xShmLock(file, i, 1, flags); munit_assert_int(rc, ==, SQLITE_OK); } return locked; } /****************************************************************************** * * xOpen * ******************************************************************************/ SUITE(VfsOpen) /* If the EXCLUSIVE and CREATE flag are given, and the file already exists, an * error is returned. */ TEST(VfsOpen, exclusive, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; sqlite3_file *file1 = munit_malloc(f->vfs.szOsFile); sqlite3_file *file2 = munit_malloc(f->vfs.szOsFile); int flags; int rc; (void)params; vfsFillPath(f, "test.db"); flags = SQLITE_OPEN_CREATE | SQLITE_OPEN_MAIN_DB; rc = f->vfs.xOpen(&f->vfs, f->path, file1, flags, &flags); munit_assert_int(rc, ==, SQLITE_OK); flags |= SQLITE_OPEN_EXCLUSIVE; rc = f->vfs.xOpen(&f->vfs, f->path, file2, flags, &flags); munit_assert_int(rc, ==, SQLITE_CANTOPEN); munit_assert_int(EEXIST, ==, f->vfs.xGetLastError(&f->vfs, 0, 0)); rc = file1->pMethods->xClose(file1); munit_assert_int(rc, ==, SQLITE_OK); free(file2); free(file1); return MUNIT_OK; } /* It's possible to open again a previously created file. In that case passing * SQLITE_OPEN_CREATE is not necessary. */ TEST(VfsOpen, again, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; sqlite3_file *file = munit_malloc(f->vfs.szOsFile); int flags; int rc; (void)params; vfsFillPath(f, "test.db"); flags = SQLITE_OPEN_CREATE | SQLITE_OPEN_MAIN_DB; rc = f->vfs.xOpen(&f->vfs, f->path, file, flags, &flags); munit_assert_int(rc, ==, SQLITE_OK); rc = file->pMethods->xClose(file); munit_assert_int(rc, ==, SQLITE_OK); flags = SQLITE_OPEN_READWRITE | SQLITE_OPEN_MAIN_DB; rc = f->vfs.xOpen(&f->vfs, f->path, file, flags, &flags); munit_assert_int(rc, ==, 0); rc = file->pMethods->xClose(file); munit_assert_int(rc, ==, SQLITE_OK); free(file); return MUNIT_OK; } /* If the file does not exist and the SQLITE_OPEN_CREATE flag is not passed, an * error is returned. */ TEST(VfsOpen, noent, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; sqlite3_file *file = munit_malloc(f->vfs.szOsFile); int flags; int rc; (void)params; vfsFillPath(f, "test.db"); rc = f->vfs.xOpen(&f->vfs, f->path, file, 0, &flags); munit_assert_int(rc, ==, SQLITE_CANTOPEN); munit_assert_int(ENOENT, ==, f->vfs.xGetLastError(&f->vfs, 0, 0)); free(file); return MUNIT_OK; } /* Trying to open a WAL file before its main database file results in an * error. */ TEST(VfsOpen, walBeforeDb, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; sqlite3_file *file = munit_malloc(f->vfs.szOsFile); int flags; int rc; (void)params; vfsFillPath(f, "test.db"); flags = SQLITE_OPEN_CREATE | SQLITE_OPEN_WAL; rc = f->vfs.xOpen(&f->vfs, f->path, file, flags, &flags); munit_assert_int(rc, ==, SQLITE_CANTOPEN); free(file); return MUNIT_OK; } /* Trying to run queries against a database that hasn't turned off the * synchronous flag results in an error. */ TEST(VfsOpen, synchronous, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; sqlite3 *db; int flags = SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE; int rc; (void)params; vfsFillPath(f, "test.db"); rc = sqlite3_vfs_register(&f->vfs, 0); munit_assert_int(rc, ==, SQLITE_OK); rc = sqlite3_open_v2(f->path, &db, flags, f->vfs.zName); munit_assert_int(rc, ==, SQLITE_OK); __db_exec(db, "PRAGMA page_size=4092"); rc = sqlite3_exec(db, "PRAGMA journal_mode=WAL", NULL, NULL, NULL); munit_assert_int(rc, ==, SQLITE_IOERR); munit_assert_string_equal(sqlite3_errmsg(db), "disk I/O error"); __db_close(db); rc = sqlite3_vfs_unregister(&f->vfs); munit_assert_int(rc, ==, SQLITE_OK); return MUNIT_OK; } /* Out of memory when creating the content structure for a new file. */ TEST(VfsOpen, oom, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; sqlite3_file *file = munit_malloc(f->vfs.szOsFile); int flags = SQLITE_OPEN_CREATE | SQLITE_OPEN_MAIN_DB; int rc; (void)params; vfsFillPath(f, "test.db"); test_heap_fault_config(0, 1); test_heap_fault_enable(); rc = f->vfs.xOpen(&f->vfs, f->path, file, flags, &flags); munit_assert_int(rc, ==, SQLITE_CANTOPEN); free(file); return MUNIT_OK; } /* Out of memory when internally copying the filename. */ TEST(VfsOpen, oomFilename, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; sqlite3_file *file = munit_malloc(f->vfs.szOsFile); int flags = SQLITE_OPEN_CREATE | SQLITE_OPEN_MAIN_DB; int rc; (void)params; vfsFillPath(f, "test.db"); test_heap_fault_config(1, 1); test_heap_fault_enable(); rc = f->vfs.xOpen(&f->vfs, f->path, file, flags, &flags); munit_assert_int(rc, ==, SQLITE_CANTOPEN); free(file); return MUNIT_OK; } /* Open a temporary file. */ TEST(VfsOpen, tmp, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; sqlite3_file *file = munit_malloc(f->vfs.szOsFile); int flags = 0; char buf[16]; int rc; (void)params; flags |= SQLITE_OPEN_CREATE; flags |= SQLITE_OPEN_READWRITE; flags |= SQLITE_OPEN_TEMP_JOURNAL; flags |= SQLITE_OPEN_DELETEONCLOSE; rc = f->vfs.xOpen(&f->vfs, NULL, file, flags, &flags); munit_assert_int(rc, ==, SQLITE_OK); rc = file->pMethods->xWrite(file, "hello", 5, 0); munit_assert_int(rc, ==, SQLITE_OK); memset(buf, 0, sizeof buf); rc = file->pMethods->xRead(file, buf, 5, 0); munit_assert_int(rc, ==, SQLITE_OK); munit_assert_string_equal(buf, "hello"); rc = file->pMethods->xClose(file); munit_assert_int(rc, ==, SQLITE_OK); free(file); return MUNIT_OK; } /****************************************************************************** * * xDelete * ******************************************************************************/ SUITE(VfsDelete) /* Delete a file. */ TEST(VfsDelete, success, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; sqlite3_file *file = munit_malloc(f->vfs.szOsFile); int flags = SQLITE_OPEN_CREATE | SQLITE_OPEN_MAIN_DB; int rc; (void)params; vfsFillPath(f, "test.db"); rc = f->vfs.xOpen(&f->vfs, f->path, file, flags, &flags); munit_assert_int(rc, ==, 0); rc = file->pMethods->xClose(file); munit_assert_int(rc, ==, 0); rc = f->vfs.xDelete(&f->vfs, f->path, 0); munit_assert_int(rc, ==, 0); /* Trying to open the file again without the SQLITE_OPEN_CREATE flag * results in an error. */ rc = f->vfs.xOpen(&f->vfs, f->path, file, 0, &flags); munit_assert_int(rc, ==, SQLITE_CANTOPEN); free(file); return MUNIT_OK; } /* Trying to delete a non-existing file results in an error. */ TEST(VfsDelete, enoent, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; int rc; (void)params; vfsFillPath(f, "test.db"); rc = f->vfs.xDelete(&f->vfs, f->path, 0); munit_assert_int(rc, ==, SQLITE_IOERR_DELETE_NOENT); munit_assert_int(ENOENT, ==, f->vfs.xGetLastError(&f->vfs, 0, 0)); return MUNIT_OK; } /****************************************************************************** * * xAccess * ******************************************************************************/ SUITE(VfsAccess) /* Accessing an existing file returns true. */ TEST(VfsAccess, success, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; sqlite3_file *file = munit_malloc(f->vfs.szOsFile); int flags = SQLITE_OPEN_CREATE | SQLITE_OPEN_MAIN_DB | SQLITE_OPEN_READWRITE; int rc; int exists; vfsFillPath(f, "test.db"); rc = f->vfs.xOpen(&f->vfs, f->path, file, flags, &flags); munit_assert_int(rc, ==, 0); setPageSizeDisk(params, file, 512, SQLITE_NOTFOUND); /* Write the first page, containing the header and some content. */ void *buf_page_1 = __buf_page_1(); rc = file->pMethods->xWrite(file, buf_page_1, 512, 0); munit_assert_int(rc, ==, 0); rc = file->pMethods->xClose(file); munit_assert_int(rc, ==, 0); rc = f->vfs.xAccess(&f->vfs, f->path, SQLITE_ACCESS_EXISTS, &exists); munit_assert_int(rc, ==, 0); munit_assert_true(exists); free(file); free(buf_page_1); return MUNIT_OK; } /* Trying to access a non existing file returns false. */ TEST(VfsAccess, noent, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; int rc; int exists; (void)params; vfsFillPath(f, "test.db"); rc = f->vfs.xAccess(&f->vfs, f->path, SQLITE_ACCESS_EXISTS, &exists); munit_assert_int(rc, ==, 0); munit_assert_false(exists); return MUNIT_OK; } /****************************************************************************** * * xFullPathname * ******************************************************************************/ SUITE(VfsFullPathname); /* The xFullPathname API returns the filename unchanged. */ TEST(VfsFullPathname, success, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; int rc; char pathname[10]; (void)params; rc = f->vfs.xFullPathname(&f->vfs, "test.db", 10, pathname); munit_assert_int(rc, ==, 0); munit_assert_string_equal(pathname, "test.db"); return MUNIT_OK; } /****************************************************************************** * * xClose * ******************************************************************************/ SUITE(VfsClose) /* Closing a file decreases its refcount so it's possible to delete it. */ TEST(VfsClose, thenDelete, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; sqlite3_file *file = munit_malloc(f->vfs.szOsFile); int flags = SQLITE_OPEN_CREATE | SQLITE_OPEN_MAIN_DB; int rc; (void)params; vfsFillPath(f, "test.db"); rc = f->vfs.xOpen(&f->vfs, f->path, file, flags, &flags); munit_assert_int(rc, ==, 0); rc = file->pMethods->xClose(file); munit_assert_int(rc, ==, 0); rc = f->vfs.xDelete(&f->vfs, f->path, 0); munit_assert_int(rc, ==, 0); free(file); return MUNIT_OK; } /****************************************************************************** * * xRead * ******************************************************************************/ SUITE(VfsRead) /* Trying to read a file that was not written yet, results in an error. */ TEST(VfsRead, neverWritten, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; sqlite3_file *file = __file_create_main_db(f); int rc; char buf[1] = {123}; (void)params; rc = file->pMethods->xRead(file, (void *)buf, 1, 0); munit_assert_int(rc, ==, SQLITE_IOERR_SHORT_READ); /* The buffer gets filled with zero */ munit_assert_int(buf[0], ==, 0); rc = file->pMethods->xClose(file); munit_assert_int(rc, ==, 0); free(file); return MUNIT_OK; } /****************************************************************************** * * xWrite * ******************************************************************************/ SUITE(VfsWrite) /* Write the header of the database file. */ TEST(VfsWrite, dbHeader, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; sqlite3_file *file = __file_create_main_db(f); void *buf = __buf_header_main_db(); int rc; (void)params; rc = file->pMethods->xWrite(file, buf, 100, 0); munit_assert_int(rc, ==, 0); rc = file->pMethods->xClose(file); munit_assert_int(rc, ==, 0); free(file); free(buf); return MUNIT_OK; } /* Write the header of the database file, then the full first page and a second * page. */ TEST(VfsWrite, andReadPages, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; sqlite3_file *file = __file_create_main_db(f); int rc; char buf[512]; void *buf_header_main = __buf_header_main_db(); void *buf_page_1 = __buf_page_1(); void *buf_page_2 = __buf_page_2(); (void)params; memset(buf, 0, 512); /* Write the header. */ rc = file->pMethods->xWrite(file, buf_header_main, 100, 0); munit_assert_int(rc, ==, 0); /* Write the first page, containing the header and some content. */ rc = file->pMethods->xWrite(file, buf_page_1, 512, 0); munit_assert_int(rc, ==, 0); /* Set the page_size in disk_mode */ setPageSizeDisk(params, file, 512, SQLITE_NOTFOUND); /* Write a second page. */ rc = file->pMethods->xWrite(file, buf_page_2, 512, 512); munit_assert_int(rc, ==, 0); /* Read the page header. */ rc = file->pMethods->xRead(file, (void *)buf, 512, 0); munit_assert_int(rc, ==, 0); munit_assert_int(buf[16], ==, 2); munit_assert_int(buf[17], ==, 0); munit_assert_int(buf[101], ==, 1); munit_assert_int(buf[256], ==, 2); munit_assert_int(buf[511], ==, 3); /* Read the second page. */ memset(buf, 0, 512); rc = file->pMethods->xRead(file, (void *)buf, 512, 512); munit_assert_int(rc, ==, 0); munit_assert_int(buf[0], ==, 4); munit_assert_int(buf[256], ==, 5); munit_assert_int(buf[511], ==, 6); free(buf_header_main); free(buf_page_1); free(buf_page_2); rc = file->pMethods->xClose(file); munit_assert_int(rc, ==, 0); free(file); return MUNIT_OK; } /* Out of memory when trying to create a new page. */ TEST(VfsWrite, oomPage, setUp, tearDown, 0, NULL) { struct fixture *f = data; sqlite3_file *file = __file_create_main_db(f); void *buf_header_main = __buf_header_main_db(); char buf[512]; int rc; test_heap_fault_config(0, 1); test_heap_fault_enable(); (void)params; memset(buf, 0, 512); /* Write the database header, which triggers creating the first page. */ rc = file->pMethods->xWrite(file, buf_header_main, 100, 0); munit_assert_int(rc, ==, SQLITE_NOMEM); free(buf_header_main); free(file); return MUNIT_OK; } /* Out of memory when trying to append a new page to the internal page array of * the content object. */ TEST(VfsWrite, oomPageArray, setUp, tearDown, 0, NULL) { struct fixture *f = data; sqlite3_file *file = __file_create_main_db(f); void *buf_header_main = __buf_header_main_db(); char buf[512]; int rc; test_heap_fault_config(1, 1); test_heap_fault_enable(); (void)params; memset(buf, 0, 512); /* Write the database header, which triggers creating the first page. */ rc = file->pMethods->xWrite(file, buf_header_main, 100, 0); munit_assert_int(rc, ==, SQLITE_NOMEM); free(buf_header_main); free(file); return MUNIT_OK; } /* Out of memory when trying to create the content buffer of a new page. */ TEST(VfsWrite, oomPageBuf, setUp, tearDown, 0, NULL) { struct fixture *f = data; sqlite3_file *file = __file_create_main_db(f); void *buf_header_main = __buf_header_main_db(); char buf[512]; int rc; test_heap_fault_config(1, 1); test_heap_fault_enable(); (void)params; memset(buf, 0, 512); /* Write the database header, which triggers creating the first page. */ rc = file->pMethods->xWrite(file, buf_header_main, 100, 0); munit_assert_int(rc, ==, SQLITE_NOMEM); free(buf_header_main); free(file); return MUNIT_OK; } /* Trying to write two pages beyond the last one results in an error. */ TEST(VfsWrite, beyondLast, setUp, tearDown, 0, NULL) { struct fixture *f = data; sqlite3_file *file = __file_create_main_db(f); void *buf_page_1 = __buf_page_1(); void *buf_page_2 = __buf_page_2(); char buf[512]; int rc; (void)params; memset(buf, 0, 512); /* Write the first page. */ rc = file->pMethods->xWrite(file, buf_page_1, 512, 0); munit_assert_int(rc, ==, 0); /* Write the third page, without writing the second. */ rc = file->pMethods->xWrite(file, buf_page_2, 512, 1024); munit_assert_int(rc, ==, SQLITE_IOERR_WRITE); free(buf_page_1); free(buf_page_2); free(file); return MUNIT_OK; } /****************************************************************************** * * xTruncate * ******************************************************************************/ SUITE(VfsTruncate); /* Truncate the main database file. */ TEST(VfsTruncate, database, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; sqlite3_file *file = __file_create_main_db(f); void *buf_page_1 = __buf_page_1(); void *buf_page_2 = __buf_page_2(); int rc; sqlite_int64 size; (void)params; /* Initial size is 0. */ rc = file->pMethods->xFileSize(file, &size); munit_assert_int(rc, ==, 0); munit_assert_int(size, ==, 0); /* Truncating an empty file is a no-op. */ rc = file->pMethods->xTruncate(file, 0); munit_assert_int(rc, ==, 0); /* The size is still 0. */ rc = file->pMethods->xFileSize(file, &size); munit_assert_int(rc, ==, 0); munit_assert_int(size, ==, 0); /* Set the page_size in disk_mode */ setPageSizeDisk(params, file, 512, SQLITE_NOTFOUND); /* Write the first page, containing the header. */ rc = file->pMethods->xWrite(file, buf_page_1, 512, 0); munit_assert_int(rc, ==, 0); /* Write a second page. */ rc = file->pMethods->xWrite(file, buf_page_2, 512, 512); munit_assert_int(rc, ==, 0); /* The size is 1024. */ rc = file->pMethods->xFileSize(file, &size); munit_assert_int(rc, ==, 0); munit_assert_int(size, ==, 1024); /* Truncate the second page. */ rc = file->pMethods->xTruncate(file, 512); munit_assert_int(rc, ==, 0); /* The size is 512. */ rc = file->pMethods->xFileSize(file, &size); munit_assert_int(rc, ==, 0); munit_assert_int(size, ==, 512); /* Truncate also the first. */ rc = file->pMethods->xTruncate(file, 0); munit_assert_int(rc, ==, 0); /* The size is 0. */ rc = file->pMethods->xFileSize(file, &size); munit_assert_int(rc, ==, 0); munit_assert_int(size, ==, 0); free(buf_page_1); free(buf_page_2); rc = file->pMethods->xClose(file); munit_assert_int(rc, ==, 0); free(file); return MUNIT_OK; } /* Truncating a file which is not the main db file or the WAL file produces an * error. */ TEST(VfsTruncate, unexpected, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; sqlite3_file *main_db = __file_create_main_db(f); sqlite3_file *file = munit_malloc(f->vfs.szOsFile); int flags = SQLITE_OPEN_CREATE | SQLITE_OPEN_MAIN_JOURNAL; char buf[32]; char journal[512]; int rc; (void)params; /* Open a journal file. */ rc = snprintf(journal, sizeof(journal), "%s-journal", f->path); munit_assert_int(rc, >, 0); munit_assert_int(rc, <, sizeof(journal)); rc = f->vfs.xOpen(&f->vfs, journal, file, flags, &flags); munit_assert_int(rc, ==, 0); /* Write some content. */ rc = file->pMethods->xWrite(file, buf, 32, 0); munit_assert_int(rc, ==, 0); /* Truncating produces an error. */ rc = file->pMethods->xTruncate(file, 0); munit_assert_int(rc, ==, SQLITE_IOERR_TRUNCATE); rc = file->pMethods->xClose(main_db); munit_assert_int(rc, ==, 0); free(main_db); rc = file->pMethods->xClose(file); munit_assert_int(rc, ==, 0); free(file); return MUNIT_OK; } /* Truncating an empty file is a no-op. */ TEST(VfsTruncate, empty, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; sqlite3_file *file = __file_create_main_db(f); sqlite_int64 size; int rc; (void)params; /* Truncating an empty file is a no-op. */ rc = file->pMethods->xTruncate(file, 0); munit_assert_int(rc, ==, SQLITE_OK); /* Size is 0. */ rc = file->pMethods->xFileSize(file, &size); munit_assert_int(rc, ==, 0); munit_assert_int(size, ==, 0); rc = file->pMethods->xClose(file); munit_assert_int(rc, ==, 0); free(file); return MUNIT_OK; } /* Trying to grow an empty file produces an error. */ TEST(VfsTruncate, emptyGrow, setUp, tearDown, 0, NULL) { struct fixture *f = data; sqlite3_file *file = __file_create_main_db(f); int rc; (void)params; /* Truncating an empty file is a no-op. */ rc = file->pMethods->xTruncate(file, 512); munit_assert_int(rc, ==, SQLITE_IOERR_TRUNCATE); rc = file->pMethods->xClose(file); munit_assert_int(rc, ==, 0); free(file); return MUNIT_OK; } /* Trying to truncate a main database file to a size which is not a multiple of * the page size produces an error. */ TEST(VfsTruncate, misaligned, setUp, tearDown, 0, NULL) { struct fixture *f = data; sqlite3_file *file = __file_create_main_db(f); void *buf_page_1 = __buf_page_1(); int rc; (void)params; /* Write the first page, containing the header. */ rc = file->pMethods->xWrite(file, buf_page_1, 512, 0); munit_assert_int(rc, ==, 0); /* Truncating to an invalid size. */ rc = file->pMethods->xTruncate(file, 400); munit_assert_int(rc, ==, SQLITE_IOERR_TRUNCATE); free(buf_page_1); rc = file->pMethods->xClose(file); munit_assert_int(rc, ==, 0); free(file); return MUNIT_OK; } /****************************************************************************** * * xShmMap * ******************************************************************************/ SUITE(VfsShmMap); static char *test_shm_map_oom_delay[] = {"0", "1", NULL}; static char *test_shm_map_oom_repeat[] = {"1", NULL}; static MunitParameterEnum test_shm_map_oom_params[] = { {TEST_HEAP_FAULT_DELAY, test_shm_map_oom_delay}, {TEST_HEAP_FAULT_REPEAT, test_shm_map_oom_repeat}, {NULL, NULL}, }; /* Out of memory when trying to initialize the internal VFS shm data struct. */ TEST(VfsShmMap, oom, setUp, tearDown, 0, test_shm_map_oom_params) { struct fixture *f = data; sqlite3_file *file = __file_create_main_db(f); volatile void *region; int rc; (void)params; (void)data; test_heap_fault_enable(); rc = file->pMethods->xShmMap(file, 0, 32768, 1, ®ion); munit_assert_int(rc, ==, SQLITE_NOMEM); free(file); return MUNIT_OK; } /****************************************************************************** * * xShmLock * ******************************************************************************/ SUITE(VfsShmLock) /* If an exclusive lock is in place, getting a shared lock on any index of its * range fails. */ TEST(VfsShmLock, sharedBusy, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; sqlite3_file *file = munit_malloc(f->vfs.szOsFile); int flags = SQLITE_OPEN_CREATE | SQLITE_OPEN_MAIN_DB; volatile void *region; int rc; (void)params; (void)data; vfsFillPath(f, "test.db"); rc = f->vfs.xOpen(&f->vfs, f->path, file, flags, &flags); munit_assert_int(rc, ==, 0); rc = file->pMethods->xShmMap(file, 0, 32768, 1, ®ion); munit_assert_int(rc, ==, 0); /* Take an exclusive lock on a range. */ flags = SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE; rc = file->pMethods->xShmLock(file, 2, 3, flags); munit_assert_int(rc, ==, 0); /* Attempting to get a shared lock on an index in that range fails. */ flags = SQLITE_SHM_LOCK | SQLITE_SHM_SHARED; rc = file->pMethods->xShmLock(file, 3, 1, flags); munit_assert_int(rc, ==, SQLITE_BUSY); rc = file->pMethods->xClose(file); munit_assert_int(rc, ==, 0); free(file); return MUNIT_OK; } /* If a shared lock is in place on any of the indexes of the requested range, * getting an exclusive lock fails. */ TEST(VfsShmLock, exclBusy, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; sqlite3_file *file = munit_malloc(f->vfs.szOsFile); int flags = SQLITE_OPEN_CREATE | SQLITE_OPEN_MAIN_DB; volatile void *region; int rc; (void)params; (void)data; vfsFillPath(f, "test.db"); rc = f->vfs.xOpen(&f->vfs, f->path, file, flags, &flags); munit_assert_int(rc, ==, 0); rc = file->pMethods->xShmMap(file, 0, 32768, 1, ®ion); munit_assert_int(rc, ==, 0); /* Take a shared lock on index 3. */ flags = SQLITE_SHM_LOCK | SQLITE_SHM_SHARED; rc = file->pMethods->xShmLock(file, 3, 1, flags); munit_assert_int(rc, ==, 0); /* Attempting to get an exclusive lock on a range that contains index 3 * fails. */ flags = SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE; rc = file->pMethods->xShmLock(file, 2, 3, flags); munit_assert_int(rc, ==, SQLITE_BUSY); rc = file->pMethods->xClose(file); munit_assert_int(rc, ==, 0); free(file); return MUNIT_OK; } /* The native unix VFS implementation from SQLite allows to release a shared * memory lock without acquiring it first. */ TEST(VfsShmLock, releaseUnix, setUp, tearDown, 0, vfs_params) { (void)data; struct sqlite3_vfs *vfs = sqlite3_vfs_find("unix"); sqlite3_file *file = munit_malloc(vfs->szOsFile); int flags = SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE | SQLITE_OPEN_MAIN_DB; char *dir = test_dir_setup(); char buf[1024]; char *path; volatile void *region; int rc; (void)params; (void)data; /* The SQLite pager stores the Database filename, Journal filename, and * WAL filename consecutively in memory, in that order. The database * filename is prefixed by four zero bytes. Emulate that behavior here, * since the internal SQLite code triggered by the xShmMap unix * implementation relies on that.*/ memset(buf, 0, sizeof buf); path = buf + 4; sprintf(path, "%s/test.db", dir); rc = vfs->xOpen(vfs, path, file, flags, &flags); munit_assert_int(rc, ==, 0); rc = file->pMethods->xShmMap(file, 0, 32768, 1, ®ion); munit_assert_int(rc, ==, 0); flags = SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE; rc = file->pMethods->xShmLock(file, 3, 1, flags); munit_assert_int(rc, ==, 0); flags = SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED; rc = file->pMethods->xShmLock(file, 2, 1, flags); munit_assert_int(rc, ==, 0); rc = file->pMethods->xShmUnmap(file, 1); munit_assert_int(rc, ==, 0); rc = file->pMethods->xClose(file); munit_assert_int(rc, ==, 0); test_dir_tear_down(dir); free(file); return MUNIT_OK; } /* The dqlite VFS implementation allows to release a shared memory lock without * acquiring it first. This is important because at open time sometimes SQLite * will do just that (release before acquire). */ TEST(VfsShmLock, release, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; sqlite3_file *file = munit_malloc(f->vfs.szOsFile); int flags = SQLITE_OPEN_CREATE | SQLITE_OPEN_MAIN_DB; volatile void *region; int rc; (void)params; (void)data; vfsFillPath(f, "test.db"); rc = f->vfs.xOpen(&f->vfs, f->path, file, flags, &flags); munit_assert_int(rc, ==, 0); rc = file->pMethods->xShmMap(file, 0, 32768, 1, ®ion); munit_assert_int(rc, ==, 0); flags = SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED; rc = file->pMethods->xShmLock(file, 3, 1, flags); munit_assert_int(rc, ==, 0); flags = SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED; rc = file->pMethods->xShmLock(file, 2, 1, flags); munit_assert_int(rc, ==, 0); rc = file->pMethods->xShmUnmap(file, 1); munit_assert_int(rc, ==, 0); rc = file->pMethods->xClose(file); munit_assert_int(rc, ==, 0); free(file); return MUNIT_OK; } /****************************************************************************** * * xFileControl * ******************************************************************************/ SUITE(VfsFileControl) /* Trying to set the journal mode to anything other than "wal" produces an * error. */ TEST(VfsFileControl, journal, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; sqlite3_file *file = __file_create_main_db(f); char *fnctl[] = { "", "journal_mode", "memory", "", }; int rc; (void)params; (void)data; /* Setting the page size a first time returns NOTFOUND, which is what * SQLite effectively expects. */ rc = file->pMethods->xFileControl(file, SQLITE_FCNTL_PRAGMA, fnctl); munit_assert_int(rc, ==, SQLITE_IOERR); rc = file->pMethods->xClose(file); munit_assert_int(rc, ==, 0); free(file); /* Free allocated memory from call to sqlite3_mprintf */ sqlite3_free(fnctl[0]); return MUNIT_OK; } /****************************************************************************** * * xCurrentTime * ******************************************************************************/ SUITE(VfsCurrentTime) TEST(VfsCurrentTime, success, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; double now; int rc; (void)params; rc = f->vfs.xCurrentTime(&f->vfs, &now); munit_assert_int(rc, ==, SQLITE_OK); munit_assert_double(now, >, 0); return MUNIT_OK; } /****************************************************************************** * * xSleep * ******************************************************************************/ SUITE(VfsSleep) /* The xSleep implementation is a no-op. */ TEST(VfsSleep, success, setUp, tearDown, 0, vfs_params) { struct fixture *f = data; int microseconds; (void)params; microseconds = f->vfs.xSleep(&f->vfs, 123); munit_assert_int(microseconds, ==, 123); return MUNIT_OK; } /****************************************************************************** * * VfsInit * ******************************************************************************/ SUITE(VfsInit); static char *test_create_oom_delay[] = {"0", NULL}; static char *test_create_oom_repeat[] = {"1", NULL}; static MunitParameterEnum test_create_oom_params[] = { {TEST_HEAP_FAULT_DELAY, test_create_oom_delay}, {TEST_HEAP_FAULT_REPEAT, test_create_oom_repeat}, {NULL, NULL}, }; TEST(VfsInit, oom, setUp, tearDown, 0, test_create_oom_params) { struct sqlite3_vfs vfs; int rv; (void)params; (void)data; test_heap_fault_enable(); rv = VfsInit(&vfs, "dqlite"); munit_assert_int(rv, ==, DQLITE_NOMEM); return MUNIT_OK; } /****************************************************************************** * * Integration * ******************************************************************************/ SUITE(VfsIntegration) /* Test our expections on the memory-mapped WAl index format. */ TEST(VfsIntegration, wal, setUp, tearDown, 0, vfs_params) { sqlite3 *db1; sqlite3 *db2; uint32_t *read_marks; int i; (void)data; (void)params; return MUNIT_SKIP; db1 = __db_open(); db2 = __db_open(); __db_exec(db1, "CREATE TABLE test (n INT)"); munit_assert_int(__wal_idx_mx_frame(db1), ==, 2); read_marks = __wal_idx_read_marks(db1); munit_assert_uint32(read_marks[0], ==, 0); munit_assert_uint32(read_marks[1], ==, 0); munit_assert_uint32(read_marks[2], ==, 0xffffffff); munit_assert_uint32(read_marks[3], ==, 0xffffffff); munit_assert_uint32(read_marks[4], ==, 0xffffffff); free(read_marks); /* Start a read transaction on db2 */ __db_exec(db2, "BEGIN"); __db_exec(db2, "SELECT * FROM test"); /* The max frame is set to 2, which is the current size of the WAL. */ munit_assert_int(__wal_idx_mx_frame(db2), ==, 2); /* The starting mx frame value has been saved in the read marks */ read_marks = __wal_idx_read_marks(db2); munit_assert_uint32(read_marks[0], ==, 0); munit_assert_uint32(read_marks[1], ==, 2); munit_assert_uint32(read_marks[2], ==, 0xffffffff); munit_assert_uint32(read_marks[3], ==, 0xffffffff); munit_assert_uint32(read_marks[4], ==, 0xffffffff); free(read_marks); /* A shared lock is held on the second read mark (read locks start at * 3). */ munit_assert_true(__shm_shared_lock_held(db2, 3 + 1)); /* Start a write transaction on db1 */ __db_exec(db1, "BEGIN"); for (i = 0; i < 100; i++) { __db_exec(db1, "INSERT INTO test(n) VALUES(1)"); } /* The mx frame is still 2 since the transaction is not committed. */ munit_assert_int(__wal_idx_mx_frame(db1), ==, 2); /* No extra read mark wal taken. */ read_marks = __wal_idx_read_marks(db1); munit_assert_uint32(read_marks[0], ==, 0); munit_assert_uint32(read_marks[1], ==, 2); munit_assert_uint32(read_marks[2], ==, 0xffffffff); munit_assert_uint32(read_marks[3], ==, 0xffffffff); munit_assert_uint32(read_marks[4], ==, 0xffffffff); free(read_marks); __db_exec(db1, "COMMIT"); /* The mx frame is now 6. */ munit_assert_int(__wal_idx_mx_frame(db1), ==, 6); /* The old read lock is still in place. */ munit_assert_true(__shm_shared_lock_held(db2, 3 + 1)); /* Start a read transaction on db1 */ __db_exec(db1, "BEGIN"); __db_exec(db1, "SELECT * FROM test"); /* The mx frame is still unchanged. */ munit_assert_int(__wal_idx_mx_frame(db1), ==, 6); /* A new read mark was taken. */ read_marks = __wal_idx_read_marks(db1); munit_assert_uint32(read_marks[0], ==, 0); munit_assert_uint32(read_marks[1], ==, 2); munit_assert_uint32(read_marks[2], ==, 6); munit_assert_uint32(read_marks[3], ==, 0xffffffff); munit_assert_uint32(read_marks[4], ==, 0xffffffff); free(read_marks); /* The old read lock is still in place. */ munit_assert_true(__shm_shared_lock_held(db2, 3 + 1)); /* The new read lock is in place as well. */ munit_assert_true(__shm_shared_lock_held(db2, 3 + 2)); __db_close(db1); __db_close(db2); return SQLITE_OK; } /* Full checkpoints are possible only when no read mark is set. */ TEST(VfsIntegration, checkpoint, setUp, tearDown, 0, vfs_params) { sqlite3 *db1; sqlite3 *db2; sqlite3_file *file1; /* main DB file */ sqlite3_file *file2; /* WAL file */ sqlite_int64 size; uint32_t *read_marks; unsigned mx_frame; char stmt[128]; int log, ckpt; int i; int rv; (void)data; (void)params; return MUNIT_SKIP; db1 = __db_open(); __db_exec(db1, "CREATE TABLE test (n INT)"); /* Insert a few rows so we grow the size of the WAL. */ __db_exec(db1, "BEGIN"); for (i = 0; i < 500; i++) { sprintf(stmt, "INSERT INTO test(n) VALUES(%d)", i); __db_exec(db1, stmt); } __db_exec(db1, "COMMIT"); /* Get the file objects for the main database and the WAL. */ rv = sqlite3_file_control(db1, "main", SQLITE_FCNTL_FILE_POINTER, &file1); munit_assert_int(rv, ==, 0); rv = sqlite3_file_control(db1, "main", SQLITE_FCNTL_JOURNAL_POINTER, &file2); munit_assert_int(rv, ==, 0); /* The WAL file has now 13 pages */ rv = file2->pMethods->xFileSize(file2, &size); munit_assert_int(formatWalCalcFramesNumber(512, size), ==, 13); mx_frame = __wal_idx_mx_frame(db1); munit_assert_int(mx_frame, ==, 13); /* Start a read transaction on a different connection, acquiring a * shared lock on all WAL pages. */ db2 = __db_open(); __db_exec(db2, "BEGIN"); __db_exec(db2, "SELECT * FROM test"); read_marks = __wal_idx_read_marks(db1); munit_assert_int(read_marks[1], ==, 13); free(read_marks); rv = file1->pMethods->xShmLock(file1, 3 + 1, 1, SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE); munit_assert_int(rv, ==, SQLITE_BUSY); munit_assert_true(__shm_shared_lock_held(db1, 3 + 1)); /* Execute a new write transaction, deleting some of the pages we * inserted and creating new ones. */ __db_exec(db1, "BEGIN"); __db_exec(db1, "DELETE FROM test WHERE n > 200"); for (i = 0; i < 1000; i++) { sprintf(stmt, "INSERT INTO test(n) VALUES(%d)", i); __db_exec(db1, stmt); } __db_exec(db1, "COMMIT"); /* Since there's a shared read lock, a full checkpoint will fail. */ rv = sqlite3_wal_checkpoint_v2(db1, "main", SQLITE_CHECKPOINT_TRUNCATE, &log, &ckpt); munit_assert_int(rv, !=, 0); /* If we complete the read transaction the shared lock is realeased and * the checkpoint succeeds. */ __db_exec(db2, "COMMIT"); rv = sqlite3_wal_checkpoint_v2(db1, "main", SQLITE_CHECKPOINT_TRUNCATE, &log, &ckpt); munit_assert_int(rv, ==, 0); __db_close(db1); __db_close(db2); return SQLITE_OK; } dqlite-1.16.7/test/unit/test_vfs2.c000066400000000000000000000233501465252713400171250ustar00rootroot00000000000000#pragma GCC diagnostic ignored "-Wformat-truncation" // XXX #include "../../src/vfs2.h" #include "../../src/lib/byte.h" #include "../lib/fs.h" #include "../lib/runner.h" #include #include #include #include #include #include #include #define PAGE_SIZE 512 #define PAGE_SIZE_STR "512" SUITE(vfs2); struct fixture { sqlite3_vfs *vfs; char *dir; }; static void *set_up(const MunitParameter params[], void *user_data) { (void)params; (void)user_data; struct fixture *f = munit_malloc(sizeof(*f)); f->dir = test_dir_setup(); f->vfs = vfs2_make(sqlite3_vfs_find("unix"), "dqlite-vfs2"); munit_assert_ptr_not_null(f->vfs); sqlite3_vfs_register(f->vfs, 1 /* make default */); return f; } static void tear_down(void *data) { struct fixture *f = data; sqlite3_vfs_unregister(f->vfs); vfs2_destroy(f->vfs); test_dir_tear_down(f->dir); free(f); } static void prepare_wals(const char *dbname, const unsigned char *wal1, size_t wal1_len, const unsigned char *wal2, size_t wal2_len) { char buf[PATH_MAX]; ssize_t n; int rv; if (wal1 != NULL) { snprintf(buf, sizeof(buf), "%s-xwal1", dbname); int fd1 = open(buf, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH); munit_assert_int(fd1, !=, -1); rv = ftruncate(fd1, 0); munit_assert_int(rv, ==, 0); n = write(fd1, wal1, wal1_len); munit_assert_llong(n, ==, wal1_len); close(fd1); } if (wal2 != NULL) { snprintf(buf, sizeof(buf), "%s-xwal2", dbname); int fd2 = open(buf, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH); munit_assert_int(fd2, !=, -1); rv = ftruncate(fd2, 0); munit_assert_int(rv, ==, 0); n = write(fd2, wal2, wal2_len); munit_assert_llong(n, ==, wal2_len); close(fd2); } } static void check_wals(const char *dbname, off_t wal1_len, off_t wal2_len) { char buf[PATH_MAX]; struct stat st; int rv; snprintf(buf, sizeof(buf), "%s-xwal1", dbname); rv = stat(buf, &st); munit_assert_true((rv == 0 && st.st_size == wal1_len) || (rv < 0 && errno == ENOENT && wal1_len == 0)); snprintf(buf, sizeof(buf), "%s-xwal2", dbname); rv = stat(buf, &st); munit_assert_true((rv == 0 && st.st_size == wal2_len) || (rv < 0 && errno == ENOENT && wal2_len == 0)); } TEST(vfs2, basic, set_up, tear_down, 0, NULL) { struct fixture *f = data; int rv; char buf[PATH_MAX]; snprintf(buf, PATH_MAX, "%s/%s", f->dir, "test.db"); sqlite3 *db; rv = sqlite3_open(buf, &db); munit_assert_int(rv, ==, SQLITE_OK); rv = sqlite3_exec(db, "PRAGMA page_size=" PAGE_SIZE_STR ";" "PRAGMA journal_mode=WAL;" "PRAGMA wal_autocheckpoint=0", NULL, NULL, NULL); munit_assert_int(rv, ==, SQLITE_OK); char *args[] = {NULL, "page_size", NULL}; rv = sqlite3_file_control(db, "main", SQLITE_FCNTL_PRAGMA, args); sqlite3_file *fp; sqlite3_file_control(db, "main", SQLITE_FCNTL_FILE_POINTER, &fp); rv = vfs2_commit_barrier(fp); munit_assert_int(rv, ==, 0); rv = sqlite3_exec(db, "CREATE TABLE foo (bar INTEGER)", NULL, NULL, NULL); munit_assert_int(rv, ==, SQLITE_OK); struct vfs2_wal_slice sl; rv = vfs2_poll(fp, NULL, NULL, &sl); munit_assert_int(rv, ==, 0); rv = vfs2_unhide(fp); munit_assert_int(rv, ==, 0); munit_assert_uint32(sl.start, ==, 0); munit_assert_uint32(sl.len, ==, 2); rv = sqlite3_exec(db, "INSERT INTO foo (bar) VALUES (17)", NULL, NULL, NULL); munit_assert_int(rv, ==, SQLITE_OK); tracef("aborting..."); rv = vfs2_abort(fp); munit_assert_int(rv, ==, 0); rv = sqlite3_exec(db, "INSERT INTO foo (bar) values (22)", NULL, NULL, NULL); munit_assert_int(rv, ==, 0); rv = vfs2_poll(fp, NULL, NULL, &sl); munit_assert_int(rv, ==, 0); munit_assert_uint32(sl.start, ==, 2); munit_assert_uint32(sl.len, ==, 1); rv = vfs2_unhide(fp); munit_assert_int(rv, ==, 0); sqlite3_stmt *stmt; rv = sqlite3_prepare_v2(db, "SELECT * FROM foo", -1, &stmt, NULL); munit_assert_int(rv, ==, SQLITE_OK); rv = sqlite3_step(stmt); munit_assert_int(rv, ==, SQLITE_ROW); munit_assert_int(sqlite3_column_count(stmt), ==, 1); munit_assert_int(sqlite3_column_int(stmt, 0), ==, 22); rv = sqlite3_step(stmt); munit_assert_int(rv, ==, SQLITE_DONE); int nlog; int nckpt; rv = sqlite3_wal_checkpoint_v2(db, "main", SQLITE_CHECKPOINT_PASSIVE, &nlog, &nckpt); munit_assert_int(rv, ==, SQLITE_OK); munit_assert_int(nlog, ==, 3); munit_assert_int(nckpt, ==, 3); rv = sqlite3_exec(db, "INSERT INTO foo (bar) VALUES (101)", NULL, NULL, NULL); munit_assert_int(rv, ==, SQLITE_OK); rv = sqlite3_reset(stmt); munit_assert_int(rv, ==, SQLITE_OK); rv = sqlite3_step(stmt); munit_assert_int(rv, ==, SQLITE_ROW); munit_assert_int(sqlite3_column_count(stmt), ==, 1); munit_assert_int(sqlite3_column_int(stmt, 0), ==, 22); /* Can't see the new row yet. */ rv = sqlite3_step(stmt); munit_assert_int(rv, ==, SQLITE_DONE); struct vfs2_wal_frame *frames; unsigned n; rv = vfs2_poll(fp, &frames, &n, &sl); munit_assert_int(rv, ==, 0); munit_assert_uint(n, ==, 1); munit_assert_not_null(frames); munit_assert_not_null(frames[0].page); sqlite3_free(frames[0].page); sqlite3_free(frames); rv = vfs2_unhide(fp); munit_assert_int(rv, ==, 0); rv = sqlite3_reset(stmt); munit_assert_int(rv, ==, SQLITE_OK); rv = sqlite3_step(stmt); munit_assert_int(rv, ==, SQLITE_ROW); munit_assert_int(sqlite3_column_count(stmt), ==, 1); munit_assert_int(sqlite3_column_int(stmt, 0), ==, 22); rv = sqlite3_step(stmt); munit_assert_int(rv, ==, SQLITE_ROW); munit_assert_int(sqlite3_column_int(stmt, 0), ==, 101); rv = sqlite3_step(stmt); munit_assert_int(rv, ==, SQLITE_DONE); rv = sqlite3_finalize(stmt); munit_assert_int(rv, ==, SQLITE_OK); sqlite3_close(db); return MUNIT_OK; } #define WAL_SIZE_FROM_FRAMES(n) (32 + (24 + PAGE_SIZE) * (n)) static void make_wal_hdr(uint8_t *buf, uint32_t ckpoint_seqno, uint32_t salt1, uint32_t salt2) { uint8_t *p = buf; /* checksum */ BytePutBe32(0x377f0683, p); p += 4; BytePutBe32(3007000, p); p += 4; BytePutBe32(PAGE_SIZE, p); p += 4; BytePutBe32(ckpoint_seqno, p); p += 4; BytePutBe32(salt1, p); p += 4; BytePutBe32(salt2, p); p += 4; uint32_t s0 = 0; uint32_t s1 = 0; size_t off = 0; s0 += ByteGetBe32(buf + off) + s1; s1 += ByteGetBe32(buf + off + 4) + s0; off += 8; s0 += ByteGetBe32(buf + off) + s1; s1 += ByteGetBe32(buf + off + 4) + s0; off += 8; s0 += ByteGetBe32(buf + off) + s1; s1 += ByteGetBe32(buf + off + 4) + s0; off += 8; BytePutBe32(s0, p); p += 4; BytePutBe32(s1, p); p += 4; } TEST(vfs2, startup_one_nonempty, set_up, tear_down, 0, NULL) { struct fixture *f = data; char buf[PATH_MAX]; snprintf(buf, PATH_MAX, "%s/%s", f->dir, "test.db"); check_wals(buf, 0, 0); uint8_t wal2_hdronly[WAL_SIZE_FROM_FRAMES(0)] = {0}; make_wal_hdr(wal2_hdronly, 0, 17, 103); prepare_wals(buf, NULL, 0, wal2_hdronly, sizeof(wal2_hdronly)); sqlite3 *db; tracef("opening..."); int rv = sqlite3_open(buf, &db); munit_assert_int(rv, ==, SQLITE_OK); tracef("setup..."); rv = sqlite3_exec(db, "PRAGMA page_size=" PAGE_SIZE_STR ";" "PRAGMA journal_mode=WAL;" "PRAGMA wal_autocheckpoint=0", NULL, NULL, NULL); munit_assert_int(rv, ==, SQLITE_OK); sqlite3_file *fp; sqlite3_file_control(db, "main", SQLITE_FCNTL_FILE_POINTER, &fp); tracef("barrier..."); rv = vfs2_commit_barrier(fp); munit_assert_int(rv, ==, 0); tracef("create table..."); rv = sqlite3_exec(db, "CREATE TABLE foo (n INTEGER)", NULL, NULL, NULL); munit_assert_int(rv, ==, SQLITE_OK); tracef("closing..."); rv = sqlite3_close(db); munit_assert_int(rv, ==, SQLITE_OK); check_wals(buf, WAL_SIZE_FROM_FRAMES(2), WAL_SIZE_FROM_FRAMES(0)); return MUNIT_OK; } TEST(vfs2, startup_both_nonempty, set_up, tear_down, 0, NULL) { struct fixture *f = data; char buf[PATH_MAX]; snprintf(buf, PATH_MAX, "%s/%s", f->dir, "test.db"); check_wals(buf, 0, 0); uint8_t wal1_hdronly[WAL_SIZE_FROM_FRAMES(0)] = {0}; make_wal_hdr(wal1_hdronly, 0, 18, 103); uint8_t wal2_hdronly[WAL_SIZE_FROM_FRAMES(0)] = {0}; make_wal_hdr(wal2_hdronly, 0, 17, 103); prepare_wals(buf, wal1_hdronly, sizeof(wal1_hdronly), wal2_hdronly, sizeof(wal2_hdronly)); sqlite3 *db; int rv = sqlite3_open(buf, &db); munit_assert_int(rv, ==, SQLITE_OK); rv = sqlite3_exec(db, "PRAGMA page_size=" PAGE_SIZE_STR ";" "PRAGMA journal_mode=WAL;" "PRAGMA wal_autocheckpoint=0", NULL, NULL, NULL); munit_assert_int(rv, ==, SQLITE_OK); rv = sqlite3_exec(db, "CREATE TABLE foo (n INTEGER)", NULL, NULL, NULL); munit_assert_int(rv, ==, SQLITE_OK); rv = sqlite3_close(db); munit_assert_int(rv, ==, SQLITE_OK); check_wals(buf, WAL_SIZE_FROM_FRAMES(0), WAL_SIZE_FROM_FRAMES(2)); return MUNIT_OK; } TEST(vfs2, rollback, set_up, tear_down, 0, NULL) { struct fixture *f = data; char buf[PATH_MAX]; snprintf(buf, PATH_MAX, "%s/%s", f->dir, "test.db"); sqlite3 *db; int rv = sqlite3_open(buf, &db); munit_assert_int(rv, ==, SQLITE_OK); rv = sqlite3_exec(db, "PRAGMA journal_mode=WAL;" "PRAGMA wal_autocheckpoint=0", NULL, NULL, NULL); munit_assert_int(rv, ==, SQLITE_OK); rv = sqlite3_exec(db, "CREATE TABLE foo (n INTEGER)", NULL, NULL, NULL); munit_assert_int(rv, ==, SQLITE_OK); sqlite3_file *fp; sqlite3_file_control(db, "main", SQLITE_FCNTL_FILE_POINTER, &fp); struct vfs2_wal_slice sl; rv = vfs2_poll(fp, NULL, NULL, &sl); munit_assert_int(rv, ==, 0); rv = vfs2_unhide(fp); rv = sqlite3_exec(db, "BEGIN", NULL, NULL, NULL); munit_assert_int(rv, ==, SQLITE_OK); char sql[100]; for (unsigned i = 0; i < 500; i++) { snprintf(sql, sizeof(sql), "INSERT INTO foo (n) VALUES (%d)", i); rv = sqlite3_exec(db, sql, NULL, NULL, NULL); munit_assert_int(rv, ==, SQLITE_OK); } rv = sqlite3_exec(db, "ROLLBACK", NULL, NULL, NULL); munit_assert_int(rv, ==, SQLITE_OK); rv = sqlite3_close(db); munit_assert_int(rv, ==, SQLITE_OK); return MUNIT_OK; }