pax_global_header 0000666 0000000 0000000 00000000064 14652527134 0014523 g ustar 00root root 0000000 0000000 52 comment=04623bf01ba876f3686ba085ed5bd8db7f28ca0d
dqlite-1.16.7/ 0000775 0000000 0000000 00000000000 14652527134 0013101 5 ustar 00root root 0000000 0000000 dqlite-1.16.7/.clang-format 0000664 0000000 0000000 00000000403 14652527134 0015451 0 ustar 00root root 0000000 0000000 BasedOnStyle: Chromium
BreakBeforeBraces: Custom
BraceWrapping:
AfterFunction: true
AfterStruct: false
Cpp11BracedListStyle: false
IndentWidth: 8
UseTab: ForContinuationAndIndentation
PointerAlignment: Right
AllowAllParametersOfDeclarationOnNextLine: false
dqlite-1.16.7/.clang-tidy 0000664 0000000 0000000 00000000611 14652527134 0015133 0 ustar 00root root 0000000 0000000 Checks: '-*,readability-identifier-naming'
HeaderFilterRegex: '.*'
WarningsAsErrors: '*'
CheckOptions:
- key: readability-identifier-naming.StructCase
value: lower_case
- key: readability-identifier-naming.UnionCase
value: lower_case
- key: readability-identifier-naming.FunctionCase
value: lower_case
- key: readability-identifier-naming.TypedefCase
value: lower_case
dqlite-1.16.7/.dir-locals.el 0000664 0000000 0000000 00000000266 14652527134 0015536 0 ustar 00root root 0000000 0000000 ((nil . ((fill-column . 80)))
(c-mode . ((c-file-style . "linux-tabs-only")
(flycheck-gcc-definitions . ("_GNU_SOURCE"))
(flycheck-clang-definitions . ("_GNU_SOURCE")))))
dqlite-1.16.7/.github/ 0000775 0000000 0000000 00000000000 14652527134 0014441 5 ustar 00root root 0000000 0000000 dqlite-1.16.7/.github/dependabot.yml 0000664 0000000 0000000 00000000542 14652527134 0017272 0 ustar 00root root 0000000 0000000 # Set update schedule for GitHub Actions
# for more info see: https://docs.github.com/en/code-security/dependabot/working-with-dependabot/keeping-your-actions-up-to-date-with-dependabot
version: 2
updates:
- package-ecosystem: "github-actions"
directory: "/" # checks for workflow files in .github/workflows
schedule:
interval: "weekly"
dqlite-1.16.7/.github/workflows/ 0000775 0000000 0000000 00000000000 14652527134 0016476 5 ustar 00root root 0000000 0000000 dqlite-1.16.7/.github/workflows/build-and-test.yml 0000664 0000000 0000000 00000002775 14652527134 0022050 0 ustar 00root root 0000000 0000000 name: CI Tests
on:
- push
- pull_request
jobs:
build-and-test:
strategy:
fail-fast: false
matrix:
os:
- ubuntu-20.04
- ubuntu-22.04
- ubuntu-24.04
compiler:
- gcc
- clang
dqlite-next:
- yes
- no
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
- name: Set up dependencies
run: |
sudo apt update
sudo apt install -y lcov libsqlite3-dev liblz4-dev libuv1-dev
- name: Build dqlite
env:
CC: ${{ matrix.compiler }}
run: |
autoreconf -i
./configure --enable-debug --enable-code-coverage --enable-sanitize \
--enable-build-raft --enable-dqlite-next=${{ matrix.dqlite-next }}
make -j4 unit-test integration-test \
raft-core-fuzzy-test \
raft-core-integration-test \
raft-core-unit-test \
raft-uv-integration-test \
raft-uv-unit-test
- name: Test
env:
CC: ${{ matrix.compiler }}
LIBDQLITE_TRACE: 1
run: |
make check || (cat ./test-suite.log && false)
- name: Coverage
env:
CC: ${{ matrix.compiler }}
if: ${{ matrix.os == 'ubuntu-22.04' && matrix.compiler == 'gcc' }}
run: |
make code-coverage-capture
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
with:
verbose: true
dqlite-1.16.7/.github/workflows/cla-check.yml 0000664 0000000 0000000 00000000271 14652527134 0021033 0 ustar 00root root 0000000 0000000 name: Canonical CLA
on:
- pull_request
jobs:
cla-check:
runs-on: ubuntu-20.04
steps:
- name: Check if CLA signed
uses: canonical/has-signed-canonical-cla@v1
dqlite-1.16.7/.github/workflows/coverity.yml 0000664 0000000 0000000 00000002742 14652527134 0021072 0 ustar 00root root 0000000 0000000 name: Coverity
on:
push:
branches:
- master
jobs:
test:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download Coverity Build Tool
run: |
wget -q https://scan.coverity.com/download/cxx/linux64 --post-data "token=$TOKEN&project=canonical/dqlite" -O cov-analysis-linux64.tar.gz
mkdir cov-analysis-linux64
tar xzf cov-analysis-linux64.tar.gz --strip 1 -C cov-analysis-linux64
env:
TOKEN: ${{ secrets.COVERITY_SCAN_TOKEN }}
- name: Install dependencies
run: |
sudo apt-get update -qq
sudo apt-get install -qq gcc libsqlite3-dev liblz4-dev libuv1-dev
- name: Run coverity
run: |
export PATH="$(pwd)/cov-analysis-linux64/bin:${PATH}"
# Configure
autoreconf -i
mkdir build
cd build
../configure --enable-build-raft
# Build
cov-build --dir cov-int make -j4
tar czvf dqlite.tgz cov-int
# Submit the results
curl \
--form project=canonical/dqlite \
--form token=${TOKEN} \
--form email=mathieu.bordere@canonical.com \
--form file=@dqlite.tgz \
--form version=master \
--form description="${GITHUB_SHA}" \
https://scan.coverity.com/builds?project=canonical/dqlite
env:
TOKEN: ${{ secrets.COVERITY_SCAN_TOKEN }}
dqlite-1.16.7/.github/workflows/downstream.yml 0000664 0000000 0000000 00000004640 14652527134 0021410 0 ustar 00root root 0000000 0000000 name: Downstream checks
on:
issue_comment:
types: [created, edited]
jobs:
dqlite:
if: ${{ github.event.issue.pull_request && contains(github.event.comment.body, 'please test downstream') }}
runs-on: ubuntu-22.04
steps:
- name: Install apt deps
run: |
sudo apt-get update -qq
sudo apt-get install -qq automake libtool gcc make liblz4-dev libuv1-dev libsqlite3-dev
- name: Check out libbacktrace
uses: actions/checkout@v4
with:
repository: ianlancetaylor/libbacktrace
path: libbacktrace
- name: Install libbacktrace
run: |
cd libbacktrace
autoreconf -i
./configure
sudo make -j$(nproc) install
sudo ldconfig
- name: Check out dqlite
uses: actions/checkout@v4
with:
ref: refs/pull/${{ github.event.issue.number }}/merge
path: dqlite
- name: Install dqlite
run: |
cd dqlite
autoreconf -i
./configure --enable-debug --enable-sanitize --enable-backtrace --enable-build-raft
sudo make -j$(nproc)
sudo make install
sudo ldconfig
- name: Install Go
uses: actions/setup-go@v5
- name: Check out go-dqlite
uses: actions/checkout@v4
with:
repository: canonical/go-dqlite
path: go-dqlite
- name: Test go-dqlite
env:
GO_DQLITE_MULTITHREAD: '1'
run: |
cd go-dqlite
go get -tags libsqlite3 -t ./...
go test -asan -v ./...
VERBOSE=1 ASAN=-asan ./test/dqlite-demo.sh
VERBOSE=1 ASAN=-asan DISK=1 ./test/dqlite-demo.sh
VERBOSE=1 ASAN=-asan ./test/roles.sh
VERBOSE=1 ASAN=-asan DISK=1 ./test/roles.sh
VERBOSE=1 ASAN=-asan ./test/recover.sh
VERBOSE=1 ASAN=-asan DISK=1 ./test/recover.sh
jepsen:
if: ${{ github.event.issue.pull_request && contains(github.event.comment.body, 'please test downstream') }}
uses: canonical/jepsen.dqlite/.github/workflows/test-build-run.yml@master
with:
dqlite-ref: refs/pull/${{ github.event.issue.number }}/head
workloads: >
['append', 'bank', 'set']
nemeses: >
['none', 'partition', 'kill', 'stop', 'disk', 'member',
'partition,stop', 'partition,kill', 'partition,member',
'packet,stop', 'pause']
disk: >
['0']
dqlite-1.16.7/.github/workflows/external-raft.yml 0000664 0000000 0000000 00000001521 14652527134 0021774 0 ustar 00root root 0000000 0000000 name: CI Tests (external libraft)
on:
- push
- pull_request
jobs:
build-and-test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Setup dependencies
run: |
sudo apt update
sudo apt install -y libsqlite3-dev liblz4-dev libuv1-dev
- name: Build raft
run: |
git clone https://github.com/canonical/raft --depth 1
cd raft
autoreconf -i
./configure --enable-debug --enable-sanitize
make -j4
sudo make install
sudo ldconfig
- name: Build dqlite
run: |
autoreconf -i
./configure --enable-debug --enable-sanitize
make -j4
- name: Test
run: |
export LIBRAFT_TRACE=1 LIBDQLITE_TRACE=1
make -j4 check || (cat ./test-suite.log && false)
dqlite-1.16.7/.github/workflows/latest-deps.yml 0000664 0000000 0000000 00000004164 14652527134 0021453 0 ustar 00root root 0000000 0000000 name: CI Tests (latest deps)
on:
- push
- pull_request
jobs:
build-and-test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Raise aio-max-nr
run: |
sysctl fs.aio-max-nr
sudo sysctl -w fs.aio-max-nr=1000000
- name: Install latest libuv
run: |
version="$(curl -L 'https://dist.libuv.org/dist' | grep -o 'v[0-9]\.[0-9]\{1,2\}\.[0-9]\{1,2\}' | sort -V -r | head -n1)"
echo "Selected libuv $version"
curl -LO "https://dist.libuv.org/dist/$version/libuv-$version.tar.gz"
tar xzf "libuv-$version.tar.gz"
cd "libuv-$version"
sh autogen.sh
./configure
make -j4
sudo make install
- name: Install latest liblz4
run: |
mkdir lz4
cd lz4
git init
git remote add github 'https://github.com/lz4/lz4'
git fetch github 'refs/tags/*:refs/tags/*'
version="$(git tag | sort -V -r | head -n1)"
echo "Selected lz4 $version"
git checkout "$version"
make -j4
sudo make install
- name: ldconfig
run: |
sudo ldconfig
- name: Get latest SQLite
run: |
relative="$(curl -L 'https://sqlite.org/download.html' | grep '^PRODUCT' | grep 'amalgamation' | cut -d',' -f3)"
curl -LO "https://sqlite.org/$relative"
name="$(basename "$relative" .zip)"
echo "Selected $name"
unzip "$name.zip"
cd "$name"
cp sqlite3.{c,h} "$GITHUB_WORKSPACE"
- name: Build dqlite
run: |
autoreconf -i
./configure --enable-debug --enable-sanitize --enable-build-raft --enable-build-sqlite
make -j4 unit-test integration-test \
raft-core-fuzzy-test \
raft-core-integration-test \
raft-core-unit-test \
raft-uv-integration-test \
raft-uv-unit-test
ldd .libs/libdqlite.so
- name: Test
run: |
export LIBDQLITE_TRACE=1
make check || (cat ./test-suite.log && false)
dqlite-1.16.7/.github/workflows/linting.yml 0000664 0000000 0000000 00000001562 14652527134 0020671 0 ustar 00root root 0000000 0000000 name: Linting
on:
- push
- pull_request
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 2
- name: Install apt dependencies
run: |
sudo apt update
sudo apt install -y libsqlite3-dev liblz4-dev libuv1-dev bear
- uses: KyleMayes/install-llvm-action@master
with:
version: 17
- name: Run clang-format
run: |
find . \( -name '*.c' -or -name '*.h' \) -not -name 'munit.*' -path ./llvm -prune | xargs ./llvm/bin/clang-format --style=file --dry-run -Werror
- name: Run clang-tidy
run: |
shopt -s globstar
bear -- cc -D_GNU_SOURCE -DHAVE_LINUX_AIO_ABI_H -c {src,test}/**/*.c
git show -U0 --first-parent | ./clang-tidy-diff.py -p1 -config-file=.clang-tidy -clang-tidy-binary=./llvm/bin/clang-tidy -use-color
dqlite-1.16.7/.github/workflows/packages.yml 0000664 0000000 0000000 00000003156 14652527134 0021004 0 ustar 00root root 0000000 0000000 name: Build PPA source packages
on:
push:
branches:
- master
jobs:
build:
if: github.repository == 'canonical/dqlite'
strategy:
fail-fast: false
matrix:
target:
- focal
- jammy
- mantic
- noble
runs-on: ubuntu-20.04
environment:
name: ppa
steps:
- name: Clone the repositories
run: |
git clone https://github.com/canonical/dqlite
git clone https://github.com/canonical/dqlite-ppa -b dqlite --depth 1
- name: Setup dependencies
run: |
sudo apt-get update -qq
sudo apt-get install -qq debhelper devscripts gnupg
- name: Setup GPG signing key
env:
PPA_SECRET_KEY: ${{ secrets.PPA_SECRET_KEY }}
run: |
echo "$PPA_SECRET_KEY" > private-key.asc
gpg --import --batch private-key.asc
- name: Delete GPG signing key file
if: always()
run: |
rm -f private-key.asc
- name: Build source package
env:
DEBFULLNAME: "Github Actions"
DEBEMAIL: "dqlitebot@lists.canonical.com"
TARGET: ${{ matrix.target }}
run: |
cp -R dqlite-ppa/debian dqlite/
cd dqlite/
VERSION="$(git describe --tags | sed -e "s/^v//" -e "s/-/+git/")"
dch --create \
--distribution ${TARGET} \
--package dqlite \
--newversion ${VERSION}~${TARGET}1 \
"Automatic build from Github"
debuild -S -sa -d -k${{ vars.PPA_PUBLIC_KEY }}
- name: Upload to Launchpad
run: |
dput -U -u ppa:dqlite/dev *.changes
dqlite-1.16.7/.gitignore 0000664 0000000 0000000 00000000565 14652527134 0015077 0 ustar 00root root 0000000 0000000 *.a
*.gcda
*.gcno
*.la
*.lo
*.log
*.o
*.so
*.trs
.deps
.dirstamp
.libs
Makefile
Makefile.in
aclocal.m4
aminclude_static.am
autom4te*.cache
confdefs.h
config.status
configure
coverage/
coverage.info
unit-test
integration-test
dqlite.pc
libtool
stamp-h*
sqlite3.c
raft-core-fuzzy-test
raft-core-integration-test
raft-core-unit-test
raft-uv-integration-test
raft-uv-unit-test
dqlite-1.16.7/AUTHORS 0000664 0000000 0000000 00000000360 14652527134 0014150 0 ustar 00root root 0000000 0000000 Unless mentioned otherwise in a specific file's header, all code in this
project is released under the LGPL v3 license.
The list of authors and contributors can be retrieved from the git
commit history and in some cases, the file headers.
dqlite-1.16.7/CODE_OF_CONDUCT.md 0000664 0000000 0000000 00000000161 14652527134 0015676 0 ustar 00root root 0000000 0000000 dqlite has adopted the [Ubuntu Code of Conduct](coc).
[coc]: https://ubuntu.com/community/ethos/code-of-conduct
dqlite-1.16.7/CONTRIBUTING.md 0000664 0000000 0000000 00000000661 14652527134 0015335 0 ustar 00root root 0000000 0000000 # Contributing to dqlite
The dqlite team welcomes external contributions via GitHub pull
requests. To get your PR merged, you need to sign [Canonical's
contributor license agreement (CLA)][cla]. This is straightforward to
do once you have an account on [Launchpad][lp]; if you don't, you can
create one [here][signup].
[cla]: https://ubuntu.com/legal/contributors
[lp]: https://launchpad.net
[signup]: https://launchpad.net/+login
dqlite-1.16.7/Dockerfile 0000664 0000000 0000000 00000002770 14652527134 0015101 0 ustar 00root root 0000000 0000000 # FROM debian:buster-slim as dqlite-lib-builder
FROM ubuntu as dqlite-lib-builder
ARG DEBIAN_FRONTEND="noninteractive"
ENV TZ=Europe/London
ENV LD_LIBRARY_PATH=/usr/local/lib
ENV GOROOT=/usr/local/go
ENV GOPATH=/go
ENV PATH=$GOPATH/bin:$GOROOT/bin:$PATH
RUN apt-get update && apt-get install -y git build-essential dh-autoreconf pkg-config libuv1-dev libsqlite3-dev liblz4-dev tcl8.6 wget
WORKDIR /opt
RUN git clone https://github.com/canonical/raft.git && \
git clone https://github.com/canonical/go-dqlite.git && \
wget -c https://golang.org/dl/go1.15.2.linux-amd64.tar.gz -O - | tar -xzf - -C /usr/local
WORKDIR /opt/raft
RUN autoreconf -i && ./configure && make && make install
WORKDIR /opt/dqlite
COPY . .
RUN autoreconf -i && ./configure && make && make install
WORKDIR /opt/go-dqlite
RUN go get -d -v ./... && \
go install -tags libsqlite3 ./cmd/dqlite-demo && \
go install -tags libsqlite3 ./cmd/dqlite
# FROM debian:buster-slim
FROM ubuntu
ARG DEBIAN_FRONTEND="noninteractive"
ENV TZ=Europe/London
ENV LD_LIBRARY_PATH=/usr/local/lib
ENV PATH=/opt:$PATH
COPY --from=dqlite-lib-builder /go/bin /opt/
COPY --from=dqlite-lib-builder /usr/local/lib /usr/local/lib
COPY --from=dqlite-lib-builder \
/usr/lib/x86_64-linux-gnu/libuv.so \
/usr/lib/x86_64-linux-gnu/libuv.so.1\
/usr/lib/x86_64-linux-gnu/libuv.so.1.0.0\
/usr/lib/
COPY --from=dqlite-lib-builder \
/lib/x86_64-linux-gnu/libsqlite3.so \
/lib/x86_64-linux-gnu/libsqlite3.so.0 \
/usr/lib/x86_64-linux-gnu/
dqlite-1.16.7/LICENSE 0000664 0000000 0000000 00000021506 14652527134 0014112 0 ustar 00root root 0000000 0000000 All files in this repository are licensed as follows. If you contribute
to this repository, it is assumed that you license your contribution
under the same license unless you state otherwise.
All files Copyright (C) 2017-2019 Canonical Ltd. unless otherwise specified in
the file.
This software is licensed under the LGPLv3, included below.
As a special exception to the GNU Lesser General Public License version 3
("LGPL3"), the copyright holders of this Library give you permission to
convey to a third party a Combined Work that links statically or dynamically
to this Library without providing any Minimal Corresponding Source or
Minimal Application Code as set out in 4d or providing the installation
information set out in section 4e, provided that you comply with the other
provisions of LGPL3 and provided that you meet, for the Application the
terms and conditions of the license(s) which apply to the Application.
Except as stated in this special exception, the provisions of LGPL3 will
continue to comply in full to this Library. If you modify this Library, you
may apply this exception to your version of this Library, but you are not
obliged to do so. If you do not wish to do so, delete this exception
statement from your version. This exception does not (and cannot) modify any
license terms which apply to the Application, with which you must still
comply.
GNU LESSER GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc.
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
This version of the GNU Lesser General Public License incorporates
the terms and conditions of version 3 of the GNU General Public
License, supplemented by the additional permissions listed below.
0. Additional Definitions.
As used herein, "this License" refers to version 3 of the GNU Lesser
General Public License, and the "GNU GPL" refers to version 3 of the GNU
General Public License.
"The Library" refers to a covered work governed by this License,
other than an Application or a Combined Work as defined below.
An "Application" is any work that makes use of an interface provided
by the Library, but which is not otherwise based on the Library.
Defining a subclass of a class defined by the Library is deemed a mode
of using an interface provided by the Library.
A "Combined Work" is a work produced by combining or linking an
Application with the Library. The particular version of the Library
with which the Combined Work was made is also called the "Linked
Version".
The "Minimal Corresponding Source" for a Combined Work means the
Corresponding Source for the Combined Work, excluding any source code
for portions of the Combined Work that, considered in isolation, are
based on the Application, and not on the Linked Version.
The "Corresponding Application Code" for a Combined Work means the
object code and/or source code for the Application, including any data
and utility programs needed for reproducing the Combined Work from the
Application, but excluding the System Libraries of the Combined Work.
1. Exception to Section 3 of the GNU GPL.
You may convey a covered work under sections 3 and 4 of this License
without being bound by section 3 of the GNU GPL.
2. Conveying Modified Versions.
If you modify a copy of the Library, and, in your modifications, a
facility refers to a function or data to be supplied by an Application
that uses the facility (other than as an argument passed when the
facility is invoked), then you may convey a copy of the modified
version:
a) under this License, provided that you make a good faith effort to
ensure that, in the event an Application does not supply the
function or data, the facility still operates, and performs
whatever part of its purpose remains meaningful, or
b) under the GNU GPL, with none of the additional permissions of
this License applicable to that copy.
3. Object Code Incorporating Material from Library Header Files.
The object code form of an Application may incorporate material from
a header file that is part of the Library. You may convey such object
code under terms of your choice, provided that, if the incorporated
material is not limited to numerical parameters, data structure
layouts and accessors, or small macros, inline functions and templates
(ten or fewer lines in length), you do both of the following:
a) Give prominent notice with each copy of the object code that the
Library is used in it and that the Library and its use are
covered by this License.
b) Accompany the object code with a copy of the GNU GPL and this license
document.
4. Combined Works.
You may convey a Combined Work under terms of your choice that,
taken together, effectively do not restrict modification of the
portions of the Library contained in the Combined Work and reverse
engineering for debugging such modifications, if you also do each of
the following:
a) Give prominent notice with each copy of the Combined Work that
the Library is used in it and that the Library and its use are
covered by this License.
b) Accompany the Combined Work with a copy of the GNU GPL and this license
document.
c) For a Combined Work that displays copyright notices during
execution, include the copyright notice for the Library among
these notices, as well as a reference directing the user to the
copies of the GNU GPL and this license document.
d) Do one of the following:
0) Convey the Minimal Corresponding Source under the terms of this
License, and the Corresponding Application Code in a form
suitable for, and under terms that permit, the user to
recombine or relink the Application with a modified version of
the Linked Version to produce a modified Combined Work, in the
manner specified by section 6 of the GNU GPL for conveying
Corresponding Source.
1) Use a suitable shared library mechanism for linking with the
Library. A suitable mechanism is one that (a) uses at run time
a copy of the Library already present on the user's computer
system, and (b) will operate properly with a modified version
of the Library that is interface-compatible with the Linked
Version.
e) Provide Installation Information, but only if you would otherwise
be required to provide such information under section 6 of the
GNU GPL, and only to the extent that such information is
necessary to install and execute a modified version of the
Combined Work produced by recombining or relinking the
Application with a modified version of the Linked Version. (If
you use option 4d0, the Installation Information must accompany
the Minimal Corresponding Source and Corresponding Application
Code. If you use option 4d1, you must provide the Installation
Information in the manner specified by section 6 of the GNU GPL
for conveying Corresponding Source.)
5. Combined Libraries.
You may place library facilities that are a work based on the
Library side by side in a single library together with other library
facilities that are not Applications and are not covered by this
License, and convey such a combined library under terms of your
choice, if you do both of the following:
a) Accompany the combined library with a copy of the same work based
on the Library, uncombined with any other library facilities,
conveyed under the terms of this License.
b) Give prominent notice with the combined library that part of it
is a work based on the Library, and explaining where to find the
accompanying uncombined form of the same work.
6. Revised Versions of the GNU Lesser General Public License.
The Free Software Foundation may publish revised and/or new versions
of the GNU Lesser General Public License from time to time. Such new
versions will be similar in spirit to the present version, but may
differ in detail to address new problems or concerns.
Each version is given a distinguishing version number. If the
Library as you received it specifies that a certain numbered version
of the GNU Lesser General Public License "or any later version"
applies to it, you have the option of following the terms and
conditions either of that published version or of any later version
published by the Free Software Foundation. If the Library as you
received it does not specify a version number of the GNU Lesser
General Public License, you may choose any version of the GNU Lesser
General Public License ever published by the Free Software Foundation.
If the Library as you received it specifies that a proxy can decide
whether future versions of the GNU Lesser General Public License shall
apply, that proxy's public statement of acceptance of any version is
permanent authorization for you to choose that version for the
Library.
dqlite-1.16.7/Makefile.am 0000664 0000000 0000000 00000023776 14652527134 0015154 0 ustar 00root root 0000000 0000000 ACLOCAL_AMFLAGS = -I m4
AM_CFLAGS += $(CODE_COVERAGE_CFLAGS)
AM_CFLAGS += $(SQLITE_CFLAGS) $(UV_CFLAGS) $(PTHREAD_CFLAGS)
AM_LDFLAGS = $(UV_LIBS) $(PTHREAD_LIBS)
if DQLITE_NEXT_ENABLED
AM_CFLAGS += -DDQLITE_NEXT
endif
if !BUILD_RAFT_ENABLED
AM_CFLAGS += $(RAFT_CFLAGS) -DUSE_SYSTEM_RAFT
AM_LDFLAGS += $(RAFT_LIBS)
endif
if DEBUG_ENABLED
AM_CFLAGS += -O0
else
AM_CFLAGS += -O2
endif
if SANITIZE_ENABLED
AM_CFLAGS += -fsanitize=address
endif
if BACKTRACE_ENABLED
AM_CFLAGS += -DDQLITE_ASSERT_WITH_BACKTRACE -DRAFT_ASSERT_WITH_BACKTRACE
AM_LDFLAGS += -lbacktrace
endif
include_HEADERS = include/dqlite.h
basic_dqlite_sources = \
src/bind.c \
src/client/protocol.c \
src/command.c \
src/conn.c \
src/db.c \
src/dqlite.c \
src/error.c \
src/format.c \
src/fsm.c \
src/gateway.c \
src/id.c \
src/leader.c \
src/lib/addr.c \
src/lib/buffer.c \
src/lib/fs.c \
src/lib/sm.c \
src/lib/threadpool.c \
src/lib/transport.c \
src/logger.c \
src/message.c \
src/metrics.c \
src/config.c \
src/query.c \
src/registry.c \
src/request.c \
src/response.c \
src/roles.c \
src/server.c \
src/stmt.c \
src/tracing.c \
src/transport.c \
src/translate.c \
src/tuple.c \
src/vfs.c \
src/vfs2.c
lib_LTLIBRARIES = libdqlite.la
libdqlite_la_CFLAGS = $(AM_CFLAGS) -fvisibility=hidden -DRAFT_API=''
libdqlite_la_LDFLAGS = $(AM_LDFLAGS) -version-info 0:1:0
libdqlite_la_SOURCES = $(basic_dqlite_sources)
if BUILD_RAFT_ENABLED
libraft_la_SOURCES = \
src/raft/byte.c \
src/raft/callbacks.c \
src/raft/client.c \
src/raft/compress.c \
src/raft/configuration.c \
src/raft/convert.c \
src/raft/election.c \
src/raft/entry.c \
src/raft/err.c \
src/raft/fixture.c \
src/raft/flags.c \
src/raft/heap.c \
src/raft/lifecycle.c \
src/raft/log.c \
src/raft/membership.c \
src/raft/progress.c \
src/raft/raft.c \
src/raft/recv.c \
src/raft/recv_append_entries.c \
src/raft/recv_append_entries_result.c \
src/raft/recv_request_vote.c \
src/raft/recv_request_vote_result.c \
src/raft/recv_install_snapshot.c \
src/raft/recv_timeout_now.c \
src/raft/replication.c \
src/raft/snapshot.c \
src/raft/start.c \
src/raft/state.c \
src/raft/syscall.c \
src/raft/tick.c \
src/raft/uv.c \
src/raft/uv_append.c \
src/raft/uv_encoding.c \
src/raft/uv_finalize.c \
src/raft/uv_fs.c \
src/raft/uv_ip.c \
src/raft/uv_list.c \
src/raft/uv_metadata.c \
src/raft/uv_os.c \
src/raft/uv_prepare.c \
src/raft/uv_recv.c \
src/raft/uv_segment.c \
src/raft/uv_send.c \
src/raft/uv_snapshot.c \
src/raft/uv_tcp.c \
src/raft/uv_tcp_listen.c \
src/raft/uv_tcp_connect.c \
src/raft/uv_truncate.c \
src/raft/uv_work.c \
src/raft/uv_writer.c
libdqlite_la_SOURCES += $(libraft_la_SOURCES)
endif # BUILD_RAFT_ENABLED
check_PROGRAMS = unit-test integration-test
check_LTLIBRARIES = libtest.la
libtest_la_CFLAGS = $(AM_CFLAGS) -DMUNIT_TEST_NAME_LEN=60 -Wno-unknown-warning-option -Wno-unused-result -Wno-conversion -Wno-uninitialized -Wno-maybe-uninitialized -Wno-strict-prototypes -Wno-old-style-definition
libtest_la_SOURCES = \
test/lib/endpoint.c \
test/lib/fault.c \
test/lib/fs.c \
test/lib/heap.c \
test/lib/logger.c \
test/lib/munit.c \
test/lib/raft_heap.c \
test/lib/server.c \
test/lib/sqlite.c \
test/lib/uv.c
unit_test_SOURCES = $(basic_dqlite_sources)
unit_test_SOURCES += \
test/test_error.c \
test/test_integration.c \
test/unit/ext/test_uv.c \
test/unit/ext/test_uv_pool.c \
test/unit/lib/test_addr.c \
test/unit/lib/test_buffer.c \
test/unit/lib/test_byte.c \
test/unit/lib/test_registry.c \
test/unit/lib/test_serialize.c \
test/unit/lib/test_transport.c \
test/unit/test_command.c \
test/unit/test_conn.c \
test/unit/test_gateway.c \
test/unit/test_concurrency.c \
test/unit/test_registry.c \
test/unit/test_replication.c \
test/unit/test_request.c \
test/unit/test_role_management.c \
test/unit/test_sm.c \
test/unit/test_tuple.c \
test/unit/test_vfs.c \
test/unit/test_vfs2.c \
test/unit/main.c
unit_test_CFLAGS = $(AM_CFLAGS) -Wno-unknown-warning-option -Wno-uninitialized -Wno-maybe-uninitialized -Wno-float-equal -Wno-conversion
unit_test_LDFLAGS = $(AM_LDFLAGS)
unit_test_LDADD = libtest.la
if BUILD_RAFT_ENABLED
unit_test_LDADD += libraft.la
endif
integration_test_SOURCES = \
test/integration/test_client.c \
test/integration/test_cluster.c \
test/integration/test_fsm.c \
test/integration/test_membership.c \
test/integration/test_node.c \
test/integration/test_role_management.c \
test/integration/test_server.c \
test/integration/test_vfs.c \
test/integration/main.c
integration_test_CFLAGS = $(AM_CFLAGS) -Wno-conversion
integration_test_LDFLAGS = $(AM_LDFLAGS) -no-install
integration_test_LDADD = libtest.la libdqlite.la
if BUILD_RAFT_ENABLED
check_LTLIBRARIES += libraft.la
check_PROGRAMS += \
raft-core-unit-test \
raft-core-integration-test \
raft-uv-unit-test \
raft-uv-integration-test \
raft-core-fuzzy-test
libtest_la_SOURCES += \
test/raft/lib/addrinfo.c \
test/raft/lib/fault.c \
test/raft/lib/fsm.c \
test/raft/lib/heap.c \
test/raft/lib/munit.c \
test/raft/lib/tcp.c \
test/raft/lib/cluster.c \
test/raft/lib/aio.c \
test/raft/lib/dir.c \
test/raft/lib/tcp.c \
test/raft/lib/loop.c
libraft_la_CFLAGS = $(AM_CFLAGS)
libraft_la_LDFLAGS = $(UV_LIBS)
raft_core_unit_test_SOURCES = \
$(libraft_la_SOURCES) \
src/lib/sm.c \
src/tracing.c \
test/raft/unit/main_core.c \
test/raft/unit/test_byte.c \
test/raft/unit/test_compress.c \
test/raft/unit/test_configuration.c \
test/raft/unit/test_err.c \
test/raft/unit/test_flags.c \
test/raft/unit/test_log.c \
test/raft/unit/test_queue.c \
test/raft/unit/test_snapshot.c
raft_core_unit_test_CFLAGS = $(AM_CFLAGS) -Wno-conversion
raft_core_unit_test_LDADD = libtest.la
raft_core_integration_test_SOURCES = \
src/tracing.c \
src/lib/sm.c \
test/raft/integration/main_core.c \
test/raft/integration/test_apply.c \
test/raft/integration/test_assign.c \
test/raft/integration/test_barrier.c \
test/raft/integration/test_bootstrap.c \
test/raft/integration/test_digest.c \
test/raft/integration/test_election.c \
test/raft/integration/test_fixture.c \
test/raft/integration/test_heap.c \
test/raft/integration/test_init.c \
test/raft/integration/test_membership.c \
test/raft/integration/test_recover.c \
test/raft/integration/test_replication.c \
test/raft/integration/test_snapshot.c \
test/raft/integration/test_start.c \
test/raft/integration/test_strerror.c \
test/raft/integration/test_tick.c \
test/raft/integration/test_transfer.c \
test/raft/integration/test_voter_contacts.c
raft_core_integration_test_CFLAGS = $(AM_CFLAGS) -Wno-conversion
raft_core_integration_test_LDFLAGS = -no-install
raft_core_integration_test_LDADD = libtest.la libraft.la
raft_core_fuzzy_test_SOURCES = \
src/lib/sm.c \
src/tracing.c \
test/raft/fuzzy/main_core.c \
test/raft/fuzzy/test_election.c \
test/raft/fuzzy/test_liveness.c \
test/raft/fuzzy/test_membership.c \
test/raft/fuzzy/test_replication.c
raft_core_fuzzy_test_CFLAGS = $(AM_CFLAGS) -Wno-conversion
raft_core_fuzzy_test_LDFLAGS = -no-install
raft_core_fuzzy_test_LDADD = libtest.la libraft.la
raft_uv_unit_test_SOURCES = \
src/tracing.c \
src/raft/err.c \
src/raft/heap.c \
src/raft/syscall.c \
src/raft/uv_fs.c \
src/raft/uv_os.c \
src/raft/uv_writer.c \
test/raft/unit/main_uv.c \
test/raft/unit/test_uv_fs.c \
test/raft/unit/test_uv_os.c \
test/raft/unit/test_uv_writer.c
raft_uv_unit_test_CFLAGS = $(AM_CFLAGS) -Wno-conversion
raft_uv_unit_test_LDADD = libtest.la $(UV_LIBS)
# The integration/uv test is not linked to libraft, but built
# directly against the libraft sources in order to test some
# non-visible, non-API functions.
raft_uv_integration_test_SOURCES = \
$(libraft_la_SOURCES) \
src/tracing.c \
src/lib/sm.c \
test/raft/integration/main_uv.c \
test/raft/integration/test_uv_init.c \
test/raft/integration/test_uv_append.c \
test/raft/integration/test_uv_bootstrap.c \
test/raft/integration/test_uv_load.c \
test/raft/integration/test_uv_recover.c \
test/raft/integration/test_uv_recv.c \
test/raft/integration/test_uv_send.c \
test/raft/integration/test_uv_set_term.c \
test/raft/integration/test_uv_tcp_connect.c \
test/raft/integration/test_uv_tcp_listen.c \
test/raft/integration/test_uv_snapshot_put.c \
test/raft/integration/test_uv_truncate.c \
test/raft/integration/test_uv_truncate_snapshot.c \
test/raft/integration/test_uv_work.c
raft_uv_integration_test_CFLAGS = $(AM_CFLAGS) -Wno-type-limits -Wno-conversion
raft_uv_integration_test_LDFLAGS = -no-install
raft_uv_integration_test_LDADD = libtest.la $(UV_LIBS)
if LZ4_AVAILABLE
libdqlite_la_CFLAGS += -DLZ4_AVAILABLE $(LZ4_CFLAGS)
libdqlite_la_LDFLAGS += $(LZ4_LIBS)
raft_core_unit_test_CFLAGS += -DLZ4_AVAILABLE $(LZ4_CFLAGS)
raft_core_unit_test_LDFLAGS = $(LZ4_LIBS)
libraft_la_CFLAGS += -DLZ4_AVAILABLE $(LZ4_CFLAGS)
libraft_la_LDFLAGS += $(LZ4_LIBS)
raft_uv_integration_test_CFLAGS += -DLZ4_AVAILABLE
raft_uv_integration_test_LDFLAGS += $(LZ4_LIBS)
endif
if LZ4_ENABLED
libdqlite_la_CFLAGS += -DLZ4_ENABLED
raft_uv_integration_test_CFLAGS += -DLZ4_ENABLED
raft_core_unit_test_CFLAGS += -DLZ4_ENABLED
libraft_la_CFLAGS += -DLZ4_ENABLED
endif
endif # BUILD_RAFT_ENABLED
if BUILD_SQLITE_ENABLED
noinst_LTLIBRARIES = libsqlite3.la
libsqlite3_la_SOURCES = sqlite3.c
libsqlite3_la_CFLAGS = -g3
unit_test_LDADD += libsqlite3.la
libdqlite_la_LIBADD = libsqlite3.la
else
AM_LDFLAGS += $(SQLITE_LIBS)
endif
TESTS = $(check_PROGRAMS)
if CODE_COVERAGE_ENABLED
include $(top_srcdir)/aminclude_static.am
CODE_COVERAGE_DIRECTORY=./src
CODE_COVERAGE_OUTPUT_DIRECTORY=coverage
CODE_COVERAGE_OUTPUT_FILE=coverage.info
CODE_COVERAGE_IGNORE_PATTERN="/usr/include/*"
CODE_COVERAGE_BRANCH_COVERAGE=1
CODE_COVERAGE_LCOV_OPTIONS=$(CODE_COVERAGE_LCOV_OPTIONS_DEFAULT) --rc lcov_excl_br_line="assert\("
clean-local: code-coverage-clean
distclean-local: code-coverage-dist-clean
endif # CODE_COVERAGE_ENABLED
pkgconfigdir = $(libdir)/pkgconfig
pkgconfig_DATA = dqlite.pc
dqlite-1.16.7/README.md 0000664 0000000 0000000 00000010460 14652527134 0014361 0 ustar 00root root 0000000 0000000 dqlite [](https://github.com/canonical/dqlite/actions/workflows/build-and-test.yml) [](https://codecov.io/gh/canonical/dqlite)
======
[English](./README.md)|[简体中文](./README_CH.md)
[dqlite](https://dqlite.io) is a C library that implements an embeddable and
replicated SQL database engine with high availability and automatic failover.
The acronym "dqlite" stands for "distributed SQLite", meaning that dqlite
extends [SQLite](https://sqlite.org/) with a network protocol that can connect
together various instances of your application and have them act as a
highly-available cluster, with no dependency on external databases.
Design highlights
----------------
* Asynchronous single-threaded implementation using [libuv](https://libuv.org/)
as event loop.
* Custom wire protocol optimized for SQLite primitives and data types.
* Data replication based on the [Raft](https://raft.github.io/) algorithm.
License
-------
The dqlite library is released under a slightly modified version of LGPLv3,
that includes a copyright exception allowing users to statically link the
library code in their project and release the final work under their own terms.
See the full [license](https://github.com/canonical/dqlite/blob/master/LICENSE)
text.
Compatibility
-------------
dqlite runs on Linux and requires a kernel with support for [native async
I/O](https://man7.org/linux/man-pages/man2/io_setup.2.html) (not to be confused
with [POSIX AIO](https://man7.org/linux/man-pages/man7/aio.7.html)).
Try it
-------
The simplest way to see dqlite in action is to use the demo program that comes
with the Go dqlite bindings. Please see the [relevant
documentation](https://github.com/canonical/go-dqlite#demo) in that project.
Media
-----
A talk about dqlite was given at FOSDEM 2020, you can watch it
[here](https://fosdem.org/2020/schedule/event/dqlite/).
[Here](https://gcore.com/blog/comparing-litestream-rqlite-dqlite/) is a blog
post from 2022 comparing dqlite with rqlite and Litestream, other replication
software for SQLite.
Wire protocol
-------------
If you wish to write a client, please refer to the [wire
protocol](https://dqlite.io/docs/protocol) documentation.
Install
-------
If you are on a Debian-based system, you can get the latest development release
from dqlite's [dev PPA](https://launchpad.net/~dqlite/+archive/ubuntu/dev):
```
sudo add-apt-repository ppa:dqlite/dev
sudo apt update
sudo apt install libdqlite-dev
```
Contributing
------------
See [CONTRIBUTING.md](./CONTRIBUTING.md).
Build
-----
To build libdqlite from source you'll need:
* Build dependencies: pkg-config and GNU Autoconf, Automake, libtool, and make
* A reasonably recent version of [libuv](https://libuv.org/) (v1.8.0 or later), with headers.
* A reasonably recent version of [SQLite](https://sqlite.org/) (v3.22.0 or later), with headers.
* Optionally, a reasonably recent version of [LZ4](https://lz4.org/) (v1.7.1 or later), with headers.
Your distribution should already provide you with these dependencies. For
example, on Debian-based distros:
```
sudo apt install pkg-config autoconf automake libtool make libuv1-dev libsqlite3-dev liblz4-dev
```
With these dependencies installed, you can build and install the dqlite shared
library and headers as follows:
```
$ autoreconf -i
$ ./configure --enable-build-raft
$ make
$ sudo make install
```
The default installation prefix is `/usr/local`; you may need to run
```
$ sudo ldconfig
```
to enable the linker to find `libdqlite.so`. To install to a different prefix,
replace the configure step with something like
```
$ ./configure --enable-build-raft --prefix=/usr
```
The `--enable-build-raft` option causes dqlite to use its bundled Raft
implementation instead of linking to an external libraft; the latter is a
legacy configuration that should not be used for new development.
Usage Notes
-----------
Detailed tracing will be enabled when the environment variable
`LIBDQLITE_TRACE` is set before startup. The value of it can be in `[0..5]`
range and reperesents a tracing level, where `0` means "no traces" emitted, `5`
enables minimum (FATAL records only), and `1` enables maximum verbosity (all:
DEBUG, INFO, WARN, ERROR, FATAL records).
dqlite-1.16.7/README_CH.md 0000664 0000000 0000000 00000006551 14652527134 0014741 0 ustar 00root root 0000000 0000000
# dqlite
[](https://github.com/canonical/dqlite/actions/workflows/build-and-test.yml) [](https://codecov.io/gh/canonical/dqlite)
**注意**:中文文档有可能未及时更新,请以最新的英文[readme](./README.md)为准。
[dqlite](https://dqlite.io)是一个用C语言开发的可嵌入的,支持流复制的数据库引擎,具备高可用性和自动故障转移功能。
“dqlite”是“distributed SQLite”的简写,即分布式SQLite。意味着dqlite通过网络协议扩展SQLite,将应用程序的各个实例连接在一起,让它们作为一个高可用的集群,而不依赖外部数据库。
## 设计亮点
- 使用[libuv](https://libuv.org/)实现异步单线程的事件循环机制
- 针对SQLite 原始数据类型优化的自定义网络协议
- 基于[Raft](https://raft.github.io/)算法的数据复制及其高效[C-raft](https://github.com/canonical/raft)实现
## license
dqlite库是在略微修改的 LGPLv3 版本下发布的,其中包括一个版权例外,允许用户在他们的项目中静态链接这个库的代码并按照自己的条款发布最终作品。如有需要,请查看完整[license](https://github.com/canonical/dqlite/blob/master/LICENSE)文件。
## 兼容性
dqlite 在 Linux 上运行,由于C-raft 的 libuv 后端的实现,需要一个支持 [native async
I/O](https://man7.org/linux/man-pages/man2/io_setup.2.html) 的内核(注意不要和[POSIX AIO](https://man7.org/linux/man-pages/man7/aio.7.html)混淆)。
## 尝试使用
查看和了解dqlite的最简单方式是使用绑定了Go dqlite的demo样例程序,Go dqlite的使用可以参考它的项目文档[relevant
documentation](https://github.com/canonical/go-dqlite#demo)。
## 视频
在 FOSDEM 2020 上有一个关于dqlite的演讲视频,您可以在[此处](https://fosdem.org/2020/schedule/event/dqlite/)观看。
## 网络协议
如果您想编写客户端,请参阅[网络协议](https://dqlite.io/docs/protocol)文档。
## 下载
如果您使用的是基于 Debian 的系统,您可以从 dqlite 的[dev PPA](https://launchpad.net/~dqlite/+archive/ubuntu/dev) 获得最新的开发版本:
```bash
sudo add-apt-repository ppa:dqlite/dev
sudo apt-get update
sudo apt-get install libdqlite-dev
```
## 源码构建
为了编译构建libdqlite,您需要准备:
- 较新版本的libuv(v1.18.0或之后的版本)
- 较新版本的sqlite3-dev
- 构建好的[C-raft](https://github.com/canonical/raft)库
您的linux发行版应该已经为您提供了预构建的 libuv 共享库和 libsqlite3-dev,就不需要在下载了,否则还需要下载这两个依赖。
对于基于 Debian 的 Linux 发行版,您可以使用以下命令安装构建依赖项:
```
sudo apt install autoconf libuv1-dev liblz4-dev libtool pkg-config build-essential libsqlite3-dev
```
编译raft库运行如下命令:
```bash
git clone https://github.com/canonical/raft.git
cd raft
autoreconf -i
./configure
make
sudo make install
cd ..
```
所有依赖的库都下载好后,运行如下命令手动编译dqlite库:
```bash
autoreconf -i
./configure
make
sudo make install
```
## 注意事项
当环境变量LIBRAFT_TRACE在启动时被设置,将启用详细跟踪。 dqlite-1.16.7/VERSION 0000664 0000000 0000000 00000000005 14652527134 0014144 0 ustar 00root root 0000000 0000000 0.1.0 dqlite-1.16.7/ac/ 0000775 0000000 0000000 00000000000 14652527134 0013464 5 ustar 00root root 0000000 0000000 dqlite-1.16.7/ac/.gitignore 0000664 0000000 0000000 00000000016 14652527134 0015451 0 ustar 00root root 0000000 0000000 *
!.gitignore
dqlite-1.16.7/bt/ 0000775 0000000 0000000 00000000000 14652527134 0013506 5 ustar 00root root 0000000 0000000 dqlite-1.16.7/bt/request 0000775 0000000 0000000 00000001420 14652527134 0015121 0 ustar 00root root 0000000 0000000 #!/bin/sh
set -o errexit
libraft_path="${LIBRAFT_SO_PATH:-/usr/local/lib/libraft.so.2}"
exec bpftrace -I resources -I include $@ - <
struct request
{
void *data;
int type;
raft_index index;
queue queue;
};
uprobe:$libraft_path:lifecycleRequestStart
{
\$req = (struct request *)arg1;
@start_request[\$req->data, \$req->type, \$req->index] = nsecs;
}
uprobe:$libraft_path:lifecycleRequestEnd
{
\$req = (struct request *)arg1;
\$start = @start_request[\$req->data, \$req->type, \$req->index];
\$end = nsecs;
@full[\$req->data, \$req->type, \$req->index] = (\$start, \$end);
\$elapsed_msecs = (\$end - \$start) / 1000;
@hist = lhist(\$elapsed_msecs, 100, 1000, 10);
delete(@start_request[\$req->data, \$req->type, \$req->index]);
}
EOF
dqlite-1.16.7/clang-tidy-diff.py 0000775 0000000 0000000 00000027747 14652527134 0016440 0 ustar 00root root 0000000 0000000 #!/usr/bin/env python3
#
# ===- clang-tidy-diff.py - ClangTidy Diff Checker -----------*- python -*--===#
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# ===-----------------------------------------------------------------------===#
r"""
ClangTidy Diff Checker
======================
This script reads input from a unified diff, runs clang-tidy on all changed
files and outputs clang-tidy warnings in changed lines only. This is useful to
detect clang-tidy regressions in the lines touched by a specific patch.
Example usage for git/svn users:
git diff -U0 HEAD^ | clang-tidy-diff.py -p1
svn diff --diff-cmd=diff -x-U0 | \
clang-tidy-diff.py -fix -checks=-*,modernize-use-override
"""
import argparse
import glob
import json
import multiprocessing
import os
import re
import shutil
import subprocess
import sys
import tempfile
import threading
import traceback
try:
import yaml
except ImportError:
yaml = None
is_py2 = sys.version[0] == "2"
if is_py2:
import Queue as queue
else:
import queue as queue
def run_tidy(task_queue, lock, timeout, failed_files):
watchdog = None
while True:
command = task_queue.get()
try:
proc = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
if timeout is not None:
watchdog = threading.Timer(timeout, proc.kill)
watchdog.start()
stdout, stderr = proc.communicate()
if proc.returncode != 0:
if proc.returncode < 0:
msg = "Terminated by signal %d : %s\n" % (
-proc.returncode,
" ".join(command),
)
stderr += msg.encode("utf-8")
failed_files.append(command)
with lock:
sys.stdout.write(stdout.decode("utf-8") + "\n")
sys.stdout.flush()
if stderr:
sys.stderr.write(stderr.decode("utf-8") + "\n")
sys.stderr.flush()
except Exception as e:
with lock:
sys.stderr.write("Failed: " + str(e) + ": ".join(command) + "\n")
finally:
with lock:
if not (timeout is None or watchdog is None):
if not watchdog.is_alive():
sys.stderr.write(
"Terminated by timeout: " + " ".join(command) + "\n"
)
watchdog.cancel()
task_queue.task_done()
def start_workers(max_tasks, tidy_caller, arguments):
for _ in range(max_tasks):
t = threading.Thread(target=tidy_caller, args=arguments)
t.daemon = True
t.start()
def merge_replacement_files(tmpdir, mergefile):
"""Merge all replacement files in a directory into a single file"""
# The fixes suggested by clang-tidy >= 4.0.0 are given under
# the top level key 'Diagnostics' in the output yaml files
mergekey = "Diagnostics"
merged = []
for replacefile in glob.iglob(os.path.join(tmpdir, "*.yaml")):
content = yaml.safe_load(open(replacefile, "r"))
if not content:
continue # Skip empty files.
merged.extend(content.get(mergekey, []))
if merged:
# MainSourceFile: The key is required by the definition inside
# include/clang/Tooling/ReplacementsYaml.h, but the value
# is actually never used inside clang-apply-replacements,
# so we set it to '' here.
output = {"MainSourceFile": "", mergekey: merged}
with open(mergefile, "w") as out:
yaml.safe_dump(output, out)
else:
# Empty the file:
open(mergefile, "w").close()
def main():
parser = argparse.ArgumentParser(
description="Run clang-tidy against changed files, and "
"output diagnostics only for modified "
"lines."
)
parser.add_argument(
"-clang-tidy-binary",
metavar="PATH",
default="clang-tidy",
help="path to clang-tidy binary",
)
parser.add_argument(
"-p",
metavar="NUM",
default=0,
help="strip the smallest prefix containing P slashes",
)
parser.add_argument(
"-regex",
metavar="PATTERN",
default=None,
help="custom pattern selecting file paths to check "
"(case sensitive, overrides -iregex)",
)
parser.add_argument(
"-iregex",
metavar="PATTERN",
default=r".*\.(cpp|cc|c\+\+|cxx|c|cl|h|hpp|m|mm|inc)",
help="custom pattern selecting file paths to check "
"(case insensitive, overridden by -regex)",
)
parser.add_argument(
"-j",
type=int,
default=1,
help="number of tidy instances to be run in parallel.",
)
parser.add_argument(
"-timeout", type=int, default=None, help="timeout per each file in seconds."
)
parser.add_argument(
"-fix", action="store_true", default=False, help="apply suggested fixes"
)
parser.add_argument(
"-checks",
help="checks filter, when not specified, use clang-tidy " "default",
default="",
)
parser.add_argument(
"-config-file",
dest="config_file",
help="Specify the path of .clang-tidy or custom config file",
default="",
)
parser.add_argument("-use-color", action="store_true", help="Use colors in output")
parser.add_argument(
"-path", dest="build_path", help="Path used to read a compile command database."
)
if yaml:
parser.add_argument(
"-export-fixes",
metavar="FILE_OR_DIRECTORY",
dest="export_fixes",
help="A directory or a yaml file to store suggested fixes in, "
"which can be applied with clang-apply-replacements. If the "
"parameter is a directory, the fixes of each compilation unit are "
"stored in individual yaml files in the directory.",
)
else:
parser.add_argument(
"-export-fixes",
metavar="DIRECTORY",
dest="export_fixes",
help="A directory to store suggested fixes in, which can be applied "
"with clang-apply-replacements. The fixes of each compilation unit are "
"stored in individual yaml files in the directory.",
)
parser.add_argument(
"-extra-arg",
dest="extra_arg",
action="append",
default=[],
help="Additional argument to append to the compiler " "command line.",
)
parser.add_argument(
"-extra-arg-before",
dest="extra_arg_before",
action="append",
default=[],
help="Additional argument to prepend to the compiler " "command line.",
)
parser.add_argument(
"-quiet",
action="store_true",
default=False,
help="Run clang-tidy in quiet mode",
)
parser.add_argument(
"-load",
dest="plugins",
action="append",
default=[],
help="Load the specified plugin in clang-tidy.",
)
clang_tidy_args = []
argv = sys.argv[1:]
if "--" in argv:
clang_tidy_args.extend(argv[argv.index("--") :])
argv = argv[: argv.index("--")]
args = parser.parse_args(argv)
# Extract changed lines for each file.
filename = None
lines_by_file = {}
for line in sys.stdin:
match = re.search('^\+\+\+\ "?(.*?/){%s}([^ \t\n"]*)' % args.p, line)
if match:
filename = match.group(2)
if filename is None:
continue
if args.regex is not None:
if not re.match("^%s$" % args.regex, filename):
continue
else:
if not re.match("^%s$" % args.iregex, filename, re.IGNORECASE):
continue
match = re.search("^@@.*\+(\d+)(,(\d+))?", line)
if match:
start_line = int(match.group(1))
line_count = 1
if match.group(3):
line_count = int(match.group(3))
if line_count == 0:
continue
end_line = start_line + line_count - 1
lines_by_file.setdefault(filename, []).append([start_line, end_line])
if not any(lines_by_file):
print("No relevant changes found.")
sys.exit(0)
max_task_count = args.j
if max_task_count == 0:
max_task_count = multiprocessing.cpu_count()
max_task_count = min(len(lines_by_file), max_task_count)
combine_fixes = False
export_fixes_dir = None
delete_fixes_dir = False
if args.export_fixes is not None:
# if a directory is given, create it if it does not exist
if args.export_fixes.endswith(os.path.sep) and not os.path.isdir(
args.export_fixes
):
os.makedirs(args.export_fixes)
if not os.path.isdir(args.export_fixes):
if not yaml:
raise RuntimeError(
"Cannot combine fixes in one yaml file. Either install PyYAML or specify an output directory."
)
combine_fixes = True
if os.path.isdir(args.export_fixes):
export_fixes_dir = args.export_fixes
if combine_fixes:
export_fixes_dir = tempfile.mkdtemp()
delete_fixes_dir = True
# Tasks for clang-tidy.
task_queue = queue.Queue(max_task_count)
# A lock for console output.
lock = threading.Lock()
# List of files with a non-zero return code.
failed_files = []
# Run a pool of clang-tidy workers.
start_workers(
max_task_count, run_tidy, (task_queue, lock, args.timeout, failed_files)
)
# Form the common args list.
common_clang_tidy_args = []
if args.fix:
common_clang_tidy_args.append("-fix")
if args.checks != "":
common_clang_tidy_args.append("-checks=" + args.checks)
if args.config_file != "":
common_clang_tidy_args.append("-config-file=" + args.config_file)
if args.quiet:
common_clang_tidy_args.append("-quiet")
if args.build_path is not None:
common_clang_tidy_args.append("-p=%s" % args.build_path)
if args.use_color:
common_clang_tidy_args.append("--use-color")
for arg in args.extra_arg:
common_clang_tidy_args.append("-extra-arg=%s" % arg)
for arg in args.extra_arg_before:
common_clang_tidy_args.append("-extra-arg-before=%s" % arg)
for plugin in args.plugins:
common_clang_tidy_args.append("-load=%s" % plugin)
for name in lines_by_file:
line_filter_json = json.dumps(
[{"name": name, "lines": lines_by_file[name]}], separators=(",", ":")
)
# Run clang-tidy on files containing changes.
command = [args.clang_tidy_binary]
command.append("-line-filter=" + line_filter_json)
if args.export_fixes is not None:
# Get a temporary file. We immediately close the handle so clang-tidy can
# overwrite it.
(handle, tmp_name) = tempfile.mkstemp(suffix=".yaml", dir=export_fixes_dir)
os.close(handle)
command.append("-export-fixes=" + tmp_name)
command.extend(common_clang_tidy_args)
command.append(name)
command.extend(clang_tidy_args)
task_queue.put(command)
# Application return code
return_code = 0
# Wait for all threads to be done.
task_queue.join()
# Application return code
return_code = 0
if failed_files:
return_code = 1
if combine_fixes:
print("Writing fixes to " + args.export_fixes + " ...")
try:
merge_replacement_files(export_fixes_dir, args.export_fixes)
except:
sys.stderr.write("Error exporting fixes.\n")
traceback.print_exc()
return_code = 1
if delete_fixes_dir:
shutil.rmtree(export_fixes_dir)
sys.exit(return_code)
if __name__ == "__main__":
main()
dqlite-1.16.7/configure.ac 0000664 0000000 0000000 00000011005 14652527134 0015364 0 ustar 00root root 0000000 0000000 AC_PREREQ(2.60)
AC_INIT([libdqlite], [1.16.7], [https://github.com/canonical/dqlite])
AC_CONFIG_MACRO_DIR([m4])
AC_CONFIG_AUX_DIR([ac])
AM_INIT_AUTOMAKE([subdir-objects -Wall -Werror -Wno-portability foreign])
AM_SILENT_RULES([yes])
# Without this line, AC_PROG_CC boneheadedly adds `-g -O2` to our CFLAGS.
AC_SUBST(CFLAGS, "")
AC_PROG_CC
AC_USE_SYSTEM_EXTENSIONS
AX_PTHREAD
LT_INIT
# TODO: eventually enable this
# AX_CHECK_COMPILE_FLAG([-Weverything], AM_CFLAGS+=" -Weverything")
# Whether to enable debugging code.
AC_ARG_ENABLE(debug, AS_HELP_STRING([--enable-debug[=ARG]], [enable debugging [default=no]]))
AM_CONDITIONAL(DEBUG_ENABLED, test "x$enable_debug" = "xyes")
# Whether to enable memory sanitizer.
AC_ARG_ENABLE(sanitize, AS_HELP_STRING([--enable-sanitize[=ARG]], [enable code sanitizers [default=no]]))
AM_CONDITIONAL(SANITIZE_ENABLED, test x"$enable_sanitize" = x"yes")
AM_COND_IF(SANITIZE_ENABLED,
AX_CHECK_COMPILE_FLAG([-fsanitize=address],
[true],
[AC_MSG_ERROR([address sanitizer not supported])]))
AC_ARG_ENABLE(backtrace, AS_HELP_STRING([--enable-backtrace[=ARG]], [print backtrace on assertion failure [default=no]]))
AM_CONDITIONAL(BACKTRACE_ENABLED, test "x$enable_backtrace" = "xyes")
AC_ARG_ENABLE(build-sqlite, AS_HELP_STRING([--enable-build-sqlite[=ARG]], [build libsqlite3 from sqlite3.c in the build root [default=no]]))
AM_CONDITIONAL(BUILD_SQLITE_ENABLED, test "x$enable_build_sqlite" = "xyes")
AC_ARG_ENABLE(build-raft, AS_HELP_STRING([--enable-build-raft[=ARG]], [use the bundled raft sources instead of linking to libraft [default=no]]))
AM_CONDITIONAL(BUILD_RAFT_ENABLED, test "x$enable_build_raft" = "xyes")
AC_ARG_ENABLE(dqlite-next, AS_HELP_STRING([--enable-dqlite-next[=ARG]], [build with the experimental dqlite backend [default=no]]))
AM_CONDITIONAL(DQLITE_NEXT_ENABLED, test "x$enable_dqlite_next" = "xyes")
AS_IF([test "x$enable_build_raft" != "xyes" -a "x$enable_dqlite_next" = "xyes"], [AC_MSG_ERROR([dqlite-next requires bundled raft])], [])
# Whether to enable code coverage.
AX_CODE_COVERAGE
# Checks for header files.
AC_CHECK_HEADERS([linux/io_uring.h linux/aio_abi.h])
# Checks for library functions and definitions.
AC_CHECK_DECLS(RWF_NOWAIT, [], [AC_MSG_ERROR(Linux kernel >= 4.14 required.)], [#include ])
# Enable large file support. This is mandatory in order to interoperate with
# libuv, which enables large file support by default, making the size of 'off_t'
# on 32-bit architecture be 8 bytes instead of the normal 4.
AC_SYS_LARGEFILE
# Checks for libraries
PKG_CHECK_MODULES(SQLITE, [sqlite3 >= 3.22.0], [], [])
PKG_CHECK_MODULES(UV, [libuv >= 1.34.0], [], [])
AS_IF([test "x$enable_build_raft" != "xyes"], [PKG_CHECK_MODULES(RAFT, [raft >= 0.18.1], [], [])], [])
# Allow not linking to liblz4 even if it's present.
AC_ARG_WITH([lz4], AS_HELP_STRING([--without-lz4], [never link to liblz4]))
AS_IF([test "x$enable_build_raft" = "xyes"],
# Building raft
[AS_IF([test "x$with_lz4" != "xno"],
[PKG_CHECK_MODULES(LZ4, [liblz4 >= 1.7.1], [have_lz4=yes], [have_lz4=no])],
[have_lz4=no])
AS_IF([test "x$with_lz4" != "xno" -a "x$have_lz4" = "xno"],
[AC_MSG_ERROR([liblz4 required but not found])],
[])],
# Not building raft
[AS_IF([test "x$with_lz4" = "xyes"],
[AC_MSG_ERROR([linking lz4 doesn't make sense unless building raft])],
[])
have_lz4=no])
AM_CONDITIONAL(LZ4_AVAILABLE, test "x$have_lz4" = "xyes")
AC_ARG_ENABLE(lz4, AS_HELP_STRING([--disable-lz4], [when building with lz4, do not compress snapshots by default]))
AS_IF([test "x$enable_lz4" != "x" -a "x$have_lz4" = "xno"],
[AC_MSG_ERROR([snapshot compression (either by default or not) requires liblz4])],
[])
AM_CONDITIONAL(LZ4_ENABLED, test "x$enable_lz4" != "xno" -a "x$have_lz4" = "xyes")
CC_CHECK_FLAGS_APPEND([AM_CFLAGS],[CFLAGS],[ \
-std=c11 \
-g3 \
--mcet \
-fcf-protection \
--param=ssp-buffer-size=4 \
-pipe \
-fno-strict-aliasing \
-fdiagnostics-color \
-fexceptions \
-fstack-clash-protection \
-fstack-protector-strong \
-fasynchronous-unwind-tables \
-fdiagnostics-show-option \
-Wall \
-Wextra \
-Wimplicit-fallthrough=5 \
-Wcast-align \
-Wstrict-prototypes \
-Wlogical-op \
-Wmissing-include-dirs \
-Wold-style-definition \
-Winit-self \
-Wfloat-equal \
-Wsuggest-attribute=noreturn \
-Wformat=2 \
-Wshadow \
-Wendif-labels \
-Wdate-time \
-Wnested-externs \
-Wconversion \
-Werror \
])
# To enable:
#
# -Wpedantic \
AC_SUBST(AM_CFLAGS)
AC_CONFIG_FILES([dqlite.pc Makefile])
AC_OUTPUT
dqlite-1.16.7/doc/ 0000775 0000000 0000000 00000000000 14652527134 0013646 5 ustar 00root root 0000000 0000000 dqlite-1.16.7/doc/faq.md 0000664 0000000 0000000 00000000074 14652527134 0014740 0 ustar 00root root 0000000 0000000 Moved to the [website project](https://dqlite.io/docs/faq).
dqlite-1.16.7/doc/index.md 0000664 0000000 0000000 00000000070 14652527134 0015274 0 ustar 00root root 0000000 0000000 Moved to the [website project](https://dqlite.io/docs).
dqlite-1.16.7/doc/protocol.md 0000664 0000000 0000000 00000000101 14652527134 0016021 0 ustar 00root root 0000000 0000000 Moved to the [website project](https://dqlite.io/docs/protocol).
dqlite-1.16.7/dqlite.pc.in 0000664 0000000 0000000 00000000410 14652527134 0015307 0 ustar 00root root 0000000 0000000 prefix=@prefix@
exec_prefix=@exec_prefix@
libdir=@libdir@
includedir=@includedir@
Name: dqlite
Description: Distributed SQLite engine
Version: @PACKAGE_VERSION@
Libs: -L${libdir} -ldqlite
Libs.private: @SQLITE_LIBS@ @UV_LIBS@ @RAFT_LIBS@
Cflags: -I${includedir}
dqlite-1.16.7/include/ 0000775 0000000 0000000 00000000000 14652527134 0014524 5 ustar 00root root 0000000 0000000 dqlite-1.16.7/include/dqlite.h 0000664 0000000 0000000 00000064337 14652527134 0016174 0 ustar 00root root 0000000 0000000 #ifndef DQLITE_H
#define DQLITE_H
#include
#include
#include
#include
#ifndef DQLITE_API
#define DQLITE_API __attribute__((visibility("default")))
#endif
/**
* This "pseudo-attribute" marks declarations that are only a provisional part
* of the dqlite public API. These declarations may change or be removed
* entirely in minor or point releases of dqlite, without bumping the soversion
* of libdqlite.so. Consumers of dqlite who use these declarations are
* responsible for updating their code in response to such breaking changes.
*/
#define DQLITE_EXPERIMENTAL
#ifndef DQLITE_VISIBLE_TO_TESTS
#define DQLITE_VISIBLE_TO_TESTS DQLITE_API
#endif
/**
* Version.
*/
#define DQLITE_VERSION_MAJOR 1
#define DQLITE_VERSION_MINOR 16
#define DQLITE_VERSION_RELEASE 7
#define DQLITE_VERSION_NUMBER \
(DQLITE_VERSION_MAJOR * 100 * 100 + DQLITE_VERSION_MINOR * 100 + \
DQLITE_VERSION_RELEASE)
DQLITE_API int dqlite_version_number(void);
/**
* Hold the value of a dqlite node ID. Guaranteed to be at least 64-bit long.
*/
typedef unsigned long long dqlite_node_id;
DQLITE_EXPERIMENTAL typedef struct dqlite_server dqlite_server;
/**
* Signature of a custom callback used to establish network connections
* to dqlite servers.
*
* @arg is a user data parameter, copied from the third argument of
* dqlite_server_set_connect_func. @addr is a (borrowed) abstract address
* string, as passed to dqlite_server_create or dqlite_server_set_auto_join. @fd
* is an address where a socket representing the connection should be stored.
* The callback should return zero if a connection was established successfully
* or nonzero if the attempt failed.
*/
DQLITE_EXPERIMENTAL typedef int (*dqlite_connect_func)(void *arg,
const char *addr,
int *fd);
/* The following dqlite_server functions return zero on success or nonzero on
* error. More specific error codes may be specified in the future. */
/**
* Start configuring a dqlite server.
*
* The server will not start running until dqlite_server_start is called. @path
* is the path to a directory where the server (and attached client) will store
* its persistent state; the directory must exist. A pointer to the new server
* object is stored in @server on success.
*
* Whether or not this function succeeds, you should call dqlite_server_destroy
* to release resources owned by the server object.
*
* No reference to @path is kept after this function returns.
*/
DQLITE_API DQLITE_EXPERIMENTAL int dqlite_server_create(const char *path,
dqlite_server **server);
/**
* Set the abstract address of this server.
*
* This function must be called when the server starts for the first time, and
* is a no-op when the server is restarting. The abstract address is recorded in
* the Raft log and passed to the connect function on each server (see
* dqlite_server_set_connect_func). The server will also bind to this address to
* listen for incoming connections from clients and other servers, unless
* dqlite_server_set_bind_address is used. For the address syntax accepted by
* the default connect function (and for binding/listening), see
* dqlite_server_set_bind_address.
*/
DQLITE_API DQLITE_EXPERIMENTAL int dqlite_server_set_address(
dqlite_server *server,
const char *address);
/**
* Turn on or off automatic bootstrap for this server.
*
* The bootstrap server should be the first to start up. It automatically
* becomes the leader in the first term, and is responsible for adding all other
* servers to the cluster configuration. There must be exactly one bootstrap
* server in each cluster. After the first startup, the bootstrap server is no
* longer special and this function is a no-op.
*/
DQLITE_API DQLITE_EXPERIMENTAL int dqlite_server_set_auto_bootstrap(
dqlite_server *server,
bool on);
/**
* Declare the addresses of existing servers in the cluster, which should
* already be running.
*
* The server addresses declared with this function will not be used unless
* @server is starting up for the first time; after the first startup, the list
* of servers stored on disk will be used instead. (It is harmless to call this
* function unconditionally.)
*/
DQLITE_API DQLITE_EXPERIMENTAL int dqlite_server_set_auto_join(
dqlite_server *server,
const char *const *addrs,
unsigned n);
/**
* Configure @server to listen on the address @addr for incoming connections
* (from clients and other servers).
*
* If no bind address is configured with this function, the abstract address
* passed to dqlite_server_create will be used. The point of this function is to
* support decoupling the abstract address from the networking implementation
* (for example, if a proxy is going to be used).
*
* @addr must use one of the following formats:
*
* 1. ""
* 2. ":"
* 3. "@"
*
* Where is a numeric IPv4/IPv6 address, is a port number, and
* is an abstract Unix socket path. The port number defaults to 8080 if
* not specified. In the second form, if is an IPv6 address, it must be
* enclosed in square brackets "[]". In the third form, if is empty, the
* implementation will automatically select an available abstract Unix socket
* path.
*
* If an abstract Unix socket is used, the server will accept only
* connections originating from the same process.
*/
DQLITE_API DQLITE_EXPERIMENTAL int dqlite_server_set_bind_address(
dqlite_server *server,
const char *addr);
/**
* Configure the function that this server will use to connect to other servers.
*
* The same function will be used by the server's attached client to establish
* connections to all servers in the cluster. @arg is a user data parameter that
* will be passed to all invocations of the connect function.
*/
DQLITE_API DQLITE_EXPERIMENTAL int dqlite_server_set_connect_func(
dqlite_server *server,
dqlite_connect_func f,
void *arg);
/**
* Start running the server.
*
* Once this function returns successfully, the server will be ready to accept
* client requests using the functions below.
*/
DQLITE_API DQLITE_EXPERIMENTAL int dqlite_server_start(dqlite_server *server);
/**
* Get the ID of the server.
*
* This will return 0 (an invalid ID) if the server has not been started.
*/
DQLITE_API DQLITE_EXPERIMENTAL dqlite_node_id
dqlite_server_get_id(dqlite_server *server);
/**
* Hand over the server's privileges to other servers.
*
* This is intended to be called before dqlite_server_stop. The server will try
* to surrender leadership and voting rights to other nodes in the cluster, if
* applicable. This avoids some disruptions that can result when a privileged
* server stops suddenly.
*/
DQLITE_API DQLITE_EXPERIMENTAL int dqlite_server_handover(
dqlite_server *server);
/**
* Stop the server.
*
* The server will stop processing requests from client or other servers. To
* smooth over some possible disruptions to the cluster, call
* dqlite_server_handover before this function. After this function returns
* (successfully or not), you should call dqlite_server_destroy to free
* resources owned by the server.
*/
DQLITE_API DQLITE_EXPERIMENTAL int dqlite_server_stop(dqlite_server *server);
/**
* Free resources owned by the server.
*
* You should always call this function to finalize a server created with
* dqlite_server_create, whether or not that function returned successfully.
* If the server has been successfully started with dqlite_server_start,
* then you must stop it with dqlite_server_stop before calling this function.
*/
DQLITE_API DQLITE_EXPERIMENTAL void dqlite_server_destroy(
dqlite_server *server);
/**
* Error codes.
*
* These are used only with the dqlite_node family of functions.
*/
enum {
DQLITE_ERROR = 1, /* Generic error */
DQLITE_MISUSE, /* Library used incorrectly */
DQLITE_NOMEM /* A malloc() failed */
};
/**
* Dqlite node handle.
*
* Opaque handle to a single dqlite node that can serve database requests from
* connected clients and exchanges data replication messages with other dqlite
* nodes.
*/
typedef struct dqlite_node dqlite_node;
/**
* Create a new dqlite node object.
*
* The @id argument a is positive number that identifies this particular dqlite
* node in the cluster. Each dqlite node part of the same cluster must be
* created with a different ID. The very first node, used to bootstrap a new
* cluster, must have ID #1. Every time a node is started again, it must be
* passed the same ID.
* The @address argument is the network address that clients or other nodes in
* the cluster must use to connect to this dqlite node. If no custom connect
* function is going to be set using dqlite_node_set_connect_func(), then the
* format of the string must be "" or ":, where is a
* numeric IPv4/IPv6 address and is a port number. The port number
* defaults to 8080 if not specified. If a port number is specified with an
* IPv6 address, the address must be enclosed in square brackets "[]".
*
* If a custom connect function is used, then the format of the string must by
* whatever the custom connect function accepts.
*
* The @data_dir argument the file system path where the node should store its
* durable data, such as Raft log entries containing WAL frames of the SQLite
* databases being replicated.
*
* No reference to the memory pointed to by @address and @data_dir is kept by
* the dqlite library, so any memory associated with them can be released after
* the function returns.
*
* Even if an error is returned, the caller should call dqlite_node_destroy()
* on the dqlite_node* value pointed to by @n, and calling dqlite_node_errmsg()
* with that value will return a valid error string. (In some cases *n will be
* set to NULL, but dqlite_node_destroy() and dqlite_node_errmsg() will handle
* this gracefully.)
*/
DQLITE_API int dqlite_node_create(dqlite_node_id id,
const char *address,
const char *data_dir,
dqlite_node **n);
/**
* Destroy a dqlite node object.
*
* This will release all memory that was allocated by the node. If
* dqlite_node_start() was successfully invoked, then dqlite_node_stop() must be
* invoked before destroying the node.
*/
DQLITE_API void dqlite_node_destroy(dqlite_node *n);
/**
* Instruct the dqlite node to bind a network address when starting, and
* listening for incoming client connections.
*
* The given address might match the one passed to @dqlite_node_create or be a
* different one (for example if the application wants to proxy it).
*
* The format of the @address argument must be one of
*
* 1. ""
* 2. ":"
* 3. "@"
*
* Where is a numeric IPv4/IPv6 address, is a port number, and
* is an abstract Unix socket path. The port number defaults to 8080 if
* not specified. In the second form, if is an IPv6 address, it must be
* enclosed in square brackets "[]". In the third form, if is empty, the
* implementation will automatically select an available abstract Unix socket
* path, which can then be retrieved with dqlite_node_get_bind_address().
*
* If an abstract Unix socket is used the dqlite node will accept only
* connections originating from the same process.
*
* No reference to the memory pointed to by @address is kept, so any memory
* associated with them can be released after the function returns.
*
* This function must be called before calling dqlite_node_start().
*/
DQLITE_API int dqlite_node_set_bind_address(dqlite_node *n,
const char *address);
/**
* Get the network address that the dqlite node is using to accept incoming
* connections.
*/
DQLITE_API const char *dqlite_node_get_bind_address(dqlite_node *n);
/**
* Set a custom connect function.
*
* The function should block until a network connection with the dqlite node at
* the given @address is established, or an error occurs.
*
* In case of success, the file descriptor of the connected socket must be saved
* into the location pointed by the @fd argument. The socket must be either a
* TCP or a Unix socket.
*
* This function must be called before calling dqlite_node_start().
*/
DQLITE_API int dqlite_node_set_connect_func(dqlite_node *n,
int (*f)(void *arg,
const char *address,
int *fd),
void *arg);
/**
* DEPRECATED - USE `dqlite_node_set_network_latency_ms`
* Set the average one-way network latency, expressed in nanoseconds.
*
* This value is used internally by dqlite to decide how frequently the leader
* node should send heartbeats to other nodes in order to maintain its
* leadership, and how long other nodes should wait before deciding that the
* leader has died and initiate a failover.
*
* This function must be called before calling dqlite_node_start().
*/
DQLITE_API int dqlite_node_set_network_latency(dqlite_node *n,
unsigned long long nanoseconds);
/**
* Set the average one-way network latency, expressed in milliseconds.
*
* This value is used internally by dqlite to decide how frequently the leader
* node should send heartbeats to other nodes in order to maintain its
* leadership, and how long other nodes should wait before deciding that the
* leader has died and initiate a failover.
*
* This function must be called before calling dqlite_node_start().
*
* Latency should not be 0 or larger than 3600000 milliseconds.
*/
DQLITE_API int dqlite_node_set_network_latency_ms(dqlite_node *t,
unsigned milliseconds);
/**
* Set the failure domain associated with this node.
*
* This is effectively a tag applied to the node and that can be inspected later
* with the "Describe node" client request.
*/
DQLITE_API int dqlite_node_set_failure_domain(dqlite_node *n,
unsigned long long code);
/**
* Set the snapshot parameters for this node.
*
* This function determines how frequently a node will snapshot the state
* of the database and how many raft log entries will be kept around after
* a snapshot has been taken.
*
* `snapshot_threshold` : Determines the frequency of taking a snapshot, the
* lower the number, the higher the frequency.
*
* `snapshot_trailing` : Determines the amount of log entries kept around after
* taking a snapshot. Lowering this number decreases disk and memory footprint
* but increases the chance of having to send a full snapshot (instead of a
* number of log entries to a node that has fallen behind.
*
* This function must be called before calling dqlite_node_start().
*/
DQLITE_API int dqlite_node_set_snapshot_params(dqlite_node *n,
unsigned snapshot_threshold,
unsigned snapshot_trailing);
/**
* Set the block size used for performing disk IO when writing raft log segments
* to disk. @size is limited to a list of preset values.
*
* This function must be called before calling dqlite_node_start().
*/
DQLITE_API int dqlite_node_set_block_size(dqlite_node *n, size_t size);
/**
* WARNING: This is an experimental API.
*
* By default dqlite holds the SQLite database file and WAL in memory. By
* enabling disk-mode, dqlite will hold the SQLite database file on-disk while
* keeping the WAL in memory. Has to be called after `dqlite_node_create` and
* before `dqlite_node_start`.
*/
DQLITE_API int dqlite_node_enable_disk_mode(dqlite_node *n);
/**
* Set the target number of voting nodes for the cluster.
*
* If automatic role management is enabled, the cluster leader will attempt to
* promote nodes to reach the target. If automatic role management is disabled,
* this has no effect.
*
* The default target is 3 voters.
*/
DQLITE_API int dqlite_node_set_target_voters(dqlite_node *n, int voters);
/**
* Set the target number of standby nodes for the cluster.
*
* If automatic role management is enabled, the cluster leader will attempt to
* promote nodes to reach the target. If automatic role management is disabled,
* this has no effect.
*
* The default target is 0 standbys.
*/
DQLITE_API int dqlite_node_set_target_standbys(dqlite_node *n, int standbys);
/**
* Set the target number of threads in the thread pool processing sqlite3 disk
* operations.
*
* The default pool thread count is 4.
*/
DQLITE_API int dqlite_node_set_pool_thread_count(dqlite_node *n,
unsigned thread_count);
/**
* Enable or disable auto-recovery for corrupted disk files.
*
* When auto-recovery is enabled, files in the data directory that are
* determined to be corrupt may be removed by dqlite at startup. This allows
* the node to start up successfully in more situations, but comes at the cost
* of possible data loss, and may mask bugs.
*
* This must be called before dqlite_node_start.
*
* Auto-recovery is enabled by default.
*/
DQLITE_API int dqlite_node_set_auto_recovery(dqlite_node *n, bool enabled);
/**
* Enable or disable raft snapshot compression.
*/
DQLITE_API int dqlite_node_set_snapshot_compression(dqlite_node *n,
bool enabled);
/**
* Enable automatic role management on the server side for this node.
*
* When automatic role management is enabled, servers in a dqlite cluster will
* autonomously (without client intervention) promote and demote each other
* to maintain a specified number of voters and standbys, taking into account
* the health, failure domain, and weight of each server.
*
* By default, no automatic role management is performed.
*/
DQLITE_API int dqlite_node_enable_role_management(dqlite_node *n);
/**
* Start a dqlite node.
*
* A background thread will be spawned which will run the node's main loop. If
* this function returns successfully, the dqlite node is ready to accept new
* connections.
*/
DQLITE_API int dqlite_node_start(dqlite_node *n);
/**
* Attempt to hand over this node's privileges to other nodes in preparation
* for a graceful shutdown.
*
* Specifically, if this node is the cluster leader, this will cause another
* voting node (if one exists) to be elected leader; then, if this node is a
* voter, another non-voting node (if one exists) will be promoted to voter, and
* then this node will be demoted to spare.
*
* This function returns 0 if all privileges were handed over successfully,
* and nonzero otherwise. Callers can continue to dqlite_node_stop immediately
* after this function returns (whether or not it succeeded), or include their
* own graceful shutdown logic before dqlite_node_stop.
*/
DQLITE_API int dqlite_node_handover(dqlite_node *n);
/**
* Stop a dqlite node.
*
* The background thread running the main loop will be notified and the node
* will not accept any new client connections. Once inflight requests are
* completed, open client connections get closed and then the thread exits.
*/
DQLITE_API int dqlite_node_stop(dqlite_node *n);
struct dqlite_node_info
{
dqlite_node_id id;
const char *address;
};
typedef struct dqlite_node_info dqlite_node_info;
/* Defined to be an extensible struct, future additions to this struct should be
* 64-bits wide and 0 should not be used as a valid value. */
struct dqlite_node_info_ext
{
uint64_t size; /* The size of this struct */
uint64_t id; /* dqlite_node_id */
uint64_t address;
uint64_t dqlite_role;
};
typedef struct dqlite_node_info_ext dqlite_node_info_ext;
#define DQLITE_NODE_INFO_EXT_SZ_ORIG 32U /* (4 * 64) / 8 */
/**
* !!! Deprecated, use `dqlite_node_recover_ext` instead which also includes
* dqlite roles. !!!
*
* Force recovering a dqlite node which is part of a cluster whose majority of
* nodes have died, and therefore has become unavailable.
*
* In order for this operation to be safe you must follow these steps:
*
* 1. Make sure no dqlite node in the cluster is running.
*
* 2. Identify all dqlite nodes that have survived and that you want to be part
* of the recovered cluster.
*
* 3. Among the survived dqlite nodes, find the one with the most up-to-date
* raft term and log.
*
* 4. Invoke @dqlite_node_recover exactly one time, on the node you found in
* step 3, and pass it an array of #dqlite_node_info filled with the IDs and
* addresses of the survived nodes, including the one being recovered.
*
* 5. Copy the data directory of the node you ran @dqlite_node_recover on to all
* other non-dead nodes in the cluster, replacing their current data
* directory.
*
* 6. Restart all nodes.
*/
DQLITE_API int dqlite_node_recover(dqlite_node *n,
dqlite_node_info infos[],
int n_info);
/**
* Force recovering a dqlite node which is part of a cluster whose majority of
* nodes have died, and therefore has become unavailable.
*
* In order for this operation to be safe you must follow these steps:
*
* 1. Make sure no dqlite node in the cluster is running.
*
* 2. Identify all dqlite nodes that have survived and that you want to be part
* of the recovered cluster.
*
* 3. Among the survived dqlite nodes, find the one with the most up-to-date
* raft term and log.
*
* 4. Invoke @dqlite_node_recover_ext exactly one time, on the node you found in
* step 3, and pass it an array of #dqlite_node_info filled with the IDs,
* addresses and roles of the survived nodes, including the one being
* recovered.
*
* 5. Copy the data directory of the node you ran @dqlite_node_recover_ext on to
* all other non-dead nodes in the cluster, replacing their current data
* directory.
*
* 6. Restart all nodes.
*/
DQLITE_API int dqlite_node_recover_ext(dqlite_node *n,
dqlite_node_info_ext infos[],
int n_info);
/**
* Return a human-readable description of the last error occurred.
*/
DQLITE_API const char *dqlite_node_errmsg(dqlite_node *n);
/**
* Generate a unique ID for the given address.
*/
DQLITE_API dqlite_node_id dqlite_generate_node_id(const char *address);
/**
* This function is DEPRECATED and will be removed in a future major release.
*
* Initialize the given SQLite VFS interface object with dqlite's custom
* implementation, which can be used for replication.
*/
DQLITE_API int dqlite_vfs_init(sqlite3_vfs *vfs, const char *name);
DQLITE_API int dqlite_vfs_enable_disk(sqlite3_vfs *vfs);
/**
* This function is DEPRECATED and will be removed in a future major release.
*
* Release all memory used internally by a SQLite VFS object that was
* initialized using @qlite_vfs_init.
*/
DQLITE_API void dqlite_vfs_close(sqlite3_vfs *vfs);
/**
* This type is DEPRECATED and will be removed in a future major release.
*
* A single WAL frame to be replicated.
*/
struct dqlite_vfs_frame
{
unsigned long page_number; /* Database page number. */
void *data; /* Content of the database page. */
};
typedef struct dqlite_vfs_frame dqlite_vfs_frame;
/**
* This function is DEPRECATED and will be removed in a future major release.
*
* Check if the last call to sqlite3_step() has triggered a write transaction on
* the database with the given filename. In that case acquire a WAL write lock
* to prevent further write transactions, and return all new WAL frames
* generated by the transaction. These frames are meant to be replicated across
* nodes and then actually added to the WAL with dqlite_vfs_apply() once a
* quorum is reached. If a quorum is not reached within a given time, then
* dqlite_vfs_abort() can be used to abort and release the WAL write lock.
*/
DQLITE_API int dqlite_vfs_poll(sqlite3_vfs *vfs,
const char *filename,
dqlite_vfs_frame **frames,
unsigned *n);
/**
* This function is DEPRECATED and will be removed in a future major release.
*
* Add to the WAL all frames that were generated by a write transaction
* triggered by sqlite3_step() and that were obtained via dqlite_vfs_poll().
*
* This interface is designed to match the typical use case of a node receiving
* the frames by sequentially reading a byte stream from a network socket and
* passing the data to this routine directly without any copy or futher
* allocation, possibly except for integer encoding/decoding.
*/
DQLITE_API int dqlite_vfs_apply(sqlite3_vfs *vfs,
const char *filename,
unsigned n,
unsigned long *page_numbers,
void *frames);
/**
* This function is DEPRECATED and will be removed in a future major release.
*
* Abort a pending write transaction that was triggered by sqlite3_step() and
* whose frames were obtained via dqlite_vfs_poll().
*
* This should be called if the transaction could not be safely replicated. In
* particular it will release the write lock acquired by dqlite_vfs_poll().
*/
DQLITE_API int dqlite_vfs_abort(sqlite3_vfs *vfs, const char *filename);
/**
* This function is DEPRECATED and will be removed in a future major release.
*
* Return a snapshot of the main database file and of the WAL file.
*/
DQLITE_API int dqlite_vfs_snapshot(sqlite3_vfs *vfs,
const char *filename,
void **data,
size_t *n);
/**
* This type is DEPRECATED and will be removed in a future major release.
*
* A data buffer.
*/
struct dqlite_buffer
{
void *base; /* Pointer to the buffer data. */
size_t len; /* Length of the buffer. */
};
/**
* This function is DEPRECATED and will be removed in a future major release.
*
* Return a shallow snapshot of the main database file and of the WAL file.
* Expects a bufs array of size x + 1, where x is obtained from
* `dqlite_vfs_num_pages`.
*/
DQLITE_API int dqlite_vfs_shallow_snapshot(sqlite3_vfs *vfs,
const char *filename,
struct dqlite_buffer bufs[],
unsigned n);
/**
* This function is DEPRECATED and will be removed in a future major release.
*/
DQLITE_API int dqlite_vfs_snapshot_disk(sqlite3_vfs *vfs,
const char *filename,
struct dqlite_buffer bufs[],
unsigned n);
/**
* This function is DEPRECATED and will be removed in a future major release.
*
* Return the number of database pages (excluding WAL).
*/
DQLITE_API int dqlite_vfs_num_pages(sqlite3_vfs *vfs,
const char *filename,
unsigned *n);
/**
* This function is DEPRECATED and will be removed in a future major release.
*
* Restore a snapshot of the main database file and of the WAL file.
*/
DQLITE_API int dqlite_vfs_restore(sqlite3_vfs *vfs,
const char *filename,
const void *data,
size_t n);
/**
* This function is DEPRECATED and will be removed in a future major release.
*
* Restore a snapshot of the main database file and of the WAL file.
*/
DQLITE_API int dqlite_vfs_restore_disk(sqlite3_vfs *vfs,
const char *filename,
const void *data,
size_t main_size,
size_t wal_size);
#endif /* DQLITE_H */
dqlite-1.16.7/m4/ 0000775 0000000 0000000 00000000000 14652527134 0013421 5 ustar 00root root 0000000 0000000 dqlite-1.16.7/m4/.gitignore 0000664 0000000 0000000 00000000377 14652527134 0015420 0 ustar 00root root 0000000 0000000 *.m4
!attributes.m4
!ax_ac_append_to_file.m4
!ax_ac_print_to_file.m4
!ax_add_am_macro_static.m4
!ax_am_macros_static.m4
!ax_check_compile_flag.m4
!ax_check_gnu_make.m4
!ax_code_coverage.m4
!ax_compare_version.m4
!ax_file_escapes.m4
!ax_pthread.m4
!pkg.m4
dqlite-1.16.7/m4/attributes.m4 0000664 0000000 0000000 00000024021 14652527134 0016050 0 ustar 00root root 0000000 0000000 dnl Macros to check the presence of generic (non-typed) symbols.
dnl Copyright (c) 2006-2008 Diego Pettenò
dnl Copyright (c) 2006-2008 xine project
dnl Copyright (c) 2012 Lucas De Marchi
dnl
dnl This program is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU General Public License as published by
dnl the Free Software Foundation; either version 2, or (at your option)
dnl any later version.
dnl
dnl This program is distributed in the hope that it will be useful,
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
dnl GNU General Public License for more details.
dnl
dnl You should have received a copy of the GNU General Public License
dnl along with this program; if not, write to the Free Software
dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
dnl 02110-1301, USA.
dnl
dnl As a special exception, the copyright owners of the
dnl macro gives unlimited permission to copy, distribute and modify the
dnl configure scripts that are the output of Autoconf when processing the
dnl Macro. You need not follow the terms of the GNU General Public
dnl License when using or distributing such scripts, even though portions
dnl of the text of the Macro appear in them. The GNU General Public
dnl License (GPL) does govern all other use of the material that
dnl constitutes the Autoconf Macro.
dnl
dnl This special exception to the GPL applies to versions of the
dnl Autoconf Macro released by this project. When you make and
dnl distribute a modified version of the Autoconf Macro, you may extend
dnl this special exception to the GPL to apply to your modified version as
dnl well.
dnl Check if FLAG in ENV-VAR is supported by compiler and append it
dnl to WHERE-TO-APPEND variable. Note that we invert -Wno-* checks to
dnl -W* as gcc cannot test for negated warnings. If a C snippet is passed,
dnl use it, otherwise use a simple main() definition that just returns 0.
dnl CC_CHECK_FLAG_APPEND([WHERE-TO-APPEND], [ENV-VAR], [FLAG], [C-SNIPPET])
AC_DEFUN([CC_CHECK_FLAG_APPEND], [
AC_CACHE_CHECK([if $CC supports flag $3 in envvar $2],
AS_TR_SH([cc_cv_$2_$3]),
[eval "AS_TR_SH([cc_save_$2])='${$2}'"
eval "AS_TR_SH([$2])='${cc_save_$2} -Werror `echo "$3" | sed 's/^-Wno-/-W/'`'"
AC_LINK_IFELSE([AC_LANG_SOURCE(ifelse([$4], [],
[int main(void) { return 0; } ],
[$4]))],
[eval "AS_TR_SH([cc_cv_$2_$3])='yes'"],
[eval "AS_TR_SH([cc_cv_$2_$3])='no'"])
eval "AS_TR_SH([$2])='$cc_save_$2'"])
AS_IF([eval test x$]AS_TR_SH([cc_cv_$2_$3])[ = xyes],
[eval "$1='${$1} $3'"])
])
dnl CC_CHECK_FLAGS_APPEND([WHERE-TO-APPEND], [ENV-VAR], [FLAG1 FLAG2], [C-SNIPPET])
AC_DEFUN([CC_CHECK_FLAGS_APPEND], [
for flag in [$3]; do
CC_CHECK_FLAG_APPEND([$1], [$2], $flag, [$4])
done
])
dnl Check if the flag is supported by linker (cacheable)
dnl CC_CHECK_LDFLAGS([FLAG], [ACTION-IF-FOUND],[ACTION-IF-NOT-FOUND])
AC_DEFUN([CC_CHECK_LDFLAGS], [
AC_CACHE_CHECK([if $CC supports $1 flag],
AS_TR_SH([cc_cv_ldflags_$1]),
[ac_save_LDFLAGS="$LDFLAGS"
LDFLAGS="$LDFLAGS $1"
AC_LINK_IFELSE([int main() { return 1; }],
[eval "AS_TR_SH([cc_cv_ldflags_$1])='yes'"],
[eval "AS_TR_SH([cc_cv_ldflags_$1])="])
LDFLAGS="$ac_save_LDFLAGS"
])
AS_IF([eval test x$]AS_TR_SH([cc_cv_ldflags_$1])[ = xyes],
[$2], [$3])
])
dnl define the LDFLAGS_NOUNDEFINED variable with the correct value for
dnl the current linker to avoid undefined references in a shared object.
AC_DEFUN([CC_NOUNDEFINED], [
dnl We check $host for which systems to enable this for.
AC_REQUIRE([AC_CANONICAL_HOST])
case $host in
dnl FreeBSD (et al.) does not complete linking for shared objects when pthreads
dnl are requested, as different implementations are present; to avoid problems
dnl use -Wl,-z,defs only for those platform not behaving this way.
*-freebsd* | *-openbsd*) ;;
*)
dnl First of all check for the --no-undefined variant of GNU ld. This allows
dnl for a much more readable command line, so that people can understand what
dnl it does without going to look for what the heck -z defs does.
for possible_flags in "-Wl,--no-undefined" "-Wl,-z,defs"; do
CC_CHECK_LDFLAGS([$possible_flags], [LDFLAGS_NOUNDEFINED="$possible_flags"])
break
done
;;
esac
AC_SUBST([LDFLAGS_NOUNDEFINED])
])
dnl Check for a -Werror flag or equivalent. -Werror is the GCC
dnl and ICC flag that tells the compiler to treat all the warnings
dnl as fatal. We usually need this option to make sure that some
dnl constructs (like attributes) are not simply ignored.
dnl
dnl Other compilers don't support -Werror per se, but they support
dnl an equivalent flag:
dnl - Sun Studio compiler supports -errwarn=%all
AC_DEFUN([CC_CHECK_WERROR], [
AC_CACHE_CHECK(
[for $CC way to treat warnings as errors],
[cc_cv_werror],
[CC_CHECK_CFLAGS_SILENT([-Werror], [cc_cv_werror=-Werror],
[CC_CHECK_CFLAGS_SILENT([-errwarn=%all], [cc_cv_werror=-errwarn=%all])])
])
])
AC_DEFUN([CC_CHECK_ATTRIBUTE], [
AC_REQUIRE([CC_CHECK_WERROR])
AC_CACHE_CHECK([if $CC supports __attribute__(( ifelse([$2], , [$1], [$2]) ))],
AS_TR_SH([cc_cv_attribute_$1]),
[ac_save_CFLAGS="$CFLAGS"
CFLAGS="$CFLAGS $cc_cv_werror"
AC_COMPILE_IFELSE([AC_LANG_SOURCE([$3])],
[eval "AS_TR_SH([cc_cv_attribute_$1])='yes'"],
[eval "AS_TR_SH([cc_cv_attribute_$1])='no'"])
CFLAGS="$ac_save_CFLAGS"
])
AS_IF([eval test x$]AS_TR_SH([cc_cv_attribute_$1])[ = xyes],
[AC_DEFINE(
AS_TR_CPP([SUPPORT_ATTRIBUTE_$1]), 1,
[Define this if the compiler supports __attribute__(( ifelse([$2], , [$1], [$2]) ))]
)
$4],
[$5])
])
AC_DEFUN([CC_ATTRIBUTE_CONSTRUCTOR], [
CC_CHECK_ATTRIBUTE(
[constructor],,
[void __attribute__((constructor)) ctor() { int a; }],
[$1], [$2])
])
AC_DEFUN([CC_ATTRIBUTE_FORMAT], [
CC_CHECK_ATTRIBUTE(
[format], [format(printf, n, n)],
[void __attribute__((format(printf, 1, 2))) printflike(const char *fmt, ...) { fmt = (void *)0; }],
[$1], [$2])
])
AC_DEFUN([CC_ATTRIBUTE_FORMAT_ARG], [
CC_CHECK_ATTRIBUTE(
[format_arg], [format_arg(printf)],
[char *__attribute__((format_arg(1))) gettextlike(const char *fmt) { fmt = (void *)0; }],
[$1], [$2])
])
AC_DEFUN([CC_ATTRIBUTE_VISIBILITY], [
CC_CHECK_ATTRIBUTE(
[visibility_$1], [visibility("$1")],
[void __attribute__((visibility("$1"))) $1_function() { }],
[$2], [$3])
])
AC_DEFUN([CC_ATTRIBUTE_NONNULL], [
CC_CHECK_ATTRIBUTE(
[nonnull], [nonnull()],
[void __attribute__((nonnull())) some_function(void *foo, void *bar) { foo = (void*)0; bar = (void*)0; }],
[$1], [$2])
])
AC_DEFUN([CC_ATTRIBUTE_UNUSED], [
CC_CHECK_ATTRIBUTE(
[unused], ,
[void some_function(void *foo, __attribute__((unused)) void *bar);],
[$1], [$2])
])
AC_DEFUN([CC_ATTRIBUTE_SENTINEL], [
CC_CHECK_ATTRIBUTE(
[sentinel], ,
[void some_function(void *foo, ...) __attribute__((sentinel));],
[$1], [$2])
])
AC_DEFUN([CC_ATTRIBUTE_DEPRECATED], [
CC_CHECK_ATTRIBUTE(
[deprecated], ,
[void some_function(void *foo, ...) __attribute__((deprecated));],
[$1], [$2])
])
AC_DEFUN([CC_ATTRIBUTE_ALIAS], [
CC_CHECK_ATTRIBUTE(
[alias], [weak, alias],
[void other_function(void *foo) { }
void some_function(void *foo) __attribute__((weak, alias("other_function")));],
[$1], [$2])
])
AC_DEFUN([CC_ATTRIBUTE_MALLOC], [
CC_CHECK_ATTRIBUTE(
[malloc], ,
[void * __attribute__((malloc)) my_alloc(int n);],
[$1], [$2])
])
AC_DEFUN([CC_ATTRIBUTE_PACKED], [
CC_CHECK_ATTRIBUTE(
[packed], ,
[struct astructure { char a; int b; long c; void *d; } __attribute__((packed));],
[$1], [$2])
])
AC_DEFUN([CC_ATTRIBUTE_CONST], [
CC_CHECK_ATTRIBUTE(
[const], ,
[int __attribute__((const)) twopow(int n) { return 1 << n; } ],
[$1], [$2])
])
AC_DEFUN([CC_FLAG_VISIBILITY], [
AC_REQUIRE([CC_CHECK_WERROR])
AC_CACHE_CHECK([if $CC supports -fvisibility=hidden],
[cc_cv_flag_visibility],
[cc_flag_visibility_save_CFLAGS="$CFLAGS"
CFLAGS="$CFLAGS $cc_cv_werror"
CC_CHECK_CFLAGS_SILENT([-fvisibility=hidden],
cc_cv_flag_visibility='yes',
cc_cv_flag_visibility='no')
CFLAGS="$cc_flag_visibility_save_CFLAGS"])
AS_IF([test "x$cc_cv_flag_visibility" = "xyes"],
[AC_DEFINE([SUPPORT_FLAG_VISIBILITY], 1,
[Define this if the compiler supports the -fvisibility flag])
$1],
[$2])
])
AC_DEFUN([CC_FUNC_EXPECT], [
AC_REQUIRE([CC_CHECK_WERROR])
AC_CACHE_CHECK([if compiler has __builtin_expect function],
[cc_cv_func_expect],
[ac_save_CFLAGS="$CFLAGS"
CFLAGS="$CFLAGS $cc_cv_werror"
AC_COMPILE_IFELSE([AC_LANG_SOURCE(
[int some_function() {
int a = 3;
return (int)__builtin_expect(a, 3);
}])],
[cc_cv_func_expect=yes],
[cc_cv_func_expect=no])
CFLAGS="$ac_save_CFLAGS"
])
AS_IF([test "x$cc_cv_func_expect" = "xyes"],
[AC_DEFINE([SUPPORT__BUILTIN_EXPECT], 1,
[Define this if the compiler supports __builtin_expect() function])
$1],
[$2])
])
AC_DEFUN([CC_ATTRIBUTE_ALIGNED], [
AC_REQUIRE([CC_CHECK_WERROR])
AC_CACHE_CHECK([highest __attribute__ ((aligned ())) supported],
[cc_cv_attribute_aligned],
[ac_save_CFLAGS="$CFLAGS"
CFLAGS="$CFLAGS $cc_cv_werror"
for cc_attribute_align_try in 64 32 16 8 4 2; do
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
int main() {
static char c __attribute__ ((aligned($cc_attribute_align_try))) = 0;
return c;
}])], [cc_cv_attribute_aligned=$cc_attribute_align_try; break])
done
CFLAGS="$ac_save_CFLAGS"
])
if test "x$cc_cv_attribute_aligned" != "x"; then
AC_DEFINE_UNQUOTED([ATTRIBUTE_ALIGNED_MAX], [$cc_cv_attribute_aligned],
[Define the highest alignment supported])
fi
])
dqlite-1.16.7/m4/ax_ac_append_to_file.m4 0000664 0000000 0000000 00000001622 14652527134 0017767 0 ustar 00root root 0000000 0000000 # ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_ac_append_to_file.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_AC_APPEND_TO_FILE([FILE],[DATA])
#
# DESCRIPTION
#
# Appends the specified data to the specified Autoconf is run. If you want
# to append to a file when configure is run use AX_APPEND_TO_FILE instead.
#
# LICENSE
#
# Copyright (c) 2009 Allan Caffee
#
# Copying and distribution of this file, with or without modification, are
# permitted in any medium without royalty provided the copyright notice
# and this notice are preserved. This file is offered as-is, without any
# warranty.
#serial 10
AC_DEFUN([AX_AC_APPEND_TO_FILE],[
AC_REQUIRE([AX_FILE_ESCAPES])
m4_esyscmd(
AX_FILE_ESCAPES
[
printf "%s" "$2" >> "$1"
])
])
dqlite-1.16.7/m4/ax_ac_print_to_file.m4 0000664 0000000 0000000 00000001611 14652527134 0017652 0 ustar 00root root 0000000 0000000 # ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_ac_print_to_file.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_AC_PRINT_TO_FILE([FILE],[DATA])
#
# DESCRIPTION
#
# Writes the specified data to the specified file when Autoconf is run. If
# you want to print to a file when configure is run use AX_PRINT_TO_FILE
# instead.
#
# LICENSE
#
# Copyright (c) 2009 Allan Caffee
#
# Copying and distribution of this file, with or without modification, are
# permitted in any medium without royalty provided the copyright notice
# and this notice are preserved. This file is offered as-is, without any
# warranty.
#serial 10
AC_DEFUN([AX_AC_PRINT_TO_FILE],[
m4_esyscmd(
AC_REQUIRE([AX_FILE_ESCAPES])
[
printf "%s" "$2" > "$1"
])
])
dqlite-1.16.7/m4/ax_add_am_macro_static.m4 0000664 0000000 0000000 00000001525 14652527134 0020313 0 ustar 00root root 0000000 0000000 # ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_add_am_macro_static.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_ADD_AM_MACRO_STATIC([RULE])
#
# DESCRIPTION
#
# Adds the specified rule to $AMINCLUDE.
#
# LICENSE
#
# Copyright (c) 2009 Tom Howard
# Copyright (c) 2009 Allan Caffee
#
# Copying and distribution of this file, with or without modification, are
# permitted in any medium without royalty provided the copyright notice
# and this notice are preserved. This file is offered as-is, without any
# warranty.
#serial 8
AC_DEFUN([AX_ADD_AM_MACRO_STATIC],[
AC_REQUIRE([AX_AM_MACROS_STATIC])
AX_AC_APPEND_TO_FILE(AMINCLUDE_STATIC,[$1])
])
dqlite-1.16.7/m4/ax_am_macros_static.m4 0000664 0000000 0000000 00000002125 14652527134 0017663 0 ustar 00root root 0000000 0000000 # ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_am_macros_static.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_AM_MACROS_STATIC
#
# DESCRIPTION
#
# Adds support for macros that create Automake rules. You must manually
# add the following line
#
# include $(top_srcdir)/aminclude_static.am
#
# to your Makefile.am files.
#
# LICENSE
#
# Copyright (c) 2009 Tom Howard
# Copyright (c) 2009 Allan Caffee
#
# Copying and distribution of this file, with or without modification, are
# permitted in any medium without royalty provided the copyright notice
# and this notice are preserved. This file is offered as-is, without any
# warranty.
#serial 11
AC_DEFUN([AMINCLUDE_STATIC],[aminclude_static.am])
AC_DEFUN([AX_AM_MACROS_STATIC],
[
AX_AC_PRINT_TO_FILE(AMINCLUDE_STATIC,[
# ]AMINCLUDE_STATIC[ generated automatically by Autoconf
# from AX_AM_MACROS_STATIC on ]m4_esyscmd([LC_ALL=C date])[
])
])
dqlite-1.16.7/m4/ax_check_compile_flag.m4 0000664 0000000 0000000 00000004070 14652527134 0020132 0 ustar 00root root 0000000 0000000 # ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS], [INPUT])
#
# DESCRIPTION
#
# Check whether the given FLAG works with the current language's compiler
# or gives an error. (Warnings, however, are ignored)
#
# ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on
# success/failure.
#
# If EXTRA-FLAGS is defined, it is added to the current language's default
# flags (e.g. CFLAGS) when the check is done. The check is thus made with
# the flags: "CFLAGS EXTRA-FLAGS FLAG". This can for example be used to
# force the compiler to issue an error when a bad flag is given.
#
# INPUT gives an alternative input source to AC_COMPILE_IFELSE.
#
# NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this
# macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG.
#
# LICENSE
#
# Copyright (c) 2008 Guido U. Draheim
# Copyright (c) 2011 Maarten Bosmans
#
# Copying and distribution of this file, with or without modification, are
# permitted in any medium without royalty provided the copyright notice
# and this notice are preserved. This file is offered as-is, without any
# warranty.
#serial 6
AC_DEFUN([AX_CHECK_COMPILE_FLAG],
[AC_PREREQ(2.64)dnl for _AC_LANG_PREFIX and AS_VAR_IF
AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl
AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [
ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS
_AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1"
AC_COMPILE_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])],
[AS_VAR_SET(CACHEVAR,[yes])],
[AS_VAR_SET(CACHEVAR,[no])])
_AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags])
AS_VAR_IF(CACHEVAR,yes,
[m4_default([$2], :)],
[m4_default([$3], :)])
AS_VAR_POPDEF([CACHEVAR])dnl
])dnl AX_CHECK_COMPILE_FLAGS
dqlite-1.16.7/m4/ax_check_gnu_make.m4 0000664 0000000 0000000 00000007727 14652527134 0017313 0 ustar 00root root 0000000 0000000 # ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_check_gnu_make.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_CHECK_GNU_MAKE([run-if-true],[run-if-false])
#
# DESCRIPTION
#
# This macro searches for a GNU version of make. If a match is found:
#
# * The makefile variable `ifGNUmake' is set to the empty string, otherwise
# it is set to "#". This is useful for including a special features in a
# Makefile, which cannot be handled by other versions of make.
# * The makefile variable `ifnGNUmake' is set to #, otherwise
# it is set to the empty string. This is useful for including a special
# features in a Makefile, which can be handled
# by other versions of make or to specify else like clause.
# * The variable `_cv_gnu_make_command` is set to the command to invoke
# GNU make if it exists, the empty string otherwise.
# * The variable `ax_cv_gnu_make_command` is set to the command to invoke
# GNU make by copying `_cv_gnu_make_command`, otherwise it is unset.
# * If GNU Make is found, its version is extracted from the output of
# `make --version` as the last field of a record of space-separated
# columns and saved into the variable `ax_check_gnu_make_version`.
# * Additionally if GNU Make is found, run shell code run-if-true
# else run shell code run-if-false.
#
# Here is an example of its use:
#
# Makefile.in might contain:
#
# # A failsafe way of putting a dependency rule into a makefile
# $(DEPEND):
# $(CC) -MM $(srcdir)/*.c > $(DEPEND)
#
# @ifGNUmake@ ifeq ($(DEPEND),$(wildcard $(DEPEND)))
# @ifGNUmake@ include $(DEPEND)
# @ifGNUmake@ else
# fallback code
# @ifGNUmake@ endif
#
# Then configure.in would normally contain:
#
# AX_CHECK_GNU_MAKE()
# AC_OUTPUT(Makefile)
#
# Then perhaps to cause gnu make to override any other make, we could do
# something like this (note that GNU make always looks for GNUmakefile
# first):
#
# if ! test x$_cv_gnu_make_command = x ; then
# mv Makefile GNUmakefile
# echo .DEFAULT: > Makefile ;
# echo \ $_cv_gnu_make_command \$@ >> Makefile;
# fi
#
# Then, if any (well almost any) other make is called, and GNU make also
# exists, then the other make wraps the GNU make.
#
# LICENSE
#
# Copyright (c) 2008 John Darrington
# Copyright (c) 2015 Enrico M. Crisostomo
#
# Copying and distribution of this file, with or without modification, are
# permitted in any medium without royalty provided the copyright notice
# and this notice are preserved. This file is offered as-is, without any
# warranty.
#serial 12
AC_DEFUN([AX_CHECK_GNU_MAKE],dnl
[AC_PROG_AWK
AC_CACHE_CHECK([for GNU make],[_cv_gnu_make_command],[dnl
_cv_gnu_make_command="" ;
dnl Search all the common names for GNU make
for a in "$MAKE" make gmake gnumake ; do
if test -z "$a" ; then continue ; fi ;
if "$a" --version 2> /dev/null | grep GNU 2>&1 > /dev/null ; then
_cv_gnu_make_command=$a ;
AX_CHECK_GNU_MAKE_HEADLINE=$("$a" --version 2> /dev/null | grep "GNU Make")
ax_check_gnu_make_version=$(echo ${AX_CHECK_GNU_MAKE_HEADLINE} | ${AWK} -F " " '{ print $(NF); }')
break ;
fi
done ;])
dnl If there was a GNU version, then set @ifGNUmake@ to the empty string, '#' otherwise
AS_VAR_IF([_cv_gnu_make_command], [""], [AS_VAR_SET([ifGNUmake], ["#"])], [AS_VAR_SET([ifGNUmake], [""])])
AS_VAR_IF([_cv_gnu_make_command], [""], [AS_VAR_SET([ifnGNUmake], [""])], [AS_VAR_SET([ifnGNUmake], ["#"])])
AS_VAR_IF([_cv_gnu_make_command], [""], [AS_UNSET(ax_cv_gnu_make_command)], [AS_VAR_SET([ax_cv_gnu_make_command], [${_cv_gnu_make_command}])])
AS_VAR_IF([_cv_gnu_make_command], [""],[$2],[$1])
AC_SUBST([ifGNUmake])
AC_SUBST([ifnGNUmake])
])
dqlite-1.16.7/m4/ax_code_coverage.m4 0000664 0000000 0000000 00000027616 14652527134 0017154 0 ustar 00root root 0000000 0000000 # ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_code_coverage.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_CODE_COVERAGE()
#
# DESCRIPTION
#
# Defines CODE_COVERAGE_CPPFLAGS, CODE_COVERAGE_CFLAGS,
# CODE_COVERAGE_CXXFLAGS and CODE_COVERAGE_LIBS which should be included
# in the CPPFLAGS, CFLAGS CXXFLAGS and LIBS/LIBADD variables of every
# build target (program or library) which should be built with code
# coverage support. Also add rules using AX_ADD_AM_MACRO_STATIC; and
# $enable_code_coverage which can be used in subsequent configure output.
# CODE_COVERAGE_ENABLED is defined and substituted, and corresponds to the
# value of the --enable-code-coverage option, which defaults to being
# disabled.
#
# Test also for gcov program and create GCOV variable that could be
# substituted.
#
# Note that all optimization flags in CFLAGS must be disabled when code
# coverage is enabled.
#
# Usage example:
#
# configure.ac:
#
# AX_CODE_COVERAGE
#
# Makefile.am:
#
# include $(top_srcdir)/aminclude_static.am
#
# my_program_LIBS = ... $(CODE_COVERAGE_LIBS) ...
# my_program_CPPFLAGS = ... $(CODE_COVERAGE_CPPFLAGS) ...
# my_program_CFLAGS = ... $(CODE_COVERAGE_CFLAGS) ...
# my_program_CXXFLAGS = ... $(CODE_COVERAGE_CXXFLAGS) ...
#
# clean-local: code-coverage-clean
# distclean-local: code-coverage-dist-clean
#
# This results in a "check-code-coverage" rule being added to any
# Makefile.am which do "include $(top_srcdir)/aminclude_static.am"
# (assuming the module has been configured with --enable-code-coverage).
# Running `make check-code-coverage` in that directory will run the
# module's test suite (`make check`) and build a code coverage report
# detailing the code which was touched, then print the URI for the report.
#
# This code was derived from Makefile.decl in GLib, originally licensed
# under LGPLv2.1+.
#
# LICENSE
#
# Copyright (c) 2012, 2016 Philip Withnall
# Copyright (c) 2012 Xan Lopez
# Copyright (c) 2012 Christian Persch
# Copyright (c) 2012 Paolo Borelli
# Copyright (c) 2012 Dan Winship
# Copyright (c) 2015,2018 Bastien ROUCARIES
#
# This library is free software; you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 2.1 of the License, or (at
# your option) any later version.
#
# This library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
# General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see .
#serial 34
m4_define(_AX_CODE_COVERAGE_RULES,[
AX_ADD_AM_MACRO_STATIC([
# Code coverage
#
# Optional:
# - CODE_COVERAGE_DIRECTORY: Top-level directory for code coverage reporting.
# Multiple directories may be specified, separated by whitespace.
# (Default: \$(top_builddir))
# - CODE_COVERAGE_OUTPUT_FILE: Filename and path for the .info file generated
# by lcov for code coverage. (Default:
# \$(PACKAGE_NAME)-\$(PACKAGE_VERSION)-coverage.info)
# - CODE_COVERAGE_OUTPUT_DIRECTORY: Directory for generated code coverage
# reports to be created. (Default:
# \$(PACKAGE_NAME)-\$(PACKAGE_VERSION)-coverage)
# - CODE_COVERAGE_BRANCH_COVERAGE: Set to 1 to enforce branch coverage,
# set to 0 to disable it and leave empty to stay with the default.
# (Default: empty)
# - CODE_COVERAGE_LCOV_SHOPTS_DEFAULT: Extra options shared between both lcov
# instances. (Default: based on $CODE_COVERAGE_BRANCH_COVERAGE)
# - CODE_COVERAGE_LCOV_SHOPTS: Extra options to shared between both lcov
# instances. (Default: $CODE_COVERAGE_LCOV_SHOPTS_DEFAULT)
# - CODE_COVERAGE_LCOV_OPTIONS_GCOVPATH: --gcov-tool pathtogcov
# - CODE_COVERAGE_LCOV_OPTIONS_DEFAULT: Extra options to pass to the
# collecting lcov instance. (Default: $CODE_COVERAGE_LCOV_OPTIONS_GCOVPATH)
# - CODE_COVERAGE_LCOV_OPTIONS: Extra options to pass to the collecting lcov
# instance. (Default: $CODE_COVERAGE_LCOV_OPTIONS_DEFAULT)
# - CODE_COVERAGE_LCOV_RMOPTS_DEFAULT: Extra options to pass to the filtering
# lcov instance. (Default: empty)
# - CODE_COVERAGE_LCOV_RMOPTS: Extra options to pass to the filtering lcov
# instance. (Default: $CODE_COVERAGE_LCOV_RMOPTS_DEFAULT)
# - CODE_COVERAGE_GENHTML_OPTIONS_DEFAULT: Extra options to pass to the
# genhtml instance. (Default: based on $CODE_COVERAGE_BRANCH_COVERAGE)
# - CODE_COVERAGE_GENHTML_OPTIONS: Extra options to pass to the genhtml
# instance. (Default: $CODE_COVERAGE_GENHTML_OPTIONS_DEFAULT)
# - CODE_COVERAGE_IGNORE_PATTERN: Extra glob pattern of files to ignore
#
# The generated report will be titled using the \$(PACKAGE_NAME) and
# \$(PACKAGE_VERSION). In order to add the current git hash to the title,
# use the git-version-gen script, available online.
# Optional variables
# run only on top dir
if CODE_COVERAGE_ENABLED
ifeq (\$(abs_builddir), \$(abs_top_builddir))
CODE_COVERAGE_DIRECTORY ?= \$(top_builddir)
CODE_COVERAGE_OUTPUT_FILE ?= \$(PACKAGE_NAME)-\$(PACKAGE_VERSION)-coverage.info
CODE_COVERAGE_OUTPUT_DIRECTORY ?= \$(PACKAGE_NAME)-\$(PACKAGE_VERSION)-coverage
CODE_COVERAGE_BRANCH_COVERAGE ?=
CODE_COVERAGE_LCOV_SHOPTS_DEFAULT ?= \$(if \$(CODE_COVERAGE_BRANCH_COVERAGE),\
--rc lcov_branch_coverage=\$(CODE_COVERAGE_BRANCH_COVERAGE))
CODE_COVERAGE_LCOV_SHOPTS ?= \$(CODE_COVERAGE_LCOV_SHOPTS_DEFAULT)
CODE_COVERAGE_LCOV_OPTIONS_GCOVPATH ?= --gcov-tool \"\$(GCOV)\"
CODE_COVERAGE_LCOV_OPTIONS_DEFAULT ?= \$(CODE_COVERAGE_LCOV_OPTIONS_GCOVPATH)
CODE_COVERAGE_LCOV_OPTIONS ?= \$(CODE_COVERAGE_LCOV_OPTIONS_DEFAULT)
CODE_COVERAGE_LCOV_RMOPTS_DEFAULT ?=
CODE_COVERAGE_LCOV_RMOPTS ?= \$(CODE_COVERAGE_LCOV_RMOPTS_DEFAULT)
CODE_COVERAGE_GENHTML_OPTIONS_DEFAULT ?=\
\$(if \$(CODE_COVERAGE_BRANCH_COVERAGE),\
--rc genhtml_branch_coverage=\$(CODE_COVERAGE_BRANCH_COVERAGE))
CODE_COVERAGE_GENHTML_OPTIONS ?= \$(CODE_COVERAGE_GENHTML_OPTIONS_DEFAULT)
CODE_COVERAGE_IGNORE_PATTERN ?=
GITIGNOREFILES := \$(GITIGNOREFILES) \$(CODE_COVERAGE_OUTPUT_FILE) \$(CODE_COVERAGE_OUTPUT_DIRECTORY)
code_coverage_v_lcov_cap = \$(code_coverage_v_lcov_cap_\$(V))
code_coverage_v_lcov_cap_ = \$(code_coverage_v_lcov_cap_\$(AM_DEFAULT_VERBOSITY))
code_coverage_v_lcov_cap_0 = @echo \" LCOV --capture\" \$(CODE_COVERAGE_OUTPUT_FILE);
code_coverage_v_lcov_ign = \$(code_coverage_v_lcov_ign_\$(V))
code_coverage_v_lcov_ign_ = \$(code_coverage_v_lcov_ign_\$(AM_DEFAULT_VERBOSITY))
code_coverage_v_lcov_ign_0 = @echo \" LCOV --remove /tmp/*\" \$(CODE_COVERAGE_IGNORE_PATTERN);
code_coverage_v_genhtml = \$(code_coverage_v_genhtml_\$(V))
code_coverage_v_genhtml_ = \$(code_coverage_v_genhtml_\$(AM_DEFAULT_VERBOSITY))
code_coverage_v_genhtml_0 = @echo \" GEN \" \"\$(CODE_COVERAGE_OUTPUT_DIRECTORY)\";
code_coverage_quiet = \$(code_coverage_quiet_\$(V))
code_coverage_quiet_ = \$(code_coverage_quiet_\$(AM_DEFAULT_VERBOSITY))
code_coverage_quiet_0 = --quiet
# sanitizes the test-name: replaces with underscores: dashes and dots
code_coverage_sanitize = \$(subst -,_,\$(subst .,_,\$(1)))
# Use recursive makes in order to ignore errors during check
check-code-coverage:
-\$(AM_V_at)\$(MAKE) \$(AM_MAKEFLAGS) -k check
\$(AM_V_at)\$(MAKE) \$(AM_MAKEFLAGS) code-coverage-capture
# Capture code coverage data
code-coverage-capture: code-coverage-capture-hook
\$(code_coverage_v_lcov_cap)\$(LCOV) \$(code_coverage_quiet) \$(addprefix --directory ,\$(CODE_COVERAGE_DIRECTORY)) --capture --output-file \"\$(CODE_COVERAGE_OUTPUT_FILE).tmp\" --test-name \"\$(call code_coverage_sanitize,\$(PACKAGE_NAME)-\$(PACKAGE_VERSION))\" --no-checksum --compat-libtool \$(CODE_COVERAGE_LCOV_SHOPTS) \$(CODE_COVERAGE_LCOV_OPTIONS)
\$(code_coverage_v_lcov_ign)\$(LCOV) \$(code_coverage_quiet) \$(addprefix --directory ,\$(CODE_COVERAGE_DIRECTORY)) --remove \"\$(CODE_COVERAGE_OUTPUT_FILE).tmp\" \"/tmp/*\" \$(CODE_COVERAGE_IGNORE_PATTERN) --output-file \"\$(CODE_COVERAGE_OUTPUT_FILE)\" \$(CODE_COVERAGE_LCOV_SHOPTS) \$(CODE_COVERAGE_LCOV_RMOPTS)
-@rm -f \"\$(CODE_COVERAGE_OUTPUT_FILE).tmp\"
\$(code_coverage_v_genhtml)LANG=C \$(GENHTML) \$(code_coverage_quiet) \$(addprefix --prefix ,\$(CODE_COVERAGE_DIRECTORY)) --output-directory \"\$(CODE_COVERAGE_OUTPUT_DIRECTORY)\" --title \"\$(PACKAGE_NAME)-\$(PACKAGE_VERSION) Code Coverage\" --legend --show-details \"\$(CODE_COVERAGE_OUTPUT_FILE)\" \$(CODE_COVERAGE_GENHTML_OPTIONS)
@echo \"file://\$(abs_builddir)/\$(CODE_COVERAGE_OUTPUT_DIRECTORY)/index.html\"
code-coverage-clean:
-\$(LCOV) --directory \$(top_builddir) -z
-rm -rf \"\$(CODE_COVERAGE_OUTPUT_FILE)\" \"\$(CODE_COVERAGE_OUTPUT_FILE).tmp\" \"\$(CODE_COVERAGE_OUTPUT_DIRECTORY)\"
-find . \\( -name \"*.gcda\" -o -name \"*.gcno\" -o -name \"*.gcov\" \\) -delete
code-coverage-dist-clean:
A][M_DISTCHECK_CONFIGURE_FLAGS := \$(A][M_DISTCHECK_CONFIGURE_FLAGS) --disable-code-coverage
else # ifneq (\$(abs_builddir), \$(abs_top_builddir))
check-code-coverage:
code-coverage-capture: code-coverage-capture-hook
code-coverage-clean:
code-coverage-dist-clean:
endif # ifeq (\$(abs_builddir), \$(abs_top_builddir))
else #! CODE_COVERAGE_ENABLED
# Use recursive makes in order to ignore errors during check
check-code-coverage:
@echo \"Need to reconfigure with --enable-code-coverage\"
# Capture code coverage data
code-coverage-capture: code-coverage-capture-hook
@echo \"Need to reconfigure with --enable-code-coverage\"
code-coverage-clean:
code-coverage-dist-clean:
endif #CODE_COVERAGE_ENABLED
# Hook rule executed before code-coverage-capture, overridable by the user
code-coverage-capture-hook:
.PHONY: check-code-coverage code-coverage-capture code-coverage-dist-clean code-coverage-clean code-coverage-capture-hook
])
])
AC_DEFUN([_AX_CODE_COVERAGE_ENABLED],[
AX_CHECK_GNU_MAKE([],[AC_MSG_ERROR([not using GNU make that is needed for coverage])])
AC_REQUIRE([AX_ADD_AM_MACRO_STATIC])
# check for gcov
AC_CHECK_TOOL([GCOV],
[$_AX_CODE_COVERAGE_GCOV_PROG_WITH],
[:])
AS_IF([test "X$GCOV" = "X:"],
[AC_MSG_ERROR([gcov is needed to do coverage])])
AC_SUBST([GCOV])
dnl Check if gcc is being used
AS_IF([ test "$GCC" = "no" ], [
AC_MSG_ERROR([not compiling with gcc, which is required for gcov code coverage])
])
AC_CHECK_PROG([LCOV], [lcov], [lcov])
AC_CHECK_PROG([GENHTML], [genhtml], [genhtml])
AS_IF([ test x"$LCOV" = x ], [
AC_MSG_ERROR([To enable code coverage reporting you must have lcov installed])
])
AS_IF([ test x"$GENHTML" = x ], [
AC_MSG_ERROR([Could not find genhtml from the lcov package])
])
dnl Build the code coverage flags
dnl Define CODE_COVERAGE_LDFLAGS for backwards compatibility
CODE_COVERAGE_CPPFLAGS="-DNDEBUG"
CODE_COVERAGE_CFLAGS="-O0 -g -fprofile-arcs -ftest-coverage"
CODE_COVERAGE_CXXFLAGS="-O0 -g -fprofile-arcs -ftest-coverage"
CODE_COVERAGE_LIBS="-lgcov"
AC_SUBST([CODE_COVERAGE_CPPFLAGS])
AC_SUBST([CODE_COVERAGE_CFLAGS])
AC_SUBST([CODE_COVERAGE_CXXFLAGS])
AC_SUBST([CODE_COVERAGE_LIBS])
])
AC_DEFUN([AX_CODE_COVERAGE],[
dnl Check for --enable-code-coverage
# allow to override gcov location
AC_ARG_WITH([gcov],
[AS_HELP_STRING([--with-gcov[=GCOV]], [use given GCOV for coverage (GCOV=gcov).])],
[_AX_CODE_COVERAGE_GCOV_PROG_WITH=$with_gcov],
[_AX_CODE_COVERAGE_GCOV_PROG_WITH=gcov])
AC_MSG_CHECKING([whether to build with code coverage support])
AC_ARG_ENABLE([code-coverage],
AS_HELP_STRING([--enable-code-coverage],
[Whether to enable code coverage support]),,
enable_code_coverage=no)
AM_CONDITIONAL([CODE_COVERAGE_ENABLED], [test "x$enable_code_coverage" = xyes])
AC_SUBST([CODE_COVERAGE_ENABLED], [$enable_code_coverage])
AC_MSG_RESULT($enable_code_coverage)
AS_IF([ test "x$enable_code_coverage" = xyes ], [
_AX_CODE_COVERAGE_ENABLED
])
_AX_CODE_COVERAGE_RULES
])
dqlite-1.16.7/m4/ax_compare_version.m4 0000664 0000000 0000000 00000014653 14652527134 0017557 0 ustar 00root root 0000000 0000000 # ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_compare_version.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_COMPARE_VERSION(VERSION_A, OP, VERSION_B, [ACTION-IF-TRUE], [ACTION-IF-FALSE])
#
# DESCRIPTION
#
# This macro compares two version strings. Due to the various number of
# minor-version numbers that can exist, and the fact that string
# comparisons are not compatible with numeric comparisons, this is not
# necessarily trivial to do in a autoconf script. This macro makes doing
# these comparisons easy.
#
# The six basic comparisons are available, as well as checking equality
# limited to a certain number of minor-version levels.
#
# The operator OP determines what type of comparison to do, and can be one
# of:
#
# eq - equal (test A == B)
# ne - not equal (test A != B)
# le - less than or equal (test A <= B)
# ge - greater than or equal (test A >= B)
# lt - less than (test A < B)
# gt - greater than (test A > B)
#
# Additionally, the eq and ne operator can have a number after it to limit
# the test to that number of minor versions.
#
# eq0 - equal up to the length of the shorter version
# ne0 - not equal up to the length of the shorter version
# eqN - equal up to N sub-version levels
# neN - not equal up to N sub-version levels
#
# When the condition is true, shell commands ACTION-IF-TRUE are run,
# otherwise shell commands ACTION-IF-FALSE are run. The environment
# variable 'ax_compare_version' is always set to either 'true' or 'false'
# as well.
#
# Examples:
#
# AX_COMPARE_VERSION([3.15.7],[lt],[3.15.8])
# AX_COMPARE_VERSION([3.15],[lt],[3.15.8])
#
# would both be true.
#
# AX_COMPARE_VERSION([3.15.7],[eq],[3.15.8])
# AX_COMPARE_VERSION([3.15],[gt],[3.15.8])
#
# would both be false.
#
# AX_COMPARE_VERSION([3.15.7],[eq2],[3.15.8])
#
# would be true because it is only comparing two minor versions.
#
# AX_COMPARE_VERSION([3.15.7],[eq0],[3.15])
#
# would be true because it is only comparing the lesser number of minor
# versions of the two values.
#
# Note: The characters that separate the version numbers do not matter. An
# empty string is the same as version 0. OP is evaluated by autoconf, not
# configure, so must be a string, not a variable.
#
# The author would like to acknowledge Guido Draheim whose advice about
# the m4_case and m4_ifvaln functions make this macro only include the
# portions necessary to perform the specific comparison specified by the
# OP argument in the final configure script.
#
# LICENSE
#
# Copyright (c) 2008 Tim Toolan
#
# Copying and distribution of this file, with or without modification, are
# permitted in any medium without royalty provided the copyright notice
# and this notice are preserved. This file is offered as-is, without any
# warranty.
#serial 13
dnl #########################################################################
AC_DEFUN([AX_COMPARE_VERSION], [
AC_REQUIRE([AC_PROG_AWK])
# Used to indicate true or false condition
ax_compare_version=false
# Convert the two version strings to be compared into a format that
# allows a simple string comparison. The end result is that a version
# string of the form 1.12.5-r617 will be converted to the form
# 0001001200050617. In other words, each number is zero padded to four
# digits, and non digits are removed.
AS_VAR_PUSHDEF([A],[ax_compare_version_A])
A=`echo "$1" | sed -e 's/\([[0-9]]*\)/Z\1Z/g' \
-e 's/Z\([[0-9]]\)Z/Z0\1Z/g' \
-e 's/Z\([[0-9]][[0-9]]\)Z/Z0\1Z/g' \
-e 's/Z\([[0-9]][[0-9]][[0-9]]\)Z/Z0\1Z/g' \
-e 's/[[^0-9]]//g'`
AS_VAR_PUSHDEF([B],[ax_compare_version_B])
B=`echo "$3" | sed -e 's/\([[0-9]]*\)/Z\1Z/g' \
-e 's/Z\([[0-9]]\)Z/Z0\1Z/g' \
-e 's/Z\([[0-9]][[0-9]]\)Z/Z0\1Z/g' \
-e 's/Z\([[0-9]][[0-9]][[0-9]]\)Z/Z0\1Z/g' \
-e 's/[[^0-9]]//g'`
dnl # In the case of le, ge, lt, and gt, the strings are sorted as necessary
dnl # then the first line is used to determine if the condition is true.
dnl # The sed right after the echo is to remove any indented white space.
m4_case(m4_tolower($2),
[lt],[
ax_compare_version=`echo "x$A
x$B" | sed 's/^ *//' | sort -r | sed "s/x${A}/false/;s/x${B}/true/;1q"`
],
[gt],[
ax_compare_version=`echo "x$A
x$B" | sed 's/^ *//' | sort | sed "s/x${A}/false/;s/x${B}/true/;1q"`
],
[le],[
ax_compare_version=`echo "x$A
x$B" | sed 's/^ *//' | sort | sed "s/x${A}/true/;s/x${B}/false/;1q"`
],
[ge],[
ax_compare_version=`echo "x$A
x$B" | sed 's/^ *//' | sort -r | sed "s/x${A}/true/;s/x${B}/false/;1q"`
],[
dnl Split the operator from the subversion count if present.
m4_bmatch(m4_substr($2,2),
[0],[
# A count of zero means use the length of the shorter version.
# Determine the number of characters in A and B.
ax_compare_version_len_A=`echo "$A" | $AWK '{print(length)}'`
ax_compare_version_len_B=`echo "$B" | $AWK '{print(length)}'`
# Set A to no more than B's length and B to no more than A's length.
A=`echo "$A" | sed "s/\(.\{$ax_compare_version_len_B\}\).*/\1/"`
B=`echo "$B" | sed "s/\(.\{$ax_compare_version_len_A\}\).*/\1/"`
],
[[0-9]+],[
# A count greater than zero means use only that many subversions
A=`echo "$A" | sed "s/\(\([[0-9]]\{4\}\)\{m4_substr($2,2)\}\).*/\1/"`
B=`echo "$B" | sed "s/\(\([[0-9]]\{4\}\)\{m4_substr($2,2)\}\).*/\1/"`
],
[.+],[
AC_WARNING(
[invalid OP numeric parameter: $2])
],[])
# Pad zeros at end of numbers to make same length.
ax_compare_version_tmp_A="$A`echo $B | sed 's/./0/g'`"
B="$B`echo $A | sed 's/./0/g'`"
A="$ax_compare_version_tmp_A"
# Check for equality or inequality as necessary.
m4_case(m4_tolower(m4_substr($2,0,2)),
[eq],[
test "x$A" = "x$B" && ax_compare_version=true
],
[ne],[
test "x$A" != "x$B" && ax_compare_version=true
],[
AC_WARNING([invalid OP parameter: $2])
])
])
AS_VAR_POPDEF([A])dnl
AS_VAR_POPDEF([B])dnl
dnl # Execute ACTION-IF-TRUE / ACTION-IF-FALSE.
if test "$ax_compare_version" = "true" ; then
m4_ifvaln([$4],[$4],[:])dnl
m4_ifvaln([$5],[else $5])dnl
fi
]) dnl AX_COMPARE_VERSION
dqlite-1.16.7/m4/ax_file_escapes.m4 0000664 0000000 0000000 00000001373 14652527134 0017001 0 ustar 00root root 0000000 0000000 # ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_file_escapes.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_FILE_ESCAPES
#
# DESCRIPTION
#
# Writes the specified data to the specified file.
#
# LICENSE
#
# Copyright (c) 2008 Tom Howard
#
# Copying and distribution of this file, with or without modification, are
# permitted in any medium without royalty provided the copyright notice
# and this notice are preserved. This file is offered as-is, without any
# warranty.
#serial 8
AC_DEFUN([AX_FILE_ESCAPES],[
AX_DOLLAR="\$"
AX_SRB="\\135"
AX_SLB="\\133"
AX_BS="\\\\"
AX_DQ="\""
])
dqlite-1.16.7/m4/ax_pthread.m4 0000664 0000000 0000000 00000054034 14652527134 0016010 0 ustar 00root root 0000000 0000000 # ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_pthread.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_PTHREAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
#
# DESCRIPTION
#
# This macro figures out how to build C programs using POSIX threads. It
# sets the PTHREAD_LIBS output variable to the threads library and linker
# flags, and the PTHREAD_CFLAGS output variable to any special C compiler
# flags that are needed. (The user can also force certain compiler
# flags/libs to be tested by setting these environment variables.)
#
# Also sets PTHREAD_CC and PTHREAD_CXX to any special C compiler that is
# needed for multi-threaded programs (defaults to the value of CC
# respectively CXX otherwise). (This is necessary on e.g. AIX to use the
# special cc_r/CC_r compiler alias.)
#
# NOTE: You are assumed to not only compile your program with these flags,
# but also to link with them as well. For example, you might link with
# $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS
# $PTHREAD_CXX $CXXFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS
#
# If you are only building threaded programs, you may wish to use these
# variables in your default LIBS, CFLAGS, and CC:
#
# LIBS="$PTHREAD_LIBS $LIBS"
# CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
# CXXFLAGS="$CXXFLAGS $PTHREAD_CFLAGS"
# CC="$PTHREAD_CC"
# CXX="$PTHREAD_CXX"
#
# In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute constant
# has a nonstandard name, this macro defines PTHREAD_CREATE_JOINABLE to
# that name (e.g. PTHREAD_CREATE_UNDETACHED on AIX).
#
# Also HAVE_PTHREAD_PRIO_INHERIT is defined if pthread is found and the
# PTHREAD_PRIO_INHERIT symbol is defined when compiling with
# PTHREAD_CFLAGS.
#
# ACTION-IF-FOUND is a list of shell commands to run if a threads library
# is found, and ACTION-IF-NOT-FOUND is a list of commands to run it if it
# is not found. If ACTION-IF-FOUND is not specified, the default action
# will define HAVE_PTHREAD.
#
# Please let the authors know if this macro fails on any platform, or if
# you have any other suggestions or comments. This macro was based on work
# by SGJ on autoconf scripts for FFTW (http://www.fftw.org/) (with help
# from M. Frigo), as well as ac_pthread and hb_pthread macros posted by
# Alejandro Forero Cuervo to the autoconf macro repository. We are also
# grateful for the helpful feedback of numerous users.
#
# Updated for Autoconf 2.68 by Daniel Richard G.
#
# LICENSE
#
# Copyright (c) 2008 Steven G. Johnson
# Copyright (c) 2011 Daniel Richard G.
# Copyright (c) 2019 Marc Stevens
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, either version 3 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program. If not, see .
#
# As a special exception, the respective Autoconf Macro's copyright owner
# gives unlimited permission to copy, distribute and modify the configure
# scripts that are the output of Autoconf when processing the Macro. You
# need not follow the terms of the GNU General Public License when using
# or distributing such scripts, even though portions of the text of the
# Macro appear in them. The GNU General Public License (GPL) does govern
# all other use of the material that constitutes the Autoconf Macro.
#
# This special exception to the GPL applies to versions of the Autoconf
# Macro released by the Autoconf Archive. When you make and distribute a
# modified version of the Autoconf Macro, you may extend this special
# exception to the GPL to apply to your modified version as well.
#serial 31
AU_ALIAS([ACX_PTHREAD], [AX_PTHREAD])
AC_DEFUN([AX_PTHREAD], [
AC_REQUIRE([AC_CANONICAL_HOST])
AC_REQUIRE([AC_PROG_CC])
AC_REQUIRE([AC_PROG_SED])
AC_LANG_PUSH([C])
ax_pthread_ok=no
# We used to check for pthread.h first, but this fails if pthread.h
# requires special compiler flags (e.g. on Tru64 or Sequent).
# It gets checked for in the link test anyway.
# First of all, check if the user has set any of the PTHREAD_LIBS,
# etcetera environment variables, and if threads linking works using
# them:
if test "x$PTHREAD_CFLAGS$PTHREAD_LIBS" != "x"; then
ax_pthread_save_CC="$CC"
ax_pthread_save_CFLAGS="$CFLAGS"
ax_pthread_save_LIBS="$LIBS"
AS_IF([test "x$PTHREAD_CC" != "x"], [CC="$PTHREAD_CC"])
AS_IF([test "x$PTHREAD_CXX" != "x"], [CXX="$PTHREAD_CXX"])
CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
LIBS="$PTHREAD_LIBS $LIBS"
AC_MSG_CHECKING([for pthread_join using $CC $PTHREAD_CFLAGS $PTHREAD_LIBS])
AC_LINK_IFELSE([AC_LANG_CALL([], [pthread_join])], [ax_pthread_ok=yes])
AC_MSG_RESULT([$ax_pthread_ok])
if test "x$ax_pthread_ok" = "xno"; then
PTHREAD_LIBS=""
PTHREAD_CFLAGS=""
fi
CC="$ax_pthread_save_CC"
CFLAGS="$ax_pthread_save_CFLAGS"
LIBS="$ax_pthread_save_LIBS"
fi
# We must check for the threads library under a number of different
# names; the ordering is very important because some systems
# (e.g. DEC) have both -lpthread and -lpthreads, where one of the
# libraries is broken (non-POSIX).
# Create a list of thread flags to try. Items with a "," contain both
# C compiler flags (before ",") and linker flags (after ","). Other items
# starting with a "-" are C compiler flags, and remaining items are
# library names, except for "none" which indicates that we try without
# any flags at all, and "pthread-config" which is a program returning
# the flags for the Pth emulation library.
ax_pthread_flags="pthreads none -Kthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config"
# The ordering *is* (sometimes) important. Some notes on the
# individual items follow:
# pthreads: AIX (must check this before -lpthread)
# none: in case threads are in libc; should be tried before -Kthread and
# other compiler flags to prevent continual compiler warnings
# -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h)
# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads), Tru64
# (Note: HP C rejects this with "bad form for `-t' option")
# -pthreads: Solaris/gcc (Note: HP C also rejects)
# -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it
# doesn't hurt to check since this sometimes defines pthreads and
# -D_REENTRANT too), HP C (must be checked before -lpthread, which
# is present but should not be used directly; and before -mthreads,
# because the compiler interprets this as "-mt" + "-hreads")
# -mthreads: Mingw32/gcc, Lynx/gcc
# pthread: Linux, etcetera
# --thread-safe: KAI C++
# pthread-config: use pthread-config program (for GNU Pth library)
case $host_os in
freebsd*)
# -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
# lthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
ax_pthread_flags="-kthread lthread $ax_pthread_flags"
;;
hpux*)
# From the cc(1) man page: "[-mt] Sets various -D flags to enable
# multi-threading and also sets -lpthread."
ax_pthread_flags="-mt -pthread pthread $ax_pthread_flags"
;;
openedition*)
# IBM z/OS requires a feature-test macro to be defined in order to
# enable POSIX threads at all, so give the user a hint if this is
# not set. (We don't define these ourselves, as they can affect
# other portions of the system API in unpredictable ways.)
AC_EGREP_CPP([AX_PTHREAD_ZOS_MISSING],
[
# if !defined(_OPEN_THREADS) && !defined(_UNIX03_THREADS)
AX_PTHREAD_ZOS_MISSING
# endif
],
[AC_MSG_WARN([IBM z/OS requires -D_OPEN_THREADS or -D_UNIX03_THREADS to enable pthreads support.])])
;;
solaris*)
# On Solaris (at least, for some versions), libc contains stubbed
# (non-functional) versions of the pthreads routines, so link-based
# tests will erroneously succeed. (N.B.: The stubs are missing
# pthread_cleanup_push, or rather a function called by this macro,
# so we could check for that, but who knows whether they'll stub
# that too in a future libc.) So we'll check first for the
# standard Solaris way of linking pthreads (-mt -lpthread).
ax_pthread_flags="-mt,-lpthread pthread $ax_pthread_flags"
;;
esac
# Are we compiling with Clang?
AC_CACHE_CHECK([whether $CC is Clang],
[ax_cv_PTHREAD_CLANG],
[ax_cv_PTHREAD_CLANG=no
# Note that Autoconf sets GCC=yes for Clang as well as GCC
if test "x$GCC" = "xyes"; then
AC_EGREP_CPP([AX_PTHREAD_CC_IS_CLANG],
[/* Note: Clang 2.7 lacks __clang_[a-z]+__ */
# if defined(__clang__) && defined(__llvm__)
AX_PTHREAD_CC_IS_CLANG
# endif
],
[ax_cv_PTHREAD_CLANG=yes])
fi
])
ax_pthread_clang="$ax_cv_PTHREAD_CLANG"
# GCC generally uses -pthread, or -pthreads on some platforms (e.g. SPARC)
# Note that for GCC and Clang -pthread generally implies -lpthread,
# except when -nostdlib is passed.
# This is problematic using libtool to build C++ shared libraries with pthread:
# [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=25460
# [2] https://bugzilla.redhat.com/show_bug.cgi?id=661333
# [3] https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=468555
# To solve this, first try -pthread together with -lpthread for GCC
AS_IF([test "x$GCC" = "xyes"],
[ax_pthread_flags="-pthread,-lpthread -pthread -pthreads $ax_pthread_flags"])
# Clang takes -pthread (never supported any other flag), but we'll try with -lpthread first
AS_IF([test "x$ax_pthread_clang" = "xyes"],
[ax_pthread_flags="-pthread,-lpthread -pthread"])
# The presence of a feature test macro requesting re-entrant function
# definitions is, on some systems, a strong hint that pthreads support is
# correctly enabled
case $host_os in
darwin* | hpux* | linux* | osf* | solaris*)
ax_pthread_check_macro="_REENTRANT"
;;
aix*)
ax_pthread_check_macro="_THREAD_SAFE"
;;
*)
ax_pthread_check_macro="--"
;;
esac
AS_IF([test "x$ax_pthread_check_macro" = "x--"],
[ax_pthread_check_cond=0],
[ax_pthread_check_cond="!defined($ax_pthread_check_macro)"])
if test "x$ax_pthread_ok" = "xno"; then
for ax_pthread_try_flag in $ax_pthread_flags; do
case $ax_pthread_try_flag in
none)
AC_MSG_CHECKING([whether pthreads work without any flags])
;;
*,*)
PTHREAD_CFLAGS=`echo $ax_pthread_try_flag | sed "s/^\(.*\),\(.*\)$/\1/"`
PTHREAD_LIBS=`echo $ax_pthread_try_flag | sed "s/^\(.*\),\(.*\)$/\2/"`
AC_MSG_CHECKING([whether pthreads work with "$PTHREAD_CFLAGS" and "$PTHREAD_LIBS"])
;;
-*)
AC_MSG_CHECKING([whether pthreads work with $ax_pthread_try_flag])
PTHREAD_CFLAGS="$ax_pthread_try_flag"
;;
pthread-config)
AC_CHECK_PROG([ax_pthread_config], [pthread-config], [yes], [no])
AS_IF([test "x$ax_pthread_config" = "xno"], [continue])
PTHREAD_CFLAGS="`pthread-config --cflags`"
PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`"
;;
*)
AC_MSG_CHECKING([for the pthreads library -l$ax_pthread_try_flag])
PTHREAD_LIBS="-l$ax_pthread_try_flag"
;;
esac
ax_pthread_save_CFLAGS="$CFLAGS"
ax_pthread_save_LIBS="$LIBS"
CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
LIBS="$PTHREAD_LIBS $LIBS"
# Check for various functions. We must include pthread.h,
# since some functions may be macros. (On the Sequent, we
# need a special flag -Kthread to make this header compile.)
# We check for pthread_join because it is in -lpthread on IRIX
# while pthread_create is in libc. We check for pthread_attr_init
# due to DEC craziness with -lpthreads. We check for
# pthread_cleanup_push because it is one of the few pthread
# functions on Solaris that doesn't have a non-functional libc stub.
# We try pthread_create on general principles.
AC_LINK_IFELSE([AC_LANG_PROGRAM([#include
# if $ax_pthread_check_cond
# error "$ax_pthread_check_macro must be defined"
# endif
static void *some_global = NULL;
static void routine(void *a)
{
/* To avoid any unused-parameter or
unused-but-set-parameter warning. */
some_global = a;
}
static void *start_routine(void *a) { return a; }],
[pthread_t th; pthread_attr_t attr;
pthread_create(&th, 0, start_routine, 0);
pthread_join(th, 0);
pthread_attr_init(&attr);
pthread_cleanup_push(routine, 0);
pthread_cleanup_pop(0) /* ; */])],
[ax_pthread_ok=yes],
[])
CFLAGS="$ax_pthread_save_CFLAGS"
LIBS="$ax_pthread_save_LIBS"
AC_MSG_RESULT([$ax_pthread_ok])
AS_IF([test "x$ax_pthread_ok" = "xyes"], [break])
PTHREAD_LIBS=""
PTHREAD_CFLAGS=""
done
fi
# Clang needs special handling, because older versions handle the -pthread
# option in a rather... idiosyncratic way
if test "x$ax_pthread_clang" = "xyes"; then
# Clang takes -pthread; it has never supported any other flag
# (Note 1: This will need to be revisited if a system that Clang
# supports has POSIX threads in a separate library. This tends not
# to be the way of modern systems, but it's conceivable.)
# (Note 2: On some systems, notably Darwin, -pthread is not needed
# to get POSIX threads support; the API is always present and
# active. We could reasonably leave PTHREAD_CFLAGS empty. But
# -pthread does define _REENTRANT, and while the Darwin headers
# ignore this macro, third-party headers might not.)
# However, older versions of Clang make a point of warning the user
# that, in an invocation where only linking and no compilation is
# taking place, the -pthread option has no effect ("argument unused
# during compilation"). They expect -pthread to be passed in only
# when source code is being compiled.
#
# Problem is, this is at odds with the way Automake and most other
# C build frameworks function, which is that the same flags used in
# compilation (CFLAGS) are also used in linking. Many systems
# supported by AX_PTHREAD require exactly this for POSIX threads
# support, and in fact it is often not straightforward to specify a
# flag that is used only in the compilation phase and not in
# linking. Such a scenario is extremely rare in practice.
#
# Even though use of the -pthread flag in linking would only print
# a warning, this can be a nuisance for well-run software projects
# that build with -Werror. So if the active version of Clang has
# this misfeature, we search for an option to squash it.
AC_CACHE_CHECK([whether Clang needs flag to prevent "argument unused" warning when linking with -pthread],
[ax_cv_PTHREAD_CLANG_NO_WARN_FLAG],
[ax_cv_PTHREAD_CLANG_NO_WARN_FLAG=unknown
# Create an alternate version of $ac_link that compiles and
# links in two steps (.c -> .o, .o -> exe) instead of one
# (.c -> exe), because the warning occurs only in the second
# step
ax_pthread_save_ac_link="$ac_link"
ax_pthread_sed='s/conftest\.\$ac_ext/conftest.$ac_objext/g'
ax_pthread_link_step=`AS_ECHO(["$ac_link"]) | sed "$ax_pthread_sed"`
ax_pthread_2step_ac_link="($ac_compile) && (echo ==== >&5) && ($ax_pthread_link_step)"
ax_pthread_save_CFLAGS="$CFLAGS"
for ax_pthread_try in '' -Qunused-arguments -Wno-unused-command-line-argument unknown; do
AS_IF([test "x$ax_pthread_try" = "xunknown"], [break])
CFLAGS="-Werror -Wunknown-warning-option $ax_pthread_try -pthread $ax_pthread_save_CFLAGS"
ac_link="$ax_pthread_save_ac_link"
AC_LINK_IFELSE([AC_LANG_SOURCE([[int main(void){return 0;}]])],
[ac_link="$ax_pthread_2step_ac_link"
AC_LINK_IFELSE([AC_LANG_SOURCE([[int main(void){return 0;}]])],
[break])
])
done
ac_link="$ax_pthread_save_ac_link"
CFLAGS="$ax_pthread_save_CFLAGS"
AS_IF([test "x$ax_pthread_try" = "x"], [ax_pthread_try=no])
ax_cv_PTHREAD_CLANG_NO_WARN_FLAG="$ax_pthread_try"
])
case "$ax_cv_PTHREAD_CLANG_NO_WARN_FLAG" in
no | unknown) ;;
*) PTHREAD_CFLAGS="$ax_cv_PTHREAD_CLANG_NO_WARN_FLAG $PTHREAD_CFLAGS" ;;
esac
fi # $ax_pthread_clang = yes
# Various other checks:
if test "x$ax_pthread_ok" = "xyes"; then
ax_pthread_save_CFLAGS="$CFLAGS"
ax_pthread_save_LIBS="$LIBS"
CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
LIBS="$PTHREAD_LIBS $LIBS"
# Detect AIX lossage: JOINABLE attribute is called UNDETACHED.
AC_CACHE_CHECK([for joinable pthread attribute],
[ax_cv_PTHREAD_JOINABLE_ATTR],
[ax_cv_PTHREAD_JOINABLE_ATTR=unknown
for ax_pthread_attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do
AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ],
[int attr = $ax_pthread_attr; return attr /* ; */])],
[ax_cv_PTHREAD_JOINABLE_ATTR=$ax_pthread_attr; break],
[])
done
])
AS_IF([test "x$ax_cv_PTHREAD_JOINABLE_ATTR" != "xunknown" && \
test "x$ax_cv_PTHREAD_JOINABLE_ATTR" != "xPTHREAD_CREATE_JOINABLE" && \
test "x$ax_pthread_joinable_attr_defined" != "xyes"],
[AC_DEFINE_UNQUOTED([PTHREAD_CREATE_JOINABLE],
[$ax_cv_PTHREAD_JOINABLE_ATTR],
[Define to necessary symbol if this constant
uses a non-standard name on your system.])
ax_pthread_joinable_attr_defined=yes
])
AC_CACHE_CHECK([whether more special flags are required for pthreads],
[ax_cv_PTHREAD_SPECIAL_FLAGS],
[ax_cv_PTHREAD_SPECIAL_FLAGS=no
case $host_os in
solaris*)
ax_cv_PTHREAD_SPECIAL_FLAGS="-D_POSIX_PTHREAD_SEMANTICS"
;;
esac
])
AS_IF([test "x$ax_cv_PTHREAD_SPECIAL_FLAGS" != "xno" && \
test "x$ax_pthread_special_flags_added" != "xyes"],
[PTHREAD_CFLAGS="$ax_cv_PTHREAD_SPECIAL_FLAGS $PTHREAD_CFLAGS"
ax_pthread_special_flags_added=yes])
AC_CACHE_CHECK([for PTHREAD_PRIO_INHERIT],
[ax_cv_PTHREAD_PRIO_INHERIT],
[AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include ]],
[[int i = PTHREAD_PRIO_INHERIT;
return i;]])],
[ax_cv_PTHREAD_PRIO_INHERIT=yes],
[ax_cv_PTHREAD_PRIO_INHERIT=no])
])
AS_IF([test "x$ax_cv_PTHREAD_PRIO_INHERIT" = "xyes" && \
test "x$ax_pthread_prio_inherit_defined" != "xyes"],
[AC_DEFINE([HAVE_PTHREAD_PRIO_INHERIT], [1], [Have PTHREAD_PRIO_INHERIT.])
ax_pthread_prio_inherit_defined=yes
])
CFLAGS="$ax_pthread_save_CFLAGS"
LIBS="$ax_pthread_save_LIBS"
# More AIX lossage: compile with *_r variant
if test "x$GCC" != "xyes"; then
case $host_os in
aix*)
AS_CASE(["x/$CC"],
[x*/c89|x*/c89_128|x*/c99|x*/c99_128|x*/cc|x*/cc128|x*/xlc|x*/xlc_v6|x*/xlc128|x*/xlc128_v6],
[#handle absolute path differently from PATH based program lookup
AS_CASE(["x$CC"],
[x/*],
[
AS_IF([AS_EXECUTABLE_P([${CC}_r])],[PTHREAD_CC="${CC}_r"])
AS_IF([test "x${CXX}" != "x"], [AS_IF([AS_EXECUTABLE_P([${CXX}_r])],[PTHREAD_CXX="${CXX}_r"])])
],
[
AC_CHECK_PROGS([PTHREAD_CC],[${CC}_r],[$CC])
AS_IF([test "x${CXX}" != "x"], [AC_CHECK_PROGS([PTHREAD_CXX],[${CXX}_r],[$CXX])])
]
)
])
;;
esac
fi
fi
test -n "$PTHREAD_CC" || PTHREAD_CC="$CC"
test -n "$PTHREAD_CXX" || PTHREAD_CXX="$CXX"
AC_SUBST([PTHREAD_LIBS])
AC_SUBST([PTHREAD_CFLAGS])
AC_SUBST([PTHREAD_CC])
AC_SUBST([PTHREAD_CXX])
# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
if test "x$ax_pthread_ok" = "xyes"; then
ifelse([$1],,[AC_DEFINE([HAVE_PTHREAD],[1],[Define if you have POSIX threads libraries and header files.])],[$1])
:
else
ax_pthread_ok=no
$2
fi
AC_LANG_POP
])dnl AX_PTHREAD
dqlite-1.16.7/m4/pkg.m4 0000664 0000000 0000000 00000024011 14652527134 0014442 0 ustar 00root root 0000000 0000000 dnl pkg.m4 - Macros to locate and utilise pkg-config. -*- Autoconf -*-
dnl serial 11 (pkg-config-0.29.1)
dnl
dnl Copyright © 2004 Scott James Remnant .
dnl Copyright © 2012-2015 Dan Nicholson
dnl
dnl This program is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU General Public License as published by
dnl the Free Software Foundation; either version 2 of the License, or
dnl (at your option) any later version.
dnl
dnl This program is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
dnl General Public License for more details.
dnl
dnl You should have received a copy of the GNU General Public License
dnl along with this program; if not, write to the Free Software
dnl Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
dnl 02111-1307, USA.
dnl
dnl As a special exception to the GNU General Public License, if you
dnl distribute this file as part of a program that contains a
dnl configuration script generated by Autoconf, you may include it under
dnl the same distribution terms that you use for the rest of that
dnl program.
dnl PKG_PREREQ(MIN-VERSION)
dnl -----------------------
dnl Since: 0.29
dnl
dnl Verify that the version of the pkg-config macros are at least
dnl MIN-VERSION. Unlike PKG_PROG_PKG_CONFIG, which checks the user's
dnl installed version of pkg-config, this checks the developer's version
dnl of pkg.m4 when generating configure.
dnl
dnl To ensure that this macro is defined, also add:
dnl m4_ifndef([PKG_PREREQ],
dnl [m4_fatal([must install pkg-config 0.29 or later before running autoconf/autogen])])
dnl
dnl See the "Since" comment for each macro you use to see what version
dnl of the macros you require.
m4_defun([PKG_PREREQ],
[m4_define([PKG_MACROS_VERSION], [0.29.1])
m4_if(m4_version_compare(PKG_MACROS_VERSION, [$1]), -1,
[m4_fatal([pkg.m4 version $1 or higher is required but ]PKG_MACROS_VERSION[ found])])
])dnl PKG_PREREQ
dnl PKG_PROG_PKG_CONFIG([MIN-VERSION])
dnl ----------------------------------
dnl Since: 0.16
dnl
dnl Search for the pkg-config tool and set the PKG_CONFIG variable to
dnl first found in the path. Checks that the version of pkg-config found
dnl is at least MIN-VERSION. If MIN-VERSION is not specified, 0.9.0 is
dnl used since that's the first version where most current features of
dnl pkg-config existed.
AC_DEFUN([PKG_PROG_PKG_CONFIG],
[m4_pattern_forbid([^_?PKG_[A-Z_]+$])
m4_pattern_allow([^PKG_CONFIG(_(PATH|LIBDIR|SYSROOT_DIR|ALLOW_SYSTEM_(CFLAGS|LIBS)))?$])
m4_pattern_allow([^PKG_CONFIG_(DISABLE_UNINSTALLED|TOP_BUILD_DIR|DEBUG_SPEW)$])
AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility])
AC_ARG_VAR([PKG_CONFIG_PATH], [directories to add to pkg-config's search path])
AC_ARG_VAR([PKG_CONFIG_LIBDIR], [path overriding pkg-config's built-in search path])
if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then
AC_PATH_TOOL([PKG_CONFIG], [pkg-config])
fi
if test -n "$PKG_CONFIG"; then
_pkg_min_version=m4_default([$1], [0.9.0])
AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version])
if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then
AC_MSG_RESULT([yes])
else
AC_MSG_RESULT([no])
PKG_CONFIG=""
fi
fi[]dnl
])dnl PKG_PROG_PKG_CONFIG
dnl PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
dnl -------------------------------------------------------------------
dnl Since: 0.18
dnl
dnl Check to see whether a particular set of modules exists. Similar to
dnl PKG_CHECK_MODULES(), but does not set variables or print errors.
dnl
dnl Please remember that m4 expands AC_REQUIRE([PKG_PROG_PKG_CONFIG])
dnl only at the first occurence in configure.ac, so if the first place
dnl it's called might be skipped (such as if it is within an "if", you
dnl have to call PKG_CHECK_EXISTS manually
AC_DEFUN([PKG_CHECK_EXISTS],
[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
if test -n "$PKG_CONFIG" && \
AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then
m4_default([$2], [:])
m4_ifvaln([$3], [else
$3])dnl
fi])
dnl _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES])
dnl ---------------------------------------------
dnl Internal wrapper calling pkg-config via PKG_CONFIG and setting
dnl pkg_failed based on the result.
m4_define([_PKG_CONFIG],
[if test -n "$$1"; then
pkg_cv_[]$1="$$1"
elif test -n "$PKG_CONFIG"; then
PKG_CHECK_EXISTS([$3],
[pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null`
test "x$?" != "x0" && pkg_failed=yes ],
[pkg_failed=yes])
else
pkg_failed=untried
fi[]dnl
])dnl _PKG_CONFIG
dnl _PKG_SHORT_ERRORS_SUPPORTED
dnl ---------------------------
dnl Internal check to see if pkg-config supports short errors.
AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED],
[AC_REQUIRE([PKG_PROG_PKG_CONFIG])
if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
_pkg_short_errors_supported=yes
else
_pkg_short_errors_supported=no
fi[]dnl
])dnl _PKG_SHORT_ERRORS_SUPPORTED
dnl PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND],
dnl [ACTION-IF-NOT-FOUND])
dnl --------------------------------------------------------------
dnl Since: 0.4.0
dnl
dnl Note that if there is a possibility the first call to
dnl PKG_CHECK_MODULES might not happen, you should be sure to include an
dnl explicit call to PKG_PROG_PKG_CONFIG in your configure.ac
AC_DEFUN([PKG_CHECK_MODULES],
[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl
AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl
pkg_failed=no
AC_MSG_CHECKING([for $1])
_PKG_CONFIG([$1][_CFLAGS], [cflags], [$2])
_PKG_CONFIG([$1][_LIBS], [libs], [$2])
m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS
and $1[]_LIBS to avoid the need to call pkg-config.
See the pkg-config man page for more details.])
if test $pkg_failed = yes; then
AC_MSG_RESULT([no])
_PKG_SHORT_ERRORS_SUPPORTED
if test $_pkg_short_errors_supported = yes; then
$1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "$2" 2>&1`
else
$1[]_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "$2" 2>&1`
fi
# Put the nasty error message in config.log where it belongs
echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD
m4_default([$4], [AC_MSG_ERROR(
[Package requirements ($2) were not met:
$$1_PKG_ERRORS
Consider adjusting the PKG_CONFIG_PATH environment variable if you
installed software in a non-standard prefix.
_PKG_TEXT])[]dnl
])
elif test $pkg_failed = untried; then
AC_MSG_RESULT([no])
m4_default([$4], [AC_MSG_FAILURE(
[The pkg-config script could not be found or is too old. Make sure it
is in your PATH or set the PKG_CONFIG environment variable to the full
path to pkg-config.
_PKG_TEXT
To get pkg-config, see .])[]dnl
])
else
$1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS
$1[]_LIBS=$pkg_cv_[]$1[]_LIBS
AC_MSG_RESULT([yes])
$3
fi[]dnl
])dnl PKG_CHECK_MODULES
dnl PKG_CHECK_MODULES_STATIC(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND],
dnl [ACTION-IF-NOT-FOUND])
dnl ---------------------------------------------------------------------
dnl Since: 0.29
dnl
dnl Checks for existence of MODULES and gathers its build flags with
dnl static libraries enabled. Sets VARIABLE-PREFIX_CFLAGS from --cflags
dnl and VARIABLE-PREFIX_LIBS from --libs.
dnl
dnl Note that if there is a possibility the first call to
dnl PKG_CHECK_MODULES_STATIC might not happen, you should be sure to
dnl include an explicit call to PKG_PROG_PKG_CONFIG in your
dnl configure.ac.
AC_DEFUN([PKG_CHECK_MODULES_STATIC],
[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
_save_PKG_CONFIG=$PKG_CONFIG
PKG_CONFIG="$PKG_CONFIG --static"
PKG_CHECK_MODULES($@)
PKG_CONFIG=$_save_PKG_CONFIG[]dnl
])dnl PKG_CHECK_MODULES_STATIC
dnl PKG_INSTALLDIR([DIRECTORY])
dnl -------------------------
dnl Since: 0.27
dnl
dnl Substitutes the variable pkgconfigdir as the location where a module
dnl should install pkg-config .pc files. By default the directory is
dnl $libdir/pkgconfig, but the default can be changed by passing
dnl DIRECTORY. The user can override through the --with-pkgconfigdir
dnl parameter.
AC_DEFUN([PKG_INSTALLDIR],
[m4_pushdef([pkg_default], [m4_default([$1], ['${libdir}/pkgconfig'])])
m4_pushdef([pkg_description],
[pkg-config installation directory @<:@]pkg_default[@:>@])
AC_ARG_WITH([pkgconfigdir],
[AS_HELP_STRING([--with-pkgconfigdir], pkg_description)],,
[with_pkgconfigdir=]pkg_default)
AC_SUBST([pkgconfigdir], [$with_pkgconfigdir])
m4_popdef([pkg_default])
m4_popdef([pkg_description])
])dnl PKG_INSTALLDIR
dnl PKG_NOARCH_INSTALLDIR([DIRECTORY])
dnl --------------------------------
dnl Since: 0.27
dnl
dnl Substitutes the variable noarch_pkgconfigdir as the location where a
dnl module should install arch-independent pkg-config .pc files. By
dnl default the directory is $datadir/pkgconfig, but the default can be
dnl changed by passing DIRECTORY. The user can override through the
dnl --with-noarch-pkgconfigdir parameter.
AC_DEFUN([PKG_NOARCH_INSTALLDIR],
[m4_pushdef([pkg_default], [m4_default([$1], ['${datadir}/pkgconfig'])])
m4_pushdef([pkg_description],
[pkg-config arch-independent installation directory @<:@]pkg_default[@:>@])
AC_ARG_WITH([noarch-pkgconfigdir],
[AS_HELP_STRING([--with-noarch-pkgconfigdir], pkg_description)],,
[with_noarch_pkgconfigdir=]pkg_default)
AC_SUBST([noarch_pkgconfigdir], [$with_noarch_pkgconfigdir])
m4_popdef([pkg_default])
m4_popdef([pkg_description])
])dnl PKG_NOARCH_INSTALLDIR
dnl PKG_CHECK_VAR(VARIABLE, MODULE, CONFIG-VARIABLE,
dnl [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
dnl -------------------------------------------
dnl Since: 0.28
dnl
dnl Retrieves the value of the pkg-config variable for the given module.
AC_DEFUN([PKG_CHECK_VAR],
[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
AC_ARG_VAR([$1], [value of $3 for $2, overriding pkg-config])dnl
_PKG_CONFIG([$1], [variable="][$3]["], [$2])
AS_VAR_COPY([$1], [pkg_cv_][$1])
AS_VAR_IF([$1], [""], [$5], [$4])dnl
])dnl PKG_CHECK_VAR
dqlite-1.16.7/resources/ 0000775 0000000 0000000 00000000000 14652527134 0015113 5 ustar 00root root 0000000 0000000 dqlite-1.16.7/resources/stdbool.h 0000664 0000000 0000000 00000002046 14652527134 0016734 0 ustar 00root root 0000000 0000000 /*===---- stdbool.h - Standard header for booleans -------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __STDBOOL_H
#define __STDBOOL_H
#define __bool_true_false_are_defined 1
#if defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L
/* FIXME: We should be issuing a deprecation warning here, but cannot yet due
* to system headers which include this header file unconditionally.
*/
#elif !defined(__cplusplus)
#define bool _Bool
#define true 1
#define false 0
#elif defined(__GNUC__) && !defined(__STRICT_ANSI__)
/* Define _Bool as a GNU extension. */
#define _Bool bool
#if defined(__cplusplus) && __cplusplus < 201103L
/* For C++98, define bool, false, true as a GNU extension. */
#define bool bool
#define false false
#define true true
#endif
#endif
#endif /* __STDBOOL_H */
dqlite-1.16.7/src/ 0000775 0000000 0000000 00000000000 14652527134 0013670 5 ustar 00root root 0000000 0000000 dqlite-1.16.7/src/bind.c 0000664 0000000 0000000 00000004052 14652527134 0014751 0 ustar 00root root 0000000 0000000 #include "bind.h"
#include "tuple.h"
/* Bind a single parameter. */
static int bind_one(sqlite3_stmt *stmt, int n, struct value *value)
{
int rc;
/* TODO: the binding calls below currently use SQLITE_TRANSIENT when
* passing pointers to data (for TEXT or BLOB datatypes). This way
* SQLite makes its private copy of the data before the bind call
* returns, and we can reuse the message body buffer. The overhead of
* the copy is typically low, but if it becomes a concern, this could be
* optimized to make no copy and instead prevent the message body from
* being reused. */
switch (value->type) {
case SQLITE_INTEGER:
rc = sqlite3_bind_int64(stmt, n, value->integer);
break;
case SQLITE_FLOAT:
rc = sqlite3_bind_double(stmt, n, value->float_);
break;
case SQLITE_BLOB:
rc = sqlite3_bind_blob(stmt, n, value->blob.base,
(int)value->blob.len,
SQLITE_TRANSIENT);
break;
case SQLITE_NULL:
rc = sqlite3_bind_null(stmt, n);
break;
case SQLITE_TEXT:
rc = sqlite3_bind_text(stmt, n, value->text, -1,
SQLITE_TRANSIENT);
break;
case DQLITE_ISO8601:
rc = sqlite3_bind_text(stmt, n, value->text, -1,
SQLITE_TRANSIENT);
break;
case DQLITE_BOOLEAN:
rc = sqlite3_bind_int64(stmt, n,
value->boolean == 0 ? 0 : 1);
break;
default:
rc = DQLITE_PROTO;
break;
}
return rc;
}
int bind__params(sqlite3_stmt *stmt, struct cursor *cursor, int format)
{
struct tuple_decoder decoder;
unsigned long i;
int rc;
assert(format == TUPLE__PARAMS || format == TUPLE__PARAMS32);
sqlite3_reset(stmt);
/* If the payload has been fully consumed, it means there are no
* parameters to bind. */
if (cursor->cap == 0) {
return 0;
}
rc = tuple_decoder__init(&decoder, 0, format, cursor);
if (rc != 0) {
return rc;
}
for (i = 0; i < tuple_decoder__n(&decoder); i++) {
struct value value;
rc = tuple_decoder__next(&decoder, &value);
if (rc != 0) {
return rc;
}
rc = bind_one(stmt, (int)(i + 1), &value);
if (rc != 0) {
return rc;
}
}
return 0;
}
dqlite-1.16.7/src/bind.h 0000664 0000000 0000000 00000000531 14652527134 0014754 0 ustar 00root root 0000000 0000000 /**
* Bind statement parameters decoding them from a client request payload.
*/
#ifndef BIND_H_
#define BIND_H_
#include
#include "lib/serialize.h"
/**
* Bind the parameters of the given statement by decoding the given payload.
*/
int bind__params(sqlite3_stmt *stmt, struct cursor *cursor, int format);
#endif /* BIND_H_*/
dqlite-1.16.7/src/client/ 0000775 0000000 0000000 00000000000 14652527134 0015146 5 ustar 00root root 0000000 0000000 dqlite-1.16.7/src/client/protocol.c 0000664 0000000 0000000 00000063305 14652527134 0017162 0 ustar 00root root 0000000 0000000 #include
#include
#include
#include
#include
#include
#include "../lib/assert.h"
#include "../message.h"
#include "../protocol.h"
#include "../request.h"
#include "../response.h"
#include "../tracing.h"
#include "../tuple.h"
#include "protocol.h"
static void oom(void)
{
abort();
}
void *mallocChecked(size_t n)
{
void *p = malloc(n);
if (p == NULL) {
oom();
}
return p;
}
void *callocChecked(size_t count, size_t n)
{
void *p = calloc(count, n);
if (p == NULL) {
oom();
}
return p;
}
char *strdupChecked(const char *s)
{
char *p = strdup(s);
if (p == NULL) {
oom();
}
return p;
}
char *strndupChecked(const char *s, size_t n)
{
char *p = strndup(s, n);
if (p == NULL) {
oom();
}
return p;
}
/* Convert a value that potentially borrows data from the client_proto read
* buffer into one that owns its data. The owned data must be free with
* freeOwnedValue. */
static void makeValueOwned(struct value *val)
{
char *p;
switch (val->type) {
case SQLITE_TEXT:
val->text = strdupChecked(val->text);
break;
case DQLITE_ISO8601:
val->iso8601 = strdupChecked(val->iso8601);
break;
case SQLITE_BLOB:
p = mallocChecked(val->blob.len);
memcpy(p, val->blob.base, val->blob.len);
val->blob.base = p;
break;
default:;
}
}
/* Free the owned data of a value, which must have had makeValueOwned called
* on it previously. This takes its argument by value because it does *not*
* free the memory that stores the `struct value` itself, only the pointers
* held by `struct value`. */
static void freeOwnedValue(struct value val)
{
switch (val.type) {
case SQLITE_TEXT:
free((char *)val.text);
break;
case DQLITE_ISO8601:
free((char *)val.iso8601);
break;
case SQLITE_BLOB:
free(val.blob.base);
break;
default:;
}
}
static int peekUint64(struct cursor cursor, uint64_t *val)
{
if (cursor.cap < 8) {
return DQLITE_CLIENT_PROTO_ERROR;
}
memcpy(val, cursor.p, sizeof(*val));
*val = ByteFlipLe64(*val);
return 0;
}
/* Read data from fd into buf until one of the following occurs:
*
* - The full count n of bytes is read.
* - A read returns 0 (EOF).
* - The context's deadline is reached.
* - An error occurs.
*
* On error, -1 is returned. Otherwise the return value is the count
* of bytes read. This may be less than n if either EOF happened or
* the deadline kicked in. */
static ssize_t doRead(int fd,
void *buf,
size_t buf_len,
struct client_context *context)
{
ssize_t total;
struct pollfd pfd;
struct timespec now;
long long millis;
ssize_t n;
int rv;
pfd.fd = fd;
pfd.events = POLLIN;
pfd.revents = 0;
total = 0;
while ((size_t)total < buf_len) {
rv = clock_gettime(CLOCK_REALTIME, &now);
assert(rv == 0);
if (context != NULL) {
millis =
(context->deadline.tv_sec - now.tv_sec) * 1000 +
(context->deadline.tv_nsec - now.tv_nsec) / 1000000;
if (millis < 0) {
/* poll(2) will block indefinitely if the
* timeout argument is negative, and we don't
* want that here. Signal a timeout. */
break;
}
} else {
/* The caller has explicitly asked us to block
* indefinitely. */
millis = -1;
}
rv = poll(&pfd, 1, (millis > INT_MAX) ? INT_MAX : (int)millis);
if (rv < 0) {
if (errno == EINTR) {
continue;
} else {
return -1;
}
} else if (rv == 0) {
/* Timeout */
break;
}
assert(rv == 1);
if (pfd.revents != POLLIN) {
/* If some other bits are set in the out parameter, an
* error occurred. */
return -1;
}
n = read(fd, (char *)buf + (size_t)total,
buf_len - (size_t)total);
if (n < 0) {
if (errno == EINTR) {
continue;
} else {
return -1;
}
} else if (n == 0) {
/* EOF */
break;
}
total += n;
}
return total;
}
/* Write data into fd from buf until one of the following occurs:
*
* - The full count n of bytes is written.
* - A write returns 0 (EOF).
* - The context's deadline is reached.
* - An error occurs.
*
* On error, -1 is returned. Otherwise the return value is the count
* of bytes written. This may be less than n if either EOF happened or
* the deadline kicked in. */
static ssize_t doWrite(int fd,
void *buf,
size_t buf_len,
struct client_context *context)
{
ssize_t total;
struct pollfd pfd;
struct timespec now;
long long millis;
ssize_t n;
int rv;
pfd.fd = fd;
pfd.events = POLLOUT;
pfd.revents = 0;
total = 0;
while ((size_t)total < buf_len) {
rv = clock_gettime(CLOCK_REALTIME, &now);
assert(rv == 0);
if (context != NULL) {
millis =
(context->deadline.tv_sec - now.tv_sec) * 1000 +
(context->deadline.tv_nsec - now.tv_nsec) / 1000000;
if (millis < 0) {
/* poll(2) will block indefinitely if the
* timeout argument is negative, and we don't
* want that here. Signal a timeout. */
break;
}
} else {
/* The caller has explicitly asked us to block
* indefinitely. */
millis = -1;
}
rv = poll(&pfd, 1, (millis > INT_MAX) ? INT_MAX : (int)millis);
if (rv < 0) {
if (errno == EINTR) {
continue;
} else {
return -1;
}
} else if (rv == 0) {
/* Timeout */
break;
}
assert(rv == 1);
if (pfd.revents != POLLOUT) {
/* If some other bits are set in the out parameter, an
* error occurred. */
return -1;
}
n = write(fd, (char *)buf + (size_t)total,
buf_len - (size_t)total);
if (n < 0) {
if (errno == EINTR) {
continue;
} else {
return -1;
}
} else if (n == 0) {
/* EOF */
break;
}
total += n;
}
return total;
}
static int handleFailure(struct client_proto *c)
{
struct response_failure failure;
struct cursor cursor;
int rv;
cursor.p = buffer__cursor(&c->read, 0);
cursor.cap = buffer__offset(&c->read);
rv = response_failure__decode(&cursor, &failure);
if (rv != 0) {
tracef("decode as failure failed rv:%d", rv);
return DQLITE_CLIENT_PROTO_ERROR;
}
c->errcode = failure.code;
if (c->errmsg != NULL) {
free(c->errmsg);
}
c->errmsg = strdupChecked(failure.message);
return DQLITE_CLIENT_PROTO_RECEIVED_FAILURE;
}
void clientContextMillis(struct client_context *context, long millis)
{
int rv;
rv = clock_gettime(CLOCK_REALTIME, &context->deadline);
assert(rv == 0);
context->deadline.tv_nsec += millis * 1000000;
while (context->deadline.tv_nsec >= 1000000000) {
context->deadline.tv_nsec -= 1000000000;
context->deadline.tv_sec += 1;
}
}
/* TODO accept a context here? */
int clientOpen(struct client_proto *c, const char *addr, uint64_t server_id)
{
int rv;
rv = c->connect(c->connect_arg, addr, &c->fd);
if (rv != 0) {
c->fd = -1;
return DQLITE_CLIENT_PROTO_ERROR;
}
c->server_id = server_id;
rv = buffer__init(&c->read);
if (rv != 0) {
oom();
}
rv = buffer__init(&c->write);
if (rv != 0) {
oom();
}
c->errcode = 0;
c->errmsg = NULL;
return 0;
}
void clientClose(struct client_proto *c)
{
tracef("client close");
if (c->fd == -1) {
return;
}
close(c->fd);
c->fd = -1;
buffer__close(&c->write);
buffer__close(&c->read);
free(c->db_name);
c->db_name = NULL;
free(c->errmsg);
c->errmsg = NULL;
c->server_id = 0;
}
int clientSendHandshake(struct client_proto *c, struct client_context *context)
{
uint64_t protocol;
ssize_t rv;
tracef("client send handshake");
protocol = ByteFlipLe64(DQLITE_PROTOCOL_VERSION);
rv = doWrite(c->fd, &protocol, sizeof protocol, context);
if (rv < 0) {
tracef("client send handshake failed %zd", rv);
return DQLITE_CLIENT_PROTO_ERROR;
} else if ((size_t)rv < sizeof protocol) {
return DQLITE_CLIENT_PROTO_SHORT;
}
return 0;
}
static int writeMessage(struct client_proto *c,
uint8_t type,
uint8_t schema,
struct client_context *context)
{
struct message message = {0};
size_t n;
size_t words;
char *cursor;
ssize_t rv;
n = buffer__offset(&c->write);
words = (n - message__sizeof(&message)) / 8;
message.words = (uint32_t)words;
message.type = type;
message.schema = schema;
cursor = buffer__cursor(&c->write, 0);
message__encode(&message, &cursor);
rv = doWrite(c->fd, buffer__cursor(&c->write, 0), n, context);
if (rv < 0) {
tracef("request write failed rv:%zd", rv);
return DQLITE_CLIENT_PROTO_ERROR;
} else if ((size_t)rv < n) {
return DQLITE_CLIENT_PROTO_SHORT;
}
return 0;
}
#define BUFFER_REQUEST(LOWER, UPPER) \
{ \
struct message _message = {0}; \
size_t _n1; \
size_t _n2; \
char *_cursor; \
_n1 = message__sizeof(&_message); \
_n2 = request_##LOWER##__sizeof(&request); \
buffer__reset(&c->write); \
_cursor = buffer__advance(&c->write, _n1 + _n2); \
if (_cursor == NULL) { \
oom(); \
} \
assert(_n2 % 8 == 0); \
message__encode(&_message, &_cursor); \
request_##LOWER##__encode(&request, &_cursor); \
}
/* Write out a request. */
#define REQUEST(LOWER, UPPER, SCHEMA) \
{ \
int _rv; \
BUFFER_REQUEST(LOWER, UPPER); \
_rv = \
writeMessage(c, DQLITE_REQUEST_##UPPER, SCHEMA, context); \
if (_rv != 0) { \
return _rv; \
} \
}
static int readMessage(struct client_proto *c,
uint8_t *type,
struct client_context *context)
{
struct message message = {0};
struct cursor cursor;
void *p;
size_t n;
ssize_t rv;
buffer__reset(&c->read);
n = message__sizeof(&message);
p = buffer__advance(&c->read, n);
if (p == NULL) {
oom();
}
rv = doRead(c->fd, p, n, context);
if (rv < 0) {
return DQLITE_CLIENT_PROTO_ERROR;
} else if (rv < (ssize_t)n) {
return DQLITE_CLIENT_PROTO_SHORT;
}
cursor.p = p;
cursor.cap = n;
rv = message__decode(&cursor, &message);
if (rv != 0) {
tracef("message decode failed rv:%zd", rv);
return DQLITE_CLIENT_PROTO_ERROR;
}
buffer__reset(&c->read);
n = message.words * 8;
p = buffer__advance(&c->read, n);
if (p == NULL) {
oom();
}
rv = doRead(c->fd, p, n, context);
if (rv < 0) {
return DQLITE_ERROR;
} else if (rv < (ssize_t)n) {
return DQLITE_CLIENT_PROTO_SHORT;
}
*type = message.type;
return 0;
}
/* Read and decode a response. */
#define RESPONSE(LOWER, UPPER) \
{ \
uint8_t _type; \
int _rv; \
_rv = readMessage(c, &_type, context); \
if (_rv != 0) { \
return _rv; \
} \
if (_type == DQLITE_RESPONSE_FAILURE && \
_type != DQLITE_RESPONSE_##UPPER) { \
_rv = handleFailure(c); \
return _rv; \
} else if (_type != DQLITE_RESPONSE_##UPPER) { \
return DQLITE_CLIENT_PROTO_ERROR; \
} \
cursor.p = buffer__cursor(&c->read, 0); \
cursor.cap = buffer__offset(&c->read); \
_rv = response_##LOWER##__decode(&cursor, &response); \
if (_rv != 0) { \
return DQLITE_CLIENT_PROTO_ERROR; \
} \
}
int clientSendLeader(struct client_proto *c, struct client_context *context)
{
tracef("client send leader");
struct request_leader request = {0};
REQUEST(leader, LEADER, 0);
return 0;
}
int clientSendClient(struct client_proto *c,
uint64_t id,
struct client_context *context)
{
tracef("client send client");
struct request_client request;
request.id = id;
REQUEST(client, CLIENT, 0);
return 0;
}
int clientSendOpen(struct client_proto *c,
const char *name,
struct client_context *context)
{
tracef("client send open name %s", name);
struct request_open request;
c->db_name = strdupChecked(name);
request.filename = name;
request.flags = 0; /* unused */
request.vfs = "test"; /* unused */
REQUEST(open, OPEN, 0);
return 0;
}
int clientRecvDb(struct client_proto *c, struct client_context *context)
{
tracef("client recvdb");
struct cursor cursor;
struct response_db response;
RESPONSE(db, DB);
c->db_id = response.id;
c->db_is_init = true;
return 0;
}
int clientSendPrepare(struct client_proto *c,
const char *sql,
struct client_context *context)
{
tracef("client send prepare");
struct request_prepare request;
request.db_id = c->db_id;
request.sql = sql;
REQUEST(prepare, PREPARE, DQLITE_PREPARE_STMT_SCHEMA_V1);
return 0;
}
int clientRecvStmt(struct client_proto *c,
uint32_t *stmt_id,
uint64_t *n_params,
uint64_t *offset,
struct client_context *context)
{
struct cursor cursor;
struct response_stmt_with_offset response;
RESPONSE(stmt_with_offset, STMT_WITH_OFFSET);
if (stmt_id != NULL) {
*stmt_id = response.id;
}
if (n_params != NULL) {
*n_params = response.params;
}
if (offset != NULL) {
*offset = response.offset;
}
return 0;
}
static int bufferParams(struct client_proto *c,
struct value *params,
unsigned n_params)
{
struct tuple_encoder tup;
size_t i;
int rv;
if (n_params == 0) {
return 0;
}
rv = tuple_encoder__init(&tup, n_params, TUPLE__PARAMS32, &c->write);
if (rv != 0) {
return DQLITE_CLIENT_PROTO_ERROR;
}
for (i = 0; i < n_params; ++i) {
rv = tuple_encoder__next(&tup, ¶ms[i]);
if (rv != 0) {
return DQLITE_CLIENT_PROTO_ERROR;
}
}
return 0;
}
int clientSendExec(struct client_proto *c,
uint32_t stmt_id,
struct value *params,
unsigned n_params,
struct client_context *context)
{
tracef("client send exec id %" PRIu32, stmt_id);
struct request_exec request;
int rv;
request.db_id = c->db_id;
request.stmt_id = stmt_id;
BUFFER_REQUEST(exec, EXEC);
rv = bufferParams(c, params, n_params);
if (rv != 0) {
return rv;
}
rv = writeMessage(c, DQLITE_REQUEST_EXEC, 1, context);
return rv;
}
int clientSendExecSQL(struct client_proto *c,
const char *sql,
struct value *params,
unsigned n_params,
struct client_context *context)
{
tracef("client send exec sql");
struct request_exec_sql request;
int rv;
request.db_id = c->db_id;
request.sql = sql;
BUFFER_REQUEST(exec_sql, EXEC_SQL);
rv = bufferParams(c, params, n_params);
if (rv != 0) {
return rv;
}
rv = writeMessage(c, DQLITE_REQUEST_EXEC_SQL, 1, context);
return rv;
}
int clientRecvResult(struct client_proto *c,
uint64_t *last_insert_id,
uint64_t *rows_affected,
struct client_context *context)
{
struct cursor cursor;
struct response_result response;
RESPONSE(result, RESULT);
if (last_insert_id != NULL) {
*last_insert_id = response.last_insert_id;
}
if (rows_affected != NULL) {
*rows_affected = response.rows_affected;
}
return 0;
}
int clientSendQuery(struct client_proto *c,
uint32_t stmt_id,
struct value *params,
unsigned n_params,
struct client_context *context)
{
tracef("client send query stmt_id %" PRIu32, stmt_id);
struct request_query request;
int rv;
request.db_id = c->db_id;
request.stmt_id = stmt_id;
BUFFER_REQUEST(query, QUERY);
rv = bufferParams(c, params, n_params);
if (rv != 0) {
return rv;
}
rv = writeMessage(c, DQLITE_REQUEST_QUERY, 1, context);
return rv;
}
int clientSendQuerySQL(struct client_proto *c,
const char *sql,
struct value *params,
unsigned n_params,
struct client_context *context)
{
tracef("client send query sql sql %s", sql);
struct request_query_sql request;
int rv;
request.db_id = c->db_id;
request.sql = sql;
BUFFER_REQUEST(query_sql, QUERY_SQL);
rv = bufferParams(c, params, n_params);
if (rv != 0) {
return rv;
}
rv = writeMessage(c, DQLITE_REQUEST_QUERY_SQL, 1, context);
return rv;
}
int clientRecvRows(struct client_proto *c,
struct rows *rows,
bool *done,
struct client_context *context)
{
tracef("client recv rows");
struct cursor cursor;
uint8_t type;
uint64_t column_count;
unsigned i;
unsigned j;
const char *raw;
struct row *row;
struct row *last;
uint64_t eof;
struct tuple_decoder tup;
int rv;
rv = readMessage(c, &type, context);
if (rv != 0) {
return rv;
}
if (type == DQLITE_RESPONSE_FAILURE) {
rv = handleFailure(c);
return rv;
} else if (type != DQLITE_RESPONSE_ROWS) {
return DQLITE_CLIENT_PROTO_ERROR;
}
cursor.p = buffer__cursor(&c->read, 0);
cursor.cap = buffer__offset(&c->read);
rv = uint64__decode(&cursor, &column_count);
if (rv != 0) {
return DQLITE_CLIENT_PROTO_ERROR;
}
rows->column_count = (unsigned)column_count;
assert((uint64_t)rows->column_count == column_count);
rows->column_names =
callocChecked(rows->column_count, sizeof *rows->column_names);
for (i = 0; i < rows->column_count; ++i) {
rv = text__decode(&cursor, &raw);
if (rv != 0) {
rv = DQLITE_CLIENT_PROTO_ERROR;
goto err_after_alloc_column_names;
}
rows->column_names[i] = strdupChecked(raw);
}
rows->next = NULL;
last = NULL;
while (1) {
rv = peekUint64(cursor, &eof);
if (rv != 0) {
goto err_after_alloc_column_names;
}
if (eof == DQLITE_RESPONSE_ROWS_DONE ||
eof == DQLITE_RESPONSE_ROWS_PART) {
break;
}
row = mallocChecked(sizeof *row);
row->values =
callocChecked(rows->column_count, sizeof *row->values);
row->next = NULL;
/* Make sure that `goto err_after_alloc_row_values` will do the
* right thing even before we enter the for loop. */
i = 0;
rv = tuple_decoder__init(&tup, rows->column_count, TUPLE__ROW,
&cursor);
if (rv != 0) {
rv = DQLITE_CLIENT_PROTO_ERROR;
goto err_after_alloc_row_values;
}
for (; i < rows->column_count; ++i) {
rv = tuple_decoder__next(&tup, &row->values[i]);
if (rv != 0) {
rv = DQLITE_CLIENT_PROTO_ERROR;
goto err_after_alloc_row_values;
}
makeValueOwned(&row->values[i]);
}
if (last == NULL) {
rows->next = row;
} else {
last->next = row;
}
last = row;
}
assert(eof == DQLITE_RESPONSE_ROWS_DONE ||
eof == DQLITE_RESPONSE_ROWS_PART);
if (done != NULL) {
*done = eof == DQLITE_RESPONSE_ROWS_DONE;
}
return 0;
err_after_alloc_row_values:
for (j = 0; j < i; ++j) {
freeOwnedValue(row->values[j]);
}
free(row->values);
free(row);
err_after_alloc_column_names:
clientCloseRows(rows);
return rv;
}
void clientCloseRows(struct rows *rows)
{
uint64_t i;
struct row *row = rows->next;
struct row *next;
/* Note that we take care to still do the right thing if this was
* called before clientRecvRows completed. */
for (row = rows->next; row != NULL; row = next) {
next = row->next;
row->next = NULL;
for (i = 0; i < rows->column_count; ++i) {
freeOwnedValue(row->values[i]);
}
free(row->values);
row->values = NULL;
free(row);
}
rows->next = NULL;
if (rows->column_names != NULL) {
for (i = 0; i < rows->column_count; ++i) {
free(rows->column_names[i]);
rows->column_names[i] = NULL;
}
}
free(rows->column_names);
}
int clientSendInterrupt(struct client_proto *c, struct client_context *context)
{
tracef("client send interrupt");
struct request_interrupt request;
request.db_id = c->db_id;
REQUEST(interrupt, INTERRUPT, 0);
return 0;
}
int clientSendFinalize(struct client_proto *c,
uint32_t stmt_id,
struct client_context *context)
{
tracef("client send finalize %u", stmt_id);
struct request_finalize request;
request.db_id = c->db_id;
request.stmt_id = stmt_id;
REQUEST(finalize, FINALIZE, 0);
return 0;
}
int clientSendAdd(struct client_proto *c,
uint64_t id,
const char *address,
struct client_context *context)
{
tracef("client send add id %" PRIu64 " address %s", id, address);
struct request_add request;
request.id = id;
request.address = address;
REQUEST(add, ADD, 0);
return 0;
}
int clientSendAssign(struct client_proto *c,
uint64_t id,
int role,
struct client_context *context)
{
tracef("client send assign id %" PRIu64 " role %d", id, role);
assert(role == DQLITE_VOTER || role == DQLITE_STANDBY ||
role == DQLITE_SPARE);
struct request_assign request;
request.id = id;
request.role = (uint64_t)role;
REQUEST(assign, ASSIGN, 0);
return 0;
}
int clientSendRemove(struct client_proto *c,
uint64_t id,
struct client_context *context)
{
tracef("client send remove id %" PRIu64, id);
struct request_remove request;
request.id = id;
REQUEST(remove, REMOVE, 0);
return 0;
}
int clientSendDump(struct client_proto *c, struct client_context *context)
{
tracef("client send dump");
struct request_dump request;
assert(c->db_is_init);
assert(c->db_name != NULL);
request.filename = c->db_name;
REQUEST(dump, DUMP, 0);
return 0;
}
int clientSendCluster(struct client_proto *c, struct client_context *context)
{
tracef("client send cluster");
struct request_cluster request;
request.format = DQLITE_REQUEST_CLUSTER_FORMAT_V1;
REQUEST(cluster, CLUSTER, 0);
return 0;
}
int clientSendTransfer(struct client_proto *c,
uint64_t id,
struct client_context *context)
{
tracef("client send transfer id %" PRIu64, id);
struct request_transfer request;
request.id = id;
REQUEST(transfer, TRANSFER, 0);
return 0;
}
int clientSendDescribe(struct client_proto *c, struct client_context *context)
{
tracef("client send describe");
struct request_describe request;
request.format = DQLITE_REQUEST_DESCRIBE_FORMAT_V0;
REQUEST(describe, DESCRIBE, 0);
return 0;
}
int clientSendWeight(struct client_proto *c,
uint64_t weight,
struct client_context *context)
{
tracef("client send weight %" PRIu64, weight);
struct request_weight request;
request.weight = weight;
REQUEST(weight, WEIGHT, 0);
return 0;
}
int clientRecvServer(struct client_proto *c,
uint64_t *id,
char **address,
struct client_context *context)
{
tracef("client recv server");
struct cursor cursor;
struct response_server response;
*id = 0;
*address = NULL;
RESPONSE(server, SERVER);
*address = strdupChecked(response.address);
*id = response.id;
return 0;
}
int clientRecvWelcome(struct client_proto *c, struct client_context *context)
{
tracef("client recv welcome");
struct cursor cursor;
struct response_welcome response;
RESPONSE(welcome, WELCOME);
return 0;
}
int clientRecvEmpty(struct client_proto *c, struct client_context *context)
{
tracef("client recv empty");
struct cursor cursor;
struct response_empty response;
RESPONSE(empty, EMPTY);
return 0;
}
int clientRecvFailure(struct client_proto *c,
uint64_t *code,
char **msg,
struct client_context *context)
{
tracef("client recv failure");
struct cursor cursor;
struct response_failure response;
RESPONSE(failure, FAILURE);
*code = response.code;
*msg = strdupChecked(response.message);
return 0;
}
int clientRecvServers(struct client_proto *c,
struct client_node_info **servers,
uint64_t *n_servers,
struct client_context *context)
{
tracef("client recv servers");
struct cursor cursor;
size_t n;
uint64_t i = 0;
uint64_t j;
uint64_t raw_role;
const char *raw_addr;
struct response_servers response;
int rv;
*servers = NULL;
*n_servers = 0;
RESPONSE(servers, SERVERS);
n = (size_t)response.n;
assert((uint64_t)n == response.n);
struct client_node_info *srvs = callocChecked(n, sizeof *srvs);
for (; i < response.n; ++i) {
rv = uint64__decode(&cursor, &srvs[i].id);
if (rv != 0) {
goto err_after_alloc_srvs;
}
rv = text__decode(&cursor, &raw_addr);
if (rv != 0) {
goto err_after_alloc_srvs;
}
srvs[i].addr = strdupChecked(raw_addr);
rv = uint64__decode(&cursor, &raw_role);
if (rv != 0) {
free(srvs[i].addr);
goto err_after_alloc_srvs;
}
srvs[i].role = (int)raw_role;
}
*n_servers = n;
*servers = srvs;
return 0;
err_after_alloc_srvs:
for (j = 0; j < i; ++j) {
free(srvs[i].addr);
}
free(srvs);
return rv;
}
int clientRecvFiles(struct client_proto *c,
struct client_file **files,
size_t *n_files,
struct client_context *context)
{
tracef("client recv files");
struct cursor cursor;
struct response_files response;
struct client_file *fs;
size_t n;
size_t z;
size_t i = 0;
size_t j;
const char *raw_name;
int rv;
*files = NULL;
*n_files = 0;
RESPONSE(files, FILES);
n = (size_t)response.n;
assert((uint64_t)n == response.n);
fs = callocChecked(n, sizeof *fs);
for (; i < response.n; ++i) {
rv = text__decode(&cursor, &raw_name);
if (rv != 0) {
goto err_after_alloc_fs;
}
fs[i].name = strdupChecked(raw_name);
rv = uint64__decode(&cursor, &fs[i].size);
if (rv != 0) {
free(fs[i].name);
goto err_after_alloc_fs;
}
if (cursor.cap != fs[i].size) {
free(fs[i].name);
rv = DQLITE_PARSE;
goto err_after_alloc_fs;
}
z = (size_t)fs[i].size;
assert((uint64_t)z == fs[i].size);
fs[i].blob = mallocChecked(z);
memcpy(fs[i].blob, cursor.p, z);
}
*files = fs;
*n_files = n;
return 0;
err_after_alloc_fs:
for (j = 0; j < i; ++j) {
free(fs[i].name);
free(fs[i].blob);
}
free(fs);
return rv;
}
int clientRecvMetadata(struct client_proto *c,
uint64_t *failure_domain,
uint64_t *weight,
struct client_context *context)
{
tracef("client recv metadata");
struct cursor cursor;
struct response_metadata response;
RESPONSE(metadata, METADATA);
*failure_domain = response.failure_domain;
*weight = response.weight;
return 0;
}
dqlite-1.16.7/src/client/protocol.h 0000664 0000000 0000000 00000023363 14652527134 0017167 0 ustar 00root root 0000000 0000000 /* Core dqlite client logic for encoding requests and decoding responses. */
#ifndef DQLITE_CLIENT_PROTOCOL_H_
#define DQLITE_CLIENT_PROTOCOL_H_
#include "../../include/dqlite.h"
#include "../lib/buffer.h"
#include "../tuple.h"
/* All functions declared in this header file return 0 for success or one
* of the follow error codes on failure. */
enum {
/* We received a FAILURE response when we expected another response.
*
* The data carried by the FAILURE response can be retrieved from the
* errcode and errmsg fields of struct client_proto.
*
* It's safe to continue using the client_proto object after receiving
* this error code. */
DQLITE_CLIENT_PROTO_RECEIVED_FAILURE = 1,
/* We timed out while reading from or writing to our fd, or a read/write
* returned EOF before the expected number of bytes were read/written.
*
* It is not generally safe to continue using the client_proto object
* after receiving this error code. */
DQLITE_CLIENT_PROTO_SHORT,
/* Another kind of error occurred, like a syscall failure.
*
* It is not generally safe to continue using the client_proto object
* after receiving this error code. */
DQLITE_CLIENT_PROTO_ERROR
};
struct client_proto
{
/* TODO find a better approach to initializing these fields? */
int (*connect)(void *, const char *, int *);
void *connect_arg;
int fd; /* Connected socket */
uint32_t db_id; /* Database ID provided by the server */
char *db_name; /* Database filename (owned) */
bool db_is_init; /* Whether the database ID has been initialized */
uint64_t server_id;
struct buffer read; /* Read buffer */
struct buffer write; /* Write buffer */
uint64_t errcode; /* Last error code returned by the server (owned) */
char *errmsg; /* Last error string returned by the server */
};
/* All of the Send and Recv functions take an `struct client_context *context`
* argument, which controls timeouts for read and write operations (and possibly
* other knobs in the future).
*
* Passing NULL for the context argument is permitted and disables all timeouts.
*/
struct client_context
{
/* An absolute CLOCK_REALTIME timestamp that limits how long will be
* spent trying to complete the requested send or receive operation.
* Whenever we are about to make a blocking syscall (read or write), we
* first poll(2) using a timeout computed based on how much time remains
* before the deadline. If the poll times out, we return early instead
* of completing the operation. */
struct timespec deadline;
};
/* TODO Consider using a dynamic array instead of a linked list here? */
struct row
{
struct value *values;
struct row *next;
};
struct rows
{
unsigned column_count;
char **column_names;
struct row *next;
};
struct client_node_info
{
uint64_t id;
char *addr;
int role;
};
struct client_file
{
char *name;
uint64_t size;
void *blob;
};
/* Checked allocation functions that abort the process on allocation failure. */
void *mallocChecked(size_t n);
void *callocChecked(size_t nmemb, size_t size);
char *strdupChecked(const char *s);
char *strndupCheck(const char *s, size_t n);
/* Initialize a context whose deadline will fall after the given duration
* in milliseconds. */
DQLITE_VISIBLE_TO_TESTS void clientContextMillis(struct client_context *context,
long millis);
/* Initialize a new client. */
DQLITE_VISIBLE_TO_TESTS int clientOpen(struct client_proto *c,
const char *addr,
uint64_t server_id);
/* Release all memory used by the client, and close the client socket. */
DQLITE_VISIBLE_TO_TESTS void clientClose(struct client_proto *c);
/* Initialize the connection by writing the protocol version. This must be
* called before using any other API. */
DQLITE_VISIBLE_TO_TESTS int clientSendHandshake(struct client_proto *c,
struct client_context *context);
/* Send a request to get the current leader. */
DQLITE_VISIBLE_TO_TESTS int clientSendLeader(struct client_proto *c,
struct client_context *context);
/* Send a request identifying this client to the attached server. */
DQLITE_VISIBLE_TO_TESTS int clientSendClient(struct client_proto *c,
uint64_t id,
struct client_context *context);
/* Send a request to open a database */
DQLITE_VISIBLE_TO_TESTS int clientSendOpen(struct client_proto *c,
const char *name,
struct client_context *context);
/* Receive the response to an open request. */
DQLITE_VISIBLE_TO_TESTS int clientRecvDb(struct client_proto *c,
struct client_context *context);
/* Send a request to prepare a statement. */
DQLITE_VISIBLE_TO_TESTS int clientSendPrepare(struct client_proto *c,
const char *sql,
struct client_context *context);
/* Receive the response to a prepare request. */
DQLITE_VISIBLE_TO_TESTS int clientRecvStmt(struct client_proto *c,
uint32_t *stmt_id,
uint64_t *n_params,
uint64_t *offset,
struct client_context *context);
/* Send a request to execute a statement. */
DQLITE_VISIBLE_TO_TESTS int clientSendExec(struct client_proto *c,
uint32_t stmt_id,
struct value *params,
unsigned n_params,
struct client_context *context);
/* Send a request to execute a non-prepared statement. */
DQLITE_VISIBLE_TO_TESTS int clientSendExecSQL(struct client_proto *c,
const char *sql,
struct value *params,
unsigned n_params,
struct client_context *context);
/* Receive the response to an exec request. */
DQLITE_VISIBLE_TO_TESTS int clientRecvResult(struct client_proto *c,
uint64_t *last_insert_id,
uint64_t *rows_affected,
struct client_context *context);
/* Send a request to perform a query. */
DQLITE_VISIBLE_TO_TESTS int clientSendQuery(struct client_proto *c,
uint32_t stmt_id,
struct value *params,
unsigned n_params,
struct client_context *context);
/* Send a request to perform a non-prepared query. */
DQLITE_VISIBLE_TO_TESTS int clientSendQuerySQL(struct client_proto *c,
const char *sql,
struct value *params,
unsigned n_params,
struct client_context *context);
/* Receive the response of a query request. */
DQLITE_VISIBLE_TO_TESTS int clientRecvRows(struct client_proto *c,
struct rows *rows,
bool *done,
struct client_context *context);
/* Release all memory used in the given rows object. */
DQLITE_VISIBLE_TO_TESTS void clientCloseRows(struct rows *rows);
/* Send a request to interrupt a server that's sending rows. */
DQLITE_VISIBLE_TO_TESTS int clientSendInterrupt(struct client_proto *c,
struct client_context *context);
/* Send a request to finalize a prepared statement. */
DQLITE_VISIBLE_TO_TESTS int clientSendFinalize(struct client_proto *c,
uint32_t stmt_id,
struct client_context *context);
/* Send a request to add a dqlite node. */
DQLITE_VISIBLE_TO_TESTS int clientSendAdd(struct client_proto *c,
uint64_t id,
const char *address,
struct client_context *context);
/* Send a request to assign a role to a node. */
DQLITE_VISIBLE_TO_TESTS int clientSendAssign(struct client_proto *c,
uint64_t id,
int role,
struct client_context *context);
/* Send a request to remove a server from the cluster. */
DQLITE_VISIBLE_TO_TESTS int clientSendRemove(struct client_proto *c,
uint64_t id,
struct client_context *context);
/* Send a request to dump the contents of the attached database. */
DQLITE_VISIBLE_TO_TESTS int clientSendDump(struct client_proto *c,
struct client_context *context);
/* Send a request to list the nodes of the cluster with their addresses and
* roles. */
DQLITE_VISIBLE_TO_TESTS int clientSendCluster(struct client_proto *c,
struct client_context *context);
/* Send a request to transfer leadership to node with id `id`. */
DQLITE_VISIBLE_TO_TESTS int clientSendTransfer(struct client_proto *c,
uint64_t id,
struct client_context *context);
/* Send a request to retrieve metadata about the attached server. */
DQLITE_VISIBLE_TO_TESTS int clientSendDescribe(struct client_proto *c,
struct client_context *context);
/* Send a request to set the weight metadata for the attached server. */
DQLITE_VISIBLE_TO_TESTS int clientSendWeight(struct client_proto *c,
uint64_t weight,
struct client_context *context);
/* Receive a response with the ID and address of a single node. */
DQLITE_VISIBLE_TO_TESTS int clientRecvServer(struct client_proto *c,
uint64_t *id,
char **address,
struct client_context *context);
/* Receive a "welcome" handshake response. */
DQLITE_VISIBLE_TO_TESTS int clientRecvWelcome(struct client_proto *c,
struct client_context *context);
/* Receive an empty response. */
DQLITE_VISIBLE_TO_TESTS int clientRecvEmpty(struct client_proto *c,
struct client_context *context);
/* Receive a failure response. */
DQLITE_VISIBLE_TO_TESTS int clientRecvFailure(struct client_proto *c,
uint64_t *code,
char **msg,
struct client_context *context);
/* Receive a list of nodes in the cluster. */
DQLITE_VISIBLE_TO_TESTS int clientRecvServers(struct client_proto *c,
struct client_node_info **servers,
uint64_t *n_servers,
struct client_context *context);
/* Receive a list of files that make up a database. */
DQLITE_VISIBLE_TO_TESTS int clientRecvFiles(struct client_proto *c,
struct client_file **files,
size_t *n_files,
struct client_context *context);
/* Receive metadata for a single server. */
DQLITE_VISIBLE_TO_TESTS int clientRecvMetadata(struct client_proto *c,
uint64_t *failure_domain,
uint64_t *weight,
struct client_context *context);
#endif /* DQLITE_CLIENT_PROTOCOL_H_ */
dqlite-1.16.7/src/command.c 0000664 0000000 0000000 00000010300 14652527134 0015444 0 ustar 00root root 0000000 0000000 #include
#include "../include/dqlite.h"
#include "lib/serialize.h"
#include "command.h"
#include "protocol.h"
#define FORMAT 1 /* Format version */
#define HEADER(X, ...) \
X(uint8, format, ##__VA_ARGS__) \
X(uint8, type, ##__VA_ARGS__) \
X(uint8, _unused1, ##__VA_ARGS__) \
X(uint8, _unused2, ##__VA_ARGS__) \
X(uint32, _unused3, ##__VA_ARGS__)
SERIALIZE__DEFINE(header, HEADER);
SERIALIZE__IMPLEMENT(header, HEADER);
static size_t frames__sizeof(const frames_t *frames)
{
size_t s = uint32__sizeof(&frames->n_pages) +
uint16__sizeof(&frames->page_size) +
uint16__sizeof(&frames->__unused__) +
sizeof(uint64_t) * frames->n_pages + /* Page numbers */
frames->page_size * frames->n_pages; /* Page data */
return s;
}
static void frames__encode(const frames_t *frames, char **cursor)
{
const dqlite_vfs_frame *list;
unsigned i;
uint32__encode(&frames->n_pages, cursor);
uint16__encode(&frames->page_size, cursor);
uint16__encode(&frames->__unused__, cursor);
list = frames->data;
for (i = 0; i < frames->n_pages; i++) {
uint64_t pgno = list[i].page_number;
uint64__encode(&pgno, cursor);
}
for (i = 0; i < frames->n_pages; i++) {
memcpy(*cursor, list[i].data, frames->page_size);
*cursor += frames->page_size;
}
}
static int frames__decode(struct cursor *cursor, frames_t *frames)
{
int rc;
rc = uint32__decode(cursor, &frames->n_pages);
if (rc != 0) {
return rc;
}
rc = uint16__decode(cursor, &frames->page_size);
if (rc != 0) {
return rc;
}
rc = uint16__decode(cursor, &frames->__unused__);
if (rc != 0) {
return rc;
}
frames->data = cursor->p;
return 0;
}
#define COMMAND__IMPLEMENT(LOWER, UPPER, _) \
SERIALIZE__IMPLEMENT(command_##LOWER, COMMAND__##UPPER);
COMMAND__TYPES(COMMAND__IMPLEMENT, );
#define ENCODE(LOWER, UPPER, _) \
case COMMAND_##UPPER: \
h.type = COMMAND_##UPPER; \
buf->len = header__sizeof(&h); \
buf->len += command_##LOWER##__sizeof(command); \
buf->base = raft_malloc(buf->len); \
if (buf->base == NULL) { \
return DQLITE_NOMEM; \
} \
cursor = buf->base; \
header__encode(&h, &cursor); \
command_##LOWER##__encode(command, &cursor); \
break;
int command__encode(int type, const void *command, struct raft_buffer *buf)
{
struct header h = {0};
char *cursor;
int rc = 0;
h.format = FORMAT;
switch (type) {
COMMAND__TYPES(ENCODE, )
};
return rc;
}
#define DECODE(LOWER, UPPER, _) \
case COMMAND_##UPPER: \
*command = raft_malloc(sizeof(struct command_##LOWER)); \
if (*command == NULL) { \
return DQLITE_NOMEM; \
} \
rc = command_##LOWER##__decode(&cursor, *command); \
break;
int command__decode(const struct raft_buffer *buf, int *type, void **command)
{
struct header h;
struct cursor cursor;
int rc;
cursor.p = buf->base;
cursor.cap = buf->len;
rc = header__decode(&cursor, &h);
if (rc != 0) {
return rc;
}
if (h.format != FORMAT) {
return DQLITE_PROTO;
}
switch (h.type) {
COMMAND__TYPES(DECODE, )
default:
rc = DQLITE_PROTO;
break;
};
if (rc != 0) {
return rc;
}
*type = h.type;
return 0;
}
int command_frames__page_numbers(const struct command_frames *c,
unsigned long *page_numbers[])
{
unsigned i;
struct cursor cursor;
cursor.p = c->frames.data;
cursor.cap = sizeof(uint64_t) * c->frames.n_pages;
*page_numbers =
sqlite3_malloc64(sizeof **page_numbers * c->frames.n_pages);
if (*page_numbers == NULL) {
return DQLITE_NOMEM;
}
for (i = 0; i < c->frames.n_pages; i++) {
uint64_t pgno;
int r = uint64__decode(&cursor, &pgno);
if (r != 0) {
return r;
}
(*page_numbers)[i] = (unsigned long)pgno;
}
return 0;
}
void command_frames__pages(const struct command_frames *c, void **pages)
{
*pages =
(void *)(c->frames.data + (sizeof(uint64_t) * c->frames.n_pages));
}
dqlite-1.16.7/src/command.h 0000664 0000000 0000000 00000004277 14652527134 0015471 0 ustar 00root root 0000000 0000000 /**
* Encode and decode dqlite Raft FSM commands.
*/
#ifndef COMMAND_H_
#define COMMAND_H_
#include "../include/dqlite.h"
#include "lib/serialize.h"
#include "raft.h"
/* Command type codes */
enum { COMMAND_OPEN = 1, COMMAND_FRAMES, COMMAND_UNDO, COMMAND_CHECKPOINT };
/* Hold information about an array of WAL frames. */
struct frames
{
uint32_t n_pages;
uint16_t page_size;
uint16_t __unused__;
/* TODO: because the sqlite3 replication APIs are asymmetrics, the
* format differs between encode and decode. When encoding data is
* expected to be a sqlite3_wal_replication_frame* array, and when
* decoding it will be a pointer to raw memory which can be further
* decoded with the command_frames__page_numbers() and
* command_frames__pages() helpers. */
const void *data;
};
typedef struct frames frames_t;
/* Serialization definitions for a raft FSM command. */
#define COMMAND__DEFINE(LOWER, UPPER, _) \
SERIALIZE__DEFINE_STRUCT(command_##LOWER, COMMAND__##UPPER);
#define COMMAND__OPEN(X, ...) X(text, filename, ##__VA_ARGS__)
#define COMMAND__FRAMES(X, ...) \
X(text, filename, ##__VA_ARGS__) \
X(uint64, tx_id, ##__VA_ARGS__) \
X(uint32, truncate, ##__VA_ARGS__) \
X(uint8, is_commit, ##__VA_ARGS__) \
X(uint8, __unused1__, ##__VA_ARGS__) \
X(uint16, __unused2__, ##__VA_ARGS__) \
X(frames, frames, ##__VA_ARGS__)
#define COMMAND__UNDO(X, ...) X(uint64, tx_id, ##__VA_ARGS__)
#define COMMAND__CHECKPOINT(X, ...) X(text, filename, ##__VA_ARGS__)
#define COMMAND__TYPES(X, ...) \
X(open, OPEN, __VA_ARGS__) \
X(frames, FRAMES, __VA_ARGS__) \
X(undo, UNDO, __VA_ARGS__) \
X(checkpoint, CHECKPOINT, __VA_ARGS__)
COMMAND__TYPES(COMMAND__DEFINE);
DQLITE_VISIBLE_TO_TESTS int command__encode(int type,
const void *command,
struct raft_buffer *buf);
DQLITE_VISIBLE_TO_TESTS int command__decode(const struct raft_buffer *buf,
int *type,
void **command);
DQLITE_VISIBLE_TO_TESTS int command_frames__page_numbers(
const struct command_frames *c,
unsigned long *page_numbers[]);
DQLITE_VISIBLE_TO_TESTS void command_frames__pages(
const struct command_frames *c,
void **pages);
#endif /* COMMAND_H_*/
dqlite-1.16.7/src/config.c 0000664 0000000 0000000 00000003222 14652527134 0015300 0 ustar 00root root 0000000 0000000 #include
#include
#include
#include "../include/dqlite.h"
#include "./lib/assert.h"
#include "config.h"
#include "logger.h"
/* Default heartbeat timeout in milliseconds.
*
* Clients will be disconnected if the server does not receive a heartbeat
* message within this time. */
#define DEFAULT_HEARTBEAT_TIMEOUT 15000
/* Default database page size in bytes. */
#define DEFAULT_PAGE_SIZE 4096
/* Number of outstanding WAL frames after which a checkpoint is triggered as
* soon as possible. */
#define DEFAULT_CHECKPOINT_THRESHOLD 1000
/* For generating unique replication/VFS registration names.
*
* TODO: make this thread safe. */
static unsigned serial = 1;
int config__init(struct config *c,
dqlite_node_id id,
const char *address,
const char *raft_dir,
const char *database_dir)
{
int rv;
c->id = id;
c->address = sqlite3_malloc((int)strlen(address) + 1);
if (c->address == NULL) {
return DQLITE_NOMEM;
}
strcpy(c->address, address);
c->heartbeat_timeout = DEFAULT_HEARTBEAT_TIMEOUT;
c->page_size = DEFAULT_PAGE_SIZE;
c->checkpoint_threshold = DEFAULT_CHECKPOINT_THRESHOLD;
rv = snprintf(c->name, sizeof c->name, "dqlite-%u", serial);
assert(rv < (int)(sizeof c->name));
c->logger.data = NULL;
c->logger.emit = loggerDefaultEmit;
c->failure_domain = 0;
c->weight = 0;
snprintf(c->raft_dir, sizeof(c->raft_dir), "%s", (raft_dir != NULL) ? raft_dir : "");
snprintf(c->database_dir, sizeof(c->database_dir), "%s", database_dir);
c->disk = false;
c->voters = 3;
c->standbys = 0;
c->pool_thread_count = 4;
serial++;
return 0;
}
void config__close(struct config *c)
{
sqlite3_free(c->address);
}
dqlite-1.16.7/src/config.h 0000664 0000000 0000000 00000002720 14652527134 0015307 0 ustar 00root root 0000000 0000000 #ifndef CONFIG_H_
#define CONFIG_H_
#include "logger.h"
/**
* Value object holding dqlite configuration.
*/
struct config {
dqlite_node_id id; /* Unique instance ID */
char *address; /* Instance address */
unsigned heartbeat_timeout; /* In milliseconds */
unsigned page_size; /* Database page size */
unsigned checkpoint_threshold; /* In outstanding WAL frames */
struct logger logger; /* Custom logger */
char name[256]; /* VFS/replication registriatio name */
unsigned long long failure_domain; /* User-provided failure domain */
unsigned long long int weight; /* User-provided node weight */
char raft_dir[1024]; /* Directory used by raft */
char database_dir[1024]; /* Data dir for on-disk database */
bool disk; /* Disk-mode or not */
int voters; /* Target number of voters */
int standbys; /* Target number of standbys */
unsigned pool_thread_count; /* Number of threads in thread pool */
};
/**
* Initialize the config object with required values and set the rest to sane
* defaults. A copy will be made of the given @address.
*/
int config__init(struct config *c,
dqlite_node_id id,
const char *address,
const char *raft_dir,
const char *database_dir);
/**
* Release any memory held by the config object.
*/
void config__close(struct config *c);
#endif /* DQLITE_OPTIONS_H */
dqlite-1.16.7/src/conn.c 0000664 0000000 0000000 00000017037 14652527134 0015001 0 ustar 00root root 0000000 0000000 #include "conn.h"
#include "message.h"
#include "protocol.h"
#include "request.h"
#include "tracing.h"
#include "transport.h"
#include
/* Initialize the given buffer for reading, ensure it has the given size. */
static int init_read(struct conn *c, uv_buf_t *buf, size_t size)
{
buffer__reset(&c->read);
buf->base = buffer__advance(&c->read, size);
if (buf->base == NULL) {
return DQLITE_NOMEM;
}
buf->len = size;
return 0;
}
static int read_message(struct conn *c);
static void conn_write_cb(struct transport *transport, int status)
{
struct conn *c = transport->data;
bool finished;
int rv;
if (status != 0) {
tracef("write cb status %d", status);
goto abort;
}
buffer__reset(&c->write);
buffer__advance(&c->write, message__sizeof(&c->response)); /* Header */
rv = gateway__resume(&c->gateway, &finished);
if (rv != 0) {
goto abort;
}
if (!finished) {
return;
}
/* Start reading the next request */
rv = read_message(c);
if (rv != 0) {
goto abort;
}
return;
abort:
conn__stop(c);
}
static void gateway_handle_cb(struct handle *req,
int status,
uint8_t type,
uint8_t schema)
{
struct conn *c = req->data;
size_t n;
char *cursor;
uv_buf_t buf;
int rv;
assert(schema <= req->schema);
/* Ignore results firing after we started closing. TODO: instead, we
* should make gateway__close() asynchronous. */
if (c->closed) {
tracef("gateway handle cb closed");
return;
}
if (status != 0) {
tracef("gateway handle cb status %d", status);
goto abort;
}
n = buffer__offset(&c->write) - message__sizeof(&c->response);
assert(n % 8 == 0);
c->response.type = type;
c->response.words = (uint32_t)(n / 8);
c->response.schema = schema;
c->response.extra = 0;
cursor = buffer__cursor(&c->write, 0);
message__encode(&c->response, &cursor);
buf.base = buffer__cursor(&c->write, 0);
buf.len = buffer__offset(&c->write);
rv = transport__write(&c->transport, &buf, conn_write_cb);
if (rv != 0) {
tracef("transport write failed %d", rv);
goto abort;
}
return;
abort:
conn__stop(c);
}
static void closeCb(struct transport *transport)
{
struct conn *c = transport->data;
buffer__close(&c->write);
buffer__close(&c->read);
if (c->close_cb != NULL) {
c->close_cb(c);
}
}
static void raft_connect(struct conn *c)
{
struct cursor *cursor = &c->handle.cursor;
struct request_connect request;
int rv;
tracef("raft_connect");
rv = request_connect__decode(cursor, &request);
if (rv != 0) {
tracef("request connect decode failed %d", rv);
conn__stop(c);
return;
}
raftProxyAccept(c->uv_transport, request.id, request.address,
c->transport.stream);
/* Close the connection without actually closing the transport, since
* the stream will be used by raft */
c->closed = true;
closeCb(&c->transport);
}
static void read_request_cb(struct transport *transport, int status)
{
struct conn *c = transport->data;
struct cursor *cursor = &c->handle.cursor;
int rv;
if (status != 0) {
tracef("read error %d", status);
// errorf(c->logger, "read error");
conn__stop(c);
return;
}
cursor->p = buffer__cursor(&c->read, 0);
cursor->cap = buffer__offset(&c->read);
buffer__reset(&c->write);
buffer__advance(&c->write, message__sizeof(&c->response)); /* Header */
switch (c->request.type) {
case DQLITE_REQUEST_CONNECT:
raft_connect(c);
return;
}
rv = gateway__handle(&c->gateway, &c->handle, c->request.type,
c->request.schema, &c->write, gateway_handle_cb);
if (rv != 0) {
tracef("read gateway handle error %d", rv);
conn__stop(c);
}
}
/* Start reading the body of the next request */
static int read_request(struct conn *c)
{
uv_buf_t buf;
int rv;
if (UINT64_C(8) * (uint64_t)c->request.words > (uint64_t)UINT32_MAX) {
return DQLITE_ERROR;
}
rv = init_read(c, &buf, c->request.words * 8);
if (rv != 0) {
tracef("init read failed %d", rv);
return rv;
}
if (c->request.words == 0) {
return 0;
}
rv = transport__read(&c->transport, &buf, read_request_cb);
if (rv != 0) {
tracef("transport read failed %d", rv);
return rv;
}
return 0;
}
static void read_message_cb(struct transport *transport, int status)
{
struct conn *c = transport->data;
struct cursor cursor;
int rv;
if (status != 0) {
// errorf(c->logger, "read error");
tracef("read error %d", status);
conn__stop(c);
return;
}
cursor.p = buffer__cursor(&c->read, 0);
cursor.cap = buffer__offset(&c->read);
rv = message__decode(&cursor, &c->request);
assert(rv == 0); /* Can't fail, we know we have enough bytes */
rv = read_request(c);
if (rv != 0) {
tracef("read request error %d", rv);
conn__stop(c);
return;
}
}
/* Start reading metadata about the next message */
static int read_message(struct conn *c)
{
uv_buf_t buf;
int rv;
rv = init_read(c, &buf, message__sizeof(&c->request));
if (rv != 0) {
tracef("init read failed %d", rv);
return rv;
}
rv = transport__read(&c->transport, &buf, read_message_cb);
if (rv != 0) {
tracef("transport read failed %d", rv);
return rv;
}
return 0;
}
static void read_protocol_cb(struct transport *transport, int status)
{
struct conn *c = transport->data;
struct cursor cursor;
int rv;
if (status != 0) {
// errorf(c->logger, "read error");
tracef("read error %d", status);
goto abort;
}
cursor.p = buffer__cursor(&c->read, 0);
cursor.cap = buffer__offset(&c->read);
rv = uint64__decode(&cursor, &c->protocol);
assert(rv == 0); /* Can't fail, we know we have enough bytes */
if (c->protocol != DQLITE_PROTOCOL_VERSION &&
c->protocol != DQLITE_PROTOCOL_VERSION_LEGACY) {
/* errorf(c->logger, "unknown protocol version: %lx", */
/* c->protocol); */
/* TODO: instead of closing the connection we should return
* error messages */
tracef("unknown protocol version %" PRIu64, c->protocol);
goto abort;
}
c->gateway.protocol = c->protocol;
rv = read_message(c);
if (rv != 0) {
goto abort;
}
return;
abort:
conn__stop(c);
}
/* Start reading the protocol format version */
static int read_protocol(struct conn *c)
{
uv_buf_t buf;
int rv;
rv = init_read(c, &buf, sizeof c->protocol);
if (rv != 0) {
tracef("init read failed %d", rv);
return rv;
}
rv = transport__read(&c->transport, &buf, read_protocol_cb);
if (rv != 0) {
tracef("transport read failed %d", rv);
return rv;
}
return 0;
}
int conn__start(struct conn *c,
struct config *config,
struct uv_loop_s *loop,
struct registry *registry,
struct raft *raft,
struct uv_stream_s *stream,
struct raft_uv_transport *uv_transport,
struct id_state seed,
conn_close_cb close_cb)
{
int rv;
(void)loop;
tracef("conn start");
rv = transport__init(&c->transport, stream);
if (rv != 0) {
tracef("conn start - transport init failed %d", rv);
goto err;
}
c->config = config;
c->transport.data = c;
c->uv_transport = uv_transport;
c->close_cb = close_cb;
gateway__init(&c->gateway, config, registry, raft, seed);
rv = buffer__init(&c->read);
if (rv != 0) {
goto err_after_transport_init;
}
rv = buffer__init(&c->write);
if (rv != 0) {
goto err_after_read_buffer_init;
}
c->handle.data = c;
c->closed = false;
/* First, we expect the client to send us the protocol version. */
rv = read_protocol(c);
if (rv != 0) {
goto err_after_write_buffer_init;
}
return 0;
err_after_write_buffer_init:
buffer__close(&c->write);
err_after_read_buffer_init:
buffer__close(&c->read);
err_after_transport_init:
transport__close(&c->transport, NULL);
err:
return rv;
}
void conn__stop(struct conn *c)
{
tracef("conn stop");
if (c->closed) {
return;
}
c->closed = true;
gateway__close(&c->gateway);
transport__close(&c->transport, closeCb);
}
dqlite-1.16.7/src/conn.h 0000664 0000000 0000000 00000003216 14652527134 0015000 0 ustar 00root root 0000000 0000000 /**
* Handle a single client connection.
*/
#ifndef DQLITE_CONN_H_
#define DQLITE_CONN_H_
#include "lib/buffer.h"
#include "lib/queue.h"
#include "lib/transport.h"
#include "gateway.h"
#include "id.h"
#include "message.h"
#include "raft.h"
/**
* Callbacks.
*/
struct conn;
typedef void (*conn_close_cb)(struct conn *c);
struct conn
{
struct config *config;
struct raft_uv_transport *uv_transport; /* Raft transport */
conn_close_cb close_cb; /* Close callback */
struct transport transport; /* Async network read/write */
struct gateway gateway; /* Request handler */
struct buffer read; /* Read buffer */
struct buffer write; /* Write buffer */
uint64_t protocol; /* Protocol format version */
struct message request; /* Request message meta data */
struct message response; /* Response message meta data */
struct handle handle;
bool closed;
queue queue;
};
/**
* Initialize and start a connection.
*
* If no error is returned, the connection should be considered started. Any
* error occurring after this point will trigger the @close_cb callback.
*/
int conn__start(struct conn *c,
struct config *config,
struct uv_loop_s *loop,
struct registry *registry,
struct raft *raft,
struct uv_stream_s *stream,
struct raft_uv_transport *uv_transport,
struct id_state seed,
conn_close_cb close_cb);
/**
* Force closing the connection. The close callback will be invoked when it's
* safe to release the memory of the connection object.
*/
void conn__stop(struct conn *c);
#endif /* DQLITE_CONN_H_ */
dqlite-1.16.7/src/db.c 0000664 0000000 0000000 00000007334 14652527134 0014430 0 ustar 00root root 0000000 0000000 #include
#include
#include "../include/dqlite.h"
#include "./lib/assert.h"
#include "db.h"
#include "tracing.h"
/* Limit taken from sqlite unix vfs. */
#define MAX_PATHNAME 512
/* Open a SQLite connection and set it to follower mode. */
static int open_follower_conn(const char *filename,
const char *vfs,
unsigned page_size,
sqlite3 **conn);
static uint32_t str_hash(const char* name)
{
const unsigned char *p;
uint32_t h = 5381U;
for (p = (const unsigned char *) name; *p != '\0'; p++) {
h = (h << 5) + h + *p;
}
return h;
}
int db__init(struct db *db, struct config *config, const char *filename)
{
tracef("db init filename=`%s'", filename);
int rv;
db->config = config;
db->cookie = str_hash(filename);
db->filename = sqlite3_malloc((int)(strlen(filename) + 1));
if (db->filename == NULL) {
rv = DQLITE_NOMEM;
goto err;
}
strcpy(db->filename, filename);
db->path = sqlite3_malloc(MAX_PATHNAME + 1);
if (db->path == NULL) {
rv = DQLITE_NOMEM;
goto err_after_filename_alloc;
}
if (db->config->disk) {
rv = snprintf(db->path, MAX_PATHNAME + 1, "%s/%s",
db->config->database_dir, db->filename);
} else {
rv = snprintf(db->path, MAX_PATHNAME + 1, "%s", db->filename);
}
if (rv < 0 || rv >= MAX_PATHNAME + 1) {
goto err_after_path_alloc;
}
db->follower = NULL;
db->tx_id = 0;
db->read_lock = 0;
queue_init(&db->leaders);
return 0;
err_after_path_alloc:
sqlite3_free(db->path);
err_after_filename_alloc:
sqlite3_free(db->filename);
err:
return rv;
}
void db__close(struct db *db)
{
assert(queue_empty(&db->leaders));
if (db->follower != NULL) {
int rc;
rc = sqlite3_close(db->follower);
assert(rc == SQLITE_OK);
}
sqlite3_free(db->path);
sqlite3_free(db->filename);
}
int db__open_follower(struct db *db)
{
int rc;
assert(db->follower == NULL);
rc = open_follower_conn(db->path, db->config->name,
db->config->page_size, &db->follower);
return rc;
}
static int open_follower_conn(const char *filename,
const char *vfs,
unsigned page_size,
sqlite3 **conn)
{
char pragma[255];
int flags = SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE;
char *msg = NULL;
int rc;
tracef("open follower conn: %s page_size:%u", filename, page_size);
rc = sqlite3_open_v2(filename, conn, flags, vfs);
if (rc != SQLITE_OK) {
tracef("open_v2 failed %d", rc);
goto err;
}
/* Enable extended result codes */
rc = sqlite3_extended_result_codes(*conn, 1);
if (rc != SQLITE_OK) {
goto err;
}
/* The vfs, db, gateway, and leader code currently assumes that
* each connection will operate on only one DB file/WAL file
* pair. Make sure that the client can't use ATTACH DATABASE to
* break this assumption. We apply the same limit in openConnection
* in leader.c.
*
* Note, 0 instead of 1 -- apparently the "initial database" is not
* counted when evaluating this limit. */
sqlite3_limit(*conn, SQLITE_LIMIT_ATTACHED, 0);
/* Set the page size. */
sprintf(pragma, "PRAGMA page_size=%d", page_size);
rc = sqlite3_exec(*conn, pragma, NULL, NULL, &msg);
if (rc != SQLITE_OK) {
tracef("page_size=%d failed", page_size);
goto err;
}
/* Disable syncs. */
rc = sqlite3_exec(*conn, "PRAGMA synchronous=OFF", NULL, NULL, &msg);
if (rc != SQLITE_OK) {
tracef("synchronous=OFF failed");
goto err;
}
/* Set WAL journaling. */
rc = sqlite3_exec(*conn, "PRAGMA journal_mode=WAL", NULL, NULL, &msg);
if (rc != SQLITE_OK) {
tracef("journal_mode=WAL failed");
goto err;
}
rc =
sqlite3_db_config(*conn, SQLITE_DBCONFIG_NO_CKPT_ON_CLOSE, 1, NULL);
if (rc != SQLITE_OK) {
goto err;
}
return 0;
err:
if (*conn != NULL) {
sqlite3_close(*conn);
*conn = NULL;
}
if (msg != NULL) {
sqlite3_free(msg);
}
return rc;
}
dqlite-1.16.7/src/db.h 0000664 0000000 0000000 00000002171 14652527134 0014427 0 ustar 00root root 0000000 0000000 /**
* State of a single database.
*/
#ifndef DB_H_
#define DB_H_
#include
#include "lib/queue.h"
#include "config.h"
struct db
{
struct config *config; /* Dqlite configuration */
char *filename; /* Database filename */
char *path; /* Used for on-disk db */
uint32_t cookie; /* Used to bind to the pool's thread */
sqlite3 *follower; /* Follower connection */
queue leaders; /* Open leader connections */
unsigned tx_id; /* Current ongoing transaction ID, if any */
queue queue; /* Prev/next database, used by the registry */
int read_lock; /* Lock used by snapshots & checkpoints */
};
/**
* Initialize a database object.
*
* The given @filename will be copied.
* Return 0 on success.
*/
int db__init(struct db *db, struct config *config, const char *filename);
/**
* Release all memory associated with a database object.
*
* If the follower connection was opened, it will be closed.
*/
void db__close(struct db *db);
/**
* Open the follower connection associated with this database.
*/
int db__open_follower(struct db *db);
#endif /* DB_H_*/
dqlite-1.16.7/src/dqlite.c 0000664 0000000 0000000 00000003705 14652527134 0015323 0 ustar 00root root 0000000 0000000 #include "../include/dqlite.h"
#include "vfs.h"
int dqlite_version_number(void)
{
return DQLITE_VERSION_NUMBER;
}
int dqlite_vfs_init(sqlite3_vfs *vfs, const char *name)
{
return VfsInit(vfs, name);
}
int dqlite_vfs_enable_disk(sqlite3_vfs *vfs)
{
return VfsEnableDisk(vfs);
}
void dqlite_vfs_close(sqlite3_vfs *vfs)
{
VfsClose(vfs);
}
int dqlite_vfs_poll(sqlite3_vfs *vfs,
const char *filename,
dqlite_vfs_frame **frames,
unsigned *n)
{
return VfsPoll(vfs, filename, frames, n);
}
int dqlite_vfs_apply(sqlite3_vfs *vfs,
const char *filename,
unsigned n,
unsigned long *page_numbers,
void *frames)
{
return VfsApply(vfs, filename, n, page_numbers, frames);
}
int dqlite_vfs_abort(sqlite3_vfs *vfs, const char *filename)
{
return VfsAbort(vfs, filename);
}
int dqlite_vfs_snapshot(sqlite3_vfs *vfs,
const char *filename,
void **data,
size_t *n)
{
return VfsSnapshot(vfs, filename, data, n);
}
int dqlite_vfs_snapshot_disk(sqlite3_vfs *vfs,
const char *filename,
struct dqlite_buffer bufs[],
unsigned n)
{
int rv;
if (n != 2) {
return -1;
}
rv = VfsDiskSnapshotDb(vfs, filename, &bufs[0]);
if (rv != 0) {
return rv;
}
rv = VfsDiskSnapshotWal(vfs, filename, &bufs[1]);
return rv;
}
int dqlite_vfs_num_pages(sqlite3_vfs *vfs, const char *filename, unsigned *n)
{
return VfsDatabaseNumPages(vfs, filename, n);
}
int dqlite_vfs_shallow_snapshot(sqlite3_vfs *vfs,
const char *filename,
struct dqlite_buffer bufs[],
unsigned n)
{
return VfsShallowSnapshot(vfs, filename, bufs, n);
}
int dqlite_vfs_restore(sqlite3_vfs *vfs,
const char *filename,
const void *data,
size_t n)
{
return VfsRestore(vfs, filename, data, n);
}
int dqlite_vfs_restore_disk(sqlite3_vfs *vfs,
const char *filename,
const void *data,
size_t main_size,
size_t wal_size)
{
return VfsDiskRestore(vfs, filename, data, main_size, wal_size);
}
dqlite-1.16.7/src/error.c 0000664 0000000 0000000 00000006570 14652527134 0015175 0 ustar 00root root 0000000 0000000 #include
#include
#include
#include
#include
#include
#include "../include/dqlite.h"
#include "./lib/assert.h"
#include "error.h"
/* Fallback message returned when failing to allocate the error message
* itself. */
static char *dqlite__error_oom_msg =
"error message unavailable (out of memory)";
void dqlite__error_init(dqlite__error *e)
{
*e = NULL;
}
void dqlite__error_close(dqlite__error *e)
{
if (*e != NULL && *e != dqlite__error_oom_msg) {
sqlite3_free(*e);
}
}
/* Set an error message by rendering the given format against the given
* parameters.
*
* Any previously set error message will be cleared. */
static void dqlite__error_vprintf(dqlite__error *e,
const char *fmt,
va_list args)
{
assert(fmt != NULL);
/* If a previous error was set (other than the hard-coded OOM fallback
* fallback), let's free it. */
if (*e != NULL && *e != dqlite__error_oom_msg) {
sqlite3_free(*e);
}
/* Render the message. In case of error we fallback to the hard-coded
* OOM fallback message. */
*e = sqlite3_vmprintf(fmt, args);
if (*e == NULL) {
*e = dqlite__error_oom_msg;
}
}
void dqlite__error_printf(dqlite__error *e, const char *fmt, ...)
{
va_list args;
va_start(args, fmt);
dqlite__error_vprintf(e, fmt, args);
va_end(args);
}
static void dqlite__error_vwrapf(dqlite__error *e,
const char *cause,
const char *fmt,
va_list args)
{
dqlite__error tmp;
char *msg;
/* First, print the format and arguments, using a temporary error. */
dqlite__error_init(&tmp);
dqlite__error_vprintf(&tmp, fmt, args);
if (cause == NULL) {
/* Special case the cause error being empty. */
dqlite__error_printf(e, "%s: (null)", tmp);
} else if (cause == *e) {
/* When the error is wrapping itself, we need to make a copy */
dqlite__error_copy(e, &msg);
dqlite__error_printf(e, "%s: %s", tmp, msg);
sqlite3_free(msg);
} else {
dqlite__error_printf(e, "%s: %s", tmp, cause);
}
dqlite__error_close(&tmp);
}
void dqlite__error_wrapf(dqlite__error *e,
const dqlite__error *cause,
const char *fmt,
...)
{
va_list args;
va_start(args, fmt);
dqlite__error_vwrapf(e, (const char *)(*cause), fmt, args);
va_end(args);
}
void dqlite__error_oom(dqlite__error *e, const char *msg, ...)
{
va_list args;
va_start(args, msg);
dqlite__error_vwrapf(e, "out of memory", msg, args);
va_end(args);
}
void dqlite__error_sys(dqlite__error *e, const char *msg)
{
dqlite__error_printf(e, "%s: %s", msg, strerror(errno));
}
void dqlite__error_uv(dqlite__error *e, int err, const char *msg)
{
dqlite__error_printf(e, "%s: %s (%s)", msg, uv_strerror(err),
uv_err_name(err));
}
int dqlite__error_copy(dqlite__error *e, char **msg)
{
char *copy;
size_t len;
assert(e != NULL);
assert(msg != NULL);
/* Trying to copy an empty error message is an error. */
if (*e == NULL) {
*msg = NULL;
return DQLITE_ERROR;
}
len = strlen(*e) + 1;
copy = sqlite3_malloc((int)(len * sizeof *copy));
if (copy == NULL) {
*msg = NULL;
return DQLITE_NOMEM;
}
memcpy(copy, *e, len);
*msg = copy;
return 0;
}
int dqlite__error_is_null(dqlite__error *e)
{
return *e == NULL;
}
int dqlite__error_is_disconnect(dqlite__error *e)
{
if (*e == NULL)
return 0;
if (strstr(*e, uv_err_name(UV_EOF)) != NULL)
return 1;
if (strstr(*e, uv_err_name(UV_ECONNRESET)) != NULL)
return 1;
return 0;
}
dqlite-1.16.7/src/error.h 0000664 0000000 0000000 00000002446 14652527134 0015200 0 ustar 00root root 0000000 0000000 #ifndef DQLITE_ERROR_H
#define DQLITE_ERROR_H
#include
#include
/* A message describing the last error occurred on an object */
typedef char *dqlite__error;
/* Initialize the error with an empty message */
void dqlite__error_init(dqlite__error *e);
/* Release the memory of the error message, if any is set */
void dqlite__error_close(dqlite__error *e);
/* Set the error message */
void dqlite__error_printf(dqlite__error *e, const char *fmt, ...);
/* Wrap an error with an additional message */
void dqlite__error_wrapf(dqlite__error *e,
const dqlite__error *cause,
const char *fmt,
...);
/* Out of memory error */
void dqlite__error_oom(dqlite__error *e, const char *msg, ...);
/* Wrap a system error */
void dqlite__error_sys(dqlite__error *e, const char *msg);
/* Wrap an error from libuv */
void dqlite__error_uv(dqlite__error *e, int err, const char *msg);
/* Copy the underlying error message.
*
* Client code is responsible of invoking sqlite3_free to deallocate the
* returned string.
*/
int dqlite__error_copy(dqlite__error *e, char **msg);
/* Whether the error is not set */
int dqlite__error_is_null(dqlite__error *e);
/* Whether the error is due to client disconnection */
int dqlite__error_is_disconnect(dqlite__error *e);
#endif /* DQLITE_ERROR_H */
dqlite-1.16.7/src/format.c 0000664 0000000 0000000 00000006753 14652527134 0015337 0 ustar 00root root 0000000 0000000 #include
#include
#include
#include
#include "./lib/assert.h"
#include "format.h"
/* tinycc doesn't have this builtin, nor the warning that it's meant to silence.
*/
#ifdef __TINYC__
#define __builtin_assume_aligned(x, y) x
#endif
/* WAL magic value. Either this value, or the same value with the least
* significant bit also set (FORMAT__WAL_MAGIC | 0x00000001) is stored in 32-bit
* big-endian format in the first 4 bytes of a WAL file.
*
* If the LSB is set, then the checksums for each frame within the WAL file are
* calculated by treating all data as an array of 32-bit big-endian
* words. Otherwise, they are calculated by interpreting all data as 32-bit
* little-endian words. */
#define FORMAT__WAL_MAGIC 0x377f0682
#define FORMAT__WAL_MAX_VERSION 3007000
static void formatGet32(const uint8_t buf[4], uint32_t *v)
{
*v = 0;
*v += (uint32_t)(buf[0] << 24);
*v += (uint32_t)(buf[1] << 16);
*v += (uint32_t)(buf[2] << 8);
*v += (uint32_t)(buf[3]);
}
/* Encode a 32-bit number to big endian format */
static void formatPut32(uint32_t v, uint8_t *buf)
{
buf[0] = (uint8_t)(v >> 24);
buf[1] = (uint8_t)(v >> 16);
buf[2] = (uint8_t)(v >> 8);
buf[3] = (uint8_t)v;
}
/*
* Generate or extend an 8 byte checksum based on the data in array data[] and
* the initial values of in[0] and in[1] (or initial values of 0 and 0 if
* in==NULL).
*
* The checksum is written back into out[] before returning.
*
* n must be a positive multiple of 8. */
static void formatWalChecksumBytes(
bool native, /* True for native byte-order, false for non-native */
uint8_t *data, /* Content to be checksummed */
unsigned n, /* Bytes of content in a[]. Must be a multiple of 8. */
const uint32_t in[2], /* Initial checksum value input */
uint32_t out[2] /* OUT: Final checksum value output */
)
{
uint32_t s1, s2;
/* `data` is an alias for the `hdr` member of a `struct vfsWal`. `hdr`
* is the first member of this struct. Because `struct vfsWal` contains
* pointer members, the struct itself will have the alignment of the
* pointer members. As `hdr` is the first member, it will have this
* alignment too. Therefore it is safe to assume pointer alignment (and
* silence the compiler). more info ->
* http://www.catb.org/esr/structure-packing/ */
uint32_t *cur =
(uint32_t *)__builtin_assume_aligned(data, sizeof(void *));
uint32_t *end =
(uint32_t *)__builtin_assume_aligned(&data[n], sizeof(void *));
if (in) {
s1 = in[0];
s2 = in[1];
} else {
s1 = s2 = 0;
}
assert(n >= 8);
assert((n & 0x00000007) == 0);
assert(n <= 65536);
if (native) {
do {
s1 += *cur++ + s2;
s2 += *cur++ + s1;
} while (cur < end);
} else {
do {
uint32_t d;
formatPut32(cur[0], (uint8_t *)&d);
s1 += d + s2;
formatPut32(cur[1], (uint8_t *)&d);
s2 += d + s1;
cur += 2;
} while (cur < end);
}
out[0] = s1;
out[1] = s2;
}
void formatWalRestartHeader(uint8_t *header)
{
uint32_t checksum[2] = {0, 0};
uint32_t checkpoint;
uint32_t salt1;
/* Increase the checkpoint sequence. */
formatGet32(&header[12], &checkpoint);
checkpoint++;
formatPut32(checkpoint, &header[12]);
/* Increase salt1. */
formatGet32(&header[16], &salt1);
salt1++;
formatPut32(salt1, &header[16]);
/* Generate a random salt2. */
sqlite3_randomness(4, &header[20]);
/* Update the checksum. */
formatWalChecksumBytes(true, header, 24, checksum, checksum);
formatPut32(checksum[0], header + 24);
formatPut32(checksum[1], header + 28);
}
dqlite-1.16.7/src/format.h 0000664 0000000 0000000 00000002436 14652527134 0015336 0 ustar 00root root 0000000 0000000 /* Utilities around SQLite file formats.
*
* See https://sqlite.org/fileformat.html. */
#ifndef FORMAT_H_
#define FORMAT_H_
#include
#include
/* Minumum and maximum page size. */
#define FORMAT__PAGE_SIZE_MIN 512
#define FORMAT__PAGE_SIZE_MAX 65536
/* Database header size. */
#define FORMAT__DB_HDR_SIZE 100
/* Write ahead log header size. */
#define FORMAT__WAL_HDR_SIZE 32
/* Write ahead log frame header size. */
#define FORMAT__WAL_FRAME_HDR_SIZE 24
/* Number of reader marks in the wal index header. */
#define FORMAT__WAL_NREADER 5
/* Given the page size, calculate the size of a full WAL frame (frame header
* plus page data). */
#define formatWalCalcFrameSize(PAGE_SIZE) \
(FORMAT__WAL_FRAME_HDR_SIZE + PAGE_SIZE)
/* Given the page size and the WAL file size, calculate the number of frames it
* has. */
#define formatWalCalcFramesNumber(PAGE_SIZE, SIZE) \
((SIZE - FORMAT__WAL_HDR_SIZE) / formatWalCalcFrameSize(PAGE_SIZE))
/* Given the page size, calculate the WAL page number of the frame starting at
* the given offset. */
#define formatWalCalcFrameIndex(PAGE_SIZE, OFFSET) \
(formatWalCalcFramesNumber(PAGE_SIZE, OFFSET) + 1)
/* Restart the header of a WAL file after a checkpoint. */
void formatWalRestartHeader(uint8_t *header);
#endif /* FORMAT_H */
dqlite-1.16.7/src/fsm.c 0000664 0000000 0000000 00000063643 14652527134 0014635 0 ustar 00root root 0000000 0000000 #include "lib/assert.h"
#include "lib/serialize.h"
#include "command.h"
#include "fsm.h"
#include "raft.h"
#include "tracing.h"
#include "vfs.h"
#include
struct fsm
{
struct logger *logger;
struct registry *registry;
struct
{
unsigned n_pages;
unsigned long *page_numbers;
uint8_t *pages;
} pending; /* For upgrades from V1 */
};
static int apply_open(struct fsm *f, const struct command_open *c)
{
tracef("fsm apply open");
(void)f;
(void)c;
return 0;
}
static int add_pending_pages(struct fsm *f,
unsigned long *page_numbers,
uint8_t *pages,
unsigned n_pages,
unsigned page_size)
{
unsigned n = f->pending.n_pages + n_pages;
unsigned i;
f->pending.page_numbers = sqlite3_realloc64(
f->pending.page_numbers, n * sizeof *f->pending.page_numbers);
if (f->pending.page_numbers == NULL) {
return DQLITE_NOMEM;
}
f->pending.pages = sqlite3_realloc64(f->pending.pages, n * page_size);
if (f->pending.pages == NULL) {
return DQLITE_NOMEM;
}
for (i = 0; i < n_pages; i++) {
unsigned j = f->pending.n_pages + i;
f->pending.page_numbers[j] = page_numbers[i];
memcpy(f->pending.pages + j * page_size,
(uint8_t *)pages + i * page_size, page_size);
}
f->pending.n_pages = n;
return 0;
}
static int databaseReadLock(struct db *db)
{
if (!db->read_lock) {
db->read_lock = 1;
return 0;
} else {
return -1;
}
}
static int databaseReadUnlock(struct db *db)
{
if (db->read_lock) {
db->read_lock = 0;
return 0;
} else {
return -1;
}
}
static void maybeCheckpoint(struct db *db)
{
tracef("maybe checkpoint");
struct sqlite3_file *main_f;
struct sqlite3_file *wal;
volatile void *region;
sqlite3_int64 size;
unsigned page_size;
unsigned pages;
int wal_size;
int ckpt;
int i;
int rv;
/* Don't run when a snapshot is busy. Running a checkpoint while a
* snapshot is busy will result in illegal memory accesses by the
* routines that try to access database page pointers contained in the
* snapshot. */
rv = databaseReadLock(db);
if (rv != 0) {
tracef("busy snapshot %d", rv);
return;
}
assert(db->follower == NULL);
rv = db__open_follower(db);
if (rv != 0) {
tracef("open follower failed %d", rv);
goto err_after_db_lock;
}
page_size = db->config->page_size;
/* Get the database wal file associated with this connection */
rv = sqlite3_file_control(db->follower, "main",
SQLITE_FCNTL_JOURNAL_POINTER, &wal);
assert(rv == SQLITE_OK); /* Should never fail */
rv = wal->pMethods->xFileSize(wal, &size);
assert(rv == SQLITE_OK); /* Should never fail */
/* Calculate the number of frames. */
pages = (unsigned)((size - 32) / (24 + page_size));
/* Check if the size of the WAL is beyond the threshold. */
if (pages < db->config->checkpoint_threshold) {
tracef("wal size (%u) < threshold (%u)", pages,
db->config->checkpoint_threshold);
goto err_after_db_open;
}
/* Get the database file associated with this db->follower connection */
rv = sqlite3_file_control(db->follower, "main",
SQLITE_FCNTL_FILE_POINTER, &main_f);
assert(rv == SQLITE_OK); /* Should never fail */
/* Get the first SHM region, which contains the WAL header. */
rv = main_f->pMethods->xShmMap(main_f, 0, 0, 0, ®ion);
assert(rv == SQLITE_OK); /* Should never fail */
rv = main_f->pMethods->xShmUnmap(main_f, 0);
assert(rv == SQLITE_OK); /* Should never fail */
/* Try to acquire all locks. */
for (i = 0; i < SQLITE_SHM_NLOCK; i++) {
int flags = SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE;
rv = main_f->pMethods->xShmLock(main_f, i, 1, flags);
if (rv == SQLITE_BUSY) {
tracef("busy reader or writer - retry next time");
goto err_after_db_open;
}
/* Not locked. Let's release the lock we just
* acquired. */
flags = SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE;
main_f->pMethods->xShmLock(main_f, i, 1, flags);
}
rv = sqlite3_wal_checkpoint_v2(
db->follower, "main", SQLITE_CHECKPOINT_TRUNCATE, &wal_size, &ckpt);
/* TODO assert(rv == 0) here? Which failure modes do we expect? */
if (rv != 0) {
tracef("sqlite3_wal_checkpoint_v2 failed %d", rv);
goto err_after_db_open;
}
tracef("sqlite3_wal_checkpoint_v2 success");
/* Since no reader transaction is in progress, we must be able to
* checkpoint the entire WAL */
assert(wal_size == 0);
assert(ckpt == 0);
err_after_db_open:
sqlite3_close(db->follower);
db->follower = NULL;
err_after_db_lock:
rv = databaseReadUnlock(db);
assert(rv == 0);
}
static int apply_frames(struct fsm *f, const struct command_frames *c)
{
tracef("fsm apply frames");
struct db *db;
sqlite3_vfs *vfs;
unsigned long *page_numbers = NULL;
void *pages;
int exists;
int rv;
rv = registry__db_get(f->registry, c->filename, &db);
if (rv != 0) {
tracef("db get failed %d", rv);
return rv;
}
vfs = sqlite3_vfs_find(db->config->name);
/* Check if the database file exists, and create it by opening a
* connection if it doesn't. */
rv = vfs->xAccess(vfs, db->path, 0, &exists);
assert(rv == 0);
if (!exists) {
rv = db__open_follower(db);
if (rv != 0) {
tracef("open follower failed %d", rv);
return rv;
}
sqlite3_close(db->follower);
db->follower = NULL;
}
rv = command_frames__page_numbers(c, &page_numbers);
if (rv != 0) {
if (page_numbers != NULL) {
sqlite3_free(page_numbers);
}
tracef("page numbers failed %d", rv);
return rv;
}
command_frames__pages(c, &pages);
/* If the commit marker is set, we apply the changes directly to the
* VFS. Otherwise, if the commit marker is not set, this must be an
* upgrade from V1, we accumulate uncommitted frames in memory until the
* final commit or a rollback. */
if (c->is_commit) {
if (f->pending.n_pages > 0) {
rv = add_pending_pages(f, page_numbers, pages,
c->frames.n_pages,
db->config->page_size);
if (rv != 0) {
tracef("malloc");
sqlite3_free(page_numbers);
return DQLITE_NOMEM;
}
rv =
VfsApply(vfs, db->path, f->pending.n_pages,
f->pending.page_numbers, f->pending.pages);
if (rv != 0) {
tracef("VfsApply failed %d", rv);
sqlite3_free(page_numbers);
return rv;
}
sqlite3_free(f->pending.page_numbers);
sqlite3_free(f->pending.pages);
f->pending.n_pages = 0;
f->pending.page_numbers = NULL;
f->pending.pages = NULL;
} else {
rv = VfsApply(vfs, db->path, c->frames.n_pages,
page_numbers, pages);
if (rv != 0) {
tracef("VfsApply failed %d", rv);
sqlite3_free(page_numbers);
return rv;
}
}
} else {
rv =
add_pending_pages(f, page_numbers, pages, c->frames.n_pages,
db->config->page_size);
if (rv != 0) {
tracef("add pending pages failed %d", rv);
sqlite3_free(page_numbers);
return DQLITE_NOMEM;
}
}
sqlite3_free(page_numbers);
maybeCheckpoint(db);
return 0;
}
static int apply_undo(struct fsm *f, const struct command_undo *c)
{
tracef("apply undo %" PRIu64, c->tx_id);
(void)c;
if (f->pending.n_pages == 0) {
return 0;
}
sqlite3_free(f->pending.page_numbers);
sqlite3_free(f->pending.pages);
f->pending.n_pages = 0;
f->pending.page_numbers = NULL;
f->pending.pages = NULL;
return 0;
}
/* Checkpoints used to be coordinated cluster-wide, these days a node
* checkpoints independently in `apply_frames`, the checkpoint command becomes a
* no-op for modern nodes. */
static int apply_checkpoint(struct fsm *f, const struct command_checkpoint *c)
{
(void)f;
(void)c;
tracef("apply no-op checkpoint");
return 0;
}
static int fsm__apply(struct raft_fsm *fsm,
const struct raft_buffer *buf,
void **result)
{
tracef("fsm apply");
struct fsm *f = fsm->data;
int type;
void *command;
int rc;
rc = command__decode(buf, &type, &command);
if (rc != 0) {
tracef("fsm: decode command: %d", rc);
goto err;
}
switch (type) {
case COMMAND_OPEN:
rc = apply_open(f, command);
break;
case COMMAND_FRAMES:
rc = apply_frames(f, command);
break;
case COMMAND_UNDO:
rc = apply_undo(f, command);
break;
case COMMAND_CHECKPOINT:
rc = apply_checkpoint(f, command);
break;
default:
rc = RAFT_MALFORMED;
break;
}
raft_free(command);
err:
*result = NULL;
return rc;
}
#define SNAPSHOT_FORMAT 1
#define SNAPSHOT_HEADER(X, ...) \
X(uint64, format, ##__VA_ARGS__) \
X(uint64, n, ##__VA_ARGS__)
SERIALIZE__DEFINE(snapshotHeader, SNAPSHOT_HEADER);
SERIALIZE__IMPLEMENT(snapshotHeader, SNAPSHOT_HEADER);
#define SNAPSHOT_DATABASE(X, ...) \
X(text, filename, ##__VA_ARGS__) \
X(uint64, main_size, ##__VA_ARGS__) \
X(uint64, wal_size, ##__VA_ARGS__)
SERIALIZE__DEFINE(snapshotDatabase, SNAPSHOT_DATABASE);
SERIALIZE__IMPLEMENT(snapshotDatabase, SNAPSHOT_DATABASE);
/* Encode the global snapshot header. */
static int encodeSnapshotHeader(unsigned n, struct raft_buffer *buf)
{
struct snapshotHeader header;
char *cursor;
header.format = SNAPSHOT_FORMAT;
header.n = n;
buf->len = snapshotHeader__sizeof(&header);
buf->base = sqlite3_malloc64(buf->len);
if (buf->base == NULL) {
return RAFT_NOMEM;
}
cursor = buf->base;
snapshotHeader__encode(&header, &cursor);
return 0;
}
/* Encode the given database. */
static int encodeDatabase(struct db *db,
struct raft_buffer r_bufs[],
uint32_t n)
{
struct snapshotDatabase header;
sqlite3_vfs *vfs;
uint32_t database_size = 0;
uint8_t *page;
char *cursor;
struct dqlite_buffer *bufs = (struct dqlite_buffer *)r_bufs;
int rv;
header.filename = db->filename;
vfs = sqlite3_vfs_find(db->config->name);
rv = VfsShallowSnapshot(vfs, db->filename, &bufs[1], n - 1);
if (rv != 0) {
goto err;
}
/* Extract the database size from the first page. */
page = bufs[1].base;
database_size += (uint32_t)(page[28] << 24);
database_size += (uint32_t)(page[29] << 16);
database_size += (uint32_t)(page[30] << 8);
database_size += (uint32_t)(page[31]);
header.main_size =
(uint64_t)database_size * (uint64_t)db->config->page_size;
header.wal_size = bufs[n - 1].len;
/* Database header. */
bufs[0].len = snapshotDatabase__sizeof(&header);
bufs[0].base = sqlite3_malloc64(bufs[0].len);
if (bufs[0].base == NULL) {
rv = RAFT_NOMEM;
goto err_after_snapshot;
}
cursor = bufs[0].base;
snapshotDatabase__encode(&header, &cursor);
return 0;
err_after_snapshot:
/* Free the wal buffer */
sqlite3_free(bufs[n - 1].base);
err:
assert(rv != 0);
return rv;
}
/* Decode the database contained in a snapshot. */
static int decodeDatabase(struct fsm *f, struct cursor *cursor)
{
struct snapshotDatabase header;
struct db *db;
sqlite3_vfs *vfs;
size_t n;
int exists;
int rv;
rv = snapshotDatabase__decode(cursor, &header);
if (rv != 0) {
return rv;
}
rv = registry__db_get(f->registry, header.filename, &db);
if (rv != 0) {
return rv;
}
vfs = sqlite3_vfs_find(db->config->name);
/* Check if the database file exists, and create it by opening a
* connection if it doesn't. */
rv = vfs->xAccess(vfs, header.filename, 0, &exists);
assert(rv == 0);
if (!exists) {
rv = db__open_follower(db);
if (rv != 0) {
return rv;
}
sqlite3_close(db->follower);
db->follower = NULL;
}
tracef("main_size:%" PRIu64 " wal_size:%" PRIu64, header.main_size,
header.wal_size);
if (header.main_size + header.wal_size > SIZE_MAX) {
tracef("main_size + wal_size would overflow max DB size");
return -1;
}
/* Due to the check above, this cast is safe. */
n = (size_t)(header.main_size + header.wal_size);
rv = VfsRestore(vfs, db->filename, cursor->p, n);
if (rv != 0) {
return rv;
}
cursor->p += n;
return 0;
}
static unsigned dbNumPages(struct db *db)
{
sqlite3_vfs *vfs;
int rv;
uint32_t n;
vfs = sqlite3_vfs_find(db->config->name);
rv = VfsDatabaseNumPages(vfs, db->filename, &n);
assert(rv == 0);
return n;
}
/* Determine the total number of raft buffers needed for a snapshot */
static unsigned snapshotNumBufs(struct fsm *f)
{
struct db *db;
queue *head;
unsigned n = 1; /* snapshot header */
QUEUE_FOREACH(head, &f->registry->dbs)
{
n += 2; /* database header & wal */
db = QUEUE_DATA(head, struct db, queue);
n += dbNumPages(db); /* 1 buffer per page (zero copy) */
}
return n;
}
/* An example array of snapshot buffers looks like this:
*
* bufs: SH DH1 P1 P2 P3 WAL1 DH2 P1 P2 WAL2
* index: 0 1 2 3 4 5 6 7 8 9
*
* SH: Snapshot Header
* DHx: Database Header
* Px: Database Page (not to be freed)
* WALx: a WAL
* */
static void freeSnapshotBufs(struct fsm *f,
struct raft_buffer bufs[],
unsigned n_bufs)
{
queue *head;
struct db *db;
unsigned i;
if (bufs == NULL || n_bufs == 0) {
return;
}
/* Free snapshot header */
sqlite3_free(bufs[0].base);
i = 1;
/* Free all database headers & WAL buffers */
QUEUE_FOREACH(head, &f->registry->dbs)
{
if (i == n_bufs) {
break;
}
db = QUEUE_DATA(head, struct db, queue);
/* i is the index of the database header */
sqlite3_free(bufs[i].base);
/* i is now the index of the next database header (if any) */
i += 1 /* db header */ + dbNumPages(db) + 1 /* WAL */;
/* free WAL buffer */
sqlite3_free(bufs[i - 1].base);
}
}
static int fsm__snapshot(struct raft_fsm *fsm,
struct raft_buffer *bufs[],
unsigned *n_bufs)
{
struct fsm *f = fsm->data;
queue *head;
struct db *db;
unsigned n_db = 0;
unsigned i;
int rv;
/* First count how many databases we have and check that no transaction
* nor checkpoint nor other snapshot is in progress. */
QUEUE_FOREACH(head, &f->registry->dbs)
{
db = QUEUE_DATA(head, struct db, queue);
if (db->tx_id != 0 || db->read_lock) {
return RAFT_BUSY;
}
n_db++;
}
/* Lock all databases, preventing the checkpoint from running */
QUEUE_FOREACH(head, &f->registry->dbs)
{
db = QUEUE_DATA(head, struct db, queue);
rv = databaseReadLock(db);
assert(rv == 0);
}
*n_bufs = snapshotNumBufs(f);
*bufs = sqlite3_malloc64(*n_bufs * sizeof **bufs);
if (*bufs == NULL) {
rv = RAFT_NOMEM;
goto err;
}
rv = encodeSnapshotHeader(n_db, &(*bufs)[0]);
if (rv != 0) {
goto err_after_bufs_alloc;
}
/* Encode individual databases. */
i = 1;
QUEUE_FOREACH(head, &f->registry->dbs)
{
db = QUEUE_DATA(head, struct db, queue);
/* database_header + num_pages + wal */
unsigned n = 1 + dbNumPages(db) + 1;
rv = encodeDatabase(db, &(*bufs)[i], n);
if (rv != 0) {
goto err_after_encode_header;
}
i += n;
}
assert(i == *n_bufs);
return 0;
err_after_encode_header:
freeSnapshotBufs(f, *bufs, i);
err_after_bufs_alloc:
sqlite3_free(*bufs);
err:
QUEUE_FOREACH(head, &f->registry->dbs)
{
db = QUEUE_DATA(head, struct db, queue);
databaseReadUnlock(db);
}
assert(rv != 0);
return rv;
}
static int fsm__snapshot_finalize(struct raft_fsm *fsm,
struct raft_buffer *bufs[],
unsigned *n_bufs)
{
struct fsm *f = fsm->data;
queue *head;
struct db *db;
unsigned n_db;
struct snapshotHeader header;
int rv;
if (bufs == NULL) {
return 0;
}
/* Decode the header to determine the number of databases. */
struct cursor cursor = {(*bufs)[0].base, (*bufs)[0].len};
rv = snapshotHeader__decode(&cursor, &header);
if (rv != 0) {
tracef("decode failed %d", rv);
return -1;
}
if (header.format != SNAPSHOT_FORMAT) {
tracef("bad format");
return -1;
}
/* Free allocated buffers */
freeSnapshotBufs(f, *bufs, *n_bufs);
sqlite3_free(*bufs);
*bufs = NULL;
*n_bufs = 0;
/* Unlock all databases that were locked for the snapshot, this is safe
* because DB's are only ever added at the back of the queue. */
n_db = 0;
QUEUE_FOREACH(head, &f->registry->dbs)
{
if (n_db == header.n) {
break;
}
db = QUEUE_DATA(head, struct db, queue);
rv = databaseReadUnlock(db);
assert(rv == 0);
n_db++;
}
return 0;
}
static int fsm__restore(struct raft_fsm *fsm, struct raft_buffer *buf)
{
tracef("fsm restore");
struct fsm *f = fsm->data;
struct cursor cursor = {buf->base, buf->len};
struct snapshotHeader header;
unsigned i;
int rv;
rv = snapshotHeader__decode(&cursor, &header);
if (rv != 0) {
tracef("decode failed %d", rv);
return rv;
}
if (header.format != SNAPSHOT_FORMAT) {
tracef("bad format");
return RAFT_MALFORMED;
}
for (i = 0; i < header.n; i++) {
rv = decodeDatabase(f, &cursor);
if (rv != 0) {
tracef("decode failed");
return rv;
}
}
/* Don't use sqlite3_free as this buffer is allocated by raft. */
raft_free(buf->base);
return 0;
}
int fsm__init(struct raft_fsm *fsm,
struct config *config,
struct registry *registry)
{
tracef("fsm init");
struct fsm *f = raft_malloc(sizeof *f);
if (f == NULL) {
return DQLITE_NOMEM;
}
f->logger = &config->logger;
f->registry = registry;
f->pending.n_pages = 0;
f->pending.page_numbers = NULL;
f->pending.pages = NULL;
fsm->version = 2;
fsm->data = f;
fsm->apply = fsm__apply;
fsm->snapshot = fsm__snapshot;
fsm->snapshot_finalize = fsm__snapshot_finalize;
fsm->restore = fsm__restore;
return 0;
}
void fsm__close(struct raft_fsm *fsm)
{
tracef("fsm close");
struct fsm *f = fsm->data;
raft_free(f);
}
/******************************************************************************
Disk-based FSM
*****************************************************************************/
/* The synchronous part of the database encoding */
static int encodeDiskDatabaseSync(struct db *db, struct raft_buffer *r_buf)
{
sqlite3_vfs *vfs;
struct dqlite_buffer *buf = (struct dqlite_buffer *)r_buf;
int rv;
vfs = sqlite3_vfs_find(db->config->name);
rv = VfsDiskSnapshotWal(vfs, db->path, buf);
if (rv != 0) {
goto err;
}
return 0;
err:
assert(rv != 0);
return rv;
}
/* The asynchronous part of the database encoding */
static int encodeDiskDatabaseAsync(struct db *db,
struct raft_buffer r_bufs[],
uint32_t n)
{
struct snapshotDatabase header;
sqlite3_vfs *vfs;
char *cursor;
struct dqlite_buffer *bufs = (struct dqlite_buffer *)r_bufs;
int rv;
assert(n == 3);
vfs = sqlite3_vfs_find(db->config->name);
rv = VfsDiskSnapshotDb(vfs, db->path, &bufs[1]);
if (rv != 0) {
goto err;
}
/* Database header. */
header.filename = db->filename;
header.main_size = bufs[1].len;
header.wal_size = bufs[2].len;
bufs[0].len = snapshotDatabase__sizeof(&header);
bufs[0].base = sqlite3_malloc64(bufs[0].len);
if (bufs[0].base == NULL) {
rv = RAFT_NOMEM;
goto err;
}
cursor = bufs[0].base;
snapshotDatabase__encode(&header, &cursor);
return 0;
/* Cleanup is performed by call to snapshot_finalize */
err:
assert(rv != 0);
return rv;
}
/* Determine the total number of raft buffers needed
* for a snapshot in disk-mode */
static unsigned snapshotNumBufsDisk(struct fsm *f)
{
queue *head;
unsigned n = 1; /* snapshot header */
QUEUE_FOREACH(head, &f->registry->dbs)
{
n += 3; /* database header, database file and wal */
}
return n;
}
/* An example array of snapshot buffers looks like this:
*
* bufs: SH DH1 DBMMAP1 WAL1 DH2 DMMAP2 WAL2
* index: 0 1 2 3 4 5 6
*
* SH: Snapshot Header
* DHx: Database Header
* DBMMAP: Pointer to mmap'ed database file
* WALx: a WAL
* */
static void freeSnapshotBufsDisk(struct fsm *f,
struct raft_buffer bufs[],
unsigned n_bufs)
{
queue *head;
unsigned i;
if (bufs == NULL || n_bufs == 0) {
return;
}
/* Free snapshot header */
sqlite3_free(bufs[0].base);
i = 1;
/* Free all database headers & WAL buffers. Unmap the DB file. */
QUEUE_FOREACH(head, &f->registry->dbs)
{
if (i == n_bufs) {
break;
}
/* i is the index of the database header */
sqlite3_free(bufs[i].base);
if (bufs[i + 1].base != NULL) {
munmap(bufs[i + 1].base, bufs[i + 1].len);
}
sqlite3_free(bufs[i + 2].base);
/* i is now the index of the next database header (if any) */
i += 3;
}
}
static int fsm__snapshot_disk(struct raft_fsm *fsm,
struct raft_buffer *bufs[],
unsigned *n_bufs)
{
struct fsm *f = fsm->data;
queue *head;
struct db *db = NULL;
unsigned n_db = 0;
unsigned i;
int rv;
/* First count how many databases we have and check that no transaction
* nor checkpoint nor other snapshot is in progress. */
QUEUE_FOREACH(head, &f->registry->dbs)
{
db = QUEUE_DATA(head, struct db, queue);
if (db->tx_id != 0 || db->read_lock) {
return RAFT_BUSY;
}
n_db++;
}
/* Lock all databases, preventing the checkpoint from running. This
* ensures the database is not written while it is mmap'ed and copied by
* raft. */
QUEUE_FOREACH(head, &f->registry->dbs)
{
db = QUEUE_DATA(head, struct db, queue);
rv = databaseReadLock(db);
assert(rv == 0);
}
*n_bufs = snapshotNumBufsDisk(f);
*bufs = sqlite3_malloc64(*n_bufs * sizeof **bufs);
if (*bufs == NULL) {
rv = RAFT_NOMEM;
goto err;
}
/* zero-init buffers, helps with cleanup */
for (unsigned j = 0; j < *n_bufs; j++) {
(*bufs)[j].base = NULL;
(*bufs)[j].len = 0;
}
rv = encodeSnapshotHeader(n_db, &(*bufs)[0]);
if (rv != 0) {
goto err_after_bufs_alloc;
}
/* Copy WAL of all databases. */
i = 1;
QUEUE_FOREACH(head, &f->registry->dbs)
{
db = QUEUE_DATA(head, struct db, queue);
/* database_header + db + WAL */
unsigned n = 3;
/* pass pointer to buffer that will contain WAL. */
rv = encodeDiskDatabaseSync(db, &(*bufs)[i + n - 1]);
if (rv != 0) {
goto err_after_encode_sync;
}
i += n;
}
assert(i == *n_bufs);
return 0;
err_after_encode_sync:
freeSnapshotBufsDisk(f, *bufs, i);
err_after_bufs_alloc:
sqlite3_free(*bufs);
err:
QUEUE_FOREACH(head, &f->registry->dbs)
{
db = QUEUE_DATA(head, struct db, queue);
databaseReadUnlock(db);
}
assert(rv != 0);
return rv;
}
static int fsm__snapshot_async_disk(struct raft_fsm *fsm,
struct raft_buffer *bufs[],
unsigned *n_bufs)
{
struct fsm *f = fsm->data;
queue *head;
struct snapshotHeader header;
struct db *db = NULL;
unsigned i;
int rv;
/* Decode the header to determine the number of databases. */
struct cursor cursor = {(*bufs)[0].base, (*bufs)[0].len};
rv = snapshotHeader__decode(&cursor, &header);
if (rv != 0) {
tracef("decode failed %d", rv);
return -1;
}
if (header.format != SNAPSHOT_FORMAT) {
tracef("bad format");
return -1;
}
/* Encode individual databases. */
i = 1;
QUEUE_FOREACH(head, &f->registry->dbs)
{
if (i == *n_bufs) {
/* In case a db was added in meanwhile */
break;
}
db = QUEUE_DATA(head, struct db, queue);
/* database_header + database file + wal */
unsigned n = 3;
rv = encodeDiskDatabaseAsync(db, &(*bufs)[i], n);
if (rv != 0) {
goto err;
}
i += n;
}
return 0;
err:
assert(rv != 0);
return rv;
}
static int fsm__snapshot_finalize_disk(struct raft_fsm *fsm,
struct raft_buffer *bufs[],
unsigned *n_bufs)
{
struct fsm *f = fsm->data;
queue *head;
struct db *db;
unsigned n_db;
struct snapshotHeader header;
int rv;
if (bufs == NULL) {
return 0;
}
/* Decode the header to determine the number of databases. */
struct cursor cursor = {(*bufs)[0].base, (*bufs)[0].len};
rv = snapshotHeader__decode(&cursor, &header);
if (rv != 0) {
tracef("decode failed %d", rv);
return -1;
}
if (header.format != SNAPSHOT_FORMAT) {
tracef("bad format");
return -1;
}
/* Free allocated buffers */
freeSnapshotBufsDisk(f, *bufs, *n_bufs);
sqlite3_free(*bufs);
*bufs = NULL;
*n_bufs = 0;
/* Unlock all databases that were locked for the snapshot, this is safe
* because DB's are only ever added at the back of the queue. */
n_db = 0;
QUEUE_FOREACH(head, &f->registry->dbs)
{
if (n_db == header.n) {
break;
}
db = QUEUE_DATA(head, struct db, queue);
databaseReadUnlock(db);
n_db++;
}
return 0;
}
/* Decode the disk database contained in a snapshot. */
static int decodeDiskDatabase(struct fsm *f, struct cursor *cursor)
{
struct snapshotDatabase header;
struct db *db;
sqlite3_vfs *vfs;
int exists;
int rv;
rv = snapshotDatabase__decode(cursor, &header);
if (rv != 0) {
return rv;
}
rv = registry__db_get(f->registry, header.filename, &db);
if (rv != 0) {
return rv;
}
vfs = sqlite3_vfs_find(db->config->name);
/* Check if the database file exists, and create it by opening a
* connection if it doesn't. */
rv = vfs->xAccess(vfs, db->path, 0, &exists);
assert(rv == 0);
if (!exists) {
rv = db__open_follower(db);
if (rv != 0) {
return rv;
}
sqlite3_close(db->follower);
db->follower = NULL;
}
/* The last check can overflow, but we would already be lost anyway, as
* the raft snapshot restore API only supplies one buffer and the data
* has to fit in size_t bytes anyway. */
if (header.main_size > SIZE_MAX || header.wal_size > SIZE_MAX ||
header.main_size + header.wal_size > SIZE_MAX) {
tracef("main_size:%" PRIu64 "B wal_size:%" PRIu64
"B would overflow max DB size (%zuB)",
header.main_size, header.wal_size, SIZE_MAX);
return -1;
}
/* Due to the check above, these casts are safe. */
rv = VfsDiskRestore(vfs, db->path, cursor->p, (size_t)header.main_size,
(size_t)header.wal_size);
if (rv != 0) {
tracef("VfsDiskRestore %d", rv);
return rv;
}
cursor->p += header.main_size + header.wal_size;
return 0;
}
static int fsm__restore_disk(struct raft_fsm *fsm, struct raft_buffer *buf)
{
tracef("fsm restore disk");
struct fsm *f = fsm->data;
struct cursor cursor = {buf->base, buf->len};
struct snapshotHeader header;
unsigned i;
int rv;
rv = snapshotHeader__decode(&cursor, &header);
if (rv != 0) {
tracef("decode failed %d", rv);
return rv;
}
if (header.format != SNAPSHOT_FORMAT) {
tracef("bad format");
return RAFT_MALFORMED;
}
for (i = 0; i < header.n; i++) {
rv = decodeDiskDatabase(f, &cursor);
if (rv != 0) {
tracef("decode failed");
return rv;
}
}
/* Don't use sqlite3_free as this buffer is allocated by raft. */
raft_free(buf->base);
return 0;
}
int fsm__init_disk(struct raft_fsm *fsm,
struct config *config,
struct registry *registry)
{
tracef("fsm init");
struct fsm *f = raft_malloc(sizeof *f);
if (f == NULL) {
return DQLITE_NOMEM;
}
f->logger = &config->logger;
f->registry = registry;
f->pending.n_pages = 0;
f->pending.page_numbers = NULL;
f->pending.pages = NULL;
fsm->version = 3;
fsm->data = f;
fsm->apply = fsm__apply;
fsm->snapshot = fsm__snapshot_disk;
fsm->snapshot_async = fsm__snapshot_async_disk;
fsm->snapshot_finalize = fsm__snapshot_finalize_disk;
fsm->restore = fsm__restore_disk;
return 0;
}
dqlite-1.16.7/src/fsm.h 0000664 0000000 0000000 00000001200 14652527134 0014617 0 ustar 00root root 0000000 0000000 /**
* Dqlite Raft FSM
*/
#ifndef DQLITE_FSM_H_
#define DQLITE_FSM_H_
#include "config.h"
#include "raft.h"
#include "registry.h"
/**
* Initialize the given SQLite replication interface with dqlite's raft based
* implementation.
*/
int fsm__init(struct raft_fsm *fsm,
struct config *config,
struct registry *registry);
/**
* Initialize the given SQLite replication interface with dqlite's on-disk
* raft based implementation.
*/
int fsm__init_disk(struct raft_fsm *fsm,
struct config *config,
struct registry *registry);
void fsm__close(struct raft_fsm *fsm);
#endif /* DQLITE_REPLICATION_METHODS_H_ */
dqlite-1.16.7/src/gateway.c 0000664 0000000 0000000 00000114121 14652527134 0015475 0 ustar 00root root 0000000 0000000 #include "gateway.h"
#include "bind.h"
#include "conn.h"
#include "id.h"
#include "lib/threadpool.h"
#include "protocol.h"
#include "query.h"
#include "request.h"
#include "response.h"
#include "server.h"
#include "tracing.h"
#include "translate.h"
#include "tuple.h"
#include "vfs.h"
void gateway__init(struct gateway *g,
struct config *config,
struct registry *registry,
struct raft *raft,
struct id_state seed)
{
tracef("gateway init");
g->config = config;
g->registry = registry;
g->raft = raft;
g->leader = NULL;
g->req = NULL;
g->exec.data = g;
stmt__registry_init(&g->stmts);
g->barrier.data = g;
g->barrier.cb = NULL;
g->barrier.leader = NULL;
g->protocol = DQLITE_PROTOCOL_VERSION;
g->client_id = 0;
g->random_state = seed;
}
void gateway__leader_close(struct gateway *g, int reason)
{
if (g == NULL || g->leader == NULL) {
tracef("gateway:%p or gateway->leader are NULL", g);
return;
}
if (g->req != NULL) {
if (g->leader->inflight != NULL) {
tracef("finish inflight apply request");
struct raft_apply *req = &g->leader->inflight->req;
req->cb(req, reason, NULL);
assert(g->req == NULL);
} else if (g->barrier.cb != NULL) {
tracef("finish inflight barrier");
/* This is not a typo, g->barrier.req.cb is a wrapper
* around g->barrier.cb and will set g->barrier.cb to
* NULL when called. */
struct raft_barrier *b = &g->barrier.req;
b->cb(b, reason);
assert(g->barrier.cb == NULL);
} else if (g->leader->exec != NULL &&
g->leader->exec->barrier.cb != NULL) {
tracef("finish inflight exec barrier");
struct raft_barrier *b = &g->leader->exec->barrier.req;
b->cb(b, reason);
assert(g->leader->exec == NULL);
} else if (g->req->type == DQLITE_REQUEST_QUERY_SQL) {
/* Finalize the statement that was in the process of
* yielding rows. We only need to handle QUERY_SQL
* because for QUERY and EXEC the statement is finalized
* by the call to stmt__registry_close, below (and for
* EXEC_SQL the lifetimes of the statements are managed
* by leader__exec and the associated callback).
*
* It's okay if g->req->stmt is NULL since
* sqlite3_finalize(NULL) is documented to be a no-op.
*/
sqlite3_finalize(g->req->stmt);
g->req = NULL;
} else if (g->req->type == DQLITE_REQUEST_QUERY) {
/* In case the statement is a prepared one, it
* will be finalized by the stmt__registry_close
* call below. Nevertheless, we must signal that
* the request is not in place anymore so that any
* callback which is already in the queue will not
* attempt to execute a finalized statement.
*/
g->req = NULL;
}
}
stmt__registry_close(&g->stmts);
leader__close(g->leader);
sqlite3_free(g->leader);
g->leader = NULL;
}
void gateway__close(struct gateway *g)
{
tracef("gateway close");
if (g->leader == NULL) {
stmt__registry_close(&g->stmts);
return;
}
gateway__leader_close(g, RAFT_SHUTDOWN);
}
/* Declare a request struct and a response struct of the appropriate types and
* decode the request. This is used in the common case where only one schema
* version is extant. */
#define START_V0(REQ, RES, ...) \
struct request_##REQ request = { 0 }; \
struct response_##RES response = { 0 }; \
{ \
int rv_; \
if (req->schema != 0) { \
tracef("bad schema version %d", req->schema); \
failure(req, DQLITE_PARSE, \
"unrecognized schema version"); \
return 0; \
} \
rv_ = request_##REQ##__decode(cursor, &request); \
if (rv_ != 0) { \
return rv_; \
} \
}
#define CHECK_LEADER(REQ) \
if (raft_state(g->raft) != RAFT_LEADER) { \
failure(REQ, SQLITE_IOERR_NOT_LEADER, "not leader"); \
return 0; \
}
#define SUCCESS(LOWER, UPPER, RESP, SCHEMA) \
{ \
size_t _n = response_##LOWER##__sizeof(&RESP); \
char *_cursor; \
assert(_n % 8 == 0); \
_cursor = buffer__advance(req->buffer, _n); \
/* Since responses are small and the buffer it's at least 4096 \
* bytes, this can't fail. */ \
assert(_cursor != NULL); \
response_##LOWER##__encode(&RESP, &_cursor); \
req->cb(req, 0, DQLITE_RESPONSE_##UPPER, SCHEMA); \
}
/* Encode the given success response and invoke the request callback,
* using schema version 0. */
#define SUCCESS_V0(LOWER, UPPER) SUCCESS(LOWER, UPPER, response, 0)
/* Lookup the database with the given ID.
*
* TODO: support more than one database per connection? */
#define LOOKUP_DB(ID) \
if (ID != 0 || g->leader == NULL) { \
failure(req, SQLITE_NOTFOUND, "no database opened"); \
return 0; \
}
/* Lookup the statement with the given ID. */
#define LOOKUP_STMT(ID) \
stmt = stmt__registry_get(&g->stmts, ID); \
if (stmt == NULL) { \
failure(req, SQLITE_NOTFOUND, \
"no statement with the given id"); \
return 0; \
}
#define FAIL_IF_CHECKPOINTING \
{ \
struct sqlite3_file *_file; \
int _rv; \
_rv = sqlite3_file_control(g->leader->conn, "main", \
SQLITE_FCNTL_FILE_POINTER, &_file); \
assert(_rv == SQLITE_OK); /* Should never fail */ \
\
_rv = _file->pMethods->xShmLock( \
_file, 1 /* checkpoint lock */, 1, \
SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE); \
if (_rv != 0) { \
assert(_rv == SQLITE_BUSY); \
failure(req, SQLITE_BUSY, "checkpoint in progress"); \
return 0; \
} \
_file->pMethods->xShmLock( \
_file, 1 /* checkpoint lock */, 1, \
SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE); \
}
/* Encode fa failure response and invoke the request callback */
static void failure(struct handle *req, int code, const char *message)
{
struct response_failure failure;
size_t n;
char *cursor;
failure.code = (uint64_t)code;
failure.message = message;
n = response_failure__sizeof(&failure);
assert(n % 8 == 0);
cursor = buffer__advance(req->buffer, n);
/* The buffer has at least 4096 bytes, and error messages are shorter
* than that. So this can't fail. */
assert(cursor != NULL);
response_failure__encode(&failure, &cursor);
req->cb(req, 0, DQLITE_RESPONSE_FAILURE, 0);
}
static void emptyRows(struct handle *req)
{
char *cursor = buffer__advance(req->buffer, 8 + 8);
uint64_t val;
assert(cursor != NULL);
val = 0;
uint64__encode(&val, &cursor);
val = DQLITE_RESPONSE_ROWS_DONE;
uint64__encode(&val, &cursor);
req->cb(req, 0, DQLITE_RESPONSE_ROWS, 0);
}
static int handle_leader_legacy(struct gateway *g, struct handle *req)
{
tracef("handle leader legacy");
struct cursor *cursor = &req->cursor;
START_V0(leader, server_legacy);
raft_id id;
raft_leader(g->raft, &id, &response.address);
if (response.address == NULL) {
response.address = "";
}
SUCCESS_V0(server_legacy, SERVER_LEGACY);
return 0;
}
static int handle_leader(struct gateway *g, struct handle *req)
{
tracef("handle leader");
struct cursor *cursor = &req->cursor;
raft_id id = 0;
const char *address = NULL;
unsigned i;
if (g->protocol == DQLITE_PROTOCOL_VERSION_LEGACY) {
return handle_leader_legacy(g, req);
}
START_V0(leader, server);
/* Only voters might now who the leader is. */
for (i = 0; i < g->raft->configuration.n; i++) {
struct raft_server *server = &g->raft->configuration.servers[i];
if (server->id == g->raft->id && server->role == RAFT_VOTER) {
tracef("handle leader - dispatch to %llu", server->id);
raft_leader(g->raft, &id, &address);
break;
}
}
response.id = id;
response.address = address;
if (response.address == NULL) {
response.address = "";
}
SUCCESS_V0(server, SERVER);
return 0;
}
static int handle_client(struct gateway *g, struct handle *req)
{
tracef("handle client");
struct cursor *cursor = &req->cursor;
START_V0(client, welcome);
g->client_id = request.id;
response.heartbeat_timeout = g->config->heartbeat_timeout;
SUCCESS_V0(welcome, WELCOME);
return 0;
}
static int handle_open(struct gateway *g, struct handle *req)
{
tracef("handle open");
struct cursor *cursor = &req->cursor;
struct db *db;
int rc;
START_V0(open, db);
if (g->leader != NULL) {
tracef("already open");
failure(req, SQLITE_BUSY,
"a database for this connection is already open");
return 0;
}
rc = registry__db_get(g->registry, request.filename, &db);
if (rc != 0) {
tracef("registry db get failed %d", rc);
return rc;
}
g->leader = sqlite3_malloc(sizeof *g->leader);
if (g->leader == NULL) {
tracef("malloc failed");
return DQLITE_NOMEM;
}
rc = leader__init(g->leader, db, g->raft);
if (rc != 0) {
tracef("leader init failed %d", rc);
sqlite3_free(g->leader);
g->leader = NULL;
return rc;
}
response.id = 0;
SUCCESS_V0(db, DB);
return 0;
}
static void prepareBarrierCb(struct barrier *barrier, int status)
{
tracef("prepare barrier cb status:%d", status);
struct gateway *g = barrier->data;
struct handle *req = g->req;
struct response_stmt response_v0 = { 0 };
struct response_stmt_with_offset response_v1 = { 0 };
const char *sql = req->sql;
struct stmt *stmt;
const char *tail;
sqlite3_stmt *tail_stmt;
int rc;
assert(req != NULL);
stmt = stmt__registry_get(&g->stmts, req->stmt_id);
assert(stmt != NULL);
g->req = NULL;
if (status != 0) {
stmt__registry_del(&g->stmts, stmt);
failure(req, status, "barrier error");
return;
}
rc = sqlite3_prepare_v2(g->leader->conn, sql, -1, &stmt->stmt, &tail);
if (rc != SQLITE_OK) {
failure(req, rc, sqlite3_errmsg(g->leader->conn));
stmt__registry_del(&g->stmts, stmt);
return;
}
if (stmt->stmt == NULL) {
tracef("prepare barrier cb empty statement");
stmt__registry_del(&g->stmts, stmt);
/* FIXME Should we use a code other than 0 here? */
failure(req, 0, "empty statement");
return;
}
if (req->schema == DQLITE_PREPARE_STMT_SCHEMA_V0) {
rc = sqlite3_prepare_v2(g->leader->conn, tail, -1, &tail_stmt,
NULL);
if (rc != 0 || tail_stmt != NULL) {
stmt__registry_del(&g->stmts, stmt);
sqlite3_finalize(tail_stmt);
failure(req, SQLITE_ERROR, "nonempty statement tail");
return;
}
}
switch (req->schema) {
case DQLITE_PREPARE_STMT_SCHEMA_V0:
response_v0.db_id = (uint32_t)req->db_id;
response_v0.id = (uint32_t)stmt->id;
response_v0.params =
(uint64_t)sqlite3_bind_parameter_count(stmt->stmt);
SUCCESS(stmt, STMT, response_v0,
DQLITE_PREPARE_STMT_SCHEMA_V0);
break;
case DQLITE_PREPARE_STMT_SCHEMA_V1:
response_v1.db_id = (uint32_t)req->db_id;
response_v1.id = (uint32_t)stmt->id;
response_v1.params =
(uint64_t)sqlite3_bind_parameter_count(stmt->stmt);
response_v1.offset = (uint64_t)(tail - sql);
SUCCESS(stmt_with_offset, STMT_WITH_OFFSET, response_v1,
DQLITE_PREPARE_STMT_SCHEMA_V1);
break;
default:
assert(0);
}
}
static int handle_prepare(struct gateway *g, struct handle *req)
{
tracef("handle prepare");
struct cursor *cursor = &req->cursor;
struct stmt *stmt;
struct request_prepare request = { 0 };
int rc;
if (req->schema != DQLITE_PREPARE_STMT_SCHEMA_V0 &&
req->schema != DQLITE_PREPARE_STMT_SCHEMA_V1) {
failure(req, SQLITE_ERROR, "unrecognized schema version");
return 0;
}
rc = request_prepare__decode(cursor, &request);
if (rc != 0) {
return rc;
}
CHECK_LEADER(req);
LOOKUP_DB(request.db_id);
rc = stmt__registry_add(&g->stmts, &stmt);
if (rc != 0) {
tracef("handle prepare registry add failed %d", rc);
return rc;
}
assert(stmt != NULL);
/* This cast is safe as long as the TODO in LOOKUP_DB is not
* implemented. */
req->db_id = (size_t)request.db_id;
req->stmt_id = stmt->id;
req->sql = request.sql;
g->req = req;
rc = leader__barrier(g->leader, &g->barrier, prepareBarrierCb);
if (rc != 0) {
tracef("handle prepare barrier failed %d", rc);
stmt__registry_del(&g->stmts, stmt);
g->req = NULL;
return rc;
}
return 0;
}
/* Fill a result response with the last inserted ID and number of rows
* affected. */
static void fill_result(struct gateway *g, struct response_result *response)
{
assert(g->leader != NULL);
response->last_insert_id =
(uint64_t)sqlite3_last_insert_rowid(g->leader->conn);
/* FIXME eventually we should consider using sqlite3_changes64 here */
response->rows_affected = (uint64_t)sqlite3_changes(g->leader->conn);
}
static const char *error_message(sqlite3 *db, int rc)
{
switch (rc) {
case SQLITE_IOERR_LEADERSHIP_LOST:
return "disk I/O error";
case SQLITE_IOERR_WRITE:
return "disk I/O error";
case SQLITE_ABORT:
return "abort";
case SQLITE_ROW:
return "rows yielded when none expected for EXEC "
"request";
}
return sqlite3_errmsg(db);
}
static void leader_exec_cb(struct exec *exec, int status)
{
struct gateway *g = exec->data;
struct handle *req = g->req;
struct stmt *stmt = stmt__registry_get(&g->stmts, req->stmt_id);
assert(stmt != NULL);
struct response_result response;
g->req = NULL;
if (status == SQLITE_DONE) {
fill_result(g, &response);
SUCCESS_V0(result, RESULT);
} else {
assert(g->leader != NULL);
failure(req, status, error_message(g->leader->conn, status));
sqlite3_reset(stmt->stmt);
}
}
static int handle_exec(struct gateway *g, struct handle *req)
{
tracef("handle exec schema:%" PRIu8, req->schema);
struct cursor *cursor = &req->cursor;
struct stmt *stmt;
struct request_exec request = { 0 };
int tuple_format;
uint64_t req_id;
int rv;
switch (req->schema) {
case DQLITE_REQUEST_PARAMS_SCHEMA_V0:
tuple_format = TUPLE__PARAMS;
break;
case DQLITE_REQUEST_PARAMS_SCHEMA_V1:
tuple_format = TUPLE__PARAMS32;
break;
default:
tracef("bad schema version %d", req->schema);
failure(req, DQLITE_PARSE,
"unrecognized schema version");
return 0;
}
/* The v0 and v1 schemas only differ in the layout of the tuple,
* so we can use the same decode function for both. */
rv = request_exec__decode(cursor, &request);
if (rv != 0) {
return rv;
}
CHECK_LEADER(req);
LOOKUP_DB(request.db_id);
LOOKUP_STMT(request.stmt_id);
FAIL_IF_CHECKPOINTING;
rv = bind__params(stmt->stmt, cursor, tuple_format);
if (rv != 0) {
tracef("handle exec bind failed %d", rv);
failure(req, rv, "bind parameters");
return 0;
}
req->stmt_id = stmt->id;
g->req = req;
req_id = idNext(&g->random_state);
rv = leader__exec(g->leader, &g->exec, stmt->stmt, req_id,
leader_exec_cb);
if (rv != 0) {
tracef("handle exec leader exec failed %d", rv);
g->req = NULL;
return rv;
}
return 0;
}
/* Step through the given statement and populate the response buffer of the
* given request with a single batch of rows.
*
* A single batch of rows is typically about the size of a memory page. */
static void query_batch_async(struct handle *req, enum pool_half half)
{
struct gateway *g = req->gw;
sqlite3_stmt *stmt = req->stmt;
assert(stmt != NULL);
struct response_rows response;
int rc;
if (half == POOL_TOP_HALF) {
req->work.rc = query__batch(stmt, req->buffer);
return;
} /* else POOL_BOTTOM_HALF => */
rc = req->work.rc;
if (rc != SQLITE_ROW && rc != SQLITE_DONE) {
assert(g->leader != NULL);
failure(req, rc, sqlite3_errmsg(g->leader->conn));
sqlite3_reset(stmt);
goto done;
}
if (rc == SQLITE_ROW) {
response.eof = DQLITE_RESPONSE_ROWS_PART;
g->req = req;
SUCCESS_V0(rows, ROWS);
return;
} else {
response.eof = DQLITE_RESPONSE_ROWS_DONE;
SUCCESS_V0(rows, ROWS);
}
done:
if (req->type == DQLITE_REQUEST_QUERY_SQL) {
sqlite3_finalize(stmt);
}
}
#ifdef DQLITE_NEXT
static void qb_top(pool_work_t *w)
{
struct handle *req = CONTAINER_OF(w, struct handle, work);
query_batch_async(req, POOL_TOP_HALF);
}
static void qb_bottom(pool_work_t *w)
{
struct handle *req = CONTAINER_OF(w, struct handle, work);
query_batch_async(req, POOL_BOTTOM_HALF);
}
#endif
static void query_batch(struct gateway *g)
{
struct handle *req = g->req;
assert(req != NULL);
g->req = NULL;
req->gw = g;
#ifdef DQLITE_NEXT
struct dqlite_node *node = g->raft->data;
pool_t *pool = !!(pool_ut_fallback()->flags & POOL_FOR_UT)
? pool_ut_fallback() : &node->pool;
pool_queue_work(pool, &req->work, g->leader->db->cookie,
WT_UNORD, qb_top, qb_bottom);
#else
query_batch_async(req, POOL_TOP_HALF);
query_batch_async(req, POOL_BOTTOM_HALF);
#endif
}
static void query_barrier_cb(struct barrier *barrier, int status)
{
tracef("query barrier cb status:%d", status);
struct gateway *g = barrier->data;
struct handle *req = g->req;
assert(req != NULL);
g->req = NULL;
struct stmt *stmt = stmt__registry_get(&g->stmts, req->stmt_id);
assert(stmt != NULL);
if (status != 0) {
failure(req, status, "barrier error");
return;
}
req->stmt = stmt->stmt;
g->req = req;
query_batch(g);
}
static void leaderModifyingQueryCb(struct exec *exec, int status)
{
struct gateway *g = exec->data;
struct handle *req = g->req;
assert(req != NULL);
g->req = NULL;
struct stmt *stmt = stmt__registry_get(&g->stmts, req->stmt_id);
assert(stmt != NULL);
if (status == SQLITE_DONE) {
emptyRows(req);
} else {
assert(g->leader != NULL);
failure(req, status, error_message(g->leader->conn, status));
sqlite3_reset(stmt->stmt);
}
}
static int handle_query(struct gateway *g, struct handle *req)
{
tracef("handle query schema:%" PRIu8, req->schema);
struct cursor *cursor = &req->cursor;
struct stmt *stmt;
struct request_query request = { 0 };
int tuple_format;
bool is_readonly;
uint64_t req_id;
int rv;
switch (req->schema) {
case DQLITE_REQUEST_PARAMS_SCHEMA_V0:
tuple_format = TUPLE__PARAMS;
break;
case DQLITE_REQUEST_PARAMS_SCHEMA_V1:
tuple_format = TUPLE__PARAMS32;
break;
default:
tracef("bad schema version %d", req->schema);
failure(req, DQLITE_PARSE,
"unrecognized schema version");
return 0;
}
/* The only difference in layout between the v0 and v1 requests is in
* the tuple, which isn't parsed until bind__params later on. */
rv = request_query__decode(cursor, &request);
if (rv != 0) {
return rv;
}
CHECK_LEADER(req);
LOOKUP_DB(request.db_id);
LOOKUP_STMT(request.stmt_id);
FAIL_IF_CHECKPOINTING;
rv = bind__params(stmt->stmt, cursor, tuple_format);
if (rv != 0) {
tracef("handle query bind failed %d", rv);
failure(req, rv, "bind parameters");
return 0;
}
req->stmt_id = stmt->id;
g->req = req;
is_readonly = (bool)sqlite3_stmt_readonly(stmt->stmt);
if (is_readonly) {
rv = leader__barrier(g->leader, &g->barrier, query_barrier_cb);
} else {
req_id = idNext(&g->random_state);
rv = leader__exec(g->leader, &g->exec, stmt->stmt, req_id,
leaderModifyingQueryCb);
}
if (rv != 0) {
g->req = NULL;
return rv;
}
return 0;
}
static int handle_finalize(struct gateway *g, struct handle *req)
{
tracef("handle finalize");
struct cursor *cursor = &req->cursor;
struct stmt *stmt;
int rv;
START_V0(finalize, empty);
LOOKUP_DB(request.db_id);
LOOKUP_STMT(request.stmt_id);
rv = stmt__registry_del(&g->stmts, stmt);
if (rv != 0) {
tracef("handle finalize registry del failed %d", rv);
failure(req, rv, "finalize statement");
return 0;
}
SUCCESS_V0(empty, EMPTY);
return 0;
}
static void handle_exec_sql_next(struct gateway *g,
struct handle *req,
bool done);
static void handle_exec_sql_cb(struct exec *exec, int status)
{
tracef("handle exec sql cb status %d", status);
struct gateway *g = exec->data;
struct handle *req = g->req;
req->exec_count += 1;
sqlite3_finalize(exec->stmt);
if (status == SQLITE_DONE) {
handle_exec_sql_next(g, req, true);
} else {
assert(g->leader != NULL);
failure(req, status, error_message(g->leader->conn, status));
g->req = NULL;
}
}
static void handle_exec_sql_next(struct gateway *g,
struct handle *req,
bool done)
{
tracef("handle exec sql next");
struct cursor *cursor = &req->cursor;
struct response_result response = { 0 };
sqlite3_stmt *stmt = NULL;
const char *tail;
int tuple_format;
uint64_t req_id;
int rv;
if (req->sql == NULL || strcmp(req->sql, "") == 0) {
goto success;
}
/* stmt will be set to NULL by sqlite when an error occurs. */
assert(g->leader != NULL);
rv = sqlite3_prepare_v2(g->leader->conn, req->sql, -1, &stmt, &tail);
if (rv != SQLITE_OK) {
tracef("exec sql prepare failed %d", rv);
failure(req, rv, sqlite3_errmsg(g->leader->conn));
goto done;
}
if (stmt == NULL) {
goto success;
}
if (!done) {
switch (req->schema) {
case DQLITE_REQUEST_PARAMS_SCHEMA_V0:
tuple_format = TUPLE__PARAMS;
break;
case DQLITE_REQUEST_PARAMS_SCHEMA_V1:
tuple_format = TUPLE__PARAMS32;
break;
default:
/* Should have been caught by handle_exec_sql */
assert(0);
}
rv = bind__params(stmt, cursor, tuple_format);
if (rv != SQLITE_OK) {
failure(req, rv, "bind parameters");
goto done_after_prepare;
}
}
req->sql = tail;
g->req = req;
req_id = idNext(&g->random_state);
/* At this point, leader__exec takes ownership of stmt */
rv =
leader__exec(g->leader, &g->exec, stmt, req_id, handle_exec_sql_cb);
if (rv != SQLITE_OK) {
failure(req, rv, sqlite3_errmsg(g->leader->conn));
goto done_after_prepare;
}
return;
success:
tracef("handle exec sql next success");
if (req->exec_count > 0) {
fill_result(g, &response);
}
SUCCESS_V0(result, RESULT);
done_after_prepare:
sqlite3_finalize(stmt);
done:
g->req = NULL;
}
static void execSqlBarrierCb(struct barrier *barrier, int status)
{
tracef("exec sql barrier cb status:%d", status);
struct gateway *g = barrier->data;
struct handle *req = g->req;
assert(req != NULL);
g->req = NULL;
if (status != 0) {
failure(req, status, "barrier error");
return;
}
handle_exec_sql_next(g, req, false);
}
static int handle_exec_sql(struct gateway *g, struct handle *req)
{
tracef("handle exec sql schema:%" PRIu8, req->schema);
struct cursor *cursor = &req->cursor;
struct request_exec_sql request = { 0 };
int rc;
/* Fail early if the schema version isn't recognized, even though we
* won't use it until later. */
if (req->schema != 0 && req->schema != 1) {
tracef("bad schema version %d", req->schema);
failure(req, DQLITE_PARSE, "unrecognized schema version");
return 0;
}
/* The only difference in layout between the v0 and v1 requests is in
* the tuple, which isn't parsed until bind__params later on. */
rc = request_exec_sql__decode(cursor, &request);
if (rc != 0) {
return rc;
}
CHECK_LEADER(req);
LOOKUP_DB(request.db_id);
FAIL_IF_CHECKPOINTING;
req->sql = request.sql;
req->exec_count = 0;
g->req = req;
rc = leader__barrier(g->leader, &g->barrier, execSqlBarrierCb);
if (rc != 0) {
tracef("handle exec sql barrier failed %d", rc);
g->req = NULL;
return rc;
}
return 0;
}
static void leaderModifyingQuerySqlCb(struct exec *exec, int status)
{
struct gateway *g = exec->data;
struct handle *req = g->req;
assert(req != NULL);
g->req = NULL;
sqlite3_stmt *stmt = exec->stmt;
assert(stmt != NULL);
sqlite3_finalize(stmt);
if (status == SQLITE_DONE) {
emptyRows(req);
} else {
assert(g->leader != NULL);
failure(req, status, error_message(g->leader->conn, status));
}
}
static void querySqlBarrierCb(struct barrier *barrier, int status)
{
tracef("query sql barrier cb status:%d", status);
struct gateway *g = barrier->data;
struct handle *req = g->req;
assert(req != NULL);
g->req = NULL;
struct cursor *cursor = &req->cursor;
const char *sql = req->sql;
sqlite3_stmt *stmt;
const char *tail;
sqlite3_stmt *tail_stmt;
int tuple_format;
bool is_readonly;
uint64_t req_id;
int rv;
if (status != 0) {
failure(req, status, "barrier error");
return;
}
rv = sqlite3_prepare_v2(g->leader->conn, sql, -1, &stmt, &tail);
if (rv != SQLITE_OK) {
tracef("handle query sql prepare failed %d", rv);
failure(req, rv, sqlite3_errmsg(g->leader->conn));
return;
}
if (stmt == NULL) {
tracef("handle query sql empty statement");
failure(req, rv, "empty statement");
return;
}
rv = sqlite3_prepare_v2(g->leader->conn, tail, -1, &tail_stmt, NULL);
if (rv != 0 || tail_stmt != NULL) {
sqlite3_finalize(stmt);
sqlite3_finalize(tail_stmt);
failure(req, SQLITE_ERROR, "nonempty statement tail");
return;
}
switch (req->schema) {
case DQLITE_REQUEST_PARAMS_SCHEMA_V0:
tuple_format = TUPLE__PARAMS;
break;
case DQLITE_REQUEST_PARAMS_SCHEMA_V1:
tuple_format = TUPLE__PARAMS32;
break;
default:
/* Should have been caught by handle_query_sql */
assert(0);
}
rv = bind__params(stmt, cursor, tuple_format);
if (rv != 0) {
tracef("handle query sql bind failed %d", rv);
sqlite3_finalize(stmt);
failure(req, rv, "bind parameters");
return;
}
req->stmt = stmt;
g->req = req;
is_readonly = (bool)sqlite3_stmt_readonly(stmt);
if (is_readonly) {
query_batch(g);
} else {
req_id = idNext(&g->random_state);
rv = leader__exec(g->leader, &g->exec, stmt, req_id,
leaderModifyingQuerySqlCb);
if (rv != 0) {
sqlite3_finalize(stmt);
g->req = NULL;
failure(req, rv, "leader exec");
}
}
}
static int handle_query_sql(struct gateway *g, struct handle *req)
{
tracef("handle query sql schema:%" PRIu8, req->schema);
struct cursor *cursor = &req->cursor;
struct request_query_sql request = { 0 };
int rv;
/* Fail early if the schema version isn't recognized. */
if (req->schema != 0 && req->schema != 1) {
tracef("bad schema version %d", req->schema);
failure(req, DQLITE_PARSE, "unrecognized schema version");
return 0;
}
/* Schema version only affect the tuple format, which is parsed later */
rv = request_query_sql__decode(cursor, &request);
if (rv != 0) {
return rv;
}
CHECK_LEADER(req);
LOOKUP_DB(request.db_id);
FAIL_IF_CHECKPOINTING;
req->sql = request.sql;
g->req = req;
rv = leader__barrier(g->leader, &g->barrier, querySqlBarrierCb);
if (rv != 0) {
tracef("handle query sql barrier failed %d", rv);
g->req = NULL;
return rv;
}
return 0;
}
/*
* An interrupt can only be handled when a query is already yielding rows.
*/
static int handle_interrupt(struct gateway *g, struct handle *req)
{
tracef("handle interrupt");
g->req = NULL;
struct cursor *cursor = &req->cursor;
START_V0(interrupt, empty);
sqlite3_finalize(req->stmt);
req->stmt = NULL;
SUCCESS_V0(empty, EMPTY);
return 0;
}
struct change {
struct gateway *gateway;
struct raft_change req;
};
static void raftChangeCb(struct raft_change *change, int status)
{
tracef("raft change cb id:%" PRIu64 " status:%d",
idExtract(change->req_id), status);
struct change *r = change->data;
struct gateway *g = r->gateway;
struct handle *req = g->req;
struct response_empty response = { 0 };
g->req = NULL;
sqlite3_free(r);
if (status != 0) {
failure(req, translateRaftErrCode(status),
raft_strerror(status));
} else {
SUCCESS_V0(empty, EMPTY);
}
}
static int handle_add(struct gateway *g, struct handle *req)
{
tracef("handle add");
struct cursor *cursor = &req->cursor;
struct change *r;
uint64_t req_id;
int rv;
START_V0(add, empty);
(void)response;
CHECK_LEADER(req);
r = sqlite3_malloc(sizeof *r);
if (r == NULL) {
return DQLITE_NOMEM;
}
r->gateway = g;
r->req.data = r;
req_id = idNext(&g->random_state);
idSet(r->req.req_id, req_id);
g->req = req;
rv = raft_add(g->raft, &r->req, request.id, request.address,
raftChangeCb);
if (rv != 0) {
tracef("raft add failed %d", rv);
g->req = NULL;
sqlite3_free(r);
failure(req, translateRaftErrCode(rv), raft_strerror(rv));
return 0;
}
return 0;
}
static int handle_promote_or_assign(struct gateway *g, struct handle *req)
{
tracef("handle assign");
struct cursor *cursor = &req->cursor;
struct change *r;
uint64_t role = DQLITE_VOTER;
uint64_t req_id;
int rv;
START_V0(promote_or_assign, empty);
(void)response;
CHECK_LEADER(req);
/* Detect if this is an assign role request, instead of the former
* promote request. */
if (cursor->cap > 0) {
rv = uint64__decode(cursor, &role);
if (rv != 0) {
tracef("handle assign promote rv %d", rv);
return rv;
}
}
r = sqlite3_malloc(sizeof *r);
if (r == NULL) {
tracef("malloc failed");
return DQLITE_NOMEM;
}
r->gateway = g;
r->req.data = r;
req_id = idNext(&g->random_state);
idSet(r->req.req_id, req_id);
g->req = req;
rv = raft_assign(g->raft, &r->req, request.id,
translateDqliteRole((int)role), raftChangeCb);
if (rv != 0) {
tracef("raft_assign failed %d", rv);
g->req = NULL;
sqlite3_free(r);
failure(req, translateRaftErrCode(rv), raft_strerror(rv));
return 0;
}
return 0;
}
static int handle_remove(struct gateway *g, struct handle *req)
{
tracef("handle remove");
struct cursor *cursor = &req->cursor;
struct change *r;
uint64_t req_id;
int rv;
START_V0(remove, empty);
(void)response;
CHECK_LEADER(req);
r = sqlite3_malloc(sizeof *r);
if (r == NULL) {
tracef("malloc failed");
return DQLITE_NOMEM;
}
r->gateway = g;
r->req.data = r;
req_id = idNext(&g->random_state);
idSet(r->req.req_id, req_id);
g->req = req;
rv = raft_remove(g->raft, &r->req, request.id, raftChangeCb);
if (rv != 0) {
tracef("raft_remote failed %d", rv);
g->req = NULL;
sqlite3_free(r);
failure(req, translateRaftErrCode(rv), raft_strerror(rv));
return 0;
}
return 0;
}
static int dumpFile(const char *filename,
uint8_t *data,
size_t n,
struct buffer *buffer)
{
char *cur;
uint64_t len = n;
cur = buffer__advance(buffer, text__sizeof(&filename));
if (cur == NULL) {
goto oom;
}
text__encode(&filename, &cur);
cur = buffer__advance(buffer, uint64__sizeof(&len));
if (cur == NULL) {
goto oom;
}
uint64__encode(&len, &cur);
if (n == 0) {
return 0;
}
assert(n % 8 == 0);
assert(data != NULL);
cur = buffer__advance(buffer, n);
if (cur == NULL) {
goto oom;
}
memcpy(cur, data, n);
return 0;
oom:
return DQLITE_NOMEM;
}
static int handle_dump(struct gateway *g, struct handle *req)
{
tracef("handle dump");
struct cursor *cursor = &req->cursor;
bool err = true;
sqlite3_vfs *vfs;
char *cur;
char filename[1024] = { 0 };
void *data;
size_t n;
uint8_t *page;
uint32_t database_size = 0;
uint8_t *database;
uint8_t *wal;
size_t n_database;
size_t n_wal;
int rv;
START_V0(dump, files);
response.n = 2;
cur = buffer__advance(req->buffer, response_files__sizeof(&response));
assert(cur != NULL);
response_files__encode(&response, &cur);
vfs = sqlite3_vfs_find(g->config->name);
rv = VfsSnapshot(vfs, request.filename, &data, &n);
if (rv != 0) {
tracef("dump failed");
failure(req, rv, "failed to dump database");
return 0;
}
if (data != NULL) {
/* Extract the database size from the first page. */
page = data;
database_size += (uint32_t)(page[28] << 24);
database_size += (uint32_t)(page[29] << 16);
database_size += (uint32_t)(page[30] << 8);
database_size += (uint32_t)(page[31]);
n_database = database_size * g->config->page_size;
n_wal = n - n_database;
database = data;
wal = database + n_database;
} else {
assert(n == 0);
n_database = 0;
n_wal = 0;
database = NULL;
wal = NULL;
}
rv = dumpFile(request.filename, database, n_database, req->buffer);
if (rv != 0) {
tracef("dump failed");
failure(req, rv, "failed to dump database");
goto out_free_data;
}
/* filename is zero inited and initially we allow only writing 1024 - 4
* - 1 bytes to it, so after strncpy filename will be zero-terminated
* and will not have overflowed. strcat adds the 4 byte suffix and
* also zero terminates the resulting string. */
const char *wal_suffix = "-wal";
strncpy(filename, request.filename,
sizeof(filename) - strlen(wal_suffix) - 1);
strcat(filename, wal_suffix);
rv = dumpFile(filename, wal, n_wal, req->buffer);
if (rv != 0) {
tracef("wal dump failed");
failure(req, rv, "failed to dump wal file");
goto out_free_data;
}
err = false;
out_free_data:
if (data != NULL) {
raft_free(data);
}
if (!err) {
req->cb(req, 0, DQLITE_RESPONSE_FILES, 0);
}
return 0;
}
static int encodeServer(struct gateway *g,
unsigned i,
struct buffer *buffer,
int format)
{
char *cur;
uint64_t id;
uint64_t role;
text_t address;
assert(format == DQLITE_REQUEST_CLUSTER_FORMAT_V0 ||
format == DQLITE_REQUEST_CLUSTER_FORMAT_V1);
id = g->raft->configuration.servers[i].id;
address = g->raft->configuration.servers[i].address;
role =
(uint64_t)translateRaftRole(g->raft->configuration.servers[i].role);
cur = buffer__advance(buffer, uint64__sizeof(&id));
if (cur == NULL) {
return DQLITE_NOMEM;
}
uint64__encode(&id, &cur);
cur = buffer__advance(buffer, text__sizeof(&address));
if (cur == NULL) {
return DQLITE_NOMEM;
}
text__encode(&address, &cur);
if (format == DQLITE_REQUEST_CLUSTER_FORMAT_V0) {
return 0;
}
cur = buffer__advance(buffer, uint64__sizeof(&role));
if (cur == NULL) {
return DQLITE_NOMEM;
}
uint64__encode(&role, &cur);
return 0;
}
static int handle_cluster(struct gateway *g, struct handle *req)
{
tracef("handle cluster");
struct cursor *cursor = &req->cursor;
unsigned i;
char *cur;
int rv;
START_V0(cluster, servers);
if (request.format != DQLITE_REQUEST_CLUSTER_FORMAT_V0 &&
request.format != DQLITE_REQUEST_CLUSTER_FORMAT_V1) {
tracef("bad cluster format");
failure(req, DQLITE_PARSE, "unrecognized cluster format");
return 0;
}
response.n = g->raft->configuration.n;
cur = buffer__advance(req->buffer, response_servers__sizeof(&response));
assert(cur != NULL);
response_servers__encode(&response, &cur);
for (i = 0; i < response.n; i++) {
rv = encodeServer(g, i, req->buffer, (int)request.format);
if (rv != 0) {
tracef("encode failed");
failure(req, rv, "failed to encode server");
return 0;
}
}
req->cb(req, 0, DQLITE_RESPONSE_SERVERS, 0);
return 0;
}
void raftTransferCb(struct raft_transfer *r)
{
struct gateway *g = r->data;
struct handle *req = g->req;
struct response_empty response = { 0 };
g->req = NULL;
sqlite3_free(r);
if (g->raft->state == RAFT_LEADER) {
tracef("transfer failed");
failure(req, DQLITE_ERROR, "leadership transfer failed");
} else {
SUCCESS_V0(empty, EMPTY);
}
}
static int handle_transfer(struct gateway *g, struct handle *req)
{
tracef("handle transfer");
struct cursor *cursor = &req->cursor;
struct raft_transfer *r;
int rv;
START_V0(transfer, empty);
(void)response;
CHECK_LEADER(req);
r = sqlite3_malloc(sizeof *r);
if (r == NULL) {
tracef("malloc failed");
return DQLITE_NOMEM;
}
r->data = g;
g->req = req;
rv = raft_transfer(g->raft, r, request.id, raftTransferCb);
if (rv != 0) {
tracef("raft_transfer failed %d", rv);
g->req = NULL;
sqlite3_free(r);
failure(req, translateRaftErrCode(rv), raft_strerror(rv));
return 0;
}
return 0;
}
static int handle_describe(struct gateway *g, struct handle *req)
{
tracef("handle describe");
struct cursor *cursor = &req->cursor;
START_V0(describe, metadata);
if (request.format != DQLITE_REQUEST_DESCRIBE_FORMAT_V0) {
tracef("bad format");
failure(req, SQLITE_PROTOCOL, "bad format version");
}
response.failure_domain = g->config->failure_domain;
response.weight = g->config->weight;
SUCCESS_V0(metadata, METADATA);
return 0;
}
static int handle_weight(struct gateway *g, struct handle *req)
{
tracef("handle weight");
struct cursor *cursor = &req->cursor;
START_V0(weight, empty);
g->config->weight = request.weight;
SUCCESS_V0(empty, EMPTY);
return 0;
}
int gateway__handle(struct gateway *g,
struct handle *req,
int type,
int schema,
struct buffer *buffer,
handle_cb cb)
{
tracef("gateway handle");
int rc = 0;
sqlite3_stmt *stmt = NULL; // used for DQLITE_REQUEST_INTERRUPT
if (g->req == NULL) {
goto handle;
}
/* Request in progress. TODO The current implementation doesn't allow
* reading a new request while a query is yielding rows, in that case
* gateway__resume in write_cb will indicate it has not finished
* returning results and a new request (in this case, the interrupt)
* will not be read. */
if (g->req->type == DQLITE_REQUEST_QUERY &&
type == DQLITE_REQUEST_INTERRUPT) {
goto handle;
}
if (g->req->type == DQLITE_REQUEST_QUERY_SQL &&
type == DQLITE_REQUEST_INTERRUPT) {
stmt = g->req->stmt;
goto handle;
}
/* Receiving a request when one is ongoing on the same connection
* is a hard error. The connection will be stopped due to the non-0
* return code in case asserts are off. */
assert(false);
return SQLITE_BUSY;
handle:
req->type = type;
req->schema = schema;
req->cb = cb;
req->buffer = buffer;
req->db_id = 0;
req->stmt_id = 0;
req->sql = NULL;
req->stmt = stmt;
req->exec_count = 0;
req->work = (pool_work_t){};
switch (type) {
#define DISPATCH(LOWER, UPPER, _) \
case DQLITE_REQUEST_##UPPER: \
rc = handle_##LOWER(g, req); \
break;
REQUEST__TYPES(DISPATCH);
default:
tracef("unrecognized request type %d", type);
failure(req, DQLITE_PARSE, "unrecognized request type");
rc = 0;
}
return rc;
}
int gateway__resume(struct gateway *g, bool *finished)
{
if (g->req == NULL || (g->req->type != DQLITE_REQUEST_QUERY &&
g->req->type != DQLITE_REQUEST_QUERY_SQL)) {
tracef("gateway resume - finished");
*finished = true;
return 0;
}
tracef("gateway resume - not finished");
*finished = false;
g->req->work = (pool_work_t){};
query_batch(g);
return 0;
}
dqlite-1.16.7/src/gateway.h 0000664 0000000 0000000 00000010311 14652527134 0015476 0 ustar 00root root 0000000 0000000 /**
* Core dqlite server engine, calling out SQLite for serving client requests.
*/
#ifndef DQLITE_GATEWAY_H_
#define DQLITE_GATEWAY_H_
#include "../include/dqlite.h"
#include "lib/buffer.h"
#include "lib/serialize.h"
#include "config.h"
#include "id.h"
#include "leader.h"
#include "raft.h"
#include "registry.h"
#include "stmt.h"
struct handle;
/**
* Handle requests from a single connected client and forward them to
* SQLite.
*/
struct gateway {
struct config *config; /* Configuration */
struct registry *registry; /* Register of existing databases */
struct raft *raft; /* Raft instance */
struct leader *leader; /* Leader connection to the database */
struct handle *req; /* Asynchronous request being handled */
struct exec exec; /* Low-level exec async request */
struct stmt__registry stmts; /* Registry of prepared statements */
struct barrier barrier; /* Barrier for query requests */
uint64_t protocol; /* Protocol format version */
uint64_t client_id;
struct id_state random_state; /* For generating IDs */
};
void gateway__init(struct gateway *g,
struct config *config,
struct registry *registry,
struct raft *raft,
struct id_state seed);
void gateway__close(struct gateway *g);
/**
* Closes the leader connection to the database, reason should contain a raft
* error code.
*/
void gateway__leader_close(struct gateway *g, int reason);
/**
* Asynchronous request to handle a client command.
*
* We also use the handle as a place to save request-scoped data that we need
* to access from a callback.
*/
typedef void (*handle_cb)(struct handle *req,
int status,
uint8_t type,
uint8_t schema);
struct handle {
/* User data. */
void *data;
/* Type code for this request. */
int type;
/* Schema version for this request. */
int schema;
/* Buffer where the response to this request will be written. */
struct buffer *buffer;
/* Cursor for reading the request. */
struct cursor cursor;
/* Database ID parsed from this request.
*
* This is used by handle_prepare. */
size_t db_id;
/* ID of the statement associated with this request.
*
* This is used by handle_prepare. */
size_t stmt_id;
/* SQL string associated with this request.
*
* This is used by handle_prepare, handle_query_sql, and handle_exec_sql
* to save the provided SQL string across calls to leader__barrier and
* leader__exec, since there's no prepared statement that can be saved
* instead. In the case of handle_exec_sql, after preparing each
* statement we update this field to point to the "tail" that has not
* been prepared yet. */
const char *sql;
/* Prepared statement that will be queried to process this request.
*
* This is used by handle_query and handle_query_sql. */
sqlite3_stmt *stmt;
/* Number of times a statement parsed from this request has been
* executed.
*
* This is used by handle_exec_sql, which parses zero or more statements
* from the provided SQL string and executes them successively. Only if
* at least one statement was executed should we fill the RESULT
* response using sqlite3_last_insert_rowid and sqlite3_changes. */
unsigned exec_count;
/* Callback that will be invoked at the end of request processing to
* write the response. */
handle_cb cb;
/* A link into thread pool's queues. */
pool_work_t work;
/* Gateway the handle belongs to. */
struct gateway *gw;
};
/**
* Start handling a new client request.
*
* At most one request can be outstanding at any given time. This function will
* return an error if user code calls it and there's already a request in
* progress.
*
* The @type parameter holds the request type code (e.g. #REQUEST_LEADER), and
* the @buffer parameter is a buffer for writing the response.
*/
int gateway__handle(struct gateway *g,
struct handle *req,
int type,
int schema,
struct buffer *buffer,
handle_cb cb);
/**
* Resume execution of a query that was yielding a lot of rows and has been
* interrupted in order to start sending a first batch of rows. The response
* write buffer associated with the request must have been reset.
*/
int gateway__resume(struct gateway *g, bool *finished);
#endif /* DQLITE_GATEWAY_H_ */
dqlite-1.16.7/src/id.c 0000664 0000000 0000000 00000002742 14652527134 0014435 0 ustar 00root root 0000000 0000000 #include "id.h"
#include
/* The PRNG used for generating request IDs is xoshiro256**, developed by
* David Blackman and Sebastiano Vigna and released into the public domain.
* See . */
static uint64_t rotl(uint64_t x, int k)
{
return (x << k) | (x >> (64 - k));
}
uint64_t idNext(struct id_state *state)
{
uint64_t result = rotl(state->data[1] * 5, 7) * 9;
uint64_t t = state->data[1] << 17;
state->data[2] ^= state->data[0];
state->data[3] ^= state->data[1];
state->data[1] ^= state->data[2];
state->data[0] ^= state->data[3];
state->data[2] ^= t;
state->data[3] = rotl(state->data[3], 45);
return result;
}
void idJump(struct id_state *state)
{
static const uint64_t JUMP[] = {0x180ec6d33cfd0aba, 0xd5a61266f0c9392c,
0xa9582618e03fc9aa, 0x39abdc4529b1661c};
uint64_t s0 = 0;
uint64_t s1 = 0;
uint64_t s2 = 0;
uint64_t s3 = 0;
for (size_t i = 0; i < sizeof(JUMP) / sizeof(*JUMP); i++) {
for (size_t b = 0; b < 64; b++) {
if (JUMP[i] & UINT64_C(1) << b) {
s0 ^= state->data[0];
s1 ^= state->data[1];
s2 ^= state->data[2];
s3 ^= state->data[3];
}
idNext(state);
}
}
state->data[0] = s0;
state->data[1] = s1;
state->data[2] = s2;
state->data[3] = s3;
}
uint64_t idExtract(const uint8_t buf[16])
{
uint64_t id;
memcpy(&id, buf, sizeof(id));
return id;
}
void idSet(uint8_t buf[16], uint64_t id)
{
memset(buf, 0, 16);
memcpy(buf, &id, sizeof(id));
buf[15] = (uint8_t)-1;
}
dqlite-1.16.7/src/id.h 0000664 0000000 0000000 00000002015 14652527134 0014433 0 ustar 00root root 0000000 0000000 /**
* Generate, set, and extract dqlite-generated request IDs.
*
* A fresh ID is generated for each config or exec client request that
* arrives at a gateway. These IDs are passed down into raft via the
* req_id field of RAFT__REQUEST, and are suitable for diagnostic use
* only.
*/
#ifndef DQLITE_ID_H_
#define DQLITE_ID_H_
#include
/**
* State used to generate a request ID.
*/
struct id_state
{
uint64_t data[4];
};
/**
* Generate a request ID, mutating the input state in the process.
*/
uint64_t idNext(struct id_state *state);
/**
* Cause the given state to yield a different sequence of IDs.
*
* This is used to ensure that the sequences of IDs generated for
* distinct clients are (in practice) disjoint.
*/
void idJump(struct id_state *state);
/**
* Read a request ID from the req_id field of RAFT__REQUEST.
*/
uint64_t idExtract(const uint8_t buf[16]);
/**
* Write a request ID to the req_id field of RAFT__REQUEST.
*/
void idSet(uint8_t buf[16], uint64_t id);
#endif /* DQLITE_ID_H_ */
dqlite-1.16.7/src/leader.c 0000664 0000000 0000000 00000030372 14652527134 0015275 0 ustar 00root root 0000000 0000000 #include
#include
#include "../include/dqlite.h"
#include "./lib/assert.h"
#include "command.h"
#include "conn.h"
#include "gateway.h"
#include "id.h"
#include "leader.h"
#include "lib/threadpool.h"
#include "server.h"
#include "tracing.h"
#include "utils.h"
#include "vfs.h"
/* Called when a leader exec request terminates and the associated callback can
* be invoked. */
static void leaderExecDone(struct exec *req)
{
tracef("leader exec done id:%" PRIu64, req->id);
req->leader->exec = NULL;
if (req->cb != NULL) {
req->cb(req, req->status);
}
}
/* Open a SQLite connection and set it to leader replication mode. */
static int openConnection(const char *filename,
const char *vfs,
unsigned page_size,
sqlite3 **conn)
{
tracef("open connection filename %s", filename);
char pragma[255];
int flags = SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE;
char *msg = NULL;
int rc;
rc = sqlite3_open_v2(filename, conn, flags, vfs);
if (rc != SQLITE_OK) {
tracef("open failed %d", rc);
goto err;
}
/* Enable extended result codes */
rc = sqlite3_extended_result_codes(*conn, 1);
if (rc != SQLITE_OK) {
tracef("extended codes failed %d", rc);
goto err;
}
/* The vfs, db, gateway, and leader code currently assumes that
* each connection will operate on only one DB file/WAL file
* pair. Make sure that the client can't use ATTACH DATABASE to
* break this assumption. We apply the same limit in open_follower_conn
* in db.c.
*
* Note, 0 instead of 1 -- apparently the "initial database" is not
* counted when evaluating this limit. */
sqlite3_limit(*conn, SQLITE_LIMIT_ATTACHED, 0);
/* Set the page size. */
sprintf(pragma, "PRAGMA page_size=%d", page_size);
rc = sqlite3_exec(*conn, pragma, NULL, NULL, &msg);
if (rc != SQLITE_OK) {
tracef("page size set failed %d page size %u", rc, page_size);
goto err;
}
/* Disable syncs. */
rc = sqlite3_exec(*conn, "PRAGMA synchronous=OFF", NULL, NULL, &msg);
if (rc != SQLITE_OK) {
tracef("sync off failed %d", rc);
goto err;
}
/* Set WAL journaling. */
rc = sqlite3_exec(*conn, "PRAGMA journal_mode=WAL", NULL, NULL, &msg);
if (rc != SQLITE_OK) {
tracef("wal on failed %d", rc);
goto err;
}
rc = sqlite3_exec(*conn, "PRAGMA wal_autocheckpoint=0", NULL, NULL,
&msg);
if (rc != SQLITE_OK) {
tracef("wal autocheckpoint off failed %d", rc);
goto err;
}
rc =
sqlite3_db_config(*conn, SQLITE_DBCONFIG_NO_CKPT_ON_CLOSE, 1, NULL);
if (rc != SQLITE_OK) {
tracef("db config failed %d", rc);
goto err;
}
/* TODO: make setting foreign keys optional. */
rc = sqlite3_exec(*conn, "PRAGMA foreign_keys=1", NULL, NULL, &msg);
if (rc != SQLITE_OK) {
tracef("enable foreign keys failed %d", rc);
goto err;
}
return 0;
err:
if (*conn != NULL) {
sqlite3_close(*conn);
*conn = NULL;
}
if (msg != NULL) {
sqlite3_free(msg);
}
return rc;
}
/* Whether we need to submit a barrier request because there is no transaction
* in progress in the underlying database and the FSM is behind the last log
* index. */
static bool needsBarrier(struct leader *l)
{
return l->db->tx_id == 0 &&
raft_last_applied(l->raft) < raft_last_index(l->raft);
}
int leader__init(struct leader *l, struct db *db, struct raft *raft)
{
tracef("leader init");
int rc;
l->db = db;
l->raft = raft;
rc = openConnection(db->path, db->config->name, db->config->page_size,
&l->conn);
if (rc != 0) {
tracef("open failed %d", rc);
return rc;
}
l->exec = NULL;
l->inflight = NULL;
queue_insert_tail(&db->leaders, &l->queue);
return 0;
}
void leader__close(struct leader *l)
{
tracef("leader close");
int rc;
/* TODO: there shouldn't be any ongoing exec request. */
if (l->exec != NULL) {
assert(l->inflight == NULL);
l->exec->status = SQLITE_ERROR;
leaderExecDone(l->exec);
}
rc = sqlite3_close(l->conn);
assert(rc == 0);
queue_remove(&l->queue);
}
/* A checkpoint command that fails to commit is not a huge issue.
* The WAL will not be checkpointed this time around on these nodes,
* a new checkpoint command will be issued once the WAL on the leader reaches
* threshold size again. It's improbable that the WAL in this way could grow
* without bound, it would mean that apply frames commands commit without
* issues, while the checkpoint command would somehow always fail to commit. */
static void leaderCheckpointApplyCb(struct raft_apply *req,
int status,
void *result)
{
(void)result;
raft_free(req);
if (status != 0) {
tracef("checkpoint apply failed %d", status);
}
}
/* Attempt to perform a checkpoint on nodes running a version of dqlite that
* doens't perform autonomous checkpoints. For recent nodes, the checkpoint
* command will just be a no-op.
* This function will run after the WAL might have been checkpointed during a
* call to `apply_frames`.
* */
static void leaderMaybeCheckpointLegacy(struct leader *l)
{
tracef("leader maybe checkpoint legacy");
struct sqlite3_file *wal;
struct raft_buffer buf;
struct command_checkpoint command;
sqlite3_int64 size;
int rv;
/* Get the database file associated with this connection */
rv = sqlite3_file_control(l->conn, "main", SQLITE_FCNTL_JOURNAL_POINTER,
&wal);
assert(rv == SQLITE_OK); /* Should never fail */
rv = wal->pMethods->xFileSize(wal, &size);
assert(rv == SQLITE_OK); /* Should never fail */
/* size of the WAL will be 0 if it has just been checkpointed on this
* leader as a result of running apply_frames. */
if (size != 0) {
return;
}
tracef("issue checkpoint command");
/* Attempt to perfom a checkpoint across nodes that don't perform
* autonomous snapshots. */
command.filename = l->db->filename;
rv = command__encode(COMMAND_CHECKPOINT, &command, &buf);
if (rv != 0) {
tracef("encode failed %d", rv);
return;
}
struct raft_apply *apply = raft_malloc(sizeof(*apply));
if (apply == NULL) {
tracef("raft_malloc - no mem");
goto err_after_buf_alloc;
}
#ifdef USE_SYSTEM_RAFT
rv = raft_apply(l->raft, apply, &buf, 1, leaderCheckpointApplyCb);
#else
rv = raft_apply(l->raft, apply, &buf, NULL, 1, leaderCheckpointApplyCb);
#endif
if (rv != 0) {
tracef("raft_apply failed %d", rv);
raft_free(apply);
goto err_after_buf_alloc;
}
return;
err_after_buf_alloc:
raft_free(buf.base);
}
static void leaderApplyFramesCb(struct raft_apply *req,
int status,
void *result)
{
tracef("apply frames cb id:%" PRIu64, idExtract(req->req_id));
struct apply *apply = req->data;
struct leader *l = apply->leader;
if (l == NULL) {
raft_free(apply);
return;
}
(void)result;
if (status != 0) {
tracef("apply frames cb failed status %d", status);
sqlite3_vfs *vfs = sqlite3_vfs_find(l->db->config->name);
switch (status) {
case RAFT_LEADERSHIPLOST:
l->exec->status = SQLITE_IOERR_LEADERSHIP_LOST;
break;
case RAFT_NOSPACE:
l->exec->status = SQLITE_IOERR_WRITE;
break;
case RAFT_SHUTDOWN:
/* If we got here it means we have manually
* fired the apply callback from
* gateway__close(). In this case we don't
* free() the apply object, since it will be
* freed when the callback is fired again by
* raft.
*
* TODO: we should instead make gatewa__close()
* itself asynchronous. */
apply->leader = NULL;
l->exec->status = SQLITE_ABORT;
goto finish;
break;
default:
l->exec->status = SQLITE_IOERR;
break;
}
VfsAbort(vfs, l->db->path);
}
raft_free(apply);
if (status == 0) {
leaderMaybeCheckpointLegacy(l);
}
finish:
l->inflight = NULL;
l->db->tx_id = 0;
leaderExecDone(l->exec);
}
static int leaderApplyFrames(struct exec *req,
dqlite_vfs_frame *frames,
unsigned n)
{
tracef("leader apply frames id:%" PRIu64, req->id);
struct leader *l = req->leader;
struct db *db = l->db;
struct command_frames c;
struct raft_buffer buf;
struct apply *apply;
int rv;
c.filename = db->filename;
c.tx_id = 0;
c.truncate = 0;
c.is_commit = 1;
c.frames.n_pages = (uint32_t)n;
c.frames.page_size = (uint16_t)db->config->page_size;
c.frames.data = frames;
apply = raft_malloc(sizeof *req);
if (apply == NULL) {
tracef("malloc");
rv = DQLITE_NOMEM;
goto err;
}
rv = command__encode(COMMAND_FRAMES, &c, &buf);
if (rv != 0) {
tracef("encode %d", rv);
goto err_after_apply_alloc;
}
apply->leader = req->leader;
apply->req.data = apply;
apply->type = COMMAND_FRAMES;
idSet(apply->req.req_id, req->id);
#ifdef USE_SYSTEM_RAFT
rv = raft_apply(l->raft, &apply->req, &buf, 1, leaderApplyFramesCb);
#else
/* TODO actual WAL slice goes here */
struct raft_entry_local_data local_data = {};
rv = raft_apply(l->raft, &apply->req, &buf, &local_data, 1, leaderApplyFramesCb);
#endif
if (rv != 0) {
tracef("raft apply failed %d", rv);
goto err_after_command_encode;
}
db->tx_id = 1;
l->inflight = apply;
return 0;
err_after_command_encode:
raft_free(buf.base);
err_after_apply_alloc:
raft_free(apply);
err:
assert(rv != 0);
return rv;
}
static void leaderExecV2(struct exec *req, enum pool_half half)
{
tracef("leader exec v2 id:%" PRIu64, req->id);
struct leader *l = req->leader;
struct db *db = l->db;
sqlite3_vfs *vfs = sqlite3_vfs_find(db->config->name);
dqlite_vfs_frame *frames;
uint64_t size;
unsigned n;
unsigned i;
int rv;
if (half == POOL_TOP_HALF) {
req->status = sqlite3_step(req->stmt);
return;
} /* else POOL_BOTTOM_HALF => */
rv = VfsPoll(vfs, db->path, &frames, &n);
if (rv != 0 || n == 0) {
tracef("vfs poll");
goto finish;
}
/* Check if the new frames would create an overfull database */
size = VfsDatabaseSize(vfs, db->path, n, db->config->page_size);
if (size > VfsDatabaseSizeLimit(vfs)) {
rv = SQLITE_FULL;
goto abort;
}
rv = leaderApplyFrames(req, frames, n);
if (rv != 0) {
goto abort;
}
for (i = 0; i < n; i++) {
sqlite3_free(frames[i].data);
}
sqlite3_free(frames);
return;
abort:
for (i = 0; i < n; i++) {
sqlite3_free(frames[i].data);
}
sqlite3_free(frames);
VfsAbort(vfs, l->db->path);
finish:
if (rv != 0) {
tracef("exec v2 failed %d", rv);
l->exec->status = rv;
}
leaderExecDone(l->exec);
}
#ifdef DQLITE_NEXT
static void exec_top(pool_work_t *w)
{
struct exec *req = CONTAINER_OF(w, struct exec, work);
leaderExecV2(req, POOL_TOP_HALF);
}
static void exec_bottom(pool_work_t *w)
{
struct exec *req = CONTAINER_OF(w, struct exec, work);
leaderExecV2(req, POOL_BOTTOM_HALF);
}
#endif
static void execBarrierCb(struct barrier *barrier, int status)
{
tracef("exec barrier cb status %d", status);
struct exec *req = barrier->data;
struct leader *l = req->leader;
if (status != 0) {
l->exec->status = status;
leaderExecDone(l->exec);
return;
}
#ifdef DQLITE_NEXT
struct dqlite_node *node = l->raft->data;
pool_t *pool = !!(pool_ut_fallback()->flags & POOL_FOR_UT)
? pool_ut_fallback() : &node->pool;
pool_queue_work(pool, &req->work, l->db->cookie,
WT_UNORD, exec_top, exec_bottom);
#else
leaderExecV2(req, POOL_TOP_HALF);
leaderExecV2(req, POOL_BOTTOM_HALF);
#endif
}
int leader__exec(struct leader *l,
struct exec *req,
sqlite3_stmt *stmt,
uint64_t id,
exec_cb cb)
{
tracef("leader exec id:%" PRIu64, id);
int rv;
if (l->exec != NULL) {
tracef("busy");
return SQLITE_BUSY;
}
l->exec = req;
req->leader = l;
req->stmt = stmt;
req->id = id;
req->cb = cb;
req->barrier.data = req;
req->barrier.cb = NULL;
req->work = (pool_work_t){};
rv = leader__barrier(l, &req->barrier, execBarrierCb);
if (rv != 0) {
l->exec = NULL;
return rv;
}
return 0;
}
static void raftBarrierCb(struct raft_barrier *req, int status)
{
tracef("raft barrier cb status %d", status);
struct barrier *barrier = req->data;
int rv = 0;
if (status != 0) {
if (status == RAFT_LEADERSHIPLOST) {
rv = SQLITE_IOERR_LEADERSHIP_LOST;
} else {
rv = SQLITE_ERROR;
}
}
barrier_cb cb = barrier->cb;
if (cb == NULL) {
tracef("barrier cb already fired");
return;
}
barrier->cb = NULL;
cb(barrier, rv);
}
int leader__barrier(struct leader *l, struct barrier *barrier, barrier_cb cb)
{
tracef("leader barrier");
int rv;
if (!needsBarrier(l)) {
tracef("not needed");
cb(barrier, 0);
return 0;
}
barrier->cb = cb;
barrier->leader = l;
barrier->req.data = barrier;
rv = raft_barrier(l->raft, &barrier->req, raftBarrierCb);
if (rv != 0) {
tracef("raft barrier failed %d", rv);
barrier->req.data = NULL;
barrier->leader = NULL;
barrier->cb = NULL;
return rv;
}
return 0;
}
dqlite-1.16.7/src/leader.h 0000664 0000000 0000000 00000006364 14652527134 0015306 0 ustar 00root root 0000000 0000000 /**
* Track the state of leader connection and execute statements asynchronously.
*/
#ifndef LEADER_H_
#define LEADER_H_
#include
#include
#include "./lib/queue.h"
#include "db.h"
#include "lib/threadpool.h"
#include "raft.h"
#define SQLITE_IOERR_NOT_LEADER (SQLITE_IOERR | (40 << 8))
#define SQLITE_IOERR_LEADERSHIP_LOST (SQLITE_IOERR | (41 << 8))
struct exec;
struct barrier;
struct leader;
typedef void (*exec_cb)(struct exec *req, int status);
typedef void (*barrier_cb)(struct barrier *req, int status);
/* Wrapper around raft_apply, saving context information. */
struct apply {
struct raft_apply req; /* Raft apply request */
int status; /* Raft apply result */
struct leader *leader; /* Leader connection that triggered the hook */
int type; /* Command type */
union { /* Command-specific data */
struct {
bool is_commit;
} frames;
};
};
struct leader {
struct db *db; /* Database the connection. */
sqlite3 *conn; /* Underlying SQLite connection. */
struct raft *raft; /* Raft instance. */
struct exec *exec; /* Exec request in progress, if any. */
queue queue; /* Prev/next leader, used by struct db. */
struct apply *inflight; /* TODO: make leader__close async */
};
struct barrier {
void *data;
struct leader *leader;
struct raft_barrier req;
barrier_cb cb;
};
/**
* Asynchronous request to execute a statement.
*/
struct exec {
void *data;
struct leader *leader;
struct barrier barrier;
sqlite3_stmt *stmt;
uint64_t id;
int status;
queue queue;
exec_cb cb;
pool_work_t work;
};
/**
* Initialize a new leader connection.
*
* This function will start the leader loop coroutine and pause it immediately,
* transfering control back to main coroutine and then opening a new leader
* connection against the given database.
*/
int leader__init(struct leader *l, struct db *db, struct raft *raft);
void leader__close(struct leader *l);
/**
* Submit a request to step a SQLite statement.
*
* The request will be dispatched to the leader loop coroutine, which will be
* resumed and will invoke sqlite_step(). If the statement triggers the
* replication hooks and one or more new Raft log entries need to be appended,
* then the loop coroutine will be paused and control will be transferred back
* to the main coroutine. In this state the leader loop coroutine call stack
* will be "blocked" on the xFrames() replication hook call triggered by the top
* sqlite_step() call. The leader loop coroutine will be resumed once the Raft
* append request completes (either successfully or not) and at that point the
* stack will rewind back to the @sqlite_step() call, returning to the leader
* loop which will then have completed the request and transfer control back to
* the main coroutine, pausing until the next request.
*/
int leader__exec(struct leader *l,
struct exec *req,
sqlite3_stmt *stmt,
uint64_t id,
exec_cb cb);
/**
* Submit a raft barrier request if there is no transaction in progress in the
* underlying database and the FSM is behind the last log index.
*
* Otherwise, just invoke the given @cb immediately.
*/
int leader__barrier(struct leader *l, struct barrier *barrier, barrier_cb cb);
#endif /* LEADER_H_*/
dqlite-1.16.7/src/lib/ 0000775 0000000 0000000 00000000000 14652527134 0014436 5 ustar 00root root 0000000 0000000 dqlite-1.16.7/src/lib/addr.c 0000664 0000000 0000000 00000005133 14652527134 0015516 0 ustar 00root root 0000000 0000000 #include "addr.h"
#include
#include
#include
#include
#include
#include
#include "../../include/dqlite.h"
int AddrParse(const char *input,
struct sockaddr *addr,
socklen_t *addr_len,
const char *service,
int flags)
{
int rv;
char *node = NULL;
size_t input_len = strlen(input);
char c = input[0];
struct sockaddr_un *addr_un;
const char *name, *addr_start, *close_bracket, *colon;
size_t name_len;
struct addrinfo hints, *res;
if (c == '@') {
/* Unix domain address.
* FIXME the use of the "abstract namespace" here is
* Linux-specific */
if (!(flags & DQLITE_ADDR_PARSE_UNIX)) {
return DQLITE_MISUSE;
}
addr_un = (struct sockaddr_un *)addr;
if (*addr_len < sizeof(*addr_un)) {
return DQLITE_ERROR;
}
name = input + 1;
name_len = input_len - 1;
if (name_len == 0) {
/* Autogenerated abstract socket name */
addr_un->sun_family = AF_UNIX;
*addr_len = sizeof(addr_un->sun_family);
return 0;
}
/* Leading null byte, no trailing null byte */
if (name_len + 1 > sizeof(addr_un->sun_path)) {
return DQLITE_ERROR;
}
memset(addr_un->sun_path, 0, sizeof(addr_un->sun_path));
memcpy(addr_un->sun_path + 1, name, name_len);
addr_un->sun_family = AF_UNIX;
*addr_len = (socklen_t)offsetof(struct sockaddr_un, sun_path) +
(socklen_t)name_len + 1;
return 0;
} else if (c == '[') {
/* IPv6 address with port */
addr_start = input + 1;
close_bracket = memchr(input, ']', input_len);
if (!close_bracket) {
return DQLITE_ERROR;
}
colon = close_bracket + 1;
if (*colon != ':') {
return DQLITE_ERROR;
}
service = colon + 1;
node =
strndup(addr_start, (size_t)(close_bracket - addr_start));
} else if (memchr(input, '.', input_len)) {
/* IPv4 address */
colon = memchr(input, ':', input_len);
if (colon) {
service = colon + 1;
node = strndup(input, (size_t)(colon - input));
} else {
node = strdup(input);
}
} else {
/* IPv6 address without port */
node = strdup(input);
}
if (!node) {
return DQLITE_NOMEM;
}
memset(&hints, 0, sizeof(hints));
hints.ai_family = AF_UNSPEC;
hints.ai_socktype = SOCK_STREAM;
hints.ai_flags = AI_NUMERICHOST | AI_NUMERICSERV;
rv = getaddrinfo(node, service, &hints, &res);
if (rv != 0) {
rv = DQLITE_ERROR;
goto err_after_strdup;
}
if (res->ai_addrlen > *addr_len) {
rv = DQLITE_ERROR;
goto err_after_getaddrinfo;
}
memcpy(addr, res->ai_addr, res->ai_addrlen);
*addr_len = res->ai_addrlen;
err_after_getaddrinfo:
freeaddrinfo(res);
err_after_strdup:
free(node);
return rv;
}
dqlite-1.16.7/src/lib/addr.h 0000664 0000000 0000000 00000001700 14652527134 0015517 0 ustar 00root root 0000000 0000000 #ifndef ADDR_H_
#define ADDR_H_
#include
enum {
/* Parse Unix socket addresses in @ notation */
DQLITE_ADDR_PARSE_UNIX = 1 << 0
};
/** Parse a socket address from the string @input.
*
* On success, the resulting address is placed in @addr, and its size is placed
* in @addr_len. If @addr is not large enough (based on the initial value of
* @addr_len) to hold the result, DQLITE_ERROR is returned.
*
* @service should be a string representing a port number, e.g. "8080".
*
* @flags customizes the behavior of the function. Currently the only flag is
* DQLITE_ADDR_PARSE_UNIX: when this is ORed in @flags, AddrParse will also
* parse Unix socket addresses in the form `@NAME`, where NAME may be empty.
* This creates a socket address in the (Linux-specific) "abstract namespace".
*/
int AddrParse(const char *input,
struct sockaddr *addr,
socklen_t *addr_len,
const char *service,
int flags);
#endif
dqlite-1.16.7/src/lib/assert.h 0000664 0000000 0000000 00000001715 14652527134 0016114 0 ustar 00root root 0000000 0000000 /**
* Define the assert() macro, either as the standard one or the test one.
*/
#ifndef LIB_ASSERT_H_
#define LIB_ASSERT_H_
#if defined(DQLITE_TEST)
#include "../../test/lib/munit.h"
#define assert(expr) munit_assert(expr)
#elif defined(DQLITE_ASSERT_WITH_BACKTRACE)
#include /* for __assert_fail */
#include
#include
#undef assert
#define assert(x) \
do { \
struct backtrace_state *state_; \
if (!(x)) { \
state_ = backtrace_create_state(NULL, 0, NULL, NULL); \
backtrace_print(state_, 0, stderr); \
__assert_fail(#x, __FILE__, __LINE__, __func__); \
} \
} while (0)
#else
#include
#endif
#endif /* LIB_ASSERT_H_ */
dqlite-1.16.7/src/lib/buffer.c 0000664 0000000 0000000 00000002631 14652527134 0016055 0 ustar 00root root 0000000 0000000 #include
#include
#include
#include "buffer.h"
#include "../../include/dqlite.h"
/* How large is the buffer currently */
#define SIZE(B) (B->n_pages * B->page_size)
/* How many remaining bytes the buffer currently */
#define CAP(B) (SIZE(B) - B->offset)
int buffer__init(struct buffer *b)
{
b->page_size = (unsigned)sysconf(_SC_PAGESIZE);
b->n_pages = 1;
b->data = malloc(SIZE(b));
if (b->data == NULL) {
return DQLITE_NOMEM;
}
b->offset = 0;
return 0;
}
void buffer__close(struct buffer *b)
{
free(b->data);
}
/* Ensure that the buffer has at least @size spare bytes */
static inline bool ensure(struct buffer *b, size_t size)
{
void *data;
uint32_t n_pages = b->n_pages;
/* Double the buffer until we have enough capacity */
while (size > CAP(b)) {
b->n_pages *= 2;
}
/* CAP(b) was insufficient */
if (b->n_pages > n_pages) {
data = realloc(b->data, SIZE(b));
if (data == NULL) {
b->n_pages = n_pages;
return false;
}
b->data = data;
}
return true;
}
void *buffer__advance(struct buffer *b, size_t size)
{
void *cursor;
if (!ensure(b, size)) {
return NULL;
}
cursor = buffer__cursor(b, b->offset);
b->offset += size;
return cursor;
}
size_t buffer__offset(struct buffer *b)
{
return b->offset;
}
void *buffer__cursor(struct buffer *b, size_t offset)
{
return b->data + offset;
}
void buffer__reset(struct buffer *b)
{
b->offset = 0;
}
dqlite-1.16.7/src/lib/buffer.h 0000664 0000000 0000000 00000003023 14652527134 0016056 0 ustar 00root root 0000000 0000000 /**
* A dynamic buffer which can grow as needed when writing to it.
*
* The buffer size is always a multiple of the OS virtual memory page size, so
* resizing the buffer *should* not incur in memory being copied.
*
* See https://stackoverflow.com/questions/16765389
*
* TODO: consider using mremap.
*/
#ifndef LIB_BUFFER_H_
#define LIB_BUFFER_H_
#include
#include "../../include/dqlite.h"
struct buffer
{
void *data; /* Allocated buffer */
unsigned page_size; /* Size of an OS page */
unsigned n_pages; /* Number of pages allocated */
size_t offset; /* Next byte to write in the buffer */
};
/**
* Initialize the buffer. It will initially have 1 memory page.
*/
DQLITE_VISIBLE_TO_TESTS int buffer__init(struct buffer *b);
/**
* Release the memory of the buffer.
*/
DQLITE_VISIBLE_TO_TESTS void buffer__close(struct buffer *b);
/**
* Return a write cursor pointing to the next byte to write, ensuring that the
* buffer has at least @size spare bytes.
*
* Return #NULL in case of out-of-memory errors.
*/
DQLITE_VISIBLE_TO_TESTS void *buffer__advance(struct buffer *b, size_t size);
/**
* Return the offset of next byte to write.
*/
DQLITE_VISIBLE_TO_TESTS size_t buffer__offset(struct buffer *b);
/**
* Return a write cursor pointing to the @offset'th byte of the buffer.
*/
DQLITE_VISIBLE_TO_TESTS void *buffer__cursor(struct buffer *b, size_t offset);
/**
* Reset the write offset of the buffer.
*/
DQLITE_VISIBLE_TO_TESTS void buffer__reset(struct buffer *b);
#endif /* LIB_BUFFER_H_ */
dqlite-1.16.7/src/lib/byte.h 0000664 0000000 0000000 00000006130 14652527134 0015552 0 ustar 00root root 0000000 0000000 #ifndef LIB_BYTE_H_
#define LIB_BYTE_H_
#include
#include
#include
#if defined(__cplusplus)
#define DQLITE_INLINE inline
#else
#define DQLITE_INLINE static inline
#endif
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
#define DQLITE_LITTLE_ENDIAN
#elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
#define DQLITE_BIG_ENDIAN
#endif
#if defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8
#define DQLITE_HAVE_BSWAP
#endif
/* Flip a 16-bit number to little-endian byte order */
DQLITE_INLINE uint16_t ByteFlipLe16(uint16_t v)
{
#if defined(DQLITE_LITTLE_ENDIAN)
return v;
#elif defined(DQLITE_BIG_ENDIAN) && defined(DQLITE_HAVE_BSWAP)
return __builtin_bswap16(v);
#else
union {
uint16_t u;
uint8_t v[2];
} s;
s.v[0] = (uint8_t)v;
s.v[1] = (uint8_t)(v >> 8);
return s.u;
#endif
}
/* Flip a 32-bit number to little-endian byte order */
DQLITE_INLINE uint32_t ByteFlipLe32(uint32_t v)
{
#if defined(DQLITE_LITTLE_ENDIAN)
return v;
#elif defined(DQLITE_BIG_ENDIAN) && defined(DQLITE_HAVE_BSWAP)
return __builtin_bswap32(v);
#else
union {
uint32_t u;
uint8_t v[4];
} s;
s.v[0] = (uint8_t)v;
s.v[1] = (uint8_t)(v >> 8);
s.v[2] = (uint8_t)(v >> 16);
s.v[3] = (uint8_t)(v >> 24);
return s.u;
#endif
}
/* Flip a 64-bit number to little-endian byte order */
DQLITE_INLINE uint64_t ByteFlipLe64(uint64_t v)
{
#if defined(DQLITE_LITTLE_ENDIAN)
return v;
#elif defined(DQLITE_BIG_ENDIAN) && defined(DQLITE_HAVE_BSWAP)
return __builtin_bswap64(v);
#else
union {
uint64_t u;
uint8_t v[8];
} s;
s.v[0] = (uint8_t)v;
s.v[1] = (uint8_t)(v >> 8);
s.v[2] = (uint8_t)(v >> 16);
s.v[3] = (uint8_t)(v >> 24);
s.v[4] = (uint8_t)(v >> 32);
s.v[5] = (uint8_t)(v >> 40);
s.v[6] = (uint8_t)(v >> 48);
s.v[7] = (uint8_t)(v >> 56);
return s.u;
#endif
}
/* -Wconversion before GCC 10 is overly sensitive. */
#if defined(__GNUC__) && __GNUC__ < 10
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wconversion"
#endif
DQLITE_INLINE uint16_t ByteGetBe16(const uint8_t *buf)
{
uint16_t x = buf[0];
uint16_t y = buf[1];
x <<= 8;
return x | y;
}
DQLITE_INLINE uint32_t ByteGetBe32(const uint8_t *buf)
{
uint32_t w = buf[0];
uint32_t x = buf[1];
uint32_t y = buf[2];
uint32_t z = buf[3];
w <<= 24;
x <<= 16;
y <<= 8;
return w | x | y | z;
}
DQLITE_INLINE uint32_t ByteGetLe32(const uint8_t *buf)
{
uint32_t w = buf[0];
uint32_t x = buf[1];
uint32_t y = buf[2];
uint32_t z = buf[3];
z <<= 24;
y <<= 16;
x <<= 8;
return w | x | y | z;
}
DQLITE_INLINE void BytePutBe32(uint32_t v, uint8_t *buf)
{
buf[0] = (uint8_t)(v >> 24);
buf[1] = (uint8_t)(v >> 16);
buf[2] = (uint8_t)(v >> 8);
buf[3] = (uint8_t)v;
}
/**
* Add padding to size if it's not a multiple of 8. E.g. if 11 is passed, 16 is
* returned.
*/
DQLITE_INLINE size_t BytePad64(size_t size)
{
size_t rest = size % sizeof(uint64_t);
if (rest != 0) {
size += sizeof(uint64_t) - rest;
}
return size;
}
#define ARRAY_SIZE(a) ((sizeof(a)) / (sizeof(a)[0]))
#if defined(__GNUC__) && __GNUC__ < 10
#pragma GCC diagnostic pop
#endif
#endif /* LIB_BYTE_H_ */
dqlite-1.16.7/src/lib/fs.c 0000664 0000000 0000000 00000001545 14652527134 0015217 0 ustar 00root root 0000000 0000000 #include
#include
#include
#include
#include
#include "../tracing.h"
#include "fs.h"
int FsEnsureDir(const char *path)
{
int rv;
struct stat st = {0};
rv = stat(path, &st);
if (rv == 0) {
if (!S_ISDIR(st.st_mode)) {
tracef("%s is not a directory", path);
return -1;
}
}
/* Directory does not exist */
if (rv == -1) {
return mkdir(path, 0755);
}
return 0;
}
static int fsRemoveDirFilesNftwFn(const char *path,
const struct stat *sb,
int type,
struct FTW *ftwb)
{
int rv;
(void)sb;
(void)type;
(void)ftwb;
rv = 0;
/* Don't remove directory */
if (S_ISREG(sb->st_mode)) {
rv = remove(path);
}
return rv;
}
int FsRemoveDirFiles(const char *path)
{
int rv;
rv = nftw(path, fsRemoveDirFilesNftwFn, 10,
FTW_DEPTH | FTW_MOUNT | FTW_PHYS);
return rv;
}
dqlite-1.16.7/src/lib/fs.h 0000664 0000000 0000000 00000000374 14652527134 0015223 0 ustar 00root root 0000000 0000000 #ifndef DQLITE_LIB_FS_H
#define DQLITE_LIB_FS_H
/* Create a directory if it does not already exist. */
int FsEnsureDir(const char *path);
/* Removes all files from a directory. */
int FsRemoveDirFiles(const char *path);
#endif /* DQLITE_LIB_FS_H */
dqlite-1.16.7/src/lib/queue.h 0000664 0000000 0000000 00000003235 14652527134 0015736 0 ustar 00root root 0000000 0000000 #ifndef LIB_QUEUE_H_
#define LIB_QUEUE_H_
#include /* offsetof */
struct queue
{
struct queue *next;
struct queue *prev;
};
typedef struct queue queue;
#define QUEUE_DATA(e, type, field) \
((type *)((void *)((char *)(e)-offsetof(type, field))))
#define QUEUE_FOREACH(q, h) for ((q) = (h)->next; (q) != (h); (q) = (q)->next)
static inline void queue_init(struct queue *q)
{
q->next = q;
q->prev = q;
}
static inline int queue_empty(const struct queue *q)
{
return q == q->next;
}
static inline struct queue *queue_head(const struct queue *q)
{
return q->next;
}
static inline struct queue *queue_next(const struct queue *q)
{
return q->next;
}
static inline struct queue *queue_tail(const struct queue *q)
{
return q->prev;
}
static inline void queue_add(struct queue *h, struct queue *n)
{
h->prev->next = n->next;
n->next->prev = h->prev;
h->prev = n->prev;
h->prev->next = h;
}
static inline void queue_split(struct queue *h,
struct queue *q,
struct queue *n)
{
n->prev = h->prev;
n->prev->next = n;
n->next = q;
h->prev = q->prev;
h->prev->next = h;
q->prev = n;
}
static inline void queue_move(struct queue *h, struct queue *n)
{
if (queue_empty(h))
queue_init(n);
else
queue_split(h, h->next, n);
}
static inline void queue_insert_head(struct queue *h, struct queue *q)
{
q->next = h->next;
q->prev = h;
q->next->prev = q;
h->next = q;
}
static inline void queue_insert_tail(struct queue *h, struct queue *q)
{
q->next = h;
q->prev = h->prev;
q->prev->next = q;
h->prev = q;
}
static inline void queue_remove(struct queue *q)
{
q->prev->next = q->next;
q->next->prev = q->prev;
}
#endif /* LIB_QUEUE_H_*/
dqlite-1.16.7/src/lib/registry.h 0000664 0000000 0000000 00000034244 14652527134 0016466 0 ustar 00root root 0000000 0000000 #ifndef LIB_REGISTRY_H_
#define LIB_REGISTRY_H_
#include
#include
#include
#include
#include "../../include/dqlite.h"
#include "assert.h"
#define DQLITE_NOTFOUND 1002
/**
* Define a type-safe registry able to allocate and lookup items of a given
* type.
*
* The item TYPE is required to implement three methods: TYPE##_init,
* TYPE##_close and TYPE##_hash.
*/
#define REGISTRY(NAME, TYPE) \
\
struct NAME \
{ \
struct TYPE **buf; /* Array of registry item slots */ \
size_t len; /* Index of the highest used slot */ \
size_t cap; /* Total number of slots */ \
}; \
\
/* Initialize the registry. */ \
void NAME##_init(struct NAME *r); \
\
/* Close the registry. */ \
void NAME##_close(struct NAME *r); \
\
/* Add an item to the registry. \
* \
* Return a pointer to a newly allocated an initialized item. \
* The "id" field of the item will be set to a unique value \
* identifying the item in the registry. */ \
int NAME##_add(struct NAME *r, struct TYPE **item); \
\
/* Given its ID, retrieve an item previously added to the \
* registry. */ \
struct TYPE *NAME##_get(struct NAME *r, size_t id); \
\
/* Get the index of the first item matching the given hash key. Return \
* 0 on success and DQLITE_NOTFOUND otherwise. */ \
int NAME##_idx(struct NAME *r, const char *key, size_t *i); \
\
/* Delete a previously added item. */ \
int NAME##_del(struct NAME *r, struct TYPE *item)
/**
* Define the methods of a registry
*/
#define REGISTRY_METHODS(NAME, TYPE) \
void NAME##_init(struct NAME *r) \
{ \
assert(r != NULL); \
\
r->buf = NULL; \
r->len = 0; \
r->cap = 0; \
} \
\
void NAME##_close(struct NAME *r) \
{ \
size_t i; \
struct TYPE *item; \
\
assert(r != NULL); \
\
/* Loop through all items currently in the registry, \
* and close them. */ \
for (i = 0; i < r->len; i++) { \
item = *(r->buf + i); \
/* Some slots may have been deleted, so we need \
* to check if the slot is actually used. */ \
if (item != NULL) { \
TYPE##_close(item); \
sqlite3_free(item); \
} \
} \
\
r->len = 0; \
r->cap = 0; \
if (r->buf != NULL) { \
sqlite3_free(r->buf); \
r->buf = NULL; \
} \
} \
\
int NAME##_add(struct NAME *r, struct TYPE **item) \
{ \
struct TYPE **buf; \
size_t cap; \
size_t i; \
\
assert(r != NULL); \
assert(item != NULL); \
\
/* Check if there is an unllocated slot. */ \
for (i = 0; i < r->len; i++) { \
if (*(r->buf + i) == NULL) { \
goto ok_slot; \
} \
} \
\
/* There are no unallocated slots. */ \
assert(i == r->len); \
\
/* If we are full, then double the capacity. */ \
if (r->len + 1 > r->cap) { \
cap = (r->cap == 0) ? 1 : r->cap * 2; \
buf = sqlite3_realloc(r->buf, \
(int)(cap * sizeof(*r->buf))); \
if (buf == NULL) { \
return DQLITE_NOMEM; \
} \
r->buf = buf; \
r->cap = cap; \
} \
r->len++; \
\
ok_slot: \
assert(i < r->len); \
\
/* Allocate and initialize the new item */ \
*item = sqlite3_malloc(sizeof **item); \
if (*item == NULL) \
return DQLITE_NOMEM; \
\
(*item)->id = i; \
\
TYPE##_init(*item); \
\
/* Save the item in its registry slot */ \
*(r->buf + i) = *item; \
\
return 0; \
} \
\
struct TYPE *NAME##_get(struct NAME *r, size_t id) \
{ \
struct TYPE *item; \
size_t i = id; \
\
assert(r != NULL); \
\
if (i >= r->len) { \
return NULL; \
} \
\
item = *(r->buf + i); \
\
assert(item->id == id); \
\
return item; \
} \
\
int NAME##_idx(struct NAME *r, const char *key, size_t *i) \
{ \
struct TYPE *item; \
\
assert(r != NULL); \
assert(key != NULL); \
assert(i != NULL); \
\
for (*i = 0; *i < r->len; (*i)++) { \
const char *hash; \
\
item = *(r->buf + *i); \
\
if (item == NULL) { \
continue; \
} \
\
hash = TYPE##_hash(item); \
\
if (hash != NULL && strcmp(hash, key) == 0) { \
return 0; \
} \
} \
\
return DQLITE_NOTFOUND; \
} \
\
int NAME##_del(struct NAME *r, struct TYPE *item) \
{ \
struct TYPE **buf; \
size_t cap; \
size_t i = item->id; \
\
assert(r != NULL); \
\
if (i >= r->len) { \
return DQLITE_NOTFOUND; \
} \
\
/* Check that the item address actually matches the one \
* we have in the registry */ \
if (*(r->buf + i) != item) { \
return DQLITE_NOTFOUND; \
} \
\
TYPE##_close(item); \
sqlite3_free(item); \
\
*(r->buf + i) = NULL; \
\
/* If this was the last item in the registry buffer, \
* decrease the length. */ \
if (i == r->len - 1) { \
r->len--; \
} \
\
/* If the new length is less than half of the capacity, \
* try to shrink the registry. */ \
if (r->len < (r->cap / 2)) { \
cap = r->cap / 2; \
buf = sqlite3_realloc(r->buf, \
(int)(cap * sizeof *r->buf)); \
if (buf != NULL) { \
r->buf = buf; \
r->cap = cap; \
} \
} \
\
return 0; \
}
#endif /* LIB_REGISTRY_H_ */
dqlite-1.16.7/src/lib/serialize.h 0000664 0000000 0000000 00000020173 14652527134 0016601 0 ustar 00root root 0000000 0000000 #ifndef LIB_SERIALIZE_H_
#define LIB_SERIALIZE_H_
#include
#include
#include
#include "../../include/dqlite.h"
#include "assert.h"
#include "byte.h"
#define DQLITE_PARSE 1005
/**
* The size in bytes of a single serialized word.
*/
#define SERIALIZE__WORD_SIZE 8
/* We rely on the size of double to be 64 bit, since that's what is sent over
* the wire.
*
* See https://stackoverflow.com/questions/752309/ensuring-c-doubles-are-64-bits
*/
#ifndef __STDC_IEC_559__
#if __SIZEOF_DOUBLE__ != 8
#error "Requires IEEE 754 floating point!"
#endif
#endif
#ifdef static_assert
static_assert(sizeof(double) == sizeof(uint64_t),
"Size of 'double' is not 64 bits");
#endif
/**
* Basic type aliases to used by macro-based processing.
*/
typedef const char *text_t;
typedef double float_t;
typedef uv_buf_t blob_t;
/**
* Cursor to progressively read a buffer.
*/
struct cursor
{
const char *p; /* Next byte to read */
size_t cap; /* Number of bytes left in the buffer */
};
/**
* Define a serializable struct.
*
* NAME: Name of the structure which will be defined.
* FIELDS: List of X-based macros defining the fields in the schema, in the form
* of X(KIND, NAME, ##__VA_ARGS__). E.g. X(uint64, id, ##__VA_ARGS__).
*
* A new struct called NAME will be defined, along with sizeof, encode and
* decode functions.
*/
#define SERIALIZE__DEFINE(NAME, FIELDS) \
SERIALIZE__DEFINE_STRUCT(NAME, FIELDS); \
SERIALIZE__DEFINE_METHODS(NAME, FIELDS)
#define SERIALIZE__DEFINE_STRUCT(NAME, FIELDS) \
struct NAME \
{ \
FIELDS(SERIALIZE__DEFINE_FIELD) \
}
#define SERIALIZE__DEFINE_METHODS(NAME, FIELDS) \
size_t NAME##__sizeof(const struct NAME *p); \
void NAME##__encode(const struct NAME *p, char **cursor); \
int NAME##__decode(struct cursor *cursor, struct NAME *p)
/* Define a single field in serializable struct.
*
* KIND: Type code (e.g. uint64, text, etc).
* MEMBER: Field name. */
#define SERIALIZE__DEFINE_FIELD(KIND, MEMBER) KIND##_t MEMBER;
/**
* Implement the sizeof, encode and decode function of a serializable struct.
*/
#define SERIALIZE__IMPLEMENT(NAME, FIELDS) \
size_t NAME##__sizeof(const struct NAME *p) \
{ \
size_t size = 0; \
FIELDS(SERIALIZE__SIZEOF_FIELD, p); \
return size; \
} \
void NAME##__encode(const struct NAME *p, char **cursor) \
{ \
FIELDS(SERIALIZE__ENCODE_FIELD, p, cursor); \
} \
int NAME##__decode(struct cursor *cursor, struct NAME *p) \
{ \
int rc; \
FIELDS(SERIALIZE__DECODE_FIELD, p, cursor); \
return 0; \
}
#define SERIALIZE__SIZEOF_FIELD(KIND, MEMBER, P) \
size += KIND##__sizeof(&((P)->MEMBER));
#define SERIALIZE__ENCODE_FIELD(KIND, MEMBER, P, CURSOR) \
KIND##__encode(&((P)->MEMBER), CURSOR);
#define SERIALIZE__DECODE_FIELD(KIND, MEMBER, P, CURSOR) \
rc = KIND##__decode(CURSOR, &((P)->MEMBER)); \
if (rc != 0) { \
return rc; \
}
DQLITE_INLINE size_t uint8__sizeof(const uint8_t *value)
{
(void)value;
return sizeof(uint8_t);
}
DQLITE_INLINE size_t uint16__sizeof(const uint16_t *value)
{
(void)value;
return sizeof(uint16_t);
}
DQLITE_INLINE size_t uint32__sizeof(const uint32_t *value)
{
(void)value;
return sizeof(uint32_t);
}
DQLITE_INLINE size_t uint64__sizeof(const uint64_t *value)
{
(void)value;
return sizeof(uint64_t);
}
DQLITE_INLINE size_t int64__sizeof(const int64_t *value)
{
(void)value;
return sizeof(int64_t);
}
DQLITE_INLINE size_t float__sizeof(const float_t *value)
{
(void)value;
return sizeof(double);
}
DQLITE_INLINE size_t text__sizeof(const text_t *value)
{
return BytePad64(strlen(*value) + 1);
}
DQLITE_INLINE size_t blob__sizeof(const blob_t *value)
{
/* length + data */
return sizeof(uint64_t) + BytePad64(value->len);
}
DQLITE_INLINE void uint8__encode(const uint8_t *value, char **cursor)
{
*(uint8_t *)(*cursor) = *value;
*cursor += sizeof(uint8_t);
}
DQLITE_INLINE void uint16__encode(const uint16_t *value, char **cursor)
{
uint16_t x = ByteFlipLe16(*value);
memcpy(*cursor, &x, sizeof(uint16_t));
*cursor += sizeof(uint16_t);
}
DQLITE_INLINE void uint32__encode(const uint32_t *value, char **cursor)
{
uint32_t x = ByteFlipLe32(*value);
memcpy(*cursor, &x, sizeof(uint32_t));
*cursor += sizeof(uint32_t);
}
DQLITE_INLINE void uint64__encode(const uint64_t *value, char **cursor)
{
uint64_t x = ByteFlipLe64(*value);
memcpy(*cursor, &x, sizeof(uint64_t));
*cursor += sizeof(uint64_t);
}
DQLITE_INLINE void int64__encode(const int64_t *value, char **cursor)
{
int64_t x = (int64_t)ByteFlipLe64((uint64_t)*value);
memcpy(*cursor, &x, sizeof(int64_t));
*cursor += sizeof(int64_t);
}
DQLITE_INLINE void float__encode(const float_t *value, char **cursor)
{
uint64_t x = ByteFlipLe64(*(uint64_t *)value);
memcpy(*cursor, &x, sizeof(uint64_t));
*cursor += sizeof(uint64_t);
}
DQLITE_INLINE void text__encode(const text_t *value, char **cursor)
{
size_t len = BytePad64(strlen(*value) + 1);
memset(*cursor, 0, len);
strcpy(*cursor, *value);
*cursor += len;
}
DQLITE_INLINE void blob__encode(const blob_t *value, char **cursor)
{
size_t len = BytePad64(value->len);
uint64_t value_len = value->len;
uint64__encode(&value_len, cursor);
memcpy(*cursor, value->base, value->len);
*cursor += len;
}
DQLITE_INLINE int uint8__decode(struct cursor *cursor, uint8_t *value)
{
size_t n = sizeof(uint8_t);
if (n > cursor->cap) {
return DQLITE_PARSE;
}
*value = *(uint8_t *)cursor->p;
cursor->p += n;
cursor->cap -= n;
return 0;
}
DQLITE_INLINE int uint16__decode(struct cursor *cursor, uint16_t *value)
{
size_t n = sizeof(uint16_t);
if (n > cursor->cap) {
return DQLITE_PARSE;
}
memcpy(value, cursor->p, sizeof(*value));
*value = ByteFlipLe16(*value);
cursor->p += n;
cursor->cap -= n;
return 0;
}
DQLITE_INLINE int uint32__decode(struct cursor *cursor, uint32_t *value)
{
size_t n = sizeof(uint32_t);
if (n > cursor->cap) {
return DQLITE_PARSE;
}
memcpy(value, cursor->p, sizeof(*value));
*value = ByteFlipLe32(*value);
cursor->p += n;
cursor->cap -= n;
return 0;
}
DQLITE_INLINE int uint64__decode(struct cursor *cursor, uint64_t *value)
{
size_t n = sizeof(uint64_t);
if (n > cursor->cap) {
return DQLITE_PARSE;
}
memcpy(value, cursor->p, sizeof(*value));
*value = ByteFlipLe64(*value);
cursor->p += n;
cursor->cap -= n;
return 0;
}
DQLITE_INLINE int int64__decode(struct cursor *cursor, int64_t *value)
{
size_t n = sizeof(int64_t);
if (n > cursor->cap) {
return DQLITE_PARSE;
}
memcpy(value, cursor->p, sizeof(*value));
*value = (int64_t)ByteFlipLe64((uint64_t)*value);
cursor->p += n;
cursor->cap -= n;
return 0;
}
DQLITE_INLINE int float__decode(struct cursor *cursor, float_t *value)
{
size_t n = sizeof(double);
if (n > cursor->cap) {
return DQLITE_PARSE;
}
uint64_t x;
memcpy(&x, cursor->p, sizeof(x));
*(uint64_t *)value = ByteFlipLe64(x);
cursor->p += n;
cursor->cap -= n;
return 0;
}
DQLITE_INLINE int text__decode(struct cursor *cursor, text_t *value)
{
/* Find the terminating null byte of the next string, if any. */
size_t len = strnlen(cursor->p, cursor->cap);
size_t n;
if (len == cursor->cap) {
return DQLITE_PARSE;
}
*value = cursor->p;
n = BytePad64(strlen(*value) + 1);
cursor->p += n;
cursor->cap -= n;
return 0;
}
DQLITE_INLINE int blob__decode(struct cursor *cursor, blob_t *value)
{
uint64_t len;
size_t n;
int rv;
rv = uint64__decode(cursor, &len);
if (rv != 0) {
return rv;
}
n = BytePad64((size_t)len);
if (n > cursor->cap) {
return DQLITE_PARSE;
}
value->base = (char *)cursor->p;
value->len = (size_t)len;
cursor->p += n;
cursor->cap -= n;
return 0;
}
#endif /* LIB_SERIALIZE_H_ */
dqlite-1.16.7/src/lib/sm.c 0000664 0000000 0000000 00000004313 14652527134 0015222 0 ustar 00root root 0000000 0000000 #include "sm.h"
#include
#include
#include /* NULL */
#include /* fprintf */
#include
#include
#include "../tracing.h"
#include "../utils.h"
static bool sm_is_locked(const struct sm *m)
{
return ERGO(m->is_locked, m->is_locked(m));
}
int sm_state(const struct sm *m)
{
PRE(sm_is_locked(m));
return m->state;
}
static inline void sm_obs(const struct sm *m)
{
tracef("%s pid: %d sm_id: %" PRIu64 " %s |\n",
m->name, m->pid, m->id, m->conf[sm_state(m)].name);
}
void sm_relate(const struct sm *from, const struct sm *to)
{
tracef("%s-to-%s opid: %d dpid: %d id: %" PRIu64 " id: %" PRIu64 " |\n",
from->name, to->name, from->pid, to->pid, from->id, to->id);
}
void sm_init(struct sm *m,
bool (*invariant)(const struct sm *, int),
bool (*is_locked)(const struct sm *),
const struct sm_conf *conf,
const char *name,
int state)
{
static atomic_uint_least64_t id = 0;
PRE(conf[state].flags & SM_INITIAL);
m->conf = conf;
m->state = state;
m->invariant = invariant;
m->is_locked = is_locked;
m->id = ++id;
m->pid = getpid();
snprintf(m->name, SM_MAX_NAME_LENGTH, "%s", name);
sm_obs(m);
POST(m->invariant != NULL && m->invariant(m, SM_PREV_NONE));
}
void sm_fini(struct sm *m)
{
PRE(m->invariant != NULL && m->invariant(m, SM_PREV_NONE));
PRE(m->conf[sm_state(m)].flags & SM_FINAL);
}
void sm_move(struct sm *m, int next_state)
{
int prev = sm_state(m);
PRE(sm_is_locked(m));
PRE(m->conf[sm_state(m)].allowed & BITS(next_state));
m->state = next_state;
sm_obs(m);
POST(m->invariant != NULL && m->invariant(m, prev));
}
void sm_fail(struct sm *m, int fail_state, int rc)
{
int prev = sm_state(m);
PRE(sm_is_locked(m));
PRE(rc != 0 && m->rc == 0);
PRE(m->conf[fail_state].flags & SM_FAILURE);
PRE(m->conf[sm_state(m)].allowed & BITS(fail_state));
m->rc = rc;
m->state = fail_state;
POST(m->invariant != NULL && m->invariant(m, prev));
}
static __attribute__((noinline)) bool check_failed(const char *f, int n, const char *s)
{
tracef("%s:%d check failed: %s\n", f, n, s);
return false;
}
bool sm_check(bool b, const char *f, int n, const char *s)
{
if (!b) {
return check_failed(f, n, s);
}
return true;
}
dqlite-1.16.7/src/lib/sm.h 0000664 0000000 0000000 00000002410 14652527134 0015223 0 ustar 00root root 0000000 0000000 #ifndef __LIB_SM__
#define __LIB_SM__
#include
#include
#include
#define BITS(state) (1ULL << (state))
#define CHECK(cond) sm_check((cond), __FILE__, __LINE__, #cond)
#define SM_MAX_NAME_LENGTH 50
enum {
SM_PREV_NONE = -1,
/* sizeof(sm_conf::allowed * 8) */
SM_STATES_MAX = 64,
/* flags */
SM_INITIAL = 1U << 0,
SM_FAILURE = 1U << 1,
SM_FINAL = 1U << 2,
};
struct sm_conf
{
uint32_t flags;
uint64_t allowed;
const char *name;
};
struct sm
{
int rc;
int state;
char name[SM_MAX_NAME_LENGTH];
uint64_t id;
pid_t pid;
bool (*is_locked)(const struct sm *);
bool (*invariant)(const struct sm *, int);
const struct sm_conf *conf;
};
void sm_init(struct sm *m,
bool (*invariant)(const struct sm *, int),
/* optional, set NULL if not used */
bool (*is_locked)(const struct sm *),
const struct sm_conf *conf,
const char *name,
int state);
void sm_fini(struct sm *m);
void sm_move(struct sm *m, int next_state);
void sm_fail(struct sm *m, int fail_state, int rc);
int sm_state(const struct sm *m);
bool sm_check(bool b, const char *f, int n, const char *s);
/* Relates one state machine to another for observability. */
void sm_relate(const struct sm *from, const struct sm *to);
#endif /* __LIB_SM__ */
dqlite-1.16.7/src/lib/threadpool.c 0000664 0000000 0000000 00000031323 14652527134 0016745 0 ustar 00root root 0000000 0000000 #include "threadpool.h"
#include
#include
#include
#include
#include
#include "../../src/lib/queue.h"
#include "../../src/lib/sm.h"
#include "../../src/utils.h"
#include "../tracing.h"
/**
* Planner thread state machine.
*
* signal() &&
* empty(o) && signal() && exiting
* empty(u) && +-----> NOTHING ----------------> EXITED
* !exiting +------- ^ |
* | |
* empty(o) && | | signal()
* empty(u) | | !empty(o) || !empty(u)
* | |
* | |
* | V
* !empty(o) && +-----> DRAINING
* !empty(u) && +------- ^ |
* type(head(o)) != BAR | |
* | | type(head(o)) == BAR
* ord_in_flight == 0 | |
* | V
* BARRIER --------+ signal()
* ^ | <-------+ ord_in_flight == 0
* | |
* empty(u) | | !empty(u)
* | V
* DRAINING_UNORD
*/
enum planner_states {
PS_NOTHING,
PS_DRAINING,
PS_BARRIER,
PS_DRAINING_UNORD,
PS_EXITED,
PS_NR,
};
static const struct sm_conf planner_states[PS_NR] = {
[PS_NOTHING] = {
.flags = SM_INITIAL,
.name = "nothing",
.allowed = BITS(PS_DRAINING) | BITS(PS_EXITED),
},
[PS_DRAINING] = {
.name = "draining",
.allowed = BITS(PS_DRAINING)
| BITS(PS_NOTHING)
| BITS(PS_BARRIER),
},
[PS_BARRIER] = {
.name = "barrier",
.allowed = BITS(PS_DRAINING_UNORD)
| BITS(PS_DRAINING)
| BITS(PS_BARRIER),
},
[PS_DRAINING_UNORD] = {
.name = "draining-unord",
.allowed = BITS(PS_BARRIER)
},
[PS_EXITED] = {
.flags = SM_FINAL,
.name = "exited",
.allowed = 0,
},
};
enum {
THREADPOOL_SIZE_MAX = 1024,
};
typedef struct pool_thread pool_thread_t;
typedef struct pool_impl pool_impl_t;
struct targs {
pool_impl_t *pi;
uv_sem_t *sem;
uint32_t idx; /* Thread's index */
};
/* Worker thread of the pool */
struct pool_thread {
queue inq; /* Thread's input queue */
uv_cond_t cond; /* Signalled when work item appears in @inq */
uv_thread_t thread; /* Pool's worker thread */
struct targs arg;
};
/* clang-format off */
struct pool_impl {
uv_mutex_t mutex; /* Input queue, planner_sm,
worker and planner threads lock */
uint32_t threads_nr;
pool_thread_t *threads;
queue outq; /* Output queue used by libuv part */
uv_mutex_t outq_mutex; /* Output queue lock */
uv_async_t outq_async; /* Signalled when output queue is not
empty and libuv loop has to process
items from it */
uint64_t active_ws; /* Number of all work items in flight,
accessed from the main thread only */
queue ordered; /* Queue of WT_ORD{N} items */
queue unordered; /* Queue of WT_UNORD items */
struct sm planner_sm; /* State machine of the scheduler */
uv_cond_t planner_cond;
uv_thread_t planner_thread; /* Scheduler's thread */
uint32_t ord_in_flight; /* Number of WT_ORD{N} in flight */
bool exiting; /* True when the pool is being stopped */
enum pool_work_type /* Type of the previous work item, */
ord_prev; /* used in WT_ORD{N} ivariants */
uint32_t qos; /* QoS token */
uint32_t qos_prio; /* QoS prio */
};
/* clang-format on */
static inline bool pool_is_inited(const pool_t *pool)
{
return pool->pi != NULL;
}
static inline bool has_active_ws(pool_t *pool)
{
return pool->pi->active_ws > 0;
}
static inline void w_register(pool_t *pool, pool_work_t *w)
{
if (w->type != WT_BAR) {
pool->pi->active_ws++;
}
}
static inline void w_unregister(pool_t *pool, pool_work_t *w)
{
(void)w;
PRE(has_active_ws(pool));
pool->pi->active_ws--;
}
static bool empty(const queue *q)
{
return queue_empty(q);
}
static queue *head(const queue *q)
{
return queue_head(q);
}
static void push(queue *to, queue *what)
{
queue_insert_tail(to, what);
}
static queue *pop(queue *from)
{
queue *q = queue_head(from);
PRE(q != NULL);
queue_remove(q);
queue_init(q);
return q;
}
static queue *qos_pop(pool_impl_t *pi, queue *first, queue *second)
{
PRE(!empty(first) || !empty(second));
if (empty(first)) {
return pop(second);
} else if (empty(second)) {
return pop(first);
}
return pop(pi->qos++ % pi->qos_prio ? first : second);
}
static pool_work_t *q_to_w(const queue *q)
{
return QUEUE_DATA(q, pool_work_t, link);
}
static enum pool_work_type q_type(const queue *q)
{
return q_to_w(q)->type;
}
static uint32_t q_tid(const queue *q)
{
return q_to_w(q)->thread_id;
}
static bool planner_invariant(const struct sm *m, int prev_state)
{
pool_impl_t *pi = CONTAINER_OF(m, pool_impl_t, planner_sm);
queue *o = &pi->ordered;
queue *u = &pi->unordered;
/* clang-format off */
return ERGO(sm_state(m) == PS_NOTHING, empty(o) && empty(u)) &&
ERGO(sm_state(m) == PS_DRAINING,
ERGO(prev_state == PS_BARRIER,
pi->ord_in_flight == 0 && empty(u)) &&
ERGO(prev_state == PS_NOTHING,
!empty(u) || !empty(o))) &&
ERGO(sm_state(m) == PS_EXITED,
pi->exiting && empty(o) && empty(u)) &&
ERGO(sm_state(m) == PS_BARRIER,
ERGO(prev_state == PS_DRAINING,
q_type(head(o)) == WT_BAR) &&
ERGO(prev_state == PS_DRAINING_UNORD, empty(u))) &&
ERGO(sm_state(m) == PS_DRAINING_UNORD, !empty(u));
/* clang-format on */
}
static void planner(void *arg)
{
struct targs *ta = arg;
uv_sem_t *sem = ta->sem;
pool_impl_t *pi = ta->pi;
uv_mutex_t *mutex = &pi->mutex;
pool_thread_t *ts = pi->threads;
struct sm *planner_sm = &pi->planner_sm;
queue *o = &pi->ordered;
queue *u = &pi->unordered;
queue *q;
sm_init(planner_sm, planner_invariant, NULL, planner_states, "ps",
PS_NOTHING);
uv_sem_post(sem);
uv_mutex_lock(mutex);
for (;;) {
switch (sm_state(planner_sm)) {
case PS_NOTHING:
while (empty(o) && empty(u) && !pi->exiting) {
uv_cond_wait(&pi->planner_cond, mutex);
}
sm_move(planner_sm,
pi->exiting && empty(o) && empty(u)
? PS_EXITED
: PS_DRAINING);
break;
case PS_DRAINING:
while (!(empty(o) && empty(u))) {
sm_move(planner_sm, PS_DRAINING);
if (!empty(o) &&
q_type(head(o)) == WT_BAR) {
sm_move(planner_sm, PS_BARRIER);
goto ps_barrier;
}
q = qos_pop(pi, o, u);
push(&ts[q_tid(q)].inq, q);
uv_cond_signal(&ts[q_tid(q)].cond);
if (q_type(q) >= WT_ORD1) {
pi->ord_in_flight++;
}
}
sm_move(planner_sm, PS_NOTHING);
ps_barrier:
break;
case PS_BARRIER:
if (!empty(u)) {
sm_move(planner_sm, PS_DRAINING_UNORD);
break;
}
if (pi->ord_in_flight == 0) {
q = pop(o);
PRE(q_to_w(q)->type == WT_BAR);
free(q_to_w(q));
sm_move(planner_sm, PS_DRAINING);
break;
}
uv_cond_wait(&pi->planner_cond, mutex);
sm_move(planner_sm, PS_BARRIER);
break;
case PS_DRAINING_UNORD:
while (!empty(u)) {
q = pop(u);
push(&ts[q_tid(q)].inq, q);
uv_cond_signal(&ts[q_tid(q)].cond);
}
sm_move(planner_sm, PS_BARRIER);
break;
case PS_EXITED:
sm_fini(planner_sm);
uv_mutex_unlock(mutex);
return;
default:
POST(false && "Impossible!");
}
}
}
static void queue_work(pool_work_t *w)
{
w->work_cb(w);
}
static void queue_done(pool_work_t *w)
{
w_unregister(w->pool, w);
if (w->after_work_cb != NULL) {
w->after_work_cb(w);
}
}
static void worker(void *arg)
{
struct targs *ta = arg;
pool_impl_t *pi = ta->pi;
uv_mutex_t *mutex = &pi->mutex;
pool_thread_t *ts = pi->threads;
enum pool_work_type wtype;
pool_work_t *w;
queue *q;
uv_sem_post(ta->sem);
uv_mutex_lock(mutex);
for (;;) {
while (empty(&ts[ta->idx].inq)) {
if (pi->exiting) {
uv_mutex_unlock(mutex);
return;
}
uv_cond_wait(&ts[ta->idx].cond, mutex);
}
q = pop(&ts[ta->idx].inq);
uv_mutex_unlock(mutex);
w = q_to_w(q);
wtype = w->type;
queue_work(w);
uv_mutex_lock(&pi->outq_mutex);
push(&pi->outq, &w->link);
uv_async_send(&pi->outq_async);
uv_mutex_unlock(&pi->outq_mutex);
uv_mutex_lock(mutex);
if (wtype > WT_BAR) {
assert(pi->ord_in_flight > 0);
if (--pi->ord_in_flight == 0) {
uv_cond_signal(&pi->planner_cond);
}
}
}
}
static void pool_cleanup(pool_t *pool)
{
pool_impl_t *pi = pool->pi;
pool_thread_t *ts = pi->threads;
uint32_t i;
if (pi->threads_nr == 0) {
return;
}
uv_cond_signal(&pi->planner_cond);
if (uv_thread_join(&pi->planner_thread)) {
abort();
}
uv_cond_destroy(&pi->planner_cond);
POST(empty(&pi->ordered) && empty(&pi->unordered));
for (i = 0; i < pi->threads_nr; i++) {
uv_cond_signal(&ts[i].cond);
if (uv_thread_join(&ts[i].thread)) {
abort();
}
POST(empty(&ts[i].inq));
uv_cond_destroy(&ts[i].cond);
}
free(pi->threads);
uv_mutex_destroy(&pi->mutex);
pi->threads_nr = 0;
}
static void pool_threads_init(pool_t *pool)
{
uint32_t i;
uv_sem_t sem;
pool_impl_t *pi = pool->pi;
pool_thread_t *ts;
struct targs pa = {
.sem = &sem,
.pi = pi,
};
uv_thread_options_t config = {
.flags = UV_THREAD_HAS_STACK_SIZE,
.stack_size = 8u << 20,
};
if (uv_mutex_init(&pi->mutex)) {
abort();
}
if (uv_sem_init(&sem, 0)) {
abort();
}
pi->threads = calloc(pi->threads_nr, sizeof(pi->threads[0]));
if (pi->threads == NULL) {
abort();
}
for (i = 0, ts = pi->threads; i < pi->threads_nr; i++) {
ts[i].arg = (struct targs){
.pi = pi,
.sem = &sem,
.idx = i,
};
queue_init(&ts[i].inq);
if (uv_cond_init(&ts[i].cond)) {
abort();
}
if (uv_thread_create_ex(&ts[i].thread, &config, worker,
&ts[i].arg)) {
abort();
}
}
if (uv_cond_init(&pi->planner_cond)) {
abort();
}
if (uv_thread_create_ex(&pi->planner_thread, &config, planner, &pa)) {
abort();
}
for (i = 0; i < pi->threads_nr + 1 /* +planner */; i++) {
uv_sem_wait(&sem);
}
uv_sem_destroy(&sem);
}
static void pool_work_submit(pool_t *pool, pool_work_t *w)
{
pool_impl_t *pi = pool->pi;
queue *o = &pi->ordered;
queue *u = &pi->unordered;
if (w->type > WT_UNORD) {
/* Make sure that elements in the ordered queue come in order.
*/
PRE(ERGO(pi->ord_prev != WT_BAR && w->type != WT_BAR,
pi->ord_prev == w->type));
pi->ord_prev = w->type;
}
uv_mutex_lock(&pi->mutex);
POST(!pi->exiting);
push(w->type == WT_UNORD ? u : o, &w->link);
uv_cond_signal(&pi->planner_cond);
uv_mutex_unlock(&pi->mutex);
}
void work_done(uv_async_t *handle)
{
queue q = {};
pool_impl_t *pi = CONTAINER_OF(handle, pool_impl_t, outq_async);
uv_mutex_lock(&pi->outq_mutex);
queue_move(&pi->outq, &q);
uv_mutex_unlock(&pi->outq_mutex);
while (!empty(&q)) {
queue_done(q_to_w(pop(&q)));
}
}
void pool_queue_work(pool_t *pool,
pool_work_t *w,
uint32_t cookie,
enum pool_work_type type,
void (*work_cb)(pool_work_t *w),
void (*after_work_cb)(pool_work_t *w))
{
PRE(memcmp(w, &(pool_work_t){}, sizeof *w) == 0);
PRE(work_cb != NULL && type < WT_NR);
if (!!(pool->flags & POOL_FOR_UT_NOT_ASYNC)) {
work_cb(w);
after_work_cb(w);
return;
}
PRE(pool_is_inited(pool));
*w = (pool_work_t){
.pool = pool,
.type = type,
.thread_id = cookie % pool->pi->threads_nr,
.work_cb = work_cb,
.after_work_cb = after_work_cb,
};
w_register(pool, w);
pool_work_submit(pool, w);
}
int pool_init(pool_t *pool,
uv_loop_t *loop,
uint32_t threads_nr,
uint32_t qos_prio)
{
int rc;
pool_impl_t *pi = pool->pi;
PRE(threads_nr <= THREADPOOL_SIZE_MAX);
pool->flags = 0x0;
pi = pool->pi = calloc(1, sizeof(*pool->pi));
if (pi == NULL) {
return UV_ENOMEM;
}
*pi = (pool_impl_t){
.qos = 0,
.qos_prio = qos_prio,
.exiting = false,
.ord_prev = WT_BAR,
.threads_nr = threads_nr,
.ord_in_flight = 0,
};
queue_init(&pi->outq);
queue_init(&pi->ordered);
queue_init(&pi->unordered);
rc = uv_mutex_init(&pi->outq_mutex);
if (rc != 0) {
free(pi);
return rc;
}
rc = uv_async_init(loop, &pi->outq_async, work_done);
if (rc != 0) {
uv_mutex_destroy(&pi->outq_mutex);
free(pi);
return rc;
}
pool_threads_init(pool);
return 0;
}
void pool_fini(pool_t *pool)
{
pool_impl_t *pi = pool->pi;
pool_cleanup(pool);
uv_mutex_lock(&pi->outq_mutex);
POST(!!(pool->flags & POOL_FOR_UT_NON_CLEAN_FINI) ||
(empty(&pi->outq) && !has_active_ws(pool)));
uv_mutex_unlock(&pi->outq_mutex);
uv_mutex_destroy(&pi->outq_mutex);
free(pi);
}
void pool_close(pool_t *pool)
{
pool_impl_t *pi = pool->pi;
uv_close((uv_handle_t *)&pi->outq_async, NULL);
uv_mutex_lock(&pi->mutex);
pi->exiting = true;
uv_mutex_unlock(&pi->mutex);
}
pool_t *pool_ut_fallback(void)
{
static pool_t pool;
return &pool;
}
dqlite-1.16.7/src/lib/threadpool.h 0000664 0000000 0000000 00000006617 14652527134 0016762 0 ustar 00root root 0000000 0000000 #ifndef __THREAD_POOL__
#define __THREAD_POOL__
#include
#include "queue.h"
/**
Thread pool
- Use-cases:
- Move sqlite3-, IO- related blocking operations from libuv
loop's thread to pool's threads in order to unblock serving
incoming dqlite requests during sqlite3 IO.
Multiple sqlite3_step()-s can be in flight and executed
concurrently, while thread's loop is not IO blocked.
- Introduced pool's work item thread affinity to serve sqlite3-
related items of each database in a "dedicated" thread which
allows not to make any assumption on sqlite3 threading model.
@see https://www.sqlite.org/threadsafe.html
- The pool supports servicing of the following types of work items:
- WT_UNORD - items, which can be processed by the pool in any
order, concurrency assumptions of this type of work are
guaranteed by other layers of the application. Read and write
transactions executed by sqlite3_step() are good examples for
such work item type.
- WT_ORD_N - items, which can NOT be processed by the pool in
any order. The pool's logic shall guarantee that servicing
all WT_ORD_{N}s happens before WT_ORD_{N + 1}s. WT_ORD_{N}s
and WT_ORD_{N + 1}s operations can't be put into the pool
interleaved. Sqlite3 checkpoints is an example of WT_ORD_{N}
and InstallSnapshot(CP(), MV()) is an example of WT_ORD_{N + 1}.
- WT_BAR - special purpose item, barrier. Delimits WT_ORD_{N}s
from WT_ORD_{N + 1}s.
- The pool supports servicing of work items with a given quality
of service (QoS) considerations. For example, the priority of
serving read/write sqlite3 transactions (WT_UNORD) can be set
higher then snapshot installation (WT_ORD{N}).
*/
struct pool_impl;
typedef struct pool_s pool_t;
typedef struct pool_work_s pool_work_t;
enum pool_work_type {
WT_UNORD,
WT_BAR,
WT_ORD1,
WT_ORD2,
WT_NR,
};
struct pool_work_s {
queue link; /* Link into ordered, unordered and outq */
uint32_t thread_id; /* Identifier of the thread the item is affined */
pool_t *pool; /* The pool, item is being associated with */
enum pool_work_type type;
int rc; /* Return code used to deliver pool work operation result to the
* uv_loop's thread. */
void (*work_cb)(pool_work_t *w);
void (*after_work_cb)(pool_work_t *w);
};
struct pool_s {
struct pool_impl *pi;
int flags;
};
enum {
POOL_QOS_PRIO_FAIR = 2,
};
enum pool_half {
POOL_TOP_HALF = 0x109,
POOL_BOTTOM_HALF = 0xb01103,
};
enum {
/**
* Setting POOL_FOR_UT_NON_CLEAN_FINI relaxes pool's invariant during
* the finalization w.r.t. to pass a few tests checking failures with
* non-clean unit-test termination.
*/
POOL_FOR_UT_NON_CLEAN_FINI = 1u << 0,
/**
* Set this flag if there's no event loop in unit test. Top- and
* bottom- halves will be called in the current thread.
*/
POOL_FOR_UT_NOT_ASYNC = 1u << 1,
/**
* Set if the pool runs in the context of unit test.
*/
POOL_FOR_UT = 1u << 2,
};
int pool_init(pool_t *pool,
uv_loop_t *loop,
uint32_t threads_nr,
uint32_t qos_prio);
void pool_fini(pool_t *pool);
void pool_close(pool_t *pool);
void pool_queue_work(pool_t *pool,
pool_work_t *w,
uint32_t cookie,
enum pool_work_type type,
void (*work_cb)(pool_work_t *w),
void (*after_work_cb)(pool_work_t *w));
pool_t *pool_ut_fallback(void);
#endif /* __THREAD_POOL__ */
dqlite-1.16.7/src/lib/transport.c 0000664 0000000 0000000 00000007007 14652527134 0016642 0 ustar 00root root 0000000 0000000 #include "../raft.h"
#include "../../include/dqlite.h"
#include "assert.h"
#include "transport.h"
/* Called to allocate a buffer for the next stream read. */
static void alloc_cb(uv_handle_t *stream, size_t suggested_size, uv_buf_t *buf)
{
struct transport *t;
(void)suggested_size;
t = stream->data;
assert(t->read.base != NULL);
assert(t->read.len > 0);
*buf = t->read;
}
/* Invoke the read callback. */
static void read_done(struct transport *t, ssize_t status)
{
transport_read_cb cb;
int rv;
rv = uv_read_stop(t->stream);
assert(rv == 0);
cb = t->read_cb;
assert(cb != NULL);
t->read_cb = NULL;
t->read.base = NULL;
t->read.len = 0;
cb(t, (int)status);
}
static void read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf)
{
struct transport *t;
(void)buf;
t = stream->data;
if (nread > 0) {
size_t n = (size_t)nread;
/* We shouldn't have read more data than the pending amount. */
assert(n <= t->read.len);
/* Advance the read window */
t->read.base += n;
t->read.len -= n;
/* If there's more data to read in order to fill the current
* read buffer, just return, we'll be invoked again. */
if (t->read.len > 0) {
return;
}
/* Read completed, invoke the callback. */
read_done(t, 0);
return;
}
assert(nread <= 0);
if (nread == 0) {
/* Empty read */
return;
}
assert(nread < 0);
/* Failure. */
read_done(t, nread);
}
int transport__stream(struct uv_loop_s *loop,
int fd,
struct uv_stream_s **stream)
{
struct uv_pipe_s *pipe;
struct uv_tcp_s *tcp;
int rv;
switch (uv_guess_handle(fd)) {
case UV_TCP:
tcp = raft_malloc(sizeof *tcp);
if (tcp == NULL) {
return DQLITE_NOMEM;
}
rv = uv_tcp_init(loop, tcp);
assert(rv == 0);
rv = uv_tcp_open(tcp, fd);
if (rv != 0) {
raft_free(tcp);
return TRANSPORT__BADSOCKET;
}
*stream = (struct uv_stream_s *)tcp;
break;
case UV_NAMED_PIPE:
pipe = raft_malloc(sizeof *pipe);
if (pipe == NULL) {
return DQLITE_NOMEM;
}
rv = uv_pipe_init(loop, pipe, 0);
assert(rv == 0);
rv = uv_pipe_open(pipe, fd);
if (rv != 0) {
raft_free(pipe);
return TRANSPORT__BADSOCKET;
}
*stream = (struct uv_stream_s *)pipe;
break;
default:
return TRANSPORT__BADSOCKET;
};
return 0;
}
int transport__init(struct transport *t, struct uv_stream_s *stream)
{
t->stream = stream;
t->stream->data = t;
t->read.base = NULL;
t->read.len = 0;
t->write.data = t;
t->read_cb = NULL;
t->write_cb = NULL;
t->close_cb = NULL;
return 0;
}
static void close_cb(uv_handle_t *handle)
{
struct transport *t = handle->data;
raft_free(t->stream);
if (t->close_cb != NULL) {
t->close_cb(t);
}
}
void transport__close(struct transport *t, transport_close_cb cb)
{
assert(t->close_cb == NULL);
t->close_cb = cb;
uv_close((uv_handle_t *)t->stream, close_cb);
}
int transport__read(struct transport *t, uv_buf_t *buf, transport_read_cb cb)
{
int rv;
assert(t->read.base == NULL);
assert(t->read.len == 0);
t->read = *buf;
t->read_cb = cb;
rv = uv_read_start(t->stream, alloc_cb, read_cb);
if (rv != 0) {
return DQLITE_ERROR;
}
return 0;
}
static void write_cb(uv_write_t *req, int status)
{
struct transport *t = req->data;
transport_write_cb cb = t->write_cb;
assert(cb != NULL);
t->write_cb = NULL;
cb(t, status);
}
int transport__write(struct transport *t, uv_buf_t *buf, transport_write_cb cb)
{
int rv;
assert(t->write_cb == NULL);
t->write_cb = cb;
rv = uv_write(&t->write, t->stream, buf, 1, write_cb);
if (rv != 0) {
return rv;
}
return 0;
}
dqlite-1.16.7/src/lib/transport.h 0000664 0000000 0000000 00000003223 14652527134 0016643 0 ustar 00root root 0000000 0000000 /**
* Asynchronously read and write buffer from and to the network.
*/
#ifndef LIB_TRANSPORT_H_
#define LIB_TRANSPORT_H_
#include
#define TRANSPORT__BADSOCKET 1000
/**
* Callbacks.
*/
struct transport;
typedef void (*transport_read_cb)(struct transport *t, int status);
typedef void (*transport_write_cb)(struct transport *t, int status);
typedef void (*transport_close_cb)(struct transport *t);
/**
* Light wrapper around a libuv stream handle, providing a more convenient way
* to read a certain amount of bytes.
*/
struct transport
{
void *data; /* User defined */
struct uv_stream_s *stream; /* Data stream */
uv_buf_t read; /* Read buffer */
uv_write_t write; /* Write request */
transport_read_cb read_cb; /* Read callback */
transport_write_cb write_cb; /* Write callback */
transport_close_cb close_cb; /* Close callback */
};
/**
* Initialize a transport of the appropriate type (TCP or PIPE) attached to the
* given file descriptor.
*/
int transport__init(struct transport *t, struct uv_stream_s *stream);
/**
* Start closing by the transport.
*/
void transport__close(struct transport *t, transport_close_cb cb);
/**
* Read from the transport file descriptor until the given buffer is full.
*/
int transport__read(struct transport *t, uv_buf_t *buf, transport_read_cb cb);
/**
* Write the given buffer to the transport.
*/
int transport__write(struct transport *t, uv_buf_t *buf, transport_write_cb cb);
/* Create an UV stream object from the given fd. */
int transport__stream(struct uv_loop_s *loop,
int fd,
struct uv_stream_s **stream);
#endif /* LIB_TRANSPORT_H_ */
dqlite-1.16.7/src/logger.c 0000664 0000000 0000000 00000001621 14652527134 0015313 0 ustar 00root root 0000000 0000000 #include
#include
#include "logger.h"
#define EMIT_BUF_LEN 1024
void loggerDefaultEmit(void *data, int level, const char *fmt, va_list args)
{
char buf[EMIT_BUF_LEN];
char *cursor = buf;
size_t n;
(void)data;
/* First, render the logging level. */
switch (level) {
case DQLITE_DEBUG:
sprintf(cursor, "[DEBUG]: ");
break;
case DQLITE_INFO:
sprintf(cursor, "[INFO ]: ");
break;
case DQLITE_WARN:
sprintf(cursor, "[WARN ]: ");
break;
case DQLITE_LOG_ERROR:
sprintf(cursor, "[ERROR]: ");
break;
default:
sprintf(cursor, "[ ]: ");
break;
};
cursor = buf + strlen(buf);
/* Then render the message, possibly truncating it. */
n = EMIT_BUF_LEN - strlen(buf) - 1;
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wformat-nonliteral"
vsnprintf(cursor, n, fmt, args);
#pragma GCC diagnostic pop
fprintf(stderr, "%s\n", buf);
}
dqlite-1.16.7/src/logger.h 0000664 0000000 0000000 00000001517 14652527134 0015324 0 ustar 00root root 0000000 0000000 #ifndef LOGGER_H_
#define LOGGER_H_
#include "raft.h"
#include "../include/dqlite.h"
/* Log levels */
enum { DQLITE_DEBUG = 0, DQLITE_INFO, DQLITE_WARN, DQLITE_LOG_ERROR };
/* Function to emit log messages. */
typedef void (*dqlite_emit)(void *data,
int level,
const char *fmt,
va_list args);
struct logger
{
void *data;
dqlite_emit emit;
};
/* Default implementation of dqlite_emit, using stderr. */
void loggerDefaultEmit(void *data, int level, const char *fmt, va_list args);
/* Emit a log message with a certain level. */
/* #define debugf(L, FORMAT, ...) \ */
/* logger__emit(L, DQLITE_DEBUG, FORMAT, ##__VA_ARGS__) */
#define debugf(C, FORMAT, ...) \
C->gateway.raft->io->emit(C->gateway.raft->io, RAFT_DEBUG, FORMAT, \
##__VA_ARGS__)
#endif /* LOGGER_H_ */
dqlite-1.16.7/src/message.c 0000664 0000000 0000000 00000000076 14652527134 0015463 0 ustar 00root root 0000000 0000000 #include "message.h"
SERIALIZE__IMPLEMENT(message, MESSAGE);
dqlite-1.16.7/src/message.h 0000664 0000000 0000000 00000000566 14652527134 0015474 0 ustar 00root root 0000000 0000000 #ifndef MESSAGE_H_
#define MESSAGE_H_
#include "lib/serialize.h"
/**
* Metadata about an incoming or outgoing RPC message.
*/
#define MESSAGE(X, ...) \
X(uint32, words, ##__VA_ARGS__) \
X(uint8, type, ##__VA_ARGS__) \
X(uint8, schema, ##__VA_ARGS__) \
X(uint16, extra, ##__VA_ARGS__)
SERIALIZE__DEFINE(message, MESSAGE);
#endif /* MESSAGE_H_x */
dqlite-1.16.7/src/metrics.c 0000664 0000000 0000000 00000000270 14652527134 0015501 0 ustar 00root root 0000000 0000000 #include
#include "./lib/assert.h"
#include "metrics.h"
void dqlite__metrics_init(struct dqlite__metrics *m)
{
assert(m != NULL);
m->requests = 0;
m->duration = 0;
}
dqlite-1.16.7/src/metrics.h 0000664 0000000 0000000 00000001003 14652527134 0015501 0 ustar 00root root 0000000 0000000 /******************************************************************************
*
* Collect various performance metrics.
*
*****************************************************************************/
#ifndef DQLITE_METRICS_H
#define DQLITE_METRICS_H
#include
struct dqlite__metrics
{
uint64_t requests; /* Total number of requests served. */
uint64_t duration; /* Total time spent to server requests. */
};
void dqlite__metrics_init(struct dqlite__metrics *m);
#endif /* DQLITE_METRICS_H */
dqlite-1.16.7/src/protocol.h 0000664 0000000 0000000 00000005035 14652527134 0015705 0 ustar 00root root 0000000 0000000 #ifndef DQLITE_PROTOCOL_H_
#define DQLITE_PROTOCOL_H_
/* Special datatypes */
#define DQLITE_UNIXTIME 9
#define DQLITE_ISO8601 10
#define DQLITE_BOOLEAN 11
#define DQLITE_PROTO 1001 /* Protocol error */
/* Role codes */
enum { DQLITE_VOTER, DQLITE_STANDBY, DQLITE_SPARE };
/* Current protocol version */
#define DQLITE_PROTOCOL_VERSION 1
/* Legacly pre-1.0 version. */
#define DQLITE_PROTOCOL_VERSION_LEGACY 0x86104dd760433fe5
/* Special value indicating that a batch of rows is over, but there are more. */
#define DQLITE_RESPONSE_ROWS_PART 0xeeeeeeeeeeeeeeee
/* Special value indicating that the result set is complete. */
#define DQLITE_RESPONSE_ROWS_DONE 0xffffffffffffffff
/* Request types */
enum {
DQLITE_REQUEST_LEADER,
DQLITE_REQUEST_CLIENT,
DQLITE_REQUEST_HEARTBEAT,
DQLITE_REQUEST_OPEN,
DQLITE_REQUEST_PREPARE,
DQLITE_REQUEST_EXEC,
DQLITE_REQUEST_QUERY,
DQLITE_REQUEST_FINALIZE,
DQLITE_REQUEST_EXEC_SQL,
DQLITE_REQUEST_QUERY_SQL,
DQLITE_REQUEST_INTERRUPT,
DQLITE_REQUEST_CONNECT,
DQLITE_REQUEST_ADD,
/* The PROMOTE and ASSIGN requests share a type tag. We expose it under
* two names here to facilitate the macro shenanigans in request.h. */
DQLITE_REQUEST_PROMOTE_OR_ASSIGN,
DQLITE_REQUEST_ASSIGN = DQLITE_REQUEST_PROMOTE_OR_ASSIGN,
DQLITE_REQUEST_REMOVE,
DQLITE_REQUEST_DUMP,
DQLITE_REQUEST_CLUSTER,
DQLITE_REQUEST_TRANSFER,
DQLITE_REQUEST_DESCRIBE,
DQLITE_REQUEST_WEIGHT
};
#define DQLITE_REQUEST_CLUSTER_FORMAT_V0 0 /* ID and address */
#define DQLITE_REQUEST_CLUSTER_FORMAT_V1 1 /* ID, address and role */
#define DQLITE_REQUEST_DESCRIBE_FORMAT_V0 0 /* Failure domain and weight */
/* These apply to REQUEST_EXEC, REQUEST_EXEC_SQL, REQUEST_QUERY, and
* REQUEST_QUERY_SQL. */
#define DQLITE_REQUEST_PARAMS_SCHEMA_V0 0 /* One-byte params count */
#define DQLITE_REQUEST_PARAMS_SCHEMA_V1 1 /* Four-byte params count */
/* These apply to REQUEST_PREPARE and RESPONSE_STMT. */
/* At most one statement in request, no tail offset in response */
#define DQLITE_PREPARE_STMT_SCHEMA_V0 0
/* Any number of statements in request, tail offset in response */
#define DQLITE_PREPARE_STMT_SCHEMA_V1 1
/* Response types */
enum {
DQLITE_RESPONSE_FAILURE,
DQLITE_RESPONSE_SERVER,
DQLITE_RESPONSE_SERVER_LEGACY = DQLITE_RESPONSE_SERVER,
DQLITE_RESPONSE_WELCOME,
DQLITE_RESPONSE_SERVERS,
DQLITE_RESPONSE_DB,
DQLITE_RESPONSE_STMT,
DQLITE_RESPONSE_STMT_WITH_OFFSET = DQLITE_RESPONSE_STMT,
DQLITE_RESPONSE_RESULT,
DQLITE_RESPONSE_ROWS,
DQLITE_RESPONSE_EMPTY,
DQLITE_RESPONSE_FILES,
DQLITE_RESPONSE_METADATA
};
#endif /* DQLITE_PROTOCOL_H_ */
dqlite-1.16.7/src/query.c 0000664 0000000 0000000 00000006454 14652527134 0015212 0 ustar 00root root 0000000 0000000 #include "query.h"
#include "tuple.h"
/* Return the type code of the i'th column value.
*
* TODO: find a better way to handle time types. */
static int value_type(sqlite3_stmt *stmt, int i)
{
int type = sqlite3_column_type(stmt, i);
const char *column_type_name = sqlite3_column_decltype(stmt, i);
if (column_type_name != NULL) {
if ((strcasecmp(column_type_name, "DATETIME") == 0) ||
(strcasecmp(column_type_name, "DATE") == 0) ||
(strcasecmp(column_type_name, "TIMESTAMP") == 0)) {
if (type == SQLITE_INTEGER) {
type = DQLITE_UNIXTIME;
} else {
assert(type == SQLITE_TEXT ||
type == SQLITE_NULL);
type = DQLITE_ISO8601;
}
} else if (strcasecmp(column_type_name, "BOOLEAN") == 0) {
assert(type == SQLITE_INTEGER || type == SQLITE_NULL);
type = DQLITE_BOOLEAN;
}
}
assert(type < 16);
return type;
}
/* Append a single row to the message. */
static int encode_row(sqlite3_stmt *stmt, struct buffer *buffer, int n)
{
struct tuple_encoder encoder;
int rc;
int i;
rc = tuple_encoder__init(&encoder, (unsigned)n, TUPLE__ROW, buffer);
if (rc != 0) {
return SQLITE_ERROR;
}
/* Encode the row values */
for (i = 0; i < n; i++) {
/* Figure the type */
struct value value;
value.type = value_type(stmt, i);
switch (value.type) {
case SQLITE_INTEGER:
value.integer = sqlite3_column_int64(stmt, i);
break;
case SQLITE_FLOAT:
value.float_ = sqlite3_column_double(stmt, i);
break;
case SQLITE_BLOB:
value.blob.base =
(char *)sqlite3_column_blob(stmt, i);
value.blob.len =
(size_t)sqlite3_column_bytes(stmt, i);
break;
case SQLITE_NULL:
/* TODO: allow null to be encoded with 0 bytes
*/
value.null = 0;
break;
case SQLITE_TEXT:
value.text =
(text_t)sqlite3_column_text(stmt, i);
break;
case DQLITE_UNIXTIME:
value.integer = sqlite3_column_int64(stmt, i);
break;
case DQLITE_ISO8601:
value.text =
(text_t)sqlite3_column_text(stmt, i);
if (value.text == NULL) {
value.text = "";
}
break;
case DQLITE_BOOLEAN:
value.integer = sqlite3_column_int64(stmt, i);
break;
default:
return SQLITE_ERROR;
}
rc = tuple_encoder__next(&encoder, &value);
if (rc != 0) {
return rc;
}
}
return SQLITE_OK;
}
int query__batch(sqlite3_stmt *stmt, struct buffer *buffer)
{
int n; /* Column count */
int i;
uint64_t n64;
char *cursor;
int rc;
n = sqlite3_column_count(stmt);
if (n <= 0) {
return SQLITE_ERROR;
}
n64 = (uint64_t)n;
/* Insert the column count */
cursor = buffer__advance(buffer, sizeof(uint64_t));
assert(cursor != NULL);
uint64__encode(&n64, &cursor);
/* Insert the column names */
for (i = 0; i < n; i++) {
const char *name = sqlite3_column_name(stmt, i);
cursor = buffer__advance(buffer, text__sizeof(&name));
if (cursor == NULL) {
return SQLITE_NOMEM;
}
text__encode(&name, &cursor);
}
/* Insert the rows. */
do {
if (buffer__offset(buffer) >= buffer->page_size) {
/* If we are already filled a memory page, let's break
* for now, we'll send more rows in a separate
* response. */
rc = SQLITE_ROW;
break;
}
rc = sqlite3_step(stmt);
if (rc != SQLITE_ROW) {
break;
}
rc = encode_row(stmt, buffer, n);
if (rc != SQLITE_OK) {
break;
}
} while (1);
return rc;
}
dqlite-1.16.7/src/query.h 0000664 0000000 0000000 00000000705 14652527134 0015210 0 ustar 00root root 0000000 0000000 /**
* Step through a query progressively encoding a the row tuples.
*/
#ifndef QUERY_H_
#define QUERY_H_
#include
#include "lib/buffer.h"
#include "lib/serialize.h"
/**
* Step through the given query statement progressively encoding the yielded row
* tuples, either until #SQLITE_DONE is returned or a full page of the given
* buffer is filled.
*/
int query__batch(sqlite3_stmt *stmt, struct buffer *buffer);
#endif /* QUERY_H_*/
dqlite-1.16.7/src/raft.h 0000664 0000000 0000000 00000206306 14652527134 0015004 0 ustar 00root root 0000000 0000000 #if defined(USE_SYSTEM_RAFT)
#include
#include
#include
#elif !defined(RAFT_H)
#define RAFT_H
#include
#include
#include
#include
#include
#include
#include "lib/sm.h"
#include "lib/queue.h"
#ifndef RAFT_API
#define RAFT_API __attribute__((visibility("default")))
#endif
#ifndef DQLITE_VISIBLE_TO_TESTS
#define DQLITE_VISIBLE_TO_TESTS __attribute__((visibility("default")))
#endif
/**
* Version.
*/
#define RAFT_VERSION_MAJOR 0
#define RAFT_VERSION_MINOR 18
#define RAFT_VERSION_RELEASE 0
#define RAFT_VERSION_NUMBER \
(RAFT_VERSION_MAJOR * 100 * 100 + RAFT_VERSION_MINOR * 100 + \
RAFT_VERSION_RELEASE)
int raft_version_number(void);
/**
* Error codes.
*/
enum {
RAFT_NOMEM = 1, /* Out of memory */
RAFT_BADID, /* Server ID is not valid */
RAFT_DUPLICATEID, /* Server ID already in use */
RAFT_DUPLICATEADDRESS, /* Server address already in use */
RAFT_BADROLE, /* Server role is not valid */
RAFT_MALFORMED,
RAFT_NOTLEADER,
RAFT_LEADERSHIPLOST,
RAFT_SHUTDOWN,
RAFT_CANTBOOTSTRAP,
RAFT_CANTCHANGE,
RAFT_CORRUPT,
RAFT_CANCELED,
RAFT_NAMETOOLONG,
RAFT_TOOBIG,
RAFT_NOCONNECTION,
RAFT_BUSY,
RAFT_IOERR, /* File system or storage error */
RAFT_NOTFOUND, /* Resource not found */
RAFT_INVALID, /* Invalid parameter */
RAFT_UNAUTHORIZED, /* No access to a resource */
RAFT_NOSPACE, /* Not enough space on disk */
RAFT_TOOMANY /* Some system or raft limit was hit */
};
/**
* Size of human-readable error message buffers.
*/
#define RAFT_ERRMSG_BUF_SIZE 256
/**
* Return the error message describing the given error code.
*/
RAFT_API const char *raft_strerror(int errnum);
typedef unsigned long long raft_id;
/**
* Hold the value of a raft term. Guaranteed to be at least 64-bit long.
*/
typedef unsigned long long raft_term;
/**
* Hold the value of a raft entry index. Guaranteed to be at least 64-bit long.
*/
typedef unsigned long long raft_index;
/**
* Hold a time value expressed in milliseconds since the epoch.
*/
typedef unsigned long long raft_time;
/**
* Hold the features a raft node is capable of.
*/
typedef uint64_t raft_flags;
/**
* A data buffer.
*/
struct raft_buffer
{
void *base; /* Pointer to the buffer data. */
size_t len; /* Length of the buffer. */
};
/**
* Server role codes.
*/
enum {
RAFT_STANDBY, /* Replicate log, does not participate in quorum. */
RAFT_VOTER, /* Replicate log, does participate in quorum. */
RAFT_SPARE /* Does not replicate log, or participate in quorum. */
};
/**
* Hold information about a single server in the cluster configuration.
* WARNING: This struct is encoded/decoded, be careful when adapting it.
*/
struct raft_server
{
raft_id id; /* Server ID, must be greater than zero. */
char *address; /* Server address. User defined. */
int role; /* Server role. */
};
/**
* Hold information about all servers currently part of the cluster.
* WARNING: This struct is encoded/decoded, be careful when adapting it.
*/
struct raft_configuration
{
struct raft_server
*servers; /* Array of servers member of the cluster. */
unsigned n; /* Number of servers in the array. */
};
/**
* Initialize an empty raft configuration.
*/
RAFT_API void raft_configuration_init(struct raft_configuration *c);
/**
* Release all memory used by the given configuration object.
*/
RAFT_API void raft_configuration_close(struct raft_configuration *c);
/**
* Add a server to a raft configuration.
*
* The @id must be greater than zero and @address point to a valid string.
*
* The @role must be either #RAFT_VOTER, #RAFT_STANDBY, #RAFT_SPARE.
*
* If @id or @address are already in use by another server in the configuration,
* an error is returned.
*
* The @address string will be copied and can be released after this function
* returns.
*/
RAFT_API int raft_configuration_add(struct raft_configuration *c,
raft_id id,
const char *address,
int role);
/**
* Encode the given configuration object.
*
* The memory of the returned buffer is allocated using raft_malloc(), and
* client code is responsible for releasing it when no longer needed.
*/
RAFT_API int raft_configuration_encode(const struct raft_configuration *c,
struct raft_buffer *buf);
/**
* Hash function which outputs a 64-bit value based on a text and a number.
*
* This can be used to generate a unique ID for a new server being added, for
* example based on its address and on the current time in milliseconds since
* the Epoch.
*
* It's internally implemented as a SHA1 where only the last 8 bytes of the hash
* value are kept.
*/
RAFT_API unsigned long long raft_digest(const char *text, unsigned long long n);
/**
* Log entry types.
*/
enum {
RAFT_COMMAND = 1, /* Command for the application FSM. */
RAFT_BARRIER, /* Wait for all previous commands to be applied. */
RAFT_CHANGE /* Raft configuration change. */
};
/**
* A small fixed-size inline buffer that stores extra data for a raft_entry
* that is different for each node in the cluster.
*
* A leader initializes the local data for an entry before passing it into
* raft_apply. This local data is stored in the volatile raft log and also
* in the persistent raft log on the leader. AppendEntries messages sent by
* the leader never contain the local data for entries.
*
* When a follower accepts an AppendEntries request, it invokes a callback
* provided by the FSM to fill out the local data for each new entry before
* appending the entries to its log (volatile and persistent). This local
* data doesn't have to be the same as the local data that the leader computed.
*
* When starting up, a raft node reads the local data for each entry for its
* persistent log as part of populating the volatile log.
*/
struct raft_entry_local_data {
/* Must be the only member of this struct. */
uint8_t buf[16];
};
/**
* A single entry in the raft log.
*
* An entry that originated from this raft instance while it was the leader
* (typically via client calls to raft_apply()) should normally have a @buf
* attribute referencing directly the memory that was originally allocated by
* the client itself to contain the entry data, and the @batch attribute set to
* #NULL.
*
* An entry that was received from the network as part of an AppendEntries RPC
* or that was loaded from disk at startup should normally have a @batch
* attribute that points to a contiguous chunk of memory that contains the data
* of the entry itself plus possibly the data for other entries that were
* received or loaded with it at the same time. In this case the @buf pointer
* will be equal to the @batch pointer plus an offset, that locates the position
* of the entry's data within the batch.
*
* When the @batch attribute is not #NULL the raft library will take care of
* releasing that memory only once there are no more references to the
* associated entries.
*
* This arrangement makes it possible to minimize the amount of memory-copying
* when performing I/O.
*
* The @is_local field is set to `true` by a leader that appends an entry to its
* volatile log. It is set to `false` by a follower that copies an entry received
* via AppendEntries to its volatile log. It is not represented in the AppendEntries
* message or in the persistent log. This field can be used by the FSM's `apply`
* callback to handle a COMMAND entry differently depending on whether it
* originated locally.
*
* Note: The @local_data and @is_local fields do not exist when we use an external
* libraft, because the last separate release of libraft predates their addition.
* The ifdef at the very top of this file ensures that we use the system raft headers
* when we build against an external libraft, so there will be no ABI mismatch as
* a result of incompatible struct layouts.
*/
struct raft_entry
{
raft_term term; /* Term in which the entry was created. */
unsigned short type; /* Type (FSM command, barrier, config change). */
bool is_local; /* Placed here so it goes in the padding after @type. */
struct raft_buffer buf; /* Entry data. */
struct raft_entry_local_data local_data;
void *batch; /* Batch that buf's memory points to, if any. */
};
/**
* Hold the arguments of a RequestVote RPC.
*
* The RequestVote RPC is invoked by candidates to gather votes.
*/
struct raft_request_vote
{
int version;
raft_term term; /* Candidate's term. */
raft_id candidate_id; /* ID of the server requesting the vote. */
raft_index last_log_index; /* Index of candidate's last log entry. */
raft_index last_log_term; /* Term of log entry at last_log_index. */
bool disrupt_leader; /* True if current leader should be discarded. */
bool pre_vote; /* True if this is a pre-vote request. */
};
#define RAFT_REQUEST_VOTE_VERSION 2
/**
* Hold the result of a RequestVote RPC.
*/
struct raft_request_vote_result
{
int version;
raft_term
term; /* Receiver's current term (candidate updates itself). */
bool vote_granted; /* True means candidate received vote. */
bool pre_vote; /* The response to a pre-vote RequestVote or not. */
};
#define RAFT_REQUEST_VOTE_RESULT_VERSION 2
/**
* Hold the arguments of an AppendEntries RPC.
*
* The AppendEntries RPC is invoked by the leader to replicate log entries. It's
* also used as heartbeat (figure 3.1).
*/
struct raft_append_entries
{
int version;
raft_term term; /* Leader's term. */
raft_index prev_log_index; /* Index of log entry preceeding new ones. */
raft_term prev_log_term; /* Term of entry at prev_log_index. */
raft_index leader_commit; /* Leader's commit index. */
struct raft_entry *entries; /* Log entries to append. */
unsigned n_entries; /* Size of the log entries array. */
};
#define RAFT_APPEND_ENTRIES_VERSION 0
/**
* Hold the result of an AppendEntries RPC (figure 3.1).
*/
struct raft_append_entries_result
{
int version;
raft_term term; /* Receiver's current_term. */
raft_index rejected; /* If non-zero, the index that was rejected. */
raft_index
last_log_index; /* Receiver's last log entry index, as hint. */
raft_flags features; /* Feature flags. */
};
#define RAFT_APPEND_ENTRIES_RESULT_VERSION 1
typedef uint32_t checksum_t;
typedef uint32_t pageno_t;
struct page_checksum {
pageno_t page_no;
checksum_t checksum;
};
/* page range [from, to], with to included */
struct page_from_to {
pageno_t from;
pageno_t to;
};
enum raft_result {
RAFT_RESULT_OK = 0,
RAFT_RESULT_UNEXPECTED = 1,
RAFT_RESULT_DONE = 2,
};
/**
* Hold the arguments of an InstallSnapshot RPC (figure 5.3).
*/
struct raft_install_snapshot
{
int version;
raft_term term; /* Leader's term. */
raft_index last_index; /* Index of last entry in the snapshot. */
raft_term last_term; /* Term of last_index. */
struct raft_configuration conf; /* Config as of last_index. */
raft_index conf_index; /* Commit index of conf. */
struct raft_buffer data; /* Raw snapshot data. */
enum raft_result result;
};
#define RAFT_INSTALL_SNAPSHOT_VERSION 0
struct raft_install_snapshot_result {
int version;
enum raft_result result;
};
#define RAFT_INSTALL_SNAPSHOT_RESULT_VERSION 0
struct raft_signature {
int version;
const char *db;
struct page_from_to page_from_to;
pageno_t cs_page_no;
enum raft_result result;
};
#define RAFT_SIGNATURE_VERSION 0
struct raft_signature_result {
int version;
const char *db;
struct page_checksum *cs;
unsigned int cs_nr;
pageno_t cs_page_no;
enum raft_result result;
};
#define RAFT_SIGNATURE_RESULT_VERSION 0
struct raft_install_snapshot_mv {
int version;
const char *db;
struct page_from_to *mv;
unsigned int mv_nr;
enum raft_result result;
};
#define RAFT_INSTALL_SNAPSHOT_MV_VERSION 0
struct raft_install_snapshot_mv_result {
int version;
const char *db;
pageno_t last_known_page_no; /* used for retries and message losses */
enum raft_result result;
};
#define RAFT_INSTALL_SNAPSHOT_MV_RESULT_VERSION 0
struct raft_install_snapshot_cp {
int version;
const char *db;
pageno_t page_no;
struct raft_buffer page_data;
enum raft_result result;
};
#define RAFT_INSTALL_SNAPSHOT_CP_VERSION 0
struct raft_install_snapshot_cp_result {
int version;
pageno_t last_known_page_no; /* used for retries and message losses */
enum raft_result result;
};
#define RAFT_INSTALL_SNAPSHOT_CP_RESULT_VERSION 0
/**
* Hold the arguments of a TimeoutNow RPC.
*
* The TimeoutNow RPC is invoked by leaders to transfer leadership to a
* follower.
*/
struct raft_timeout_now
{
int version;
raft_term term; /* Leader's term. */
raft_index last_log_index; /* Index of leader's last log entry. */
raft_index last_log_term; /* Term of log entry at last_log_index. */
};
#define RAFT_TIMEOUT_NOW_VERSION 0
/**
* Type codes for RPC messages.
*/
enum {
RAFT_IO_APPEND_ENTRIES = 1,
RAFT_IO_APPEND_ENTRIES_RESULT,
RAFT_IO_REQUEST_VOTE,
RAFT_IO_REQUEST_VOTE_RESULT,
RAFT_IO_INSTALL_SNAPSHOT,
RAFT_IO_TIMEOUT_NOW,
RAFT_IO_SIGNATURE,
RAFT_IO_SIGNATURE_RESULT,
RAFT_IO_INSTALL_SNAPSHOT_RESULT,
RAFT_IO_INSTALL_SNAPSHOT_MV,
RAFT_IO_INSTALL_SNAPSHOT_MV_RESULT,
RAFT_IO_INSTALL_SNAPSHOT_CP,
RAFT_IO_INSTALL_SNAPSHOT_CP_RESULT,
};
/**
* A single RPC message that can be sent or received over the network.
*
* The RPC message types all have a `version` field.
* In the libuv io implementation, `version` is filled out during decoding
* and is based on the size of the message on the wire, see e.g.
* `sizeofRequestVoteV1`. The version number in the RAFT_MESSAGE_XXX_VERSION
* macro needs to be bumped every time the message is updated.
*
* Notes when adding a new message type to raft:
* raft_io implementations compiled against old versions of raft don't know the
* new message type and possibly have not allocated enough space for it. When
* such an application receives a new message over the wire, the raft_io
* implementation will err out or drop the message, because it doesn't know how
* to decode it based on its type.
* raft_io implementations compiled against versions of raft that know the new
* message type but at runtime are linked against an older raft lib, will pass
* the message to raft, where raft will drop it.
* When raft receives a message and accesses a field of a new message type,
* the raft_io implementation must have known about the new message type,
* so it was compiled against a modern enough version of raft, and memory
* accesses should be safe.
*
* Sending a new message type with a raft_io implementation that doesn't know
* the type is safe, the implementation should drop the message based on its
* type and will not try to access fields it doesn't know the existence of.
*/
struct raft_message
{
unsigned short type; /* RPC type code. */
raft_id server_id; /* ID of sending or destination server. */
const char
*server_address; /* Address of sending or destination server. */
union { /* Type-specific data */
struct raft_request_vote request_vote;
struct raft_request_vote_result request_vote_result;
struct raft_append_entries append_entries;
struct raft_append_entries_result append_entries_result;
struct raft_install_snapshot install_snapshot;
struct raft_install_snapshot_result install_snapshot_result;
struct raft_signature signature;
struct raft_signature_result signature_result;
struct raft_install_snapshot_cp install_snapshot_cp;
struct raft_install_snapshot_cp_result install_snapshot_cp_result;
struct raft_install_snapshot_mv install_snapshot_mv;
struct raft_install_snapshot_mv_result install_snapshot_mv_result;
struct raft_timeout_now timeout_now;
};
};
/**
* Hold the details of a snapshot.
* The user-provided raft_buffer structs should provide the user with enough
* flexibility to adapt/evolve snapshot formats.
* If this struct would NEED to be adapted in the future, raft can always move
* to a new struct with a new name and a new raft_io version.
*/
struct raft_snapshot
{
/* Index and term of last entry included in the snapshot. */
raft_index index;
raft_term term;
/* Last committed configuration included in the snapshot, along with the
* index it was committed at. */
struct raft_configuration configuration;
raft_index configuration_index;
/* Content of the snapshot. When a snapshot is taken, the user FSM can
* fill the bufs array with more than one buffer. When a snapshot is
* restored, there will always be a single buffer. */
struct raft_buffer *bufs;
unsigned n_bufs;
};
/**
* Asynchronous request to send an RPC message.
*/
struct raft_io_send;
typedef void (*raft_io_send_cb)(struct raft_io_send *req, int status);
struct raft_io_send
{
void *data; /* User data */
raft_io_send_cb cb; /* Request callback */
};
/**
* Asynchronous request to store new log entries.
*/
struct raft_io_append;
typedef void (*raft_io_append_cb)(struct raft_io_append *req, int status);
struct raft_io_append
{
void *data; /* User data */
raft_io_append_cb cb; /* Request callback */
};
/**
* Asynchronous request to store a new snapshot.
*/
struct raft_io_snapshot_put;
typedef void (*raft_io_snapshot_put_cb)(struct raft_io_snapshot_put *req,
int status);
struct raft_io_snapshot_put
{
void *data; /* User data */
raft_io_snapshot_put_cb cb; /* Request callback */
};
/**
* Asynchronous request to load the most recent snapshot available.
*/
struct raft_io_snapshot_get;
typedef void (*raft_io_snapshot_get_cb)(struct raft_io_snapshot_get *req,
struct raft_snapshot *snapshot,
int status);
struct raft_io_snapshot_get
{
void *data; /* User data */
raft_io_snapshot_get_cb cb; /* Request callback */
};
/**
* Asynchronous work request.
*/
struct raft_io_async_work;
typedef int (*raft_io_async_work_fn)(struct raft_io_async_work *req);
typedef void (*raft_io_async_work_cb)(struct raft_io_async_work *req,
int status);
struct raft_io_async_work
{
void *data; /* User data */
raft_io_async_work_fn
work; /* Function to run async from the main loop */
raft_io_async_work_cb cb; /* Request callback */
};
/**
* Customizable tracer, for debugging purposes.
*/
struct raft_tracer
{
/**
* Implementation-defined state object.
*/
void *impl;
/**
* Whether this tracer should emit messages.
*/
bool enabled;
/**
* Trace level.
*/
unsigned level;
/**
* Emit the given trace message, possibly decorating it with the
* provided metadata.
*/
void (*emit)(struct raft_tracer *t,
const char *file,
unsigned int line,
const char *func,
unsigned int level,
const char *message);
};
struct raft_io; /* Forward declaration. */
/**
* Callback invoked by the I/O implementation at regular intervals.
*/
typedef void (*raft_io_tick_cb)(struct raft_io *io);
/**
* Callback invoked by the I/O implementation when an RPC message is received.
*/
typedef void (*raft_io_recv_cb)(struct raft_io *io, struct raft_message *msg);
typedef void (*raft_io_close_cb)(struct raft_io *io);
/**
* version field MUST be filled out by user.
* When moving to a new version, the user MUST implement the newly added
* methods.
*/
struct raft_io
{
int version; /* 1 or 2 */
void *data;
void *impl;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int (*init)(struct raft_io *io, raft_id id, const char *address);
void (*close)(struct raft_io *io, raft_io_close_cb cb);
int (*load)(struct raft_io *io,
raft_term *term,
raft_id *voted_for,
struct raft_snapshot **snapshot,
raft_index *start_index,
struct raft_entry *entries[],
size_t *n_entries);
int (*start)(struct raft_io *io,
unsigned msecs,
raft_io_tick_cb tick,
raft_io_recv_cb recv);
int (*bootstrap)(struct raft_io *io,
const struct raft_configuration *conf);
int (*recover)(struct raft_io *io,
const struct raft_configuration *conf);
int (*set_term)(struct raft_io *io, raft_term term);
int (*set_vote)(struct raft_io *io, raft_id server_id);
int (*send)(struct raft_io *io,
struct raft_io_send *req,
const struct raft_message *message,
raft_io_send_cb cb);
int (*append)(struct raft_io *io,
struct raft_io_append *req,
const struct raft_entry entries[],
unsigned n,
raft_io_append_cb cb);
int (*truncate)(struct raft_io *io, raft_index index);
int (*snapshot_put)(struct raft_io *io,
unsigned trailing,
struct raft_io_snapshot_put *req,
const struct raft_snapshot *snapshot,
raft_io_snapshot_put_cb cb);
int (*snapshot_get)(struct raft_io *io,
struct raft_io_snapshot_get *req,
raft_io_snapshot_get_cb cb);
raft_time (*time)(struct raft_io *io);
int (*random)(struct raft_io *io, int min, int max);
/* Field(s) below added since version 2. */
int (*async_work)(struct raft_io *io,
struct raft_io_async_work *req,
raft_io_async_work_cb cb);
};
/**
* version field MUST be filled out by user.
* When moving to a new version, the user MUST initialize the new methods,
* either with an implementation or with NULL.
*
* version 2:
* introduces `snapshot_finalize`, when this method is not NULL, it will
* always run after a successful call to `snapshot`, whether the snapshot has
* been successfully written to disk or not. If it is set, raft will
* assume no ownership of any of the `raft_buffer`s and the responsibility to
* clean up lies with the user of raft.
* `snapshot_finalize` can be used to e.g. release a lock that was taken during
* a call to `snapshot`. Until `snapshot_finalize` is called, raft can access
* the data contained in the `raft_buffer`s.
*
* version 3:
* Adds support for async snapshots through the `snapshot_async` function.
* When this method is provided, raft will call `snapshot` in the main loop,
* and when successful, will call `snapshot_async` using the `io->async_work`
* method, so blocking I/O calls are allowed in the implementation. After the
* `snapshot_async` completes, `snapshot_finalize` will be called in the main
* loop, independent of the return value of `snapshot_async`.
* An implementation that does not use asynchronous snapshots MUST set
* `snapshot_async` to NULL.
* All memory allocated by the snapshot routines MUST be freed by the snapshot
* routines themselves.
*/
struct raft_fsm
{
int version; /* 1, 2 or 3 */
void *data;
int (*apply)(struct raft_fsm *fsm,
const struct raft_buffer *buf,
void **result);
int (*snapshot)(struct raft_fsm *fsm,
struct raft_buffer *bufs[],
unsigned *n_bufs);
int (*restore)(struct raft_fsm *fsm, struct raft_buffer *buf);
/* Fields below added since version 2. */
int (*snapshot_finalize)(struct raft_fsm *fsm,
struct raft_buffer *bufs[],
unsigned *n_bufs);
/* Fields below added since version 3. */
int (*snapshot_async)(struct raft_fsm *fsm,
struct raft_buffer *bufs[],
unsigned *n_bufs);
};
struct raft; /* Forward declaration. */
/**
* State codes.
*/
enum { RAFT_UNAVAILABLE, RAFT_FOLLOWER, RAFT_CANDIDATE, RAFT_LEADER };
/**
* State callback to invoke if raft's state changes.
*/
typedef void (*raft_state_cb)(struct raft *raft,
unsigned short old_state,
unsigned short new_state);
struct raft_progress;
/**
* Close callback.
*
* It's safe to release the memory of a raft instance only after this callback
* has fired.
*/
typedef void (*raft_close_cb)(struct raft *raft);
struct raft_change; /* Forward declaration */
struct raft_transfer; /* Forward declaration */
struct raft_log;
/**
* Hold and drive the state of a single raft server in a cluster.
* When replacing reserved fields in the middle of this struct, you MUST use a
* type with the same size and alignment requirements as the original type.
*/
struct raft
{
void *data; /* Custom user data. */
struct raft_tracer *tracer; /* Tracer implementation. */
struct raft_io *io; /* Disk and network I/O implementation. */
struct raft_fsm *fsm; /* User-defined FSM to apply commands to. */
raft_id id; /* Server ID of this raft instance. */
char *address; /* Server address of this raft instance. */
/*
* Cache of the server's persistent state, updated on stable storage
* before responding to RPCs (Figure 3.1).
*/
raft_term current_term; /* Latest term server has seen. */
raft_id voted_for; /* Candidate that received vote in current term. */
struct raft_log *log; /* Log entries. */
/*
* Current membership configuration (Chapter 4).
*
* At any given moment the current configuration can be committed or
* uncommitted.
*
* If a server is voting, the log entry with index 1 must always contain
* the first committed configuration.
*
* At all times #configuration_committed_index is either zero or is the
* index of the most recent log entry of type #RAFT_CHANGE that we know
* to be committed. That means #configuration_committed_index is always
* equal or lower than #commit_index.
*
* At all times #configuration_uncommitted_index is either zero or is
* the index of an uncommitted log entry of type #RAFT_CHANGE. There can
* be at most one uncommitted entry of type #RAFT_CHANGE because we
* allow only one configuration change at a time.
*
* At all times #configuration_last_snapshot is a copy of the
* configuration contained the most recent snapshot, if any.
*
* The possible scenarios are:
*
* 1. #configuration_committed_index and
* #configuration_uncommitted_index are both zero. This should only
* happen when a brand new server starts joining a cluster and is
* waiting to receive log entries from the current leader. In this case
* #configuration and #configuration_last_snapshot must be empty and
* have no servers.
*
* 2. #configuration_committed_index is non-zero and
* #configuration_uncommitted_index is zero. This means that
* #configuration is committed and there is no pending configuration
* change. The content of #configuration must match the one of the
* log entry at #configuration_committed_index.
*
* 3. #configuration_committed_index and
* #configuration_uncommitted_index are both non-zero, with the latter
* being greater than the former. This means that #configuration is
* uncommitted and represents a pending configuration change. The
* content of #configuration must match the one of the log entry at
* #configuration_uncommitted_index.
*
* When a snapshot is taken, a copy of the most recent configuration
* known to be committed (i.e. the configuration contained in the log
* entry at #configuration_committed_index) is saved in
* #configuration_last_snapshot, so it can be easily retrieved in case
* the log gets truncated because of compaction and does not contain the
* entry at #configuration_committed_index anymore. Likewise, if a
* snapshot is restored its associated configuration is saved in
* #configuration_last_snapshot.
*/
struct raft_configuration configuration;
struct raft_configuration configuration_last_snapshot;
raft_index configuration_committed_index;
raft_index configuration_uncommitted_index;
/*
* Election timeout in milliseconds (default 1000).
*
* From 3.4:
*
* Raft uses a heartbeat mechanism to trigger leader election. When
* servers start up, they begin as followers. A server remains in
* follower state as long as it receives valid RPCs from a leader or
* candidate. Leaders send periodic heartbeats (AppendEntries RPCs
* that carry no log entries) to all followers in order to maintain
* their authority. If a follower receives no communication over a
* period of time called the election timeout, then it assumes there is
* no viable leader and begins an election to choose a new leader.
*
* This is the baseline value and will be randomized between 1x and 2x.
*
* See raft_change_election_timeout() to customize the value of this
* attribute.
*/
unsigned election_timeout;
/*
* Heartbeat timeout in milliseconds (default 100). This is relevant
* only for when the raft instance is in leader state: empty
* AppendEntries RPCs will be sent if this amount of milliseconds
* elapses without any user-triggered AppendEntries RCPs being sent.
*
* From Figure 3.1:
*
* [Leaders] Send empty AppendEntries RPC during idle periods to
* prevent election timeouts.
*/
unsigned heartbeat_timeout;
/*
* When the leader sends an InstallSnapshot RPC to a follower it will
* consider the RPC as failed after this timeout and retry.
*/
unsigned install_snapshot_timeout;
/*
* The fields below hold the part of the server's volatile state which
* is always applicable regardless of the whether the server is
* follower, candidate or leader (Figure 3.1). This state is rebuilt
* automatically after a server restart.
*/
raft_index commit_index; /* Highest log entry known to be committed */
raft_index last_applied; /* Highest log entry applied to the FSM */
raft_index last_stored; /* Highest log entry persisted on disk */
/*
* Current server state of this raft instance, along with a union
* defining state-specific values.
*/
unsigned short state;
union {
struct /* Follower */
{
unsigned
randomized_election_timeout; /* Timer expiration. */
struct /* Current leader info. */
{
raft_id id;
char *address;
} current_leader;
uint64_t append_in_flight_count;
uint64_t reserved[7]; /* Future use */
} follower_state;
struct
{
unsigned
randomized_election_timeout; /* Timer expiration. */
bool *votes; /* Vote results. */
bool disrupt_leader; /* For leadership transfer */
bool in_pre_vote; /* True in pre-vote phase. */
uint64_t reserved[8]; /* Future use */
} candidate_state;
struct
{
struct raft_progress
*progress; /* Per-server replication state. */
struct raft_change
*change; /* Pending membership change. */
raft_id promotee_id; /* ID of server being promoted. */
unsigned short round_number; /* Current sync round. */
raft_index
round_index; /* Target of the current round. */
raft_time round_start; /* Start of current round. */
queue requests; /* Outstanding client requests. */
uint32_t
voter_contacts; /* Current number of voting nodes we
are in contact with */
uint32_t reserved2; /* Future use */
uint64_t reserved[7]; /* Future use */
} leader_state;
};
/* Election timer start.
*
* This timer has different purposes depending on the state. Followers
* convert to candidate after the randomized election timeout has
* elapsed without leader contact. Candidates start a new election after
* the randomized election timeout has elapsed without a winner. Leaders
* step down after the election timeout has elapsed without contacting a
* majority of voting servers. */
raft_time election_timer_start;
/* In-progress leadership transfer request, if any. */
struct raft_transfer *transfer;
/*
* Information about the last snapshot that was taken (if any).
*/
struct
{
unsigned threshold; /* N. of entries before snapshot */
unsigned trailing; /* N. of trailing entries to retain */
struct raft_snapshot pending; /* In progress snapshot */
struct raft_io_snapshot_put put; /* Store snapshot request */
uint64_t reserved[8]; /* Future use */
} snapshot;
/*
* Callback to invoke once a close request has completed.
*/
raft_close_cb close_cb;
/*
* Human-readable message providing diagnostic information about the
* last error occurred.
*/
char errmsg[RAFT_ERRMSG_BUF_SIZE];
/* Whether to use pre-vote to avoid disconnected servers disrupting the
* current leader, as described in 4.2.3 and 9.6. */
bool pre_vote;
/* Limit how long to wait for a stand-by to catch-up with the log when
* its being promoted to voter. */
unsigned max_catch_up_rounds;
unsigned max_catch_up_round_duration;
/* uint64_t because we used a reserved field. In reality this a pointer
* to a `struct raft_callbacks` that can be used to store e.g. various
* user-supplied callbacks. */
uint64_t callbacks;
/* Future extensions */
uint64_t reserved[31];
};
RAFT_API int raft_init(struct raft *r,
struct raft_io *io,
struct raft_fsm *fsm,
raft_id id,
const char *address);
RAFT_API void raft_close(struct raft *r, raft_close_cb cb);
/**
* This function MUST be called after raft_init and before raft_start.
* @cb will be called every time the raft state changes.
*/
RAFT_API void raft_register_state_cb(struct raft *r, raft_state_cb cb);
/**
* Bootstrap this raft instance using the given configuration. The instance must
* not have been started yet and must be completely pristine, otherwise
* #RAFT_CANTBOOTSTRAP will be returned.
*/
RAFT_API int raft_bootstrap(struct raft *r,
const struct raft_configuration *conf);
/**
* Force a new configuration in order to recover from a loss of quorum where the
* current configuration cannot be restored, such as when a majority of servers
* die at the same time.
*
* This works by appending the new configuration directly to the log stored on
* disk.
*
* In order for this operation to be safe you must follow these steps:
*
* 1. Make sure that no servers in the cluster are running, either because they
* died or because you manually stopped them.
*
* 2. Run @raft_recover exactly one time, on the non-dead server which has
* the highest term and the longest log.
*
* 3. Copy the data directory of the server you ran @raft_recover on to all
* other non-dead servers in the cluster, replacing their current data
* directory.
*
* 4. Restart all servers.
*/
RAFT_API int raft_recover(struct raft *r,
const struct raft_configuration *conf);
RAFT_API int raft_start(struct raft *r);
/**
* Set the election timeout.
*
* Every raft instance is initialized with a default election timeout of 1000
* milliseconds. If you wish to tweak it, call this function before starting
* your event loop.
*
* From Chapter 9:
*
* We recommend a range that is 10-20 times the one-way network latency, which
* keeps split votes rates under 40% in all cases for reasonably sized
* clusters, and typically results in much lower rates.
*
* Note that the current random election timer will be reset and a new one timer
* will be generated.
*/
RAFT_API void raft_set_election_timeout(struct raft *r, unsigned msecs);
/**
* Set the heartbeat timeout.
*/
RAFT_API void raft_set_heartbeat_timeout(struct raft *r, unsigned msecs);
/**
* Set the snapshot install timeout.
*/
RAFT_API void raft_set_install_snapshot_timeout(struct raft *r, unsigned msecs);
/**
* Number of outstanding log entries before starting a new snapshot. The default
* is 1024.
*/
RAFT_API void raft_set_snapshot_threshold(struct raft *r, unsigned n);
/**
* Enable or disable pre-vote support. Pre-vote is turned off by default.
*/
RAFT_API void raft_set_pre_vote(struct raft *r, bool enabled);
/**
* Number of outstanding log entries to keep in the log after a snapshot has
* been taken. This avoids sending snapshots when a follower is behind by just a
* few entries. The default is 128.
*/
RAFT_API void raft_set_snapshot_trailing(struct raft *r, unsigned n);
/**
* Set the maximum number of a catch-up rounds to try when replicating entries
* to a stand-by server that is being promoted to voter, before giving up and
* failing the configuration change. The default is 10.
*/
RAFT_API void raft_set_max_catch_up_rounds(struct raft *r, unsigned n);
/**
* Set the maximum duration of a catch-up round when replicating entries to a
* stand-by server that is being promoted to voter. The default is 5 seconds.
*/
RAFT_API void raft_set_max_catch_up_round_duration(struct raft *r,
unsigned msecs);
/**
* Return a human-readable description of the last error occurred.
*/
RAFT_API const char *raft_errmsg(struct raft *r);
/**
* Return the code of the current raft state (follower/candidate/leader).
*/
RAFT_API int raft_state(struct raft *r);
/**
* Return the code of the current raft role (spare/standby/voter),
* or -1 if this server is not in the current configuration.
*/
RAFT_API int raft_role(struct raft *r);
/**
* Return the ID and address of the current known leader, if any.
*/
RAFT_API void raft_leader(struct raft *r, raft_id *id, const char **address);
/**
* Return the index of the last entry that was appended to the local log.
*/
RAFT_API raft_index raft_last_index(struct raft *r);
/**
* Return the index of the last entry that was applied to the local FSM.
*/
RAFT_API raft_index raft_last_applied(struct raft *r);
/**
* Return the number of voting servers that the leader has recently been in
* contact with. This can be used to help determine whether the cluster may be
* in a degraded/at risk state.
*
* Returns valid values >= 1, because a leader is always in contact with
* itself.
* Returns -1 if called on a follower.
*
* Note that the value returned may be out of date, and so should not be relied
* upon for absolute correctness.
*/
RAFT_API int raft_voter_contacts(struct raft *r);
/**
* Common fields across client request types.
* `req_id`, `client_id` and `unique_id` are currently unused.
* `reserved` fields should be replaced by new members with the same size
* and alignment requirements as `uint64_t`.
*/
#define RAFT__REQUEST \
void *data; \
int type; \
raft_index index; \
queue queue; \
uint8_t req_id[16]; \
uint8_t client_id[16]; \
uint8_t unique_id[16]; \
uint64_t reserved[4]
/**
* Asynchronous request to append a new command entry to the log and apply it to
* the FSM when a quorum is reached.
*/
struct raft_apply;
typedef void (*raft_apply_cb)(struct raft_apply *req, int status, void *result);
struct raft_apply
{
RAFT__REQUEST;
raft_apply_cb cb;
};
/**
* Propose to append commands to the log and apply them to the FSM once
* committed.
*
* If this server is the leader, it will create @n new log entries of type
* #RAFT_COMMAND using the given buffers as their payloads, append them to its
* own log and attempt to replicate them on other servers by sending
* AppendEntries RPCs.
*
* The memory pointed at by the @base attribute of each #raft_buffer in the
* given array must have been allocated with raft_malloc() or a compatible
* allocator. If this function returns 0, the ownership of this memory is
* implicitly transferred to the raft library, which will take care of releasing
* it when appropriate. Any further client access to such memory leads to
* undefined behavior.
*
* The ownership of the memory of the @bufs array itself is not transferred to
* the raft library, and, if allocated dynamically, must be deallocated by the
* caller.
*
* If the command was successfully applied, r->last_applied will be equal to
* the log entry index of the applied command when the cb is invoked.
*/
RAFT_API int raft_apply(struct raft *r,
struct raft_apply *req,
const struct raft_buffer bufs[],
const struct raft_entry_local_data local_data[],
const unsigned n,
raft_apply_cb cb);
/**
* Asynchronous request to append a barrier entry.
*/
struct raft_barrier;
typedef void (*raft_barrier_cb)(struct raft_barrier *req, int status);
struct raft_barrier
{
RAFT__REQUEST;
raft_barrier_cb cb;
};
/**
* Propose to append a log entry of type #RAFT_BARRIER.
*
* This can be used to ensure that there are no unapplied commands.
*/
RAFT_API int raft_barrier(struct raft *r,
struct raft_barrier *req,
raft_barrier_cb cb);
/**
* Asynchronous request to change the raft configuration.
*/
typedef void (*raft_change_cb)(struct raft_change *req, int status);
struct raft_change
{
RAFT__REQUEST;
raft_change_cb cb;
};
/**
* Add a new server to the cluster configuration. Its initial role will be
* #RAFT_SPARE.
*/
RAFT_API int raft_add(struct raft *r,
struct raft_change *req,
raft_id id,
const char *address,
raft_change_cb cb);
/**
* Assign a new role to the given server.
*
* If the server has already the given role, or if the given role is unknown,
* #RAFT_BADROLE is returned.
*/
RAFT_API int raft_assign(struct raft *r,
struct raft_change *req,
raft_id id,
int role,
raft_change_cb cb);
/**
* Remove the given server from the cluster configuration.
*/
RAFT_API int raft_remove(struct raft *r,
struct raft_change *req,
raft_id id,
raft_change_cb cb);
/**
* Asynchronous request to transfer leadership.
*/
typedef void (*raft_transfer_cb)(struct raft_transfer *req);
struct raft_transfer
{
RAFT__REQUEST;
raft_id id; /* ID of target server. */
raft_time start; /* Start of leadership transfer. */
struct raft_io_send send; /* For sending TimeoutNow */
raft_transfer_cb cb; /* User callback */
};
/**
* Transfer leadership to the server with the given ID.
*
* If the target server is not part of the configuration, or it's the leader
* itself, or it's not a #RAFT_VOTER, then #RAFT_BADID is returned.
*
* The special value #0 means to automatically select a voting follower to
* transfer leadership to. If there are no voting followers, return
* #RAFT_NOTFOUND.
*
* When this server detects that the target server has become the leader, or
* when @election_timeout milliseconds have elapsed, the given callback will be
* invoked.
*
* After the callback files, clients can check whether the operation was
* successful or not by calling @raft_leader() and checking if it returns the
* target server.
*/
RAFT_API int raft_transfer(struct raft *r,
struct raft_transfer *req,
raft_id id,
raft_transfer_cb cb);
/**
* User-definable dynamic memory allocation functions.
*
* The @data field will be passed as first argument to all functions.
*/
struct raft_heap
{
void *data; /* User data */
void *(*malloc)(void *data, size_t size);
void (*free)(void *data, void *ptr);
void *(*calloc)(void *data, size_t nmemb, size_t size);
void *(*realloc)(void *data, void *ptr, size_t size);
void *(*aligned_alloc)(void *data, size_t alignment, size_t size);
void (*aligned_free)(void *data, size_t alignment, void *ptr);
};
DQLITE_VISIBLE_TO_TESTS void *raft_malloc(size_t size);
DQLITE_VISIBLE_TO_TESTS void raft_free(void *ptr);
DQLITE_VISIBLE_TO_TESTS void *raft_calloc(size_t nmemb, size_t size);
DQLITE_VISIBLE_TO_TESTS void *raft_realloc(void *ptr, size_t size);
DQLITE_VISIBLE_TO_TESTS void *raft_aligned_alloc(size_t alignment, size_t size);
DQLITE_VISIBLE_TO_TESTS void raft_aligned_free(size_t alignment, void *ptr);
/**
* Use a custom dynamic memory allocator.
*/
DQLITE_VISIBLE_TO_TESTS void raft_heap_set(struct raft_heap *heap);
/**
* Use the default dynamic memory allocator (from the stdlib). This clears any
* custom allocator specified with @raft_heap_set.
*/
DQLITE_VISIBLE_TO_TESTS void raft_heap_set_default(void);
/**
* Return a reference to the current dynamic memory allocator.
*
* This is intended for use by applications that want to temporarily replace
* and then restore the original allocator, or that want to defer to the
* original allocator in some circumstances.
*
* The behavior of attempting to mutate the default allocator through the
* pointer returned by this function, including attempting to deallocate
* the backing memory, is undefined.
*/
DQLITE_VISIBLE_TO_TESTS const struct raft_heap *raft_heap_get(void);
#undef RAFT__REQUEST
struct raft_uv_transport;
/**
* Configure the given @raft_io instance to use a libuv-based I/O
* implementation.
*
* The @dir path will be copied, and its memory can possibly be released once
* this function returns.
*
* Return #RAFT_NAMETOOLONG if @dir exceeds the size of the internal buffer
* that should hold it
*
* Return #RAFT_NOTFOUND if @dir does not exist.
*
* Return #RAFT_INVALID if @dir exists but it's not a directory.
*
* The implementation of metadata and log persistency is virtually the same as
* the one found in LogCabin [0].
*
* The disk files consist of metadata files, closed segments, and open
* segments. Metadata files are used to track Raft metadata, such as the
* server's current term, vote, and log's start index. Segments contain
* contiguous entries that are part of the log. Closed segments are never
* written to again (but may be renamed and truncated if a suffix of the log is
* truncated). Open segments are where newly appended entries go. Once an open
* segment reaches the maximum allowed size, it is closed and a new one is used.
*
* Metadata files are named "metadata1" and "metadata2". The code alternates
* between these so that there is always at least one readable metadata file.
* On boot, the readable metadata file with the higher version number is used.
*
* The format of a metadata file is:
*
* [8 bytes] Format (currently 1).
* [8 bytes] Incremental version number.
* [8 bytes] Current term.
* [8 bytes] ID of server we voted for.
*
* Closed segments are named by the format string "%lu-%lu" with their
* start and end indexes, both inclusive. Closed segments always contain at
* least one entry; the end index is always at least as large as the start
* index. Closed segment files may occasionally include data past their
* filename's end index (these are ignored but a warning is logged). This can
* happen if the suffix of the segment is truncated and a crash occurs at an
* inopportune time (the segment file is first renamed, then truncated, and a
* crash occurs in between).
*
* Open segments are named by the format string "open-%lu" with a unique
* number. These should not exist when the server shuts down cleanly, but they
* exist while the server is running and may be left around during a crash.
* Open segments either contain entries which come after the last closed
* segment or are full of zeros. When the server crashes while appending to an
* open segment, the end of that file may be corrupt. We can't distinguish
* between a corrupt file and a partially written entry. The code assumes it's
* a partially written entry, logs a warning, and ignores it.
*
* Truncating a suffix of the log will remove all entries that are no longer
* part of the log. Truncating a prefix of the log will only remove complete
* segments that are before the new log start index. For example, if a
* segment has entries 10 through 20 and the prefix of the log is truncated to
* start at entry 15, that entire segment will be retained.
*
* Each segment file starts with a segment header, which currently contains
* just an 8-byte version number for the format of that segment. The current
* format (version 1) is just a concatenation of serialized entry batches.
*
* Each batch has the following format:
*
* [4 bytes] CRC32 checksum of the batch header, little endian.
* [4 bytes] CRC32 checksum of the batch data, little endian.
* [ ... ] Batch (as described in @raft_decode_entries_batch).
*
* [0] https://github.com/logcabin/logcabin/blob/master/Storage/SegmentedLog.h
*/
RAFT_API int raft_uv_init(struct raft_io *io,
struct uv_loop_s *loop,
const char *dir,
struct raft_uv_transport *transport);
/**
* Release any memory allocated internally.
*/
RAFT_API void raft_uv_close(struct raft_io *io);
/**
* Set the block size that will be used for direct I/O.
*
* The default is to automatically detect the appropriate block size.
*/
RAFT_API void raft_uv_set_block_size(struct raft_io *io, size_t size);
/**
* Set the maximum initial size of newly created open segments.
*
* If the given size is not a multiple of the block size, the actual size will
* be reduced to the closest multiple.
*
* The default is 8 megabytes.
*/
RAFT_API void raft_uv_set_segment_size(struct raft_io *io, size_t size);
/**
* Turn snapshot compression on or off.
* Returns non-0 on failure, this can e.g. happen when compression is requested
* while no suitable compression library is found.
*
* By default snapshots are compressed if the appropriate libraries are found.
*/
RAFT_API int raft_uv_set_snapshot_compression(struct raft_io *io,
bool compressed);
/**
* Set how many milliseconds to wait between subsequent retries when
* establishing a connection with another server. The default is 1000
* milliseconds.
*/
RAFT_API void raft_uv_set_connect_retry_delay(struct raft_io *io,
unsigned msecs);
/**
* Emit low-level debug messages using the given tracer.
*/
RAFT_API void raft_uv_set_tracer(struct raft_io *io,
struct raft_tracer *tracer);
/**
* Enable or disable auto-recovery on startup. Default enabled.
*/
RAFT_API void raft_uv_set_auto_recovery(struct raft_io *io, bool flag);
/**
* Callback invoked by the transport implementation when a new incoming
* connection has been established.
*
* No references to @address must be kept after this function returns.
*
* Ownership of @stream is transferred to user code, which is responsible of
* uv_close()'ing it and then releasing its memory.
*/
typedef void (*raft_uv_accept_cb)(struct raft_uv_transport *t,
raft_id id,
const char *address,
struct uv_stream_s *stream);
/**
* Callback invoked by the transport implementation after a connect request has
* completed. If status is #0, then @stream will point to a valid handle, which
* user code is then responsible to uv_close() and then release.
*/
struct raft_uv_connect;
typedef void (*raft_uv_connect_cb)(struct raft_uv_connect *req,
struct uv_stream_s *stream,
int status);
/**
* Handle to a connect request.
*/
struct raft_uv_connect
{
void *data; /* User data */
raft_uv_connect_cb cb; /* Callback */
};
/**
* Callback invoked by the transport implementation after a close request is
* completed.
*/
typedef void (*raft_uv_transport_close_cb)(struct raft_uv_transport *t);
/**
* Interface to establish outgoing connections to other Raft servers and to
* accept incoming connections from them.
*/
struct raft_uv_transport
{
/**
* Keep track of struct version, MUST be filled out by user.
* When moving to a new version, the user MUST implement the newly added
* methods.
* Latest version is 1.
*/
int version;
/**
* User defined data.
*/
void *data;
/**
* Implementation-defined state.
*/
void *impl;
/**
* Human-readable message providing diagnostic information about the
* last error occurred.
*/
char errmsg[RAFT_ERRMSG_BUF_SIZE];
/**
* Initialize the transport with the given server's identity.
*/
int (*init)(struct raft_uv_transport *t,
raft_id id,
const char *address);
/**
* Start listening for incoming connections.
*
* Once a new connection is accepted, the @cb callback passed in the
* initializer must be invoked with the relevant details of the
* connecting Raft server.
*/
int (*listen)(struct raft_uv_transport *t, raft_uv_accept_cb cb);
/**
* Connect to the server with the given ID and address.
*
* The @cb callback must be invoked when the connection has been
* established or the connection attempt has failed. The memory pointed
* by @req can be released only after @cb has fired.
*/
int (*connect)(struct raft_uv_transport *t,
struct raft_uv_connect *req,
raft_id id,
const char *address,
raft_uv_connect_cb cb);
/**
* Close the transport.
*
* The implementation must:
*
* - Stop accepting incoming connections. The @cb callback passed to
* @listen must not be invoked anymore.
*
* - Cancel all pending @connect requests.
*
* - Invoke the @cb callback passed to this method once it's safe to
* release the memory of the transport object.
*/
void (*close)(struct raft_uv_transport *t,
raft_uv_transport_close_cb cb);
};
/**
* Init a transport interface that uses TCP sockets.
*/
RAFT_API int raft_uv_tcp_init(struct raft_uv_transport *t,
struct uv_loop_s *loop);
/**
* Release any memory allocated internally.
*/
RAFT_API void raft_uv_tcp_close(struct raft_uv_transport *t);
/**
* Set the IP address and port that the listening socket will bind to.
*
* By default the socket will bind to the address provided in
* raft_init(), which may be inconvenient if running your application in a
* container, for example.
*
* The @address argument must be an IPv4 dotted quad IP address and port, e.g.
* "0.0.0.0:8080". If you do not provide a port, the default of 8080 will be
* used. The port given here *must* match the port given to raft_init().
*
* Must be called before raft_init().
*/
RAFT_API int raft_uv_tcp_set_bind_address(struct raft_uv_transport *t,
const char *address);
/**
* Raft cluster test fixture, using an in-memory @raft_io implementation. This
* is meant to be used in unit tests.
*/
#define RAFT_FIXTURE_MAX_SERVERS 8
/**
* Fixture step event types.
*/
enum {
RAFT_FIXTURE_TICK = 1, /* The tick callback has been invoked */
RAFT_FIXTURE_NETWORK, /* A network request has been sent or received */
RAFT_FIXTURE_DISK, /* An I/O request has been submitted */
RAFT_FIXTURE_WORK /* A large, CPU and/or memory intensive task */
};
/**
* State of a single server in a cluster fixture.
*/
struct raft_fixture_server;
/**
* Information about a test cluster event triggered by the fixture.
*/
struct raft_fixture_event;
/**
* Returns the type of the event.
*/
int raft_fixture_event_type(struct raft_fixture_event *event);
/**
* Returns the server index of the event.
*/
unsigned raft_fixture_event_server_index(struct raft_fixture_event *event);
/**
* Event callback. See raft_fixture_hook().
*/
struct raft_fixture;
typedef void (*raft_fixture_event_cb)(struct raft_fixture *f,
struct raft_fixture_event *event);
/**
* Test implementation of a cluster of @n servers, each having a user-provided
* FSM.
*
* The cluster can simulate network latency and time elapsed on individual
* servers.
*
* Servers can be alive or dead. Network messages sent to dead servers are
* dropped. Dead servers do not have their @raft_io_tick_cb callback invoked.
*
* Any two servers can be connected or disconnected. Network messages sent
* between disconnected servers are dropped.
*/
struct raft_fixture
{
raft_time time; /* Global time, common to all servers. */
unsigned n; /* Number of servers. */
raft_id leader_id; /* ID of current leader, or 0 if none. */
struct raft_log *log; /* Copy of current leader's log. */
raft_index commit_index; /* Current commit index on leader. */
struct raft_fixture_event *event; /* Last event occurred. */
raft_fixture_event_cb hook; /* Event callback. */
struct raft_fixture_server *servers[RAFT_FIXTURE_MAX_SERVERS];
uint64_t reserved[16]; /* For future expansion of struct. */
};
/**
* Initialize a raft cluster fixture. Servers can be added by using
* `raft_fixture_grow`.
*/
RAFT_API int raft_fixture_init(struct raft_fixture *f);
/**
* Release all memory used by the fixture.
*/
RAFT_API void raft_fixture_close(struct raft_fixture *f);
/**
* Convenience to generate a configuration object containing all servers in the
* cluster. The first @n_voting servers will be voting ones.
*/
RAFT_API int raft_fixture_configuration(struct raft_fixture *f,
unsigned n_voting,
struct raft_configuration *conf);
/**
* Convenience to bootstrap all servers in the cluster using the given
* configuration.
*/
RAFT_API int raft_fixture_bootstrap(struct raft_fixture *f,
struct raft_configuration *conf);
/**
* Convenience to start all servers in the fixture.
*/
RAFT_API int raft_fixture_start(struct raft_fixture *f);
/**
* Return the number of servers in the fixture.
*/
RAFT_API unsigned raft_fixture_n(struct raft_fixture *f);
/**
* Return the current cluster global time. All raft instances see the same time.
*/
RAFT_API raft_time raft_fixture_time(struct raft_fixture *f);
/**
* Return the raft instance associated with the @i'th server of the fixture.
*/
RAFT_API struct raft *raft_fixture_get(struct raft_fixture *f, unsigned i);
/**
* Return @true if the @i'th server hasn't been killed.
*/
RAFT_API bool raft_fixture_alive(struct raft_fixture *f, unsigned i);
/**
* Return the index of the current leader, or the current number of servers if
* there's no leader.
*/
RAFT_API unsigned raft_fixture_leader_index(struct raft_fixture *f);
/**
* Return the ID of the server the @i'th server has voted for, or zero .
*/
RAFT_API raft_id raft_fixture_voted_for(struct raft_fixture *f, unsigned i);
/**
* Drive the cluster so the @i'th server starts an election but doesn't
* necessarily win it.
*
* This is achieved by bumping the randomized election timeout of all other
* servers to a very high value, letting the one of the @i'th server expire.
*
* There must currently be no leader and no candidate and the given server must
* be a voting one. Also, the @i'th server must be connected to a majority of
* voting servers.
*/
RAFT_API void raft_fixture_start_elect(struct raft_fixture *f, unsigned i);
/**
* Calls raft_fixture_start_elect, but waits and asserts that the @i'th server
* has become the leader.
*/
RAFT_API void raft_fixture_elect(struct raft_fixture *f, unsigned i);
/**
* Drive the cluster so the current leader gets deposed.
*
* This is achieved by dropping all AppendEntries result messages sent by
* followers to the leader, until the leader decides to step down because it has
* lost connectivity to a majority of followers.
*/
RAFT_API void raft_fixture_depose(struct raft_fixture *f);
/**
* Step through the cluster state advancing the time to the minimum value needed
* for it to make progress (i.e. for a message to be delivered, for an I/O
* operation to complete or for a single time tick to occur).
*
* In particular, the following happens:
*
* 1. If there are pending #raft_io_send requests, that have been submitted
* using #raft_io->send() and not yet sent, the oldest one is picked and the
* relevant callback fired. This simulates completion of a socket write,
* which means that the send request has been completed. The receiver does
* not immediately receives the message, as the message is propagating
* through the network. However any memory associated with the #raft_io_send
* request can be released (e.g. log entries). The in-memory I/O
* implementation assigns a latency to each RPC message, which will get
* delivered to the receiver only after that amount of time elapses. If the
* sender and the receiver are currently disconnected, the RPC message is
* simply dropped. If a callback was fired, jump directly to 3. and skip 2.
*
* 2. All pending #raft_io_append disk writes across all servers, that have been
* submitted using #raft_io->append() but not yet completed, are scanned and
* the one with the lowest completion time is picked. All in-flight network
* messages waiting to be delivered are scanned and the one with the lowest
* delivery time is picked. All servers are scanned, and the one with the
* lowest tick expiration time is picked. The three times are compared and
* the lowest one is picked. If a #raft_io_append disk write has completed,
* the relevant callback will be invoked, if there's a network message to be
* delivered, the receiver's @raft_io_recv_cb callback gets fired, if a tick
* timer has expired the relevant #raft_io->tick() callback will be
* invoked. Only one event will be fired. If there is more than one event to
* fire, one of them is picked according to the following rules: events for
* servers with lower index are fired first, tick events take precedence over
* disk events, and disk events take precedence over network events.
*
* 3. The current cluster leader is detected (if any). When detecting the leader
* the Election Safety property is checked: no servers can be in leader state
* for the same term. The server in leader state with the highest term is
* considered the current cluster leader, as long as it's "stable", i.e. it
* has been acknowledged by all servers connected to it, and those servers
* form a majority (this means that no further leader change can happen,
* unless the network gets disrupted). If there is a stable leader and it has
* not changed with respect to the previous call to @raft_fixture_step(),
* then the Leader Append-Only property is checked, by comparing its log with
* a copy of it that was taken during the previous iteration.
*
* 4. If there is a stable leader, its current log is copied, in order to be
* able to check the Leader Append-Only property at the next call.
*
* 5. If there is a stable leader, its commit index gets copied.
*
* The function returns information about which particular event occurred
* (either in step 1 or 2).
*/
RAFT_API struct raft_fixture_event *raft_fixture_step(struct raft_fixture *f);
/**
* Call raft_fixture_step() exactly @n times, and return the last event fired.
*/
RAFT_API struct raft_fixture_event *raft_fixture_step_n(struct raft_fixture *f,
unsigned n);
/**
* Step the cluster until the given @stop function returns #true, or @max_msecs
* have elapsed.
*
* Return #true if the @stop function has returned #true within @max_msecs.
*/
RAFT_API bool raft_fixture_step_until(struct raft_fixture *f,
bool (*stop)(struct raft_fixture *f,
void *arg),
void *arg,
unsigned max_msecs);
/**
* Step the cluster until @msecs have elapsed.
*/
RAFT_API void raft_fixture_step_until_elapsed(struct raft_fixture *f,
unsigned msecs);
/**
* Step the cluster until a leader is elected, or @max_msecs have elapsed.
*/
RAFT_API bool raft_fixture_step_until_has_leader(struct raft_fixture *f,
unsigned max_msecs);
/**
* Step the cluster until the current leader gets deposed, or @max_msecs have
* elapsed.
*/
RAFT_API bool raft_fixture_step_until_has_no_leader(struct raft_fixture *f,
unsigned max_msecs);
/**
* Step the cluster until the @i'th server has applied the entry at the given
* index, or @max_msecs have elapsed. If @i equals the number of servers, then
* step until all servers have applied the given entry.
*/
RAFT_API bool raft_fixture_step_until_applied(struct raft_fixture *f,
unsigned i,
raft_index index,
unsigned max_msecs);
/**
* Step the cluster until the state of the @i'th server matches the given one,
* or @max_msecs have elapsed.
*/
RAFT_API bool raft_fixture_step_until_state_is(struct raft_fixture *f,
unsigned i,
int state,
unsigned max_msecs);
/**
* Step the cluster until the term of the @i'th server matches the given one,
* or @max_msecs have elapsed.
*/
RAFT_API bool raft_fixture_step_until_term_is(struct raft_fixture *f,
unsigned i,
raft_term term,
unsigned max_msecs);
/**
* Step the cluster until the @i'th server has voted for the @j'th one, or
* @max_msecs have elapsed.
*/
RAFT_API bool raft_fixture_step_until_voted_for(struct raft_fixture *f,
unsigned i,
unsigned j,
unsigned max_msecs);
/**
* Step the cluster until all pending network messages from the @i'th server to
* the @j'th server have been delivered, or @max_msecs have elapsed.
*/
RAFT_API bool raft_fixture_step_until_delivered(struct raft_fixture *f,
unsigned i,
unsigned j,
unsigned max_msecs);
/**
* Set a function to be called after every time a fixture event occurs as
* consequence of a step.
*/
RAFT_API void raft_fixture_hook(struct raft_fixture *f,
raft_fixture_event_cb hook);
/**
* Disconnect the @i'th and the @j'th servers, so attempts to send a message
* from @i to @j will fail with #RAFT_NOCONNECTION.
*/
RAFT_API void raft_fixture_disconnect(struct raft_fixture *f,
unsigned i,
unsigned j);
/**
* Reconnect the @i'th and the @j'th servers, so attempts to send a message
* from @i to @j will succeed again.
*/
RAFT_API void raft_fixture_reconnect(struct raft_fixture *f,
unsigned i,
unsigned j);
/**
* Saturate the connection between the @i'th and the @j'th servers, so messages
* sent by @i to @j will be silently dropped.
*/
RAFT_API void raft_fixture_saturate(struct raft_fixture *f,
unsigned i,
unsigned j);
/**
* Return true if the connection from the @i'th to the @j'th server has been set
* as saturated.
*/
RAFT_API bool raft_fixture_saturated(struct raft_fixture *f,
unsigned i,
unsigned j);
/**
* Desaturate the connection between the @i'th and the @j'th servers, so
* messages sent by @i to @j will start being delivered again.
*/
RAFT_API void raft_fixture_desaturate(struct raft_fixture *f,
unsigned i,
unsigned j);
/**
* Kill the server with the given index. The server won't receive any message
* and its tick callback won't be invoked.
*/
RAFT_API void raft_fixture_kill(struct raft_fixture *f, unsigned i);
/**
* Revive a killed server with the given index.
*/
RAFT_API void raft_fixture_revive(struct raft_fixture *f, unsigned i);
/**
* Add a new empty server to the cluster and connect it to all others.
*/
RAFT_API int raft_fixture_grow(struct raft_fixture *f, struct raft_fsm *fsm);
/**
* Set the value that will be returned to the @i'th raft instance when it asks
* the underlying #raft_io implementation for a randomized election timeout
* value. The default value is 1000 + @i * 100, meaning that the election timer
* of server 0 will expire first.
*/
RAFT_API void raft_fixture_set_randomized_election_timeout(
struct raft_fixture *f,
unsigned i,
unsigned msecs);
/**
* Set the network latency in milliseconds. Each RPC message sent by the @i'th
* server from now on will take @msecs milliseconds to be delivered. The default
* value is 15.
*/
RAFT_API void raft_fixture_set_network_latency(struct raft_fixture *f,
unsigned i,
unsigned msecs);
/**
* Set the disk I/O latency in milliseconds. Each append request will take this
* amount of milliseconds to complete. The default value is 10.
*/
RAFT_API void raft_fixture_set_disk_latency(struct raft_fixture *f,
unsigned i,
unsigned msecs);
/**
* Send the send latency in milliseconds. Each message send will take this many
* milliseconds before the send callback is invoked.
*/
RAFT_API void raft_fixture_set_send_latency(struct raft_fixture *f,
unsigned i,
unsigned j,
unsigned msecs);
/**
* Set the persisted term of the @i'th server.
*/
RAFT_API void raft_fixture_set_term(struct raft_fixture *f,
unsigned i,
raft_term term);
/**
* Set the most recent persisted snapshot on the @i'th server.
*/
RAFT_API void raft_fixture_set_snapshot(struct raft_fixture *f,
unsigned i,
struct raft_snapshot *snapshot);
/**
* Add an entry to the persisted entries of the @i'th server.
*/
RAFT_API void raft_fixture_add_entry(struct raft_fixture *f,
unsigned i,
struct raft_entry *entry);
RAFT_API void raft_fixture_append_fault(struct raft_fixture *f,
unsigned i,
int delay);
RAFT_API void raft_fixture_vote_fault(struct raft_fixture *f,
unsigned i,
int delay);
RAFT_API void raft_fixture_term_fault(struct raft_fixture *f,
unsigned i,
int delay);
RAFT_API void raft_fixture_send_fault(struct raft_fixture *f,
unsigned i,
int delay);
/**
* Return the number of messages of the given type that the @i'th server has
* successfully sent so far.
*/
RAFT_API unsigned raft_fixture_n_send(struct raft_fixture *f,
unsigned i,
int type);
/**
* Return the number of messages of the given type that the @i'th server has
* received so far.
*/
RAFT_API unsigned raft_fixture_n_recv(struct raft_fixture *f,
unsigned i,
int type);
/**
* Force the @i'th server into the UNAVAILABLE state.
*/
RAFT_API void raft_fixture_make_unavailable(struct raft_fixture *f, unsigned i);
#endif /* RAFT_H */
dqlite-1.16.7/src/raft/ 0000775 0000000 0000000 00000000000 14652527134 0014624 5 ustar 00root root 0000000 0000000 dqlite-1.16.7/src/raft/array.h 0000664 0000000 0000000 00000001654 14652527134 0016121 0 ustar 00root root 0000000 0000000 /* Macros to manipulate contiguous arrays. */
#ifndef ARRAY_H_
#define ARRAY_H_
#include "../raft.h"
/* Append item I of type T to array A which currently has N items.
*
* A and N must both by pointers. Set RV to -1 in case of failure. */
#define ARRAY__APPEND(T, I, A, N, RV) \
{ \
T *tmp_array; \
tmp_array = raft_realloc(*A, (*N + 1) * sizeof **A); \
if (tmp_array != NULL) { \
(*N)++; \
*A = tmp_array; \
(*A)[(*N) - 1] = I; \
RV = 0; \
} else { \
RV = -1; \
} \
}
#endif /* ARRAY_H_ */
dqlite-1.16.7/src/raft/assert.h 0000664 0000000 0000000 00000002662 14652527134 0016304 0 ustar 00root root 0000000 0000000 /* Define the assert() macro, either as the standard one or the test one. */
#ifndef ASSERT_H_
#define ASSERT_H_
#if defined(RAFT_TEST)
extern void munit_errorf_ex(const char *filename,
int line,
const char *format,
...);
#define assert(expr) \
do { \
if (!expr) { \
munit_errorf_ex(__FILE__, __LINE__, \
"assertion failed: ", #expr); \
} \
} while (0)
#elif defined(NDEBUG)
#define assert(x) \
do { \
(void)sizeof(x); \
} while (0)
#elif defined(RAFT_ASSERT_WITH_BACKTRACE)
#include /* for __assert_fail */
#include
#include
#undef assert
#define assert(x) \
do { \
struct backtrace_state *state_; \
if (!(x)) { \
state_ = backtrace_create_state(NULL, 0, NULL, NULL); \
backtrace_print(state_, 0, stderr); \
__assert_fail(#x, __FILE__, __LINE__, __func__); \
} \
} while (0)
#else
#include
#endif
#endif /* ASSERT_H_ */
dqlite-1.16.7/src/raft/byte.c 0000664 0000000 0000000 00000030474 14652527134 0015743 0 ustar 00root root 0000000 0000000 #include "byte.h"
/* Taken from https://github.com/gcc-mirror/gcc/blob/master/libiberty/crc32.c */
static const unsigned byteCrcTable[] = {
0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9, 0x130476dc, 0x17c56b6b,
0x1a864db2, 0x1e475005, 0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61,
0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd, 0x4c11db70, 0x48d0c6c7,
0x4593e01e, 0x4152fda9, 0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75,
0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011, 0x791d4014, 0x7ddc5da3,
0x709f7b7a, 0x745e66cd, 0x9823b6e0, 0x9ce2ab57, 0x91a18d8e, 0x95609039,
0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5, 0xbe2b5b58, 0xbaea46ef,
0xb7a96036, 0xb3687d81, 0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d,
0xd4326d90, 0xd0f37027, 0xddb056fe, 0xd9714b49, 0xc7361b4c, 0xc3f706fb,
0xceb42022, 0xca753d95, 0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1,
0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d, 0x34867077, 0x30476dc0,
0x3d044b19, 0x39c556ae, 0x278206ab, 0x23431b1c, 0x2e003dc5, 0x2ac12072,
0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16, 0x018aeb13, 0x054bf6a4,
0x0808d07d, 0x0cc9cdca, 0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde,
0x6b93dddb, 0x6f52c06c, 0x6211e6b5, 0x66d0fb02, 0x5e9f46bf, 0x5a5e5b08,
0x571d7dd1, 0x53dc6066, 0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba,
0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e, 0xbfa1b04b, 0xbb60adfc,
0xb6238b25, 0xb2e29692, 0x8aad2b2f, 0x8e6c3698, 0x832f1041, 0x87ee0df6,
0x99a95df3, 0x9d684044, 0x902b669d, 0x94ea7b2a, 0xe0b41de7, 0xe4750050,
0xe9362689, 0xedf73b3e, 0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2,
0xc6bcf05f, 0xc27dede8, 0xcf3ecb31, 0xcbffd686, 0xd5b88683, 0xd1799b34,
0xdc3abded, 0xd8fba05a, 0x690ce0ee, 0x6dcdfd59, 0x608edb80, 0x644fc637,
0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb, 0x4f040d56, 0x4bc510e1,
0x46863638, 0x42472b8f, 0x5c007b8a, 0x58c1663d, 0x558240e4, 0x51435d53,
0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47, 0x36194d42, 0x32d850f5,
0x3f9b762c, 0x3b5a6b9b, 0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff,
0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623, 0xf12f560e, 0xf5ee4bb9,
0xf8ad6d60, 0xfc6c70d7, 0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, 0xef68060b,
0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f, 0xc423cd6a, 0xc0e2d0dd,
0xcda1f604, 0xc960ebb3, 0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7,
0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b, 0x9b3660c6, 0x9ff77d71,
0x92b45ba8, 0x9675461f, 0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3,
0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640, 0x4e8ee645, 0x4a4ffbf2,
0x470cdd2b, 0x43cdc09c, 0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8,
0x68860bfd, 0x6c47164a, 0x61043093, 0x65c52d24, 0x119b4be9, 0x155a565e,
0x18197087, 0x1cd86d30, 0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec,
0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088, 0x2497d08d, 0x2056cd3a,
0x2d15ebe3, 0x29d4f654, 0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0,
0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c, 0xe3a1cbc1, 0xe760d676,
0xea23f0af, 0xeee2ed18, 0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4,
0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0, 0x9abc8bd5, 0x9e7d9662,
0x933eb0bb, 0x97ffad0c, 0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668,
0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4};
unsigned byteCrc32(const void *buf, const size_t size, const unsigned init)
{
unsigned crc = init;
uint8_t *cursor = (uint8_t *)buf;
size_t count = size;
while (count--) {
crc = (crc << 8) ^ byteCrcTable[((crc >> 24) ^ *cursor) & 255];
cursor++;
}
return crc;
}
/* ================ sha1.c ================ */
/*
SHA-1 in C
By Steve Reid
100% Public Domain
Test Vectors (from FIPS PUB 180-1)
"abc"
A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D
"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1
A million repetitions of "a"
34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F
*/
/* #define LITTLE_ENDIAN * This should be #define'd already, if true. */
/* #define SHA1HANDSOFF * Copies data before messing with it. */
#define SHA1HANDSOFF
#include
#include
#include /* for u_int*_t */
#if defined(__sun)
#include "solarisfixes.h"
#endif
#ifndef BYTE_ORDER
#if (BSD >= 199103)
#include
#else
#if defined(linux) || defined(__linux__)
#include
#else
#define LITTLE_ENDIAN 1234 /* least-significant byte first (vax, pc) */
#define BIG_ENDIAN 4321 /* most-significant byte first (IBM, net) */
#define PDP_ENDIAN 3412 /* LSB first in word, MSW first in long (pdp)*/
#if defined(vax) || defined(ns32000) || defined(sun386) || \
defined(__i386__) || defined(MIPSEL) || defined(_MIPSEL) || \
defined(BIT_ZERO_ON_RIGHT) || defined(__alpha__) || defined(__alpha)
#define BYTE_ORDER LITTLE_ENDIAN
#endif
#if defined(sel) || defined(pyr) || defined(mc68000) || defined(sparc) || \
defined(is68k) || defined(tahoe) || defined(ibm032) || defined(ibm370) || \
defined(MIPSEB) || defined(_MIPSEB) || defined(_IBMR2) || defined(DGUX) || \
defined(apollo) || defined(__convex__) || defined(_CRAY) || \
defined(__hppa) || defined(__hp9000) || defined(__hp9000s300) || \
defined(__hp9000s700) || defined(BIT_ZERO_ON_LEFT) || defined(m68k) || \
defined(__sparc)
#define BYTE_ORDER BIG_ENDIAN
#endif
#endif /* linux */
#endif /* BSD */
#endif /* BYTE_ORDER */
#if defined(__BYTE_ORDER) && !defined(BYTE_ORDER)
#if (__BYTE_ORDER == __LITTLE_ENDIAN)
#define BYTE_ORDER LITTLE_ENDIAN
#else
#define BYTE_ORDER BIG_ENDIAN
#endif
#endif
#if !defined(BYTE_ORDER) || \
(BYTE_ORDER != BIG_ENDIAN && BYTE_ORDER != LITTLE_ENDIAN && \
BYTE_ORDER != PDP_ENDIAN)
/* you must determine what the correct bit order is for
* your compiler - the next line is an intentional error
* which will force your compiles to bomb until you fix
* the above macros.
*/
#error "Undefined or invalid BYTE_ORDER"
#endif
#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
/* blk0() and blk() perform the initial expand. */
/* I got the idea of expanding during the round function from SSLeay */
#if BYTE_ORDER == LITTLE_ENDIAN
#define blk0(i) \
(block->l[i] = (rol(block->l[i], 24) & 0xFF00FF00) | \
(rol(block->l[i], 8) & 0x00FF00FF))
#elif BYTE_ORDER == BIG_ENDIAN
#define blk0(i) block->l[i]
#else
#error "Endianness not defined!"
#endif
#define blk(i) \
(block->l[i & 15] = \
rol(block->l[(i + 13) & 15] ^ block->l[(i + 8) & 15] ^ \
block->l[(i + 2) & 15] ^ block->l[i & 15], \
1))
/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
#define R0(v, w, x, y, z, i) \
z += ((w & (x ^ y)) ^ y) + blk0(i) + 0x5A827999 + rol(v, 5); \
w = rol(w, 30);
#define R1(v, w, x, y, z, i) \
z += ((w & (x ^ y)) ^ y) + blk(i) + 0x5A827999 + rol(v, 5); \
w = rol(w, 30);
#define R2(v, w, x, y, z, i) \
z += (w ^ x ^ y) + blk(i) + 0x6ED9EBA1 + rol(v, 5); \
w = rol(w, 30);
#define R3(v, w, x, y, z, i) \
z += (((w | x) & y) | (w & x)) + blk(i) + 0x8F1BBCDC + rol(v, 5); \
w = rol(w, 30);
#define R4(v, w, x, y, z, i) \
z += (w ^ x ^ y) + blk(i) + 0xCA62C1D6 + rol(v, 5); \
w = rol(w, 30);
static void byteSha1Transform(uint32_t state[5], const uint8_t buffer[64])
{
uint32_t a, b, c, d, e;
typedef union {
uint8_t c[64];
uint32_t l[16];
} CHAR64LONG16;
#ifdef SHA1HANDSOFF
CHAR64LONG16 block[1]; /* use array to appear as a pointer */
memcpy(block, buffer, 64);
#else
/* The following had better never be used because it causes the
* pointer-to-const buffer to be cast into a pointer to non-const.
* And the result is written through. I threw a "const" in, hoping
* this will cause a diagnostic.
*/
CHAR64LONG16 *block = (const CHAR64LONG16 *)buffer;
#endif
/* Copy context->state[] to working vars */
a = state[0];
b = state[1];
c = state[2];
d = state[3];
e = state[4];
/* 4 rounds of 20 operations each. Loop unrolled. */
R0(a, b, c, d, e, 0);
R0(e, a, b, c, d, 1);
R0(d, e, a, b, c, 2);
R0(c, d, e, a, b, 3);
R0(b, c, d, e, a, 4);
R0(a, b, c, d, e, 5);
R0(e, a, b, c, d, 6);
R0(d, e, a, b, c, 7);
R0(c, d, e, a, b, 8);
R0(b, c, d, e, a, 9);
R0(a, b, c, d, e, 10);
R0(e, a, b, c, d, 11);
R0(d, e, a, b, c, 12);
R0(c, d, e, a, b, 13);
R0(b, c, d, e, a, 14);
R0(a, b, c, d, e, 15);
R1(e, a, b, c, d, 16);
R1(d, e, a, b, c, 17);
R1(c, d, e, a, b, 18);
R1(b, c, d, e, a, 19);
R2(a, b, c, d, e, 20);
R2(e, a, b, c, d, 21);
R2(d, e, a, b, c, 22);
R2(c, d, e, a, b, 23);
R2(b, c, d, e, a, 24);
R2(a, b, c, d, e, 25);
R2(e, a, b, c, d, 26);
R2(d, e, a, b, c, 27);
R2(c, d, e, a, b, 28);
R2(b, c, d, e, a, 29);
R2(a, b, c, d, e, 30);
R2(e, a, b, c, d, 31);
R2(d, e, a, b, c, 32);
R2(c, d, e, a, b, 33);
R2(b, c, d, e, a, 34);
R2(a, b, c, d, e, 35);
R2(e, a, b, c, d, 36);
R2(d, e, a, b, c, 37);
R2(c, d, e, a, b, 38);
R2(b, c, d, e, a, 39);
R3(a, b, c, d, e, 40);
R3(e, a, b, c, d, 41);
R3(d, e, a, b, c, 42);
R3(c, d, e, a, b, 43);
R3(b, c, d, e, a, 44);
R3(a, b, c, d, e, 45);
R3(e, a, b, c, d, 46);
R3(d, e, a, b, c, 47);
R3(c, d, e, a, b, 48);
R3(b, c, d, e, a, 49);
R3(a, b, c, d, e, 50);
R3(e, a, b, c, d, 51);
R3(d, e, a, b, c, 52);
R3(c, d, e, a, b, 53);
R3(b, c, d, e, a, 54);
R3(a, b, c, d, e, 55);
R3(e, a, b, c, d, 56);
R3(d, e, a, b, c, 57);
R3(c, d, e, a, b, 58);
R3(b, c, d, e, a, 59);
R4(a, b, c, d, e, 60);
R4(e, a, b, c, d, 61);
R4(d, e, a, b, c, 62);
R4(c, d, e, a, b, 63);
R4(b, c, d, e, a, 64);
R4(a, b, c, d, e, 65);
R4(e, a, b, c, d, 66);
R4(d, e, a, b, c, 67);
R4(c, d, e, a, b, 68);
R4(b, c, d, e, a, 69);
R4(a, b, c, d, e, 70);
R4(e, a, b, c, d, 71);
R4(d, e, a, b, c, 72);
R4(c, d, e, a, b, 73);
R4(b, c, d, e, a, 74);
R4(a, b, c, d, e, 75);
R4(e, a, b, c, d, 76);
R4(d, e, a, b, c, 77);
R4(c, d, e, a, b, 78);
R4(b, c, d, e, a, 79);
/* Add the working vars back into context.state[] */
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
/* Wipe variables */
a = b = c = d = e = 0;
#ifdef SHA1HANDSOFF
memset(block, '\0', sizeof(block));
#endif
}
void byteSha1Init(struct byteSha1 *s)
{
/* SHA1 initialization constants */
s->state[0] = 0x67452301;
s->state[1] = 0xEFCDAB89;
s->state[2] = 0x98BADCFE;
s->state[3] = 0x10325476;
s->state[4] = 0xC3D2E1F0;
s->count[0] = s->count[1] = 0;
}
/* Run your data through this. */
void __attribute__((noinline))
byteSha1Update(struct byteSha1 *s, const uint8_t *data, uint32_t len)
{
uint32_t i;
uint32_t j;
j = s->count[0];
if ((s->count[0] += len << 3) < j)
s->count[1]++;
s->count[1] += (len >> 29);
j = (j >> 3) & 63;
if ((j + len) > 63) {
memcpy(&s->buffer[j], data, (i = 64 - j));
byteSha1Transform(s->state, s->buffer);
for (; i + 63 < len; i += 64) {
byteSha1Transform(s->state, &data[i]);
}
j = 0;
} else
i = 0;
memcpy(&s->buffer[j], &data[i], len - i);
}
/* Add padding and return the message digest. */
void byteSha1Digest(struct byteSha1 *s, uint8_t value[20])
{
unsigned i;
uint8_t finalcount[8];
uint8_t c;
#if 0 /* untested "improvement" by DHR */
/* Convert context->count to a sequence of bytes
* in finalcount. Second element first, but
* big-endian order within element.
* But we do it all backwards.
*/
uint8_t *fcp = &finalcount[8];
for (i = 0; i < 2; i++)
{
u_int32_t t = context->count[i];
int j;
for (j = 0; j < 4; t >>= 8, j++)
*--fcp = (uint8_t) t
}
#else
for (i = 0; i < 8; i++) {
finalcount[i] = (uint8_t)((s->count[(i >= 4 ? 0 : 1)] >>
((3 - (i & 3)) * 8)) &
255); /* Endian independent */
}
#endif
c = 0200;
byteSha1Update(s, &c, 1);
while ((s->count[0] & 504) != 448) {
c = 0000;
byteSha1Update(s, &c, 1);
}
byteSha1Update(s, finalcount, 8); /* Should cause a SHA1Transform() */
for (i = 0; i < 20; i++) {
value[i] =
(uint8_t)((s->state[i >> 2] >> ((3 - (i & 3)) * 8)) & 255);
}
/* Wipe variables */
memset(s, '\0', sizeof(*s));
memset(&finalcount, '\0', sizeof(finalcount));
}
/* ================ end of sha1.c ================ */
dqlite-1.16.7/src/raft/byte.h 0000664 0000000 0000000 00000007405 14652527134 0015746 0 ustar 00root root 0000000 0000000 /* Byte-level utilities. */
#ifndef BYTE_H_
#define BYTE_H_
#include
#include
#include
#if defined(__cplusplus)
#define BYTE__INLINE inline
#else
#if defined(__clang__)
#define BYTE__INLINE static inline __attribute__((unused))
#else
#define BYTE__INLINE static inline
#endif
#endif
/* Compile-time endianess detection (best effort). */
#if (defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \
(defined(__ARMEL__) && (__ARMEL__ == 1))
#define BYTE__LITTLE_ENDIAN
#elif defined(__BYTE_ORDER) && (__BYTE_ORDER == __BIG_ENDIAN) && \
defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8
#define RAFT__BIG_ENDIAN
#endif
/* Flip a 32-bit number to network byte order (little endian) */
BYTE__INLINE uint32_t byteFlip32(uint32_t v)
{
#if defined(BYTE__LITTLE_ENDIAN)
return v;
#elif defined(RAFT__BIG_ENDIAN)
return __builtin_bswap32(v);
#else /* Unknown endianess */
union {
uint32_t u;
uint8_t v[4];
} s;
s.v[0] = (uint8_t)v;
s.v[1] = (uint8_t)(v >> 8);
s.v[2] = (uint8_t)(v >> 16);
s.v[3] = (uint8_t)(v >> 24);
return s.u;
#endif
}
/* Flip a 64-bit number to network byte order (little endian) */
BYTE__INLINE uint64_t byteFlip64(uint64_t v)
{
#if defined(BYTE__LITTLE_ENDIAN)
return v;
#elif defined(RAFT__BIG_ENDIAN)
return __builtin_bswap64(v);
#else
union {
uint64_t u;
uint8_t v[8];
} s;
s.v[0] = (uint8_t)v;
s.v[1] = (uint8_t)(v >> 8);
s.v[2] = (uint8_t)(v >> 16);
s.v[3] = (uint8_t)(v >> 24);
s.v[4] = (uint8_t)(v >> 32);
s.v[5] = (uint8_t)(v >> 40);
s.v[6] = (uint8_t)(v >> 48);
s.v[7] = (uint8_t)(v >> 56);
return s.u;
#endif
}
BYTE__INLINE void bytePut8(void **cursor, uint8_t value)
{
uint8_t **p = (uint8_t **)cursor;
**p = value;
*p += 1;
}
BYTE__INLINE void bytePut32(void **cursor, uint32_t value)
{
unsigned i;
uint32_t flipped = byteFlip32(value);
for (i = 0; i < sizeof(uint32_t); i++) {
bytePut8(cursor, ((uint8_t *)(&flipped))[i]);
}
}
BYTE__INLINE void bytePut64(void **cursor, uint64_t value)
{
unsigned i;
uint64_t flipped = byteFlip64(value);
for (i = 0; i < sizeof(uint64_t); i++) {
bytePut8(cursor, ((uint8_t *)(&flipped))[i]);
}
}
BYTE__INLINE void bytePutString(void **cursor, const char *value)
{
char **p = (char **)cursor;
strcpy(*p, value);
*p += strlen(value) + 1;
}
BYTE__INLINE uint8_t byteGet8(const void **cursor)
{
const uint8_t **p = (const uint8_t **)cursor;
uint8_t value = **p;
*p += 1;
return value;
}
BYTE__INLINE uint32_t byteGet32(const void **cursor)
{
uint32_t value = 0;
unsigned i;
for (i = 0; i < sizeof(uint32_t); i++) {
((uint8_t *)(&value))[i] = byteGet8(cursor);
}
return byteFlip32(value);
}
BYTE__INLINE uint64_t byteGet64(const void **cursor)
{
uint64_t value = 0;
unsigned i;
for (i = 0; i < sizeof(uint64_t); i++) {
((uint8_t *)(&value))[i] = byteGet8(cursor);
}
return byteFlip64(value);
}
BYTE__INLINE const char *byteGetString(const void **cursor, size_t max_len)
{
const char **p = (const char **)cursor;
const char *value = *p;
size_t len = 0;
while (len < max_len) {
if (*(*p + len) == 0) {
break;
}
len++;
}
if (len == max_len) {
return NULL;
}
*p += len + 1;
return value;
}
/* Add padding to size if it's not a multiple of 8. */
BYTE__INLINE size_t bytePad64(size_t size)
{
size_t rest = size % sizeof(uint64_t);
if (rest != 0) {
size += sizeof(uint64_t) - rest;
}
return size;
}
/* Calculate the CRC32 checksum of the given data buffer. */
unsigned byteCrc32(const void *buf, size_t size, unsigned init);
struct byteSha1
{
uint32_t state[5];
uint32_t count[2];
uint8_t buffer[64];
uint8_t value[20];
};
void byteSha1Init(struct byteSha1 *s);
void byteSha1Update(struct byteSha1 *s, const uint8_t *data, uint32_t len);
void byteSha1Digest(struct byteSha1 *s, uint8_t value[20]);
#endif /* BYTE_H_ */
dqlite-1.16.7/src/raft/callbacks.c 0000664 0000000 0000000 00000000737 14652527134 0016716 0 ustar 00root root 0000000 0000000 #include "callbacks.h"
#include "heap.h"
int raftInitCallbacks(struct raft *r)
{
r->callbacks = 0;
struct raft_callbacks *cbs = RaftHeapCalloc(1, sizeof(*cbs));
if (cbs == NULL) {
return RAFT_NOMEM;
}
r->callbacks = (uint64_t)(uintptr_t)cbs;
return 0;
}
void raftDestroyCallbacks(struct raft *r)
{
RaftHeapFree((void *)(uintptr_t)r->callbacks);
r->callbacks = 0;
}
struct raft_callbacks *raftGetCallbacks(struct raft *r)
{
return (void *)(uintptr_t)r->callbacks;
}
dqlite-1.16.7/src/raft/callbacks.h 0000664 0000000 0000000 00000000411 14652527134 0016710 0 ustar 00root root 0000000 0000000 #ifndef CALLBACKS_H_
#define CALLBACKS_H_
#include "../raft.h"
struct raft_callbacks
{
raft_state_cb state_cb;
};
int raftInitCallbacks(struct raft *r);
void raftDestroyCallbacks(struct raft *r);
struct raft_callbacks *raftGetCallbacks(struct raft *r);
#endif
dqlite-1.16.7/src/raft/client.c 0000664 0000000 0000000 00000022706 14652527134 0016255 0 ustar 00root root 0000000 0000000 #include "../raft.h"
#include "../tracing.h"
#include "assert.h"
#include "configuration.h"
#include "err.h"
#include "lifecycle.h"
#include "log.h"
#include "membership.h"
#include "progress.h"
#include "../lib/queue.h"
#include "replication.h"
#include "request.h"
int raft_apply(struct raft *r,
struct raft_apply *req,
const struct raft_buffer bufs[],
const struct raft_entry_local_data local_data[],
const unsigned n,
raft_apply_cb cb)
{
raft_index index;
int rv;
tracef("raft_apply n %d", n);
assert(r != NULL);
assert(bufs != NULL);
assert(n > 0);
if (r->state != RAFT_LEADER || r->transfer != NULL) {
rv = RAFT_NOTLEADER;
ErrMsgFromCode(r->errmsg, rv);
tracef("raft_apply not leader");
goto err;
}
/* Index of the first entry being appended. */
index = logLastIndex(r->log) + 1;
tracef("%u commands starting at %lld", n, index);
req->type = RAFT_COMMAND;
req->index = index;
req->cb = cb;
/* Append the new entries to the log. */
rv = logAppendCommands(r->log, r->current_term, bufs, local_data, n);
if (rv != 0) {
goto err;
}
lifecycleRequestStart(r, (struct request *)req);
rv = replicationTrigger(r, index);
if (rv != 0) {
goto err_after_log_append;
}
return 0;
err_after_log_append:
logDiscard(r->log, index);
queue_remove(&req->queue);
err:
assert(rv != 0);
return rv;
}
int raft_barrier(struct raft *r, struct raft_barrier *req, raft_barrier_cb cb)
{
raft_index index;
struct raft_buffer buf;
int rv;
if (r->state != RAFT_LEADER || r->transfer != NULL) {
rv = RAFT_NOTLEADER;
goto err;
}
/* TODO: use a completely empty buffer */
buf.len = 8;
buf.base = raft_malloc(buf.len);
if (buf.base == NULL) {
rv = RAFT_NOMEM;
goto err;
}
/* Index of the barrier entry being appended. */
index = logLastIndex(r->log) + 1;
tracef("barrier starting at %lld", index);
req->type = RAFT_BARRIER;
req->index = index;
req->cb = cb;
rv = logAppend(r->log, r->current_term, RAFT_BARRIER, buf, (struct raft_entry_local_data){}, true, NULL);
if (rv != 0) {
goto err_after_buf_alloc;
}
lifecycleRequestStart(r, (struct request *)req);
rv = replicationTrigger(r, index);
if (rv != 0) {
goto err_after_log_append;
}
return 0;
err_after_log_append:
logDiscard(r->log, index);
queue_remove(&req->queue);
err_after_buf_alloc:
raft_free(buf.base);
err:
return rv;
}
static int clientChangeConfiguration(
struct raft *r,
struct raft_change *req,
const struct raft_configuration *configuration)
{
raft_index index;
raft_term term = r->current_term;
int rv;
(void)req;
/* Index of the entry being appended. */
index = logLastIndex(r->log) + 1;
/* Encode the new configuration and append it to the log. */
rv = logAppendConfiguration(r->log, term, configuration);
if (rv != 0) {
goto err;
}
if (configuration->n != r->configuration.n) {
rv = progressRebuildArray(r, configuration);
if (rv != 0) {
goto err;
}
}
/* Update the current configuration if we've created a new object. */
if (configuration != &r->configuration) {
raft_configuration_close(&r->configuration);
r->configuration = *configuration;
}
/* Start writing the new log entry to disk and send it to the followers.
*/
rv = replicationTrigger(r, index);
if (rv != 0) {
/* TODO: restore the old next/match indexes and configuration.
*/
goto err_after_log_append;
}
r->configuration_uncommitted_index = index;
return 0;
err_after_log_append:
logTruncate(r->log, index);
err:
assert(rv != 0);
return rv;
}
int raft_add(struct raft *r,
struct raft_change *req,
raft_id id,
const char *address,
raft_change_cb cb)
{
struct raft_configuration configuration;
int rv;
rv = membershipCanChangeConfiguration(r);
if (rv != 0) {
return rv;
}
tracef("add server: id %llu, address %s", id, address);
/* Make a copy of the current configuration, and add the new server to
* it. */
rv = configurationCopy(&r->configuration, &configuration);
if (rv != 0) {
goto err;
}
rv = raft_configuration_add(&configuration, id, address, RAFT_SPARE);
if (rv != 0) {
goto err_after_configuration_copy;
}
req->cb = cb;
rv = clientChangeConfiguration(r, req, &configuration);
if (rv != 0) {
goto err_after_configuration_copy;
}
assert(r->leader_state.change == NULL);
r->leader_state.change = req;
return 0;
err_after_configuration_copy:
raft_configuration_close(&configuration);
err:
assert(rv != 0);
return rv;
}
int raft_assign(struct raft *r,
struct raft_change *req,
raft_id id,
int role,
raft_change_cb cb)
{
const struct raft_server *server;
unsigned server_index;
raft_index last_index;
int rv;
tracef("raft_assign to id:%llu the role:%d", id, role);
if (role != RAFT_STANDBY && role != RAFT_VOTER && role != RAFT_SPARE) {
rv = RAFT_BADROLE;
ErrMsgFromCode(r->errmsg, rv);
return rv;
}
rv = membershipCanChangeConfiguration(r);
if (rv != 0) {
return rv;
}
server = configurationGet(&r->configuration, id);
if (server == NULL) {
rv = RAFT_NOTFOUND;
ErrMsgPrintf(r->errmsg, "no server has ID %llu", id);
goto err;
}
/* Check if we have already the desired role. */
if (server->role == role) {
const char *name;
rv = RAFT_BADROLE;
switch (role) {
case RAFT_VOTER:
name = "voter";
break;
case RAFT_STANDBY:
name = "stand-by";
break;
case RAFT_SPARE:
name = "spare";
break;
default:
name = NULL;
assert(0);
break;
}
ErrMsgPrintf(r->errmsg, "server is already %s", name);
goto err;
}
server_index = configurationIndexOf(&r->configuration, id);
assert(server_index < r->configuration.n);
last_index = logLastIndex(r->log);
req->cb = cb;
assert(r->leader_state.change == NULL);
r->leader_state.change = req;
/* If we are not promoting to the voter role or if the log of this
* server is already up-to-date, we can submit the configuration change
* immediately. */
if (role != RAFT_VOTER ||
progressMatchIndex(r, server_index) == last_index) {
int old_role = r->configuration.servers[server_index].role;
r->configuration.servers[server_index].role = role;
rv = clientChangeConfiguration(r, req, &r->configuration);
if (rv != 0) {
tracef("clientChangeConfiguration failed %d", rv);
r->configuration.servers[server_index].role = old_role;
return rv;
}
return 0;
}
r->leader_state.promotee_id = server->id;
/* Initialize the first catch-up round. */
r->leader_state.round_number = 1;
r->leader_state.round_index = last_index;
r->leader_state.round_start = r->io->time(r->io);
/* Immediately initiate an AppendEntries request. */
rv = replicationProgress(r, server_index);
if (rv != 0 && rv != RAFT_NOCONNECTION) {
/* This error is not fatal. */
tracef("failed to send append entries to server %llu: %s (%d)",
server->id, raft_strerror(rv), rv);
}
return 0;
err:
assert(rv != 0);
return rv;
}
int raft_remove(struct raft *r,
struct raft_change *req,
raft_id id,
raft_change_cb cb)
{
const struct raft_server *server;
struct raft_configuration configuration;
int rv;
rv = membershipCanChangeConfiguration(r);
if (rv != 0) {
return rv;
}
server = configurationGet(&r->configuration, id);
if (server == NULL) {
rv = RAFT_BADID;
goto err;
}
tracef("remove server: id %llu", id);
/* Make a copy of the current configuration, and remove the given server
* from it. */
rv = configurationCopy(&r->configuration, &configuration);
if (rv != 0) {
goto err;
}
rv = configurationRemove(&configuration, id);
if (rv != 0) {
goto err_after_configuration_copy;
}
req->cb = cb;
rv = clientChangeConfiguration(r, req, &configuration);
if (rv != 0) {
goto err_after_configuration_copy;
}
assert(r->leader_state.change == NULL);
r->leader_state.change = req;
return 0;
err_after_configuration_copy:
raft_configuration_close(&configuration);
err:
assert(rv != 0);
return rv;
}
/* Find a suitable voting follower. */
static raft_id clientSelectTransferee(struct raft *r)
{
const struct raft_server *transferee = NULL;
unsigned i;
for (i = 0; i < r->configuration.n; i++) {
const struct raft_server *server = &r->configuration.servers[i];
if (server->id == r->id || server->role != RAFT_VOTER) {
continue;
}
transferee = server;
if (progressIsUpToDate(r, i)) {
break;
}
}
if (transferee != NULL) {
return transferee->id;
}
return 0;
}
int raft_transfer(struct raft *r,
struct raft_transfer *req,
raft_id id,
raft_transfer_cb cb)
{
const struct raft_server *server;
unsigned i;
int rv;
tracef("transfer to %llu", id);
if (r->state != RAFT_LEADER || r->transfer != NULL) {
tracef("transfer error - state:%d", r->state);
rv = RAFT_NOTLEADER;
ErrMsgFromCode(r->errmsg, rv);
goto err;
}
if (id == 0) {
id = clientSelectTransferee(r);
if (id == 0) {
rv = RAFT_NOTFOUND;
ErrMsgPrintf(r->errmsg,
"there's no other voting server");
goto err;
}
}
server = configurationGet(&r->configuration, id);
if (server == NULL || server->id == r->id ||
server->role != RAFT_VOTER) {
rv = RAFT_BADID;
ErrMsgFromCode(r->errmsg, rv);
goto err;
}
/* If this follower is up-to-date, we can send it the TimeoutNow message
* right away. */
i = configurationIndexOf(&r->configuration, server->id);
assert(i < r->configuration.n);
membershipLeadershipTransferInit(r, req, id, cb);
if (progressPersistedIsUpToDate(r, i)) {
rv = membershipLeadershipTransferStart(r);
if (rv != 0) {
r->transfer = NULL;
goto err;
}
}
return 0;
err:
assert(rv != 0);
return rv;
}
#undef tracef
dqlite-1.16.7/src/raft/compress.c 0000664 0000000 0000000 00000016704 14652527134 0016633 0 ustar 00root root 0000000 0000000 #include "compress.h"
#ifdef LZ4_AVAILABLE
#include
#endif
#include
#include
#include "assert.h"
#include "byte.h"
#include "err.h"
#define min(a, b) ((a) < (b) ? (a) : (b))
#define max(a, b) ((a) > (b) ? (a) : (b))
#define MEGABYTE 1048576
int Compress(struct raft_buffer bufs[],
unsigned n_bufs,
struct raft_buffer *compressed,
char *errmsg)
{
#ifndef LZ4_AVAILABLE
(void)bufs;
(void)n_bufs;
(void)compressed;
ErrMsgPrintf(errmsg, "LZ4 not available");
return RAFT_INVALID;
#else
assert(bufs != NULL);
assert(n_bufs > 0);
assert(compressed != NULL);
assert(errmsg != NULL);
int rv = RAFT_IOERR;
size_t src_size = 0;
size_t dst_size = 0;
size_t src_offset = 0;
size_t dst_offset = 0;
size_t dst_size_needed = 0; /* Store minimal dst_size */
size_t ret = 0; /* Return value of LZ4F_XXX functions */
compressed->base = NULL;
compressed->len = 0;
/* Determine total uncompressed size */
for (unsigned i = 0; i < n_bufs; ++i) {
src_size += bufs[i].len;
}
/* Work around a bug in liblz4 on bionic, in practice raft should only
* Compress non-0 length buffers, so this should be fine.
* https://github.com/lz4/lz4/issues/157
* */
if (src_size == 0) {
ErrMsgPrintf(errmsg, "total size must be larger then 0");
rv = RAFT_INVALID;
goto err;
}
/* Set LZ4 preferences */
LZ4F_preferences_t lz4_pref;
memset(&lz4_pref, 0, sizeof(lz4_pref));
/* Detect data corruption when decompressing */
lz4_pref.frameInfo.contentChecksumFlag = 1;
/* For allocating a suitable buffer when decompressing */
lz4_pref.frameInfo.contentSize = src_size;
/* Context to track compression progress */
LZ4F_compressionContext_t ctx;
ret = LZ4F_createCompressionContext(&ctx, LZ4F_VERSION);
if (LZ4F_isError(ret)) {
ErrMsgPrintf(errmsg, "LZ4F_createDecompressionContext %s",
LZ4F_getErrorName(ret));
rv = RAFT_NOMEM;
goto err;
}
/* Guestimate of eventual compressed size, mainly not to allocate a huge
* buffer as `LZ4F_compressBound` calculates the worst case scenario. */
dst_size = LZ4F_compressBound(
max(MEGABYTE, (size_t)lz4_pref.frameInfo.contentSize / 10),
&lz4_pref);
dst_size += LZ4F_HEADER_SIZE_MAX_RAFT;
compressed->base = raft_malloc(dst_size);
if (compressed->base == NULL) {
rv = RAFT_NOMEM;
goto err_after_ctx_alloc;
}
/* Returns the size of the lz4 header, data should be written after the
* header */
dst_offset =
LZ4F_compressBegin(ctx, compressed->base, dst_size, &lz4_pref);
if (LZ4F_isError(dst_offset)) {
ErrMsgPrintf(errmsg, "LZ4F_compressBegin %s",
LZ4F_getErrorName(dst_offset));
rv = RAFT_IOERR;
goto err_after_buff_alloc;
}
/* Compress all buffers */
for (unsigned i = 0; i < n_bufs; ++i) {
src_offset = 0;
while (src_offset < bufs[i].len) {
/* Compress in chunks of maximum 1MB and check if there
* is enough room in the dst buffer, if not realloc */
src_size =
min(bufs[i].len - src_offset, (size_t)MEGABYTE);
dst_size_needed =
LZ4F_compressBound(src_size, &lz4_pref);
if (dst_size - dst_offset < dst_size_needed) {
dst_size +=
max(dst_size_needed,
(size_t)lz4_pref.frameInfo.contentSize /
10);
compressed->base =
raft_realloc(compressed->base, dst_size);
if (compressed->base == NULL) {
rv = RAFT_NOMEM;
goto err_after_ctx_alloc;
}
}
/* There is guaranteed enough room in `dst` to perform
* the compression */
ret = LZ4F_compressUpdate(
ctx, (char *)compressed->base + dst_offset,
dst_size - dst_offset,
(char *)bufs[i].base + src_offset, src_size, NULL);
if (LZ4F_isError(ret)) {
ErrMsgPrintf(errmsg, "LZ4F_compressUpdate %s",
LZ4F_getErrorName(ret));
rv = RAFT_IOERR;
goto err_after_buff_alloc;
}
dst_offset += ret;
src_offset += src_size;
}
}
/* Make sure LZ4F_compressEnd has enough room to succeed */
dst_size_needed = LZ4F_compressBound(0, &lz4_pref);
if ((dst_size - dst_offset) < dst_size_needed) {
dst_size += dst_size_needed;
compressed->base = raft_realloc(compressed->base, dst_size);
if (compressed->base == NULL) {
rv = RAFT_NOMEM;
goto err_after_ctx_alloc;
}
}
/* Finalize compression */
ret = LZ4F_compressEnd(ctx, (char *)compressed->base + dst_offset,
dst_size - dst_offset, NULL);
if (LZ4F_isError(ret)) {
ErrMsgPrintf(errmsg, "LZ4F_compressEnd %s",
LZ4F_getErrorName(ret));
rv = RAFT_IOERR;
goto err_after_buff_alloc;
}
dst_offset += ret;
compressed->len = dst_offset;
LZ4F_freeCompressionContext(ctx);
return 0;
err_after_buff_alloc:
raft_free(compressed->base);
compressed->base = NULL;
err_after_ctx_alloc:
LZ4F_freeCompressionContext(ctx);
err:
return rv;
#endif /* LZ4_AVAILABLE */
}
int Decompress(struct raft_buffer buf,
struct raft_buffer *decompressed,
char *errmsg)
{
#ifndef LZ4_AVAILABLE
(void)buf;
(void)decompressed;
ErrMsgPrintf(errmsg, "LZ4 not available");
return RAFT_INVALID;
#else
assert(decompressed != NULL);
int rv = RAFT_IOERR;
size_t src_offset = 0;
size_t dst_offset = 0;
size_t src_size = 0;
size_t dst_size = 0;
size_t ret = 0;
LZ4F_decompressionContext_t ctx;
if (LZ4F_isError(LZ4F_createDecompressionContext(&ctx, LZ4F_VERSION))) {
ErrMsgPrintf(errmsg, "LZ4F_createDecompressionContext");
rv = RAFT_NOMEM;
goto err;
}
src_size = buf.len;
LZ4F_frameInfo_t frameInfo = {0};
/* `src_size` will contain the size of the LZ4 Frame Header after the
* call, decompression must resume at that offset. */
ret = LZ4F_getFrameInfo(ctx, &frameInfo, buf.base, &src_size);
if (LZ4F_isError(ret)) {
ErrMsgPrintf(errmsg, "LZ4F_getFrameInfo %s",
LZ4F_getErrorName(ret));
rv = RAFT_IOERR;
goto err_after_ctx_alloc;
}
src_offset = src_size;
decompressed->base = raft_malloc((size_t)frameInfo.contentSize);
decompressed->len = (size_t)frameInfo.contentSize;
if (decompressed->base == NULL) {
rv = RAFT_NOMEM;
goto err_after_ctx_alloc;
}
ret = 1;
while (ret != 0) {
src_size = buf.len - src_offset;
/* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
* The next line works around a bug in an older lz4 lib where
* the `size_t` dst_size parameter would overflow an `int`.
* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
*/
dst_size = min(decompressed->len - dst_offset, (size_t)INT_MAX);
/* `dst_size` will contain the number of bytes written to
* decompressed->base, while `src_size` will contain the number
* of bytes consumed from buf.base */
ret = LZ4F_decompress(
ctx, (char *)decompressed->base + dst_offset, &dst_size,
(char *)buf.base + src_offset, &src_size, NULL);
if (LZ4F_isError(ret)) {
ErrMsgPrintf(errmsg, "LZ4F_decompress %s",
LZ4F_getErrorName(ret));
rv = RAFT_IOERR;
goto err_after_buff_alloc;
}
src_offset += src_size;
dst_offset += dst_size;
}
if (LZ4F_freeDecompressionContext(ctx) != 0) {
raft_free(decompressed->base);
decompressed->base = NULL;
return RAFT_IOERR;
}
return 0;
err_after_buff_alloc:
raft_free(decompressed->base);
decompressed->base = NULL;
err_after_ctx_alloc:
LZ4F_freeDecompressionContext(ctx);
err:
return rv;
#endif /* LZ4_AVAILABLE */
}
bool IsCompressed(const void *data, size_t sz)
{
if (data == NULL || sz < 4) {
return false;
}
const void *cursor = data;
#ifdef LZ4F_MAGICNUMBER
#define RAFT_LZ4F_MAGICNUMBER LZ4F_MAGICNUMBER
#else
#define RAFT_LZ4F_MAGICNUMBER 0x184D2204U
#endif
return byteGet32(&cursor) == RAFT_LZ4F_MAGICNUMBER;
}
dqlite-1.16.7/src/raft/compress.h 0000664 0000000 0000000 00000001611 14652527134 0016627 0 ustar 00root root 0000000 0000000 #ifndef COMPRESS_H_
#define COMPRESS_H_
#include "../raft.h"
#ifdef LZ4F_HEADER_SIZE_MAX
#define LZ4F_HEADER_SIZE_MAX_RAFT LZ4F_HEADER_SIZE_MAX
#else
#define LZ4F_HEADER_SIZE_MAX_RAFT 19UL
#endif
/*
* Compresses the content of `bufs` into a newly allocated buffer that is
* returned to the caller through `compressed`. Returns a non-0 value upon
* failure.
*/
int Compress(struct raft_buffer bufs[],
unsigned n_bufs,
struct raft_buffer *compressed,
char *errmsg);
/*
* Decompresses the content of `buf` into a newly allocated buffer that is
* returned to the caller through `decompressed`. Returns a non-0 value upon
* failure.
*/
int Decompress(struct raft_buffer buf,
struct raft_buffer *decompressed,
char *errmsg);
/* Returns `true` if `data` is compressed, `false` otherwise. */
bool IsCompressed(const void *data, size_t sz);
#endif /* COMPRESS_H_ */
dqlite-1.16.7/src/raft/configuration.c 0000664 0000000 0000000 00000017401 14652527134 0017642 0 ustar 00root root 0000000 0000000 #include "configuration.h"
#include "../tracing.h"
#include "assert.h"
#include "byte.h"
/* Current encoding format version. */
#define ENCODING_FORMAT 1
void configurationInit(struct raft_configuration *c)
{
c->servers = NULL;
c->n = 0;
}
void configurationClose(struct raft_configuration *c)
{
size_t i;
assert(c != NULL);
assert(c->n == 0 || c->servers != NULL);
for (i = 0; i < c->n; i++) {
raft_free(c->servers[i].address);
}
if (c->servers != NULL) {
raft_free(c->servers);
}
}
unsigned configurationIndexOf(const struct raft_configuration *c,
const raft_id id)
{
unsigned i;
assert(c != NULL);
for (i = 0; i < c->n; i++) {
if (c->servers[i].id == id) {
return i;
}
}
return c->n;
}
unsigned configurationIndexOfVoter(const struct raft_configuration *c,
const raft_id id)
{
unsigned i;
unsigned j = 0;
assert(c != NULL);
assert(id > 0);
for (i = 0; i < c->n; i++) {
if (c->servers[i].id == id) {
if (c->servers[i].role == RAFT_VOTER) {
return j;
}
return c->n;
}
if (c->servers[i].role == RAFT_VOTER) {
j++;
}
}
return c->n;
}
const struct raft_server *configurationGet(const struct raft_configuration *c,
const raft_id id)
{
size_t i;
assert(c != NULL);
assert(id > 0);
/* Grab the index of the server with the given ID */
i = configurationIndexOf(c, id);
if (i == c->n) {
/* No server with matching ID. */
return NULL;
}
assert(i < c->n);
return &c->servers[i];
}
unsigned configurationVoterCount(const struct raft_configuration *c)
{
unsigned i;
unsigned n = 0;
assert(c != NULL);
for (i = 0; i < c->n; i++) {
if (c->servers[i].role == RAFT_VOTER) {
n++;
}
}
return n;
}
int configurationCopy(const struct raft_configuration *src,
struct raft_configuration *dst)
{
size_t i;
int rv;
configurationInit(dst);
for (i = 0; i < src->n; i++) {
struct raft_server *server = &src->servers[i];
rv = configurationAdd(dst, server->id, server->address,
server->role);
if (rv != 0) {
goto err;
}
}
return 0;
err:
configurationClose(dst);
assert(rv == RAFT_NOMEM);
return rv;
}
int configurationAdd(struct raft_configuration *c,
raft_id id,
const char *address,
int role)
{
struct raft_server *servers;
struct raft_server *server;
char *address_copy;
size_t i;
int rv;
assert(c != NULL);
assert(id != 0);
if (role != RAFT_STANDBY && role != RAFT_VOTER && role != RAFT_SPARE) {
rv = RAFT_BADROLE;
goto err;
}
/* Check that neither the given id or address is already in use */
for (i = 0; i < c->n; i++) {
server = &c->servers[i];
if (server->id == id) {
rv = RAFT_DUPLICATEID;
goto err;
}
if (strcmp(server->address, address) == 0) {
rv = RAFT_DUPLICATEADDRESS;
goto err;
}
}
/* Make a copy of the given address */
address_copy = raft_malloc(strlen(address) + 1);
if (address_copy == NULL) {
rv = RAFT_NOMEM;
goto err;
}
strcpy(address_copy, address);
/* Grow the servers array.. */
servers = raft_realloc(c->servers, (c->n + 1) * sizeof *server);
if (servers == NULL) {
rv = RAFT_NOMEM;
goto err_after_address_copy;
}
c->servers = servers;
/* Fill the newly allocated slot (the last one) with the given details.
*/
server = &servers[c->n];
server->id = id;
server->address = address_copy;
server->role = role;
c->n++;
return 0;
err_after_address_copy:
raft_free(address_copy);
err:
assert(rv == RAFT_BADROLE || rv == RAFT_DUPLICATEID ||
rv == RAFT_DUPLICATEADDRESS || rv == RAFT_NOMEM);
return rv;
}
int configurationRemove(struct raft_configuration *c, const raft_id id)
{
unsigned i;
unsigned j;
struct raft_server *servers;
int rv;
assert(c != NULL);
i = configurationIndexOf(c, id);
if (i == c->n) {
rv = RAFT_BADID;
goto err;
}
assert(i < c->n);
/* If this is the last server in the configuration, reset everything. */
if (c->n - 1 == 0) {
assert(i == 0);
servers = NULL;
goto out;
}
/* Create a new servers array. */
servers = raft_calloc(c->n - 1, sizeof *servers);
if (servers == NULL) {
rv = RAFT_NOMEM;
goto err;
}
/* Copy the first part of the servers array into a new array, excluding
* the i'th server. */
for (j = 0; j < i; j++) {
servers[j] = c->servers[j];
}
/* Copy the second part of the servers array into a new array. */
for (j = i + 1; j < c->n; j++) {
servers[j - 1] = c->servers[j];
}
out:
/* Release the address of the server that was deleted. */
raft_free(c->servers[i].address);
/* Release the old servers array */
raft_free(c->servers);
c->servers = servers;
c->n--;
return 0;
err:
assert(rv == RAFT_BADID || rv == RAFT_NOMEM);
return rv;
}
size_t configurationEncodedSize(const struct raft_configuration *c)
{
size_t n = 0;
unsigned i;
/* We need one byte for the encoding format version */
n++;
/* Then 8 bytes for number of servers. */
n += sizeof(uint64_t);
/* Then some space for each server. */
for (i = 0; i < c->n; i++) {
struct raft_server *server = &c->servers[i];
assert(server->address != NULL);
n += sizeof(uint64_t); /* Server ID */
n += strlen(server->address) + 1; /* Address */
n++; /* Voting flag */
};
return bytePad64(n);
}
void configurationEncodeToBuf(const struct raft_configuration *c, void *buf)
{
void *cursor = buf;
unsigned i;
/* Encoding format version */
bytePut8(&cursor, ENCODING_FORMAT);
/* Number of servers. */
bytePut64(&cursor, c->n);
for (i = 0; i < c->n; i++) {
struct raft_server *server = &c->servers[i];
assert(server->address != NULL);
bytePut64(&cursor, server->id);
bytePutString(&cursor, server->address);
assert(server->role < 255);
bytePut8(&cursor, (uint8_t)server->role);
};
}
int configurationEncode(const struct raft_configuration *c,
struct raft_buffer *buf)
{
int rv;
assert(c != NULL);
assert(buf != NULL);
/* The configuration can't be empty. */
assert(c->n > 0);
buf->len = configurationEncodedSize(c);
buf->base = raft_malloc(buf->len);
if (buf->base == NULL) {
rv = RAFT_NOMEM;
goto err;
}
configurationEncodeToBuf(c, buf->base);
return 0;
err:
assert(rv == RAFT_NOMEM);
return rv;
}
int configurationDecode(const struct raft_buffer *buf,
struct raft_configuration *c)
{
const void *cursor;
size_t i;
size_t n;
int rv;
assert(c != NULL);
assert(buf != NULL);
/* TODO: use 'if' instead of assert for checking buffer boundaries */
assert(buf->len > 0);
configurationInit(c);
cursor = buf->base;
/* Check the encoding format version */
if (byteGet8(&cursor) != ENCODING_FORMAT) {
rv = RAFT_MALFORMED;
goto err;
}
/* Read the number of servers. */
n = (size_t)byteGet64(&cursor);
/* Decode the individual servers. */
for (i = 0; i < n; i++) {
raft_id id;
const char *address;
int role;
/* Server ID. */
id = byteGet64(&cursor);
/* Server Address. */
address = byteGetString(
&cursor, buf->len - (size_t)((uint8_t *)cursor -
(uint8_t *)buf->base));
if (address == NULL) {
rv = RAFT_MALFORMED;
goto err;
}
/* Role code. */
role = byteGet8(&cursor);
rv = configurationAdd(c, id, address, role);
if (rv != 0) {
/* Only valid configurations should be ever be encoded,
* so in case configurationAdd() fails because of
* invalid data we return RAFT_MALFORMED. */
if (rv != RAFT_NOMEM) {
rv = RAFT_MALFORMED;
}
goto err;
}
}
return 0;
err:
assert(rv == RAFT_MALFORMED || rv == RAFT_NOMEM);
configurationClose(c);
return rv;
}
void configurationTrace(const struct raft *r,
struct raft_configuration *c,
const char *msg)
{
(void)r;
tracef("%s", msg);
tracef("=== CONFIG START ===");
unsigned i;
struct raft_server *s;
for (i = 0; i < c->n; i++) {
s = &c->servers[i];
tracef("id:%llu address:%s role:%d", s->id, s->address,
s->role);
}
tracef("=== CONFIG END ===");
}
#undef tracef
dqlite-1.16.7/src/raft/configuration.h 0000664 0000000 0000000 00000010121 14652527134 0017637 0 ustar 00root root 0000000 0000000 /* Modify and inspect @raft_configuration objects. */
#ifndef CONFIGURATION_H_
#define CONFIGURATION_H_
#include "../raft.h"
/* Initialize an empty configuration. */
void configurationInit(struct raft_configuration *c);
/* Release all memory used by the given configuration. */
void configurationClose(struct raft_configuration *c);
/* Add a server to the given configuration.
*
* The given @address is copied and no reference to it is kept. In case of
* error, @c is left unchanged.
*
* Errors:
*
* RAFT_DUPLICATEID
* @c already has a server with the given id.
*
* RAFT_DUPLICATEADDRESS
* @c already has a server with the given @address.
*
* RAFT_BADROLE
* @role is not one of ROLE_STANDBY, ROLE_VOTER or ROLE_SPARE.
*
* RAFT_NOMEM
* A copy of @address could not me made or the @c->servers could not
* be extended
*/
int configurationAdd(struct raft_configuration *c,
raft_id id,
const char *address,
int role);
/* Return the number of servers with the RAFT_VOTER role. */
unsigned configurationVoterCount(const struct raft_configuration *c);
/* Return the index of the server with the given ID (relative to the c->servers
* array). If there's no server with the given ID, return the number of
* servers. */
unsigned configurationIndexOf(const struct raft_configuration *c, raft_id id);
/* Return the index of the RAFT_VOTER server with the given ID (relative to the
* sub array of c->servers that has only voting servers). If there's no server
* with the given ID, or if it's not flagged as voting, return the number of
* servers. */
unsigned configurationIndexOfVoter(const struct raft_configuration *c,
raft_id id);
/* Get the server with the given ID, or #NULL if no matching server is found. */
const struct raft_server *configurationGet(const struct raft_configuration *c,
raft_id id);
/* Remove a server from a raft configuration. The given ID must match the one of
* an existing server in the configuration.
*
* In case of error @c is left unchanged.
*
* Errors:
*
* RAFT_BADID
* @c does not contain any server with the given @id
*
* RAFT_NOMEM
* Memory to hold the new set of servers could not be allocated.
*/
int configurationRemove(struct raft_configuration *c, raft_id id);
/* Deep copy @src to @dst.
*
* The configuration @src is assumed to be valid (i.e. each of its servers has a
* valid ID, address and role).
*
* The @dst configuration object must be uninitialized or empty.
*
* In case of error, both @src and @dst are left unchanged.
*
* Errors:
*
* RAFT_NOMEM
* Memory to copy all the servers could not be allocated.
*/
int configurationCopy(const struct raft_configuration *src,
struct raft_configuration *dst);
/* Number of bytes needed to encode the given configuration object. */
size_t configurationEncodedSize(const struct raft_configuration *c);
/* Encode the given configuration object to the given pre-allocated buffer,
* which is assumed to be at least configurationEncodedSize(c) bytes. */
void configurationEncodeToBuf(const struct raft_configuration *c, void *buf);
/* Encode the given configuration object. The memory of the returned buffer is
* allocated using raft_malloc(), and client code is responsible for releasing
* it when no longer needed.
*
* Errors:
*
* RAFT_NOMEM
* Memory for the encoded buffer could not be allocated.
*/
int configurationEncode(const struct raft_configuration *c,
struct raft_buffer *buf);
/* Populate a configuration object by decoding the given serialized payload.
*
* The @c configuration object must be uninitialized or empty.
*
* In case of error, @c will be left empty.
*
* Errors:
*
* RAFT_MALFORMED
* The given buffer does not contain a valid encoded configuration.
*
* RAFT_NOMEM
* Memory to populate the given configuration could not be allocated.
*/
int configurationDecode(const struct raft_buffer *buf,
struct raft_configuration *c);
/* Output the configuration to the raft tracer */
void configurationTrace(const struct raft *r,
struct raft_configuration *c,
const char *msg);
#endif /* CONFIGURATION_H_ */
dqlite-1.16.7/src/raft/convert.c 0000664 0000000 0000000 00000016267 14652527134 0016464 0 ustar 00root root 0000000 0000000 #include "convert.h"
#include "../raft.h"
#include "../tracing.h"
#include "assert.h"
#include "callbacks.h"
#include "configuration.h"
#include "election.h"
#include "log.h"
#include "membership.h"
#include "progress.h"
#include "../lib/queue.h"
#include "replication.h"
#include "request.h"
/* Convenience for setting a new state value and asserting that the transition
* is valid. */
static void convertSetState(struct raft *r, unsigned short new_state)
{
/* Check that the transition is legal, see Figure 3.3. Note that with
* respect to the paper we have an additional "unavailable" state, which
* is the initial or final state. */
unsigned short old_state = r->state;
tracef("old_state:%u new_state:%u", old_state, new_state);
assert((r->state == RAFT_UNAVAILABLE && new_state == RAFT_FOLLOWER) ||
(r->state == RAFT_FOLLOWER && new_state == RAFT_CANDIDATE) ||
(r->state == RAFT_CANDIDATE && new_state == RAFT_FOLLOWER) ||
(r->state == RAFT_CANDIDATE && new_state == RAFT_LEADER) ||
(r->state == RAFT_LEADER && new_state == RAFT_FOLLOWER) ||
(r->state == RAFT_FOLLOWER && new_state == RAFT_UNAVAILABLE) ||
(r->state == RAFT_CANDIDATE && new_state == RAFT_UNAVAILABLE) ||
(r->state == RAFT_LEADER && new_state == RAFT_UNAVAILABLE));
r->state = new_state;
if (r->state == RAFT_LEADER) {
r->leader_state.voter_contacts = 1;
}
struct raft_callbacks *cbs = raftGetCallbacks(r);
if (cbs != NULL && cbs->state_cb != NULL) {
cbs->state_cb(r, old_state, new_state);
}
}
/* Clear follower state. */
static void convertClearFollower(struct raft *r)
{
tracef("clear follower state");
r->follower_state.current_leader.id = 0;
if (r->follower_state.current_leader.address != NULL) {
raft_free(r->follower_state.current_leader.address);
}
r->follower_state.current_leader.address = NULL;
}
/* Clear candidate state. */
static void convertClearCandidate(struct raft *r)
{
tracef("clear candidate state");
if (r->candidate_state.votes != NULL) {
raft_free(r->candidate_state.votes);
r->candidate_state.votes = NULL;
}
}
static void convertFailApply(struct raft_apply *req)
{
if (req != NULL && req->cb != NULL) {
req->cb(req, RAFT_LEADERSHIPLOST, NULL);
}
}
static void convertFailBarrier(struct raft_barrier *req)
{
if (req != NULL && req->cb != NULL) {
req->cb(req, RAFT_LEADERSHIPLOST);
}
}
static void convertFailChange(struct raft_change *req)
{
if (req != NULL && req->cb != NULL) {
req->cb(req, RAFT_LEADERSHIPLOST);
}
}
/* Clear leader state. */
static void convertClearLeader(struct raft *r)
{
tracef("clear leader state");
if (r->leader_state.progress != NULL) {
raft_free(r->leader_state.progress);
r->leader_state.progress = NULL;
}
/* Fail all outstanding requests */
while (!queue_empty(&r->leader_state.requests)) {
struct request *req;
queue *head;
head = queue_head(&r->leader_state.requests);
queue_remove(head);
req = QUEUE_DATA(head, struct request, queue);
assert(req->type == RAFT_COMMAND || req->type == RAFT_BARRIER);
switch (req->type) {
case RAFT_COMMAND:
convertFailApply((struct raft_apply *)req);
break;
case RAFT_BARRIER:
convertFailBarrier((struct raft_barrier *)req);
break;
};
}
/* Fail any promote request that is still outstanding because the server
* is still catching up and no entry was submitted. */
if (r->leader_state.change != NULL) {
convertFailChange(r->leader_state.change);
r->leader_state.change = NULL;
}
}
/* Clear the current state */
static void convertClear(struct raft *r)
{
assert(r->state == RAFT_UNAVAILABLE || r->state == RAFT_FOLLOWER ||
r->state == RAFT_CANDIDATE || r->state == RAFT_LEADER);
switch (r->state) {
case RAFT_FOLLOWER:
convertClearFollower(r);
break;
case RAFT_CANDIDATE:
convertClearCandidate(r);
break;
case RAFT_LEADER:
convertClearLeader(r);
break;
}
}
void convertToFollower(struct raft *r)
{
convertClear(r);
convertSetState(r, RAFT_FOLLOWER);
/* Reset election timer. */
electionResetTimer(r);
r->follower_state.current_leader.id = 0;
r->follower_state.current_leader.address = NULL;
r->follower_state.append_in_flight_count = 0;
}
int convertToCandidate(struct raft *r, bool disrupt_leader)
{
const struct raft_server *server;
size_t n_voters = configurationVoterCount(&r->configuration);
int rv;
(void)server; /* Only used for assertions. */
convertClear(r);
convertSetState(r, RAFT_CANDIDATE);
/* Allocate the votes array. */
r->candidate_state.votes = raft_malloc(n_voters * sizeof(bool));
if (r->candidate_state.votes == NULL) {
return RAFT_NOMEM;
}
r->candidate_state.disrupt_leader = disrupt_leader;
r->candidate_state.in_pre_vote = disrupt_leader ? false : r->pre_vote;
/* Fast-forward to leader if we're the only voting server in the
* configuration. */
server = configurationGet(&r->configuration, r->id);
assert(server != NULL);
assert(server->role == RAFT_VOTER);
if (n_voters == 1) {
tracef("self elect and convert to leader");
return convertToLeader(r);
}
/* Start a new election round */
rv = electionStart(r);
if (rv != 0) {
r->state = RAFT_FOLLOWER;
raft_free(r->candidate_state.votes);
return rv;
}
return 0;
}
void convertInitialBarrierCb(struct raft_barrier *req, int status)
{
(void)status;
raft_free(req);
}
int convertToLeader(struct raft *r)
{
int rv;
tracef("become leader for term %llu", r->current_term);
convertClear(r);
convertSetState(r, RAFT_LEADER);
/* Reset timers */
r->election_timer_start = r->io->time(r->io);
/* Reset apply requests queue */
queue_init(&r->leader_state.requests);
/* Allocate and initialize the progress array. */
rv = progressBuildArray(r);
if (rv != 0) {
return rv;
}
r->leader_state.change = NULL;
/* Reset promotion state. */
r->leader_state.promotee_id = 0;
r->leader_state.round_number = 0;
r->leader_state.round_index = 0;
r->leader_state.round_start = 0;
/* By definition, all entries until the last_stored entry will be
* committed if we are the only voter around. */
size_t n_voters = configurationVoterCount(&r->configuration);
if (n_voters == 1 && (r->last_stored > r->commit_index)) {
tracef("apply log entries after self election %llu %llu",
r->last_stored, r->commit_index);
r->commit_index = r->last_stored;
rv = replicationApply(r);
} else if (n_voters > 1) {
/* Raft Dissertation, paragraph 6.4:
* The Leader Completeness Property guarantees that a leader has
* all committed entries, but at the start of its term, it may
* not know which those are. To find out, it needs to commit an
* entry from its term. Raft handles this by having each leader
* commit a blank no-op entry into the log at the start of its
* term. */
struct raft_barrier *req = raft_malloc(sizeof(*req));
if (req == NULL) {
return RAFT_NOMEM;
}
rv = raft_barrier(r, req, convertInitialBarrierCb);
if (rv != 0) {
tracef(
"failed to send no-op barrier entry after leader "
"conversion: "
"%d",
rv);
}
}
return rv;
}
void convertToUnavailable(struct raft *r)
{
/* Abort any pending leadership transfer request. */
if (r->transfer != NULL) {
membershipLeadershipTransferClose(r);
}
convertClear(r);
convertSetState(r, RAFT_UNAVAILABLE);
}
#undef tracef
dqlite-1.16.7/src/raft/convert.h 0000664 0000000 0000000 00000003062 14652527134 0016456 0 ustar 00root root 0000000 0000000 /* Convert from one state to another. */
#ifndef CONVERT_H_
#define CONVERT_H_
#include "../raft.h"
/* Convert from unavailable, or candidate or leader to follower.
*
* From Figure 3.1:
*
* If election timeout elapses without receiving AppendEntries RPC from
* current leader or granting vote to candidate: convert to candidate.
*
* The above implies that we need to reset the election timer when converting to
* follower. */
void convertToFollower(struct raft *r);
/* Convert from follower to candidate, starting a new election.
*
* From Figure 3.1:
*
* On conversion to candidate, start election
*
* If the disrupt_leader flag is true, the server will set the disrupt leader
* flag of the RequestVote messages it sends. */
int convertToCandidate(struct raft *r, bool disrupt_leader);
/* Convert from candidate to leader.
*
* From Figure 3.1:
*
* Upon election: send initial empty AppendEntries RPC (heartbeat) to each
* server.
*
* From Section 3.4:
*
* Once a candidate wins an election, it becomes leader. It then sends
* heartbeat messages to all of the other servers to establish its authority
* and prevent new elections.
*
* From Section 3.3:
*
* The leader maintains a nextIndex for each follower, which is the index
* of the next log entry the leader will send to that follower. When a
* leader first comes to power, it initializes all nextIndex values to the
* index just after the last one in its log. */
int convertToLeader(struct raft *r);
void convertToUnavailable(struct raft *r);
#endif /* CONVERT_H_ */
dqlite-1.16.7/src/raft/election.c 0000664 0000000 0000000 00000022210 14652527134 0016567 0 ustar 00root root 0000000 0000000 #include "election.h"
#include "../tracing.h"
#include "assert.h"
#include "configuration.h"
#include "heap.h"
#include "log.h"
/* Common fields between follower and candidate state.
*
* The follower_state and candidate_state structs in raft.h must be kept
* consistent with this definition. */
struct followerOrCandidateState
{
unsigned randomized_election_timeout;
};
/* Return a pointer to either the follower or candidate state. */
struct followerOrCandidateState *getFollowerOrCandidateState(struct raft *r)
{
struct followerOrCandidateState *state;
assert(r->state == RAFT_FOLLOWER || r->state == RAFT_CANDIDATE);
if (r->state == RAFT_FOLLOWER) {
state = (struct followerOrCandidateState *)&r->follower_state;
} else {
state = (struct followerOrCandidateState *)&r->candidate_state;
}
return state;
}
void electionResetTimer(struct raft *r)
{
struct followerOrCandidateState *state = getFollowerOrCandidateState(r);
unsigned timeout = (unsigned)r->io->random(
r->io, (int)r->election_timeout, 2 * (int)r->election_timeout);
assert(timeout >= r->election_timeout);
assert(timeout <= r->election_timeout * 2);
state->randomized_election_timeout = timeout;
r->election_timer_start = r->io->time(r->io);
}
bool electionTimerExpired(struct raft *r)
{
struct followerOrCandidateState *state = getFollowerOrCandidateState(r);
raft_time now = r->io->time(r->io);
return now - r->election_timer_start >=
state->randomized_election_timeout;
}
static void sendRequestVoteCb(struct raft_io_send *send, int status)
{
(void)status;
RaftHeapFree(send);
}
/* Send a RequestVote RPC to the given server. */
static int electionSend(struct raft *r, const struct raft_server *server)
{
struct raft_message message;
struct raft_io_send *send;
raft_term term;
int rv;
assert(server->id != r->id);
assert(server->id != 0);
/* If we are in the pre-vote phase, we indicate our future term in the
* request. */
term = r->current_term;
if (r->candidate_state.in_pre_vote) {
term++;
}
/* Fill the RequestVote message.
*
* Note that we set last_log_index and last_log_term to the index and
* term of the last persisted entry, to the last entry in our in-memory
* log cache, because we must advertise only log entries that can't be
* lost at restart.
*
* Also note that, for a similar reason, we apply pending configuration
* changes only once they are persisted. When running an election we
* then use only persisted information, which is safe (while using
* unpersisted information for the log and persisted information for the
* configuration or viceversa would lead to inconsistencies and
* violations of Raft invariants).
*/
message.type = RAFT_IO_REQUEST_VOTE;
message.request_vote.term = term;
message.request_vote.candidate_id = r->id;
message.request_vote.last_log_index = r->last_stored;
message.request_vote.last_log_term = logTermOf(r->log, r->last_stored);
message.request_vote.disrupt_leader = r->candidate_state.disrupt_leader;
message.request_vote.pre_vote = r->candidate_state.in_pre_vote;
message.server_id = server->id;
message.server_address = server->address;
send = RaftHeapMalloc(sizeof *send);
if (send == NULL) {
return RAFT_NOMEM;
}
send->data = r;
rv = r->io->send(r->io, send, &message, sendRequestVoteCb);
if (rv != 0) {
RaftHeapFree(send);
return rv;
}
return 0;
}
int electionStart(struct raft *r)
{
raft_term term;
size_t n_voters;
size_t voting_index;
size_t i;
int rv;
assert(r->state == RAFT_CANDIDATE);
n_voters = configurationVoterCount(&r->configuration);
voting_index = configurationIndexOfVoter(&r->configuration, r->id);
/* This function should not be invoked if we are not a voting server,
* hence voting_index must be lower than the number of servers in the
* configuration (meaning that we are a voting server). */
assert(voting_index < r->configuration.n);
/* Coherence check that configurationVoterCount and
* configurationIndexOfVoter have returned something that makes sense.
*/
assert(n_voters <= r->configuration.n);
assert(voting_index < n_voters);
/* During pre-vote we don't increment our term, or reset our vote.
* Resetting our vote could lead to double-voting if we were to receive
* a RequestVote RPC during our Candidate state while we already voted
* for a server during the term. */
if (!r->candidate_state.in_pre_vote) {
/* Increment current term */
term = r->current_term + 1;
rv = r->io->set_term(r->io, term);
if (rv != 0) {
tracef("set_term failed %d", rv);
goto err;
}
tracef("beginning of term %llu", term);
/* Vote for self */
rv = r->io->set_vote(r->io, r->id);
if (rv != 0) {
tracef("set_vote self failed %d", rv);
goto err;
}
/* Update our cache too. */
r->current_term = term;
r->voted_for = r->id;
}
/* Reset election timer. */
electionResetTimer(r);
assert(r->candidate_state.votes != NULL);
/* Initialize the votes array and send vote requests. */
for (i = 0; i < n_voters; i++) {
if (i == voting_index) {
r->candidate_state.votes[i] =
true; /* We vote for ourselves */
} else {
r->candidate_state.votes[i] = false;
}
}
for (i = 0; i < r->configuration.n; i++) {
const struct raft_server *server = &r->configuration.servers[i];
if (server->id == r->id || server->role != RAFT_VOTER) {
continue;
}
rv = electionSend(r, server);
if (rv != 0) {
/* This is not a critical failure, let's just log it. */
tracef("failed to send vote request to server %llu: %s",
server->id, raft_strerror(rv));
}
}
return 0;
err:
assert(rv != 0);
return rv;
}
int electionVote(struct raft *r,
const struct raft_request_vote *args,
bool *granted)
{
const struct raft_server *local_server;
raft_index local_last_index;
raft_term local_last_term;
bool is_transferee; /* Requester is the target of a leadership transfer
*/
int rv;
assert(r != NULL);
assert(args != NULL);
assert(granted != NULL);
local_server = configurationGet(&r->configuration, r->id);
*granted = false;
if (local_server == NULL || local_server->role != RAFT_VOTER) {
tracef("local server is not voting -> not granting vote");
return 0;
}
is_transferee =
r->transfer != NULL && r->transfer->id == args->candidate_id;
if (!args->pre_vote && r->voted_for != 0 &&
r->voted_for != args->candidate_id && !is_transferee) {
tracef("local server already voted -> not granting vote");
return 0;
}
/* Raft Dissertation 9.6:
* > In the Pre-Vote algorithm, a candidate
* > only increments its term if it first learns from a majority of the
* > cluster that they would be willing
* > to grant the candidate their votes (if the candidate's log is
* > sufficiently up-to-date, and the voters
* > have not received heartbeats from a valid leader for at least a
* baseline > election timeout) Arriving here means that in a pre-vote
* phase, we will cast our vote if the candidate's log is sufficiently
* up-to-date, no matter what the candidate's term is. We have already
* checked if we currently have a leader upon reception of the
* RequestVote RPC, meaning the 2 conditions will be satisfied if the
* candidate's log is up-to-date.
* */
local_last_index = logLastIndex(r->log);
/* Our log is definitely not more up-to-date if it's empty! */
if (local_last_index == 0) {
tracef("local log is empty -> granting vote");
goto grant_vote;
}
local_last_term = logLastTerm(r->log);
if (args->last_log_term < local_last_term) {
/* The requesting server has last entry's log term lower than
* ours. */
tracef(
"local last entry %llu has term %llu higher than %llu -> "
"not "
"granting",
local_last_index, local_last_term, args->last_log_term);
return 0;
}
if (args->last_log_term > local_last_term) {
/* The requesting server has a more up-to-date log. */
tracef(
"remote last entry %llu has term %llu higher than %llu -> "
"granting vote",
args->last_log_index, args->last_log_term, local_last_term);
goto grant_vote;
}
/* The term of the last log entry is the same, so let's compare the
* length of the log. */
assert(args->last_log_term == local_last_term);
if (local_last_index <= args->last_log_index) {
/* Our log is shorter or equal to the one of the requester. */
tracef(
"remote log equal or longer than local -> granting vote");
goto grant_vote;
}
tracef("remote log shorter than local -> not granting vote");
return 0;
grant_vote:
if (!args->pre_vote) {
rv = r->io->set_vote(r->io, args->candidate_id);
if (rv != 0) {
tracef("set_vote failed %d", rv);
return rv;
}
r->voted_for = args->candidate_id;
/* Reset the election timer. */
r->election_timer_start = r->io->time(r->io);
}
tracef("vote granted to %llu", args->candidate_id);
*granted = true;
return 0;
}
bool electionTally(struct raft *r, size_t voter_index)
{
size_t n_voters = configurationVoterCount(&r->configuration);
size_t votes = 0;
size_t i;
size_t half = n_voters / 2;
assert(r->state == RAFT_CANDIDATE);
assert(r->candidate_state.votes != NULL);
r->candidate_state.votes[voter_index] = true;
for (i = 0; i < n_voters; i++) {
if (r->candidate_state.votes[i]) {
votes++;
}
}
return votes >= half + 1;
}
#undef tracef
dqlite-1.16.7/src/raft/election.h 0000664 0000000 0000000 00000005574 14652527134 0016612 0 ustar 00root root 0000000 0000000 /* Election-related logic and helpers. */
#ifndef ELECTION_H_
#define ELECTION_H_
#include "../raft.h"
/* Reset the election_timer clock and set randomized_election_timeout to a
* random value between election_timeout and 2 * election_timeout.
*
* From Section 3.4:
*
* Raft uses randomized election timeouts to ensure that split votes are rare
* and that they are resolved quickly. To prevent split votes in the first
* place, election timeouts are chosen randomly from a fixed interval (e.g.,
* 150-300 ms). This spreads out the servers so that in most cases only a
* single server will time out.
*
* From Section 9.4:
*
* We used AvailSim to approximate a WAN spanning the continental US. Each
* message was assigned a latency chosen randomly from the uniform range of
* 30-40 ms, and the servers' election timeout range was set accordingly to
* 300-600 ms (about 10-20 times the one-way network latency). When only one
* of the five servers has failed, the average election completes within about
* 475 ms, and 99.9% of elections complete within 1.5 s. Even when two of the
* five servers have failed, the average election takes about 650 ms (about 20
* times the one-way network latency), and 99.9% of elections complete in 3
* s. We believe these election times are more than adequate for most WAN
* deployments.
*
* Must be called in follower or candidate state. */
void electionResetTimer(struct raft *r);
/* Return true if the election timer has expired.
*
* Must be called in follower or candidate state. */
bool electionTimerExpired(struct raft *r);
/* Start a new election round.
*
* From Figure 3.1:
*
* [Rules for Servers] Candidates: On conversion to candidates, start
* election:
*
* - Increment current term
* - Vote for self
* - Reset election timer
* - Send RequestVote RPCs to all other servers
*
* From Section 3.4:
*
* To begin an election, a follower increments its current term and
* transitions to candidate state. It then votes for itself and issues
* RequestVote RPCs in parallel to each of the other servers in the
* cluster.
*/
int electionStart(struct raft *r);
/* Decide whether our vote should be granted to the requesting server and update
* our state accordingly.
*
* From Figure 3.1:
*
* RequestVote RPC: Receiver Implementation:
*
* - If votedFor is null or candidateId, and candidate's log is at least as
* up-to-date as receiver's log, grant vote.
*
* The outcome of the decision is stored through the @granted pointer. */
int electionVote(struct raft *r,
const struct raft_request_vote *args,
bool *granted);
/* Update the votes array by adding the vote from the server at the given
* index. Return true if with this vote the server has reached the majority of
* votes and won elections. */
bool electionTally(struct raft *r, size_t voter_index);
#endif /* ELECTION_H_ */
dqlite-1.16.7/src/raft/entry.c 0000664 0000000 0000000 00000003170 14652527134 0016132 0 ustar 00root root 0000000 0000000 #include
#include
#include "assert.h"
#include "entry.h"
void entryBatchesDestroy(struct raft_entry *entries, const size_t n)
{
void *batch = NULL;
size_t i;
if (entries == NULL) {
assert(n == 0);
return;
}
assert(n > 0);
for (i = 0; i < n; i++) {
assert(entries[i].batch != NULL);
if (entries[i].batch != batch) {
batch = entries[i].batch;
raft_free(batch);
}
}
raft_free(entries);
}
int entryCopy(const struct raft_entry *src, struct raft_entry *dst)
{
dst->term = src->term;
dst->type = src->type;
dst->buf.len = src->buf.len;
dst->buf.base = raft_malloc(dst->buf.len);
if (dst->buf.len > 0 && dst->buf.base == NULL) {
return RAFT_NOMEM;
}
memcpy(dst->buf.base, src->buf.base, dst->buf.len);
dst->batch = NULL;
return 0;
}
int entryBatchCopy(const struct raft_entry *src,
struct raft_entry **dst,
const size_t n)
{
size_t size = 0;
void *batch;
uint8_t *cursor;
unsigned i;
if (n == 0) {
*dst = NULL;
return 0;
}
/* Calculate the total size of the entries content and allocate the
* batch. */
for (i = 0; i < n; i++) {
size += src[i].buf.len;
}
batch = raft_malloc(size);
if (batch == NULL) {
return RAFT_NOMEM;
}
/* Copy the entries. */
*dst = raft_malloc(n * sizeof **dst);
if (*dst == NULL) {
raft_free(batch);
return RAFT_NOMEM;
}
cursor = batch;
for (i = 0; i < n; i++) {
(*dst)[i].term = src[i].term;
(*dst)[i].type = src[i].type;
(*dst)[i].buf.base = cursor;
(*dst)[i].buf.len = src[i].buf.len;
(*dst)[i].batch = batch;
memcpy((*dst)[i].buf.base, src[i].buf.base, src[i].buf.len);
cursor += src[i].buf.len;
}
return 0;
}
dqlite-1.16.7/src/raft/entry.h 0000664 0000000 0000000 00000001153 14652527134 0016136 0 ustar 00root root 0000000 0000000 #ifndef ENTRY_H_
#define ENTRY_H_
#include "../raft.h"
/* Release all memory associated with the given entries, including the array
* itself. The entries are supposed to belong to one or more batches. */
void entryBatchesDestroy(struct raft_entry *entries, size_t n);
/* Create a copy of a log entry, including its data. */
int entryCopy(const struct raft_entry *src, struct raft_entry *dst);
/* Create a single batch of entries containing a copy of the given entries,
* including their data. */
int entryBatchCopy(const struct raft_entry *src,
struct raft_entry **dst,
size_t n);
#endif /* ENTRY_H */
dqlite-1.16.7/src/raft/err.c 0000664 0000000 0000000 00000003235 14652527134 0015563 0 ustar 00root root 0000000 0000000 #include "err.h"
#include
#include "../raft.h"
#include "assert.h"
#define WRAP_SEP ": "
#define WRAP_SEP_LEN ((size_t)strlen(WRAP_SEP))
void errMsgWrap(char *e, const char *format)
{
size_t n = RAFT_ERRMSG_BUF_SIZE;
size_t prefix_n;
size_t prefix_and_sep_n;
size_t trail_n;
size_t i;
/* Calculate the length of the prefix. */
prefix_n = strlen(format);
/* If there isn't enough space for the ": " separator and at least one
* character of the wrapped error message, then just print the prefix.
*/
if (prefix_n >= n - (WRAP_SEP_LEN + 1)) {
/* We explicitly allow truncation here + silence clang about unknown
* warning-group "-Wformat-truncation" */
#ifdef __GNUC__
#ifndef __clang__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wformat-truncation"
#endif
#endif
ErrMsgPrintf(e, "%s", format);
#ifdef __GNUC__
#ifndef __clang__
#pragma GCC diagnostic pop
#endif
#endif
return;
}
/* Right-shift the wrapped message, to make room for the prefix. */
prefix_and_sep_n = prefix_n + WRAP_SEP_LEN;
trail_n = strnlen(e, n - prefix_and_sep_n - 1);
memmove(e + prefix_and_sep_n, e, trail_n);
e[prefix_and_sep_n + trail_n] = 0;
/* Print the prefix. */
ErrMsgPrintf(e, "%s", format);
/* Print the separator.
*
* Avoid using strncpy(e->msg + prefix_n, WRAP_SEP, WRAP_SEP_LEN) since
* it generates a warning. */
for (i = 0; i < WRAP_SEP_LEN; i++) {
e[prefix_n + i] = WRAP_SEP[i];
}
}
#define ERR_CODE_TO_STRING_CASE(CODE, MSG) \
case CODE: \
return MSG;
const char *errCodeToString(int code)
{
switch (code) {
ERR_CODE_TO_STRING_MAP(ERR_CODE_TO_STRING_CASE);
default:
return "unknown error";
}
}
dqlite-1.16.7/src/raft/err.h 0000664 0000000 0000000 00000005763 14652527134 0015600 0 ustar 00root root 0000000 0000000 /* Utilities around error handling. */
#ifndef ERROR_H_
#define ERROR_H_
#include
#include
#define ERR_CODE_TO_STRING_MAP(X) \
X(RAFT_NOMEM, "out of memory") \
X(RAFT_BADID, "server ID is not valid") \
X(RAFT_DUPLICATEID, "server ID already in use") \
X(RAFT_DUPLICATEADDRESS, "server address already in use") \
X(RAFT_BADROLE, "server role is not valid") \
X(RAFT_MALFORMED, "encoded data is malformed") \
X(RAFT_NOTLEADER, "server is not the leader") \
X(RAFT_LEADERSHIPLOST, "server has lost leadership") \
X(RAFT_SHUTDOWN, "server is shutting down") \
X(RAFT_CANTBOOTSTRAP, "bootstrap only works on new clusters") \
X(RAFT_CANTCHANGE, "a configuration change is already in progress") \
X(RAFT_CORRUPT, "persisted data is corrupted") \
X(RAFT_CANCELED, "operation canceled") \
X(RAFT_NAMETOOLONG, "resource name too long") \
X(RAFT_TOOBIG, "data is too big") \
X(RAFT_NOCONNECTION, "no connection to remote server available") \
X(RAFT_BUSY, "operation can't be performed at this time") \
X(RAFT_IOERR, "I/O error") \
X(RAFT_NOTFOUND, "Resource not found") \
X(RAFT_INVALID, "Invalid parameter") \
X(RAFT_UNAUTHORIZED, "No access to resource") \
X(RAFT_NOSPACE, "Not enough disk space") \
X(RAFT_TOOMANY, "System or raft limit met or exceeded")
/* Format an error message. */
#define ErrMsgPrintf(ERRMSG, ...) \
snprintf(ERRMSG, RAFT_ERRMSG_BUF_SIZE, __VA_ARGS__)
/* Wrap the given error message with an additional prefix message.. */
#define ErrMsgWrapf(ERRMSG, ...) \
do { \
char _errmsg[RAFT_ERRMSG_BUF_SIZE]; \
ErrMsgPrintf(_errmsg, __VA_ARGS__); \
errMsgWrap(ERRMSG, _errmsg); \
} while (0)
void errMsgWrap(char *e, const char *format);
/* Transfer an error message from an object to another, wrapping it. */
#define ErrMsgTransfer(ERRMSG1, ERRMSG2, FORMAT) \
memcpy(ERRMSG2, ERRMSG1, RAFT_ERRMSG_BUF_SIZE); \
ErrMsgWrapf(ERRMSG2, FORMAT)
#define ErrMsgTransferf(ERRMSG1, ERRMSG2, FORMAT, ...) \
memcpy(ERRMSG2, ERRMSG1, RAFT_ERRMSG_BUF_SIZE); \
ErrMsgWrapf(ERRMSG2, FORMAT, __VA_ARGS__)
/* Use the static error message for the error with the given code. */
#define ErrMsgFromCode(ERRMSG, CODE) \
ErrMsgPrintf(ERRMSG, "%s", errCodeToString(CODE))
/* Format the out of memory error message. */
#define ErrMsgOom(ERRMSG) ErrMsgFromCode(ERRMSG, RAFT_NOMEM)
/* Convert a numeric raft error code to a human-readable error message. */
const char *errCodeToString(int code);
#endif /* ERROR_H_ */
dqlite-1.16.7/src/raft/fixture.c 0000664 0000000 0000000 00000134452 14652527134 0016467 0 ustar 00root root 0000000 0000000 #include
#include
#include
#include
#include "../raft.h"
#include "../tracing.h"
#include "assert.h"
#include "configuration.h"
#include "convert.h"
#include "entry.h"
#include "log.h"
#include "../lib/queue.h"
#include "snapshot.h"
/* Defaults */
#define HEARTBEAT_TIMEOUT 100
#define INSTALL_SNAPSHOT_TIMEOUT 30000
#define ELECTION_TIMEOUT 1000
#define NETWORK_LATENCY 15
#define DISK_LATENCY 10
#define WORK_DURATION 200
#define SEND_LATENCY 0
/* To keep in sync with raft.h */
#define N_MESSAGE_TYPES 6
/* Maximum number of peer stub instances connected to a certain stub
* instance. This should be enough for testing purposes. */
#define MAX_PEERS 8
struct raft_fixture_server
{
bool alive; /* If false, the server is down. */
raft_id id; /* Server ID. */
char address[16]; /* Server address (stringified ID). */
struct raft_tracer tracer; /* Tracer. */
struct raft_io io; /* In-memory raft_io implementation. */
struct raft raft; /* Raft instance. */
};
struct raft_fixture_event
{
unsigned server_index; /* Index of the server the event occurred on. */
int type; /* Type of the event. */
};
RAFT_API int raft_fixture_event_type(struct raft_fixture_event *event)
{
assert(event != NULL);
return event->type;
}
RAFT_API unsigned raft_fixture_event_server_index(
struct raft_fixture_event *event)
{
assert(event != NULL);
return event->server_index;
}
/* Fields common across all request types. */
#define REQUEST \
int type; /* Request code type. */ \
raft_time completion_time; /* When the request should be fulfilled. */ \
queue queue /* Link the I/O pending requests queue. */
/* Request type codes. */
enum { APPEND = 1, SEND, TRANSMIT, SNAPSHOT_PUT, SNAPSHOT_GET, ASYNC_WORK };
/* Abstract base type for an asynchronous request submitted to the stub I/o
* implementation. */
struct ioRequest
{
REQUEST;
};
/* Pending request to append entries to the log. */
struct append
{
REQUEST;
struct raft_io_append *req;
const struct raft_entry *entries;
unsigned n;
unsigned start; /* Request timestamp. */
};
/* Pending request to send a message. */
struct send
{
REQUEST;
struct raft_io_send *req;
struct raft_message message;
};
/* Pending request to store a snapshot. */
struct snapshot_put
{
REQUEST;
unsigned trailing;
struct raft_io_snapshot_put *req;
const struct raft_snapshot *snapshot;
};
/* Pending request to perform general work. */
struct async_work
{
REQUEST;
struct raft_io_async_work *req;
};
/* Pending request to load a snapshot. */
struct snapshot_get
{
REQUEST;
struct raft_io_snapshot_get *req;
};
/* Message that has been written to the network and is waiting to be delivered
* (or discarded). */
struct transmit
{
REQUEST;
struct raft_message message; /* Message to deliver */
int timer; /* Deliver after this n of msecs. */
};
/* Information about a peer server. */
struct peer
{
struct io *io; /* The peer's I/O backend. */
bool connected; /* Whether a connection is established. */
bool saturated; /* Whether the established connection is saturated. */
unsigned send_latency;
};
/* Stub I/O implementation implementing all operations in-memory. */
struct io
{
struct raft_io *io; /* I/O object we're implementing. */
unsigned index; /* Fixture server index. */
raft_time *time; /* Global cluster time. */
raft_time next_tick; /* Time the next tick should occurs. */
/* Term and vote */
raft_term term;
raft_id voted_for;
/* Log */
struct raft_snapshot *snapshot; /* Latest snapshot */
struct raft_entry *entries; /* Array or persisted entries */
size_t n; /* Size of the persisted entries array */
/* Parameters passed via raft_io->init and raft_io->start */
raft_id id;
const char *address;
unsigned tick_interval;
raft_io_tick_cb tick_cb;
raft_io_recv_cb recv_cb;
/* Queue of pending asynchronous requests, whose callbacks still haven't
* been fired. */
queue requests;
/* Peers connected to us. */
struct peer peers[MAX_PEERS];
unsigned n_peers;
unsigned
randomized_election_timeout; /* Value returned by io->random() */
unsigned network_latency; /* Milliseconds to deliver RPCs */
unsigned disk_latency; /* Milliseconds to perform disk I/O */
unsigned work_duration; /* Milliseconds to run async work */
int append_fault_countdown;
int vote_fault_countdown;
int term_fault_countdown;
int send_fault_countdown;
/* If flag i is true, messages of type i will be silently dropped. */
bool drop[N_MESSAGE_TYPES];
/* Counters of events that happened so far. */
unsigned n_send[N_MESSAGE_TYPES];
unsigned n_recv[N_MESSAGE_TYPES];
unsigned n_append;
};
static bool faultTick(int *countdown)
{
bool trigger = *countdown == 0;
if (*countdown >= 0) {
*countdown -= 1;
}
return trigger;
}
static int ioMethodInit(struct raft_io *raft_io,
raft_id id,
const char *address)
{
struct io *io = raft_io->impl;
io->id = id;
io->address = address;
return 0;
}
static int ioMethodStart(struct raft_io *raft_io,
unsigned msecs,
raft_io_tick_cb tick_cb,
raft_io_recv_cb recv_cb)
{
struct io *io = raft_io->impl;
io->tick_interval = msecs;
io->tick_cb = tick_cb;
io->recv_cb = recv_cb;
io->next_tick = *io->time + io->tick_interval;
return 0;
}
/* Flush an append entries request, appending its entries to the local in-memory
* log. */
static void ioFlushAppend(struct io *s, struct append *append)
{
struct raft_entry *entries;
unsigned i;
int status = 0;
/* Simulates a disk write failure. */
if (faultTick(&s->append_fault_countdown)) {
status = RAFT_IOERR;
goto done;
}
/* Allocate an array for the old entries plus the new ones. */
entries =
raft_realloc(s->entries, (s->n + append->n) * sizeof *s->entries);
assert(entries != NULL);
/* Copy new entries into the new array. */
for (i = 0; i < append->n; i++) {
const struct raft_entry *src = &append->entries[i];
struct raft_entry *dst = &entries[s->n + i];
int rv = entryCopy(src, dst);
assert(rv == 0);
}
s->entries = entries;
s->n += append->n;
done:
if (append->req->cb != NULL) {
append->req->cb(append->req, status);
}
raft_free(append);
}
/* Flush a snapshot put request, copying the snapshot data. */
static void ioFlushSnapshotPut(struct io *s, struct snapshot_put *r)
{
int rv;
if (s->snapshot == NULL) {
s->snapshot = raft_malloc(sizeof *s->snapshot);
assert(s->snapshot != NULL);
} else {
snapshotClose(s->snapshot);
}
rv = snapshotCopy(r->snapshot, s->snapshot);
assert(rv == 0);
if (r->trailing == 0) {
rv = s->io->truncate(s->io, 1);
assert(rv == 0);
}
if (r->req->cb != NULL) {
r->req->cb(r->req, 0);
}
raft_free(r);
}
/* Flush a snapshot get request, returning to the client a copy of the local
* snapshot (if any). */
static void ioFlushSnapshotGet(struct io *s, struct snapshot_get *r)
{
struct raft_snapshot *snapshot;
int rv;
snapshot = raft_malloc(sizeof *snapshot);
assert(snapshot != NULL);
rv = snapshotCopy(s->snapshot, snapshot);
assert(rv == 0);
r->req->cb(r->req, snapshot, 0);
raft_free(r);
}
/* Flush an async work request */
static void ioFlushAsyncWork(struct io *s, struct async_work *r)
{
(void)s;
int rv;
rv = r->req->work(r->req);
r->req->cb(r->req, rv);
raft_free(r);
}
/* Search for the peer with the given ID. */
static struct peer *ioGetPeer(struct io *io, raft_id id)
{
unsigned i;
for (i = 0; i < io->n_peers; i++) {
struct peer *peer = &io->peers[i];
if (peer->io->id == id) {
return peer;
}
}
return NULL;
}
/* Copy the dynamically allocated memory of an AppendEntries message. */
static void copyAppendEntries(const struct raft_append_entries *src,
struct raft_append_entries *dst)
{
int rv;
rv = entryBatchCopy(src->entries, &dst->entries, src->n_entries);
assert(rv == 0);
dst->n_entries = src->n_entries;
}
/* Copy the dynamically allocated memory of an InstallSnapshot message. */
static void copyInstallSnapshot(const struct raft_install_snapshot *src,
struct raft_install_snapshot *dst)
{
int rv;
rv = configurationCopy(&src->conf, &dst->conf);
assert(rv == 0);
dst->data.base = raft_malloc(dst->data.len);
assert(dst->data.base != NULL);
memcpy(dst->data.base, src->data.base, src->data.len);
}
/* Flush a raft_io_send request, copying the message content into a new struct
* transmit object and invoking the user callback. */
static void ioFlushSend(struct io *io, struct send *send)
{
struct peer *peer;
struct transmit *transmit;
struct raft_message *src;
struct raft_message *dst;
int status;
/* If the peer doesn't exist or was disconnected, fail the request. */
peer = ioGetPeer(io, send->message.server_id);
if (peer == NULL || !peer->connected) {
status = RAFT_NOCONNECTION;
goto out;
}
transmit = raft_calloc(1, sizeof *transmit);
assert(transmit != NULL);
transmit->type = TRANSMIT;
transmit->completion_time = *io->time + io->network_latency;
src = &send->message;
dst = &transmit->message;
queue_insert_tail(&io->requests, &transmit->queue);
*dst = *src;
switch (dst->type) {
case RAFT_IO_APPEND_ENTRIES:
/* Make a copy of the entries being sent */
copyAppendEntries(&src->append_entries,
&dst->append_entries);
break;
case RAFT_IO_INSTALL_SNAPSHOT:
copyInstallSnapshot(&src->install_snapshot,
&dst->install_snapshot);
break;
}
io->n_send[send->message.type]++;
status = 0;
out:
if (send->req->cb != NULL) {
send->req->cb(send->req, status);
}
raft_free(send);
}
/* Release the memory used by the given message transmit object. */
static void ioDestroyTransmit(struct transmit *transmit)
{
struct raft_message *message;
message = &transmit->message;
switch (message->type) {
case RAFT_IO_APPEND_ENTRIES:
if (message->append_entries.entries != NULL) {
raft_free(
message->append_entries.entries[0].batch);
raft_free(message->append_entries.entries);
}
break;
case RAFT_IO_INSTALL_SNAPSHOT:
raft_configuration_close(
&message->install_snapshot.conf);
raft_free(message->install_snapshot.data.base);
break;
}
raft_free(transmit);
}
/* Flush all requests in the queue. */
static void ioFlushAll(struct io *io)
{
while (!queue_empty(&io->requests)) {
queue *head;
struct ioRequest *r;
head = queue_head(&io->requests);
queue_remove(head);
r = QUEUE_DATA(head, struct ioRequest, queue);
switch (r->type) {
case APPEND:
ioFlushAppend(io, (struct append *)r);
break;
case SEND:
ioFlushSend(io, (struct send *)r);
break;
case TRANSMIT:
ioDestroyTransmit((struct transmit *)r);
break;
case SNAPSHOT_PUT:
ioFlushSnapshotPut(io,
(struct snapshot_put *)r);
break;
case SNAPSHOT_GET:
ioFlushSnapshotGet(io,
(struct snapshot_get *)r);
break;
case ASYNC_WORK:
ioFlushAsyncWork(io, (struct async_work *)r);
break;
default:
assert(0);
}
}
}
static void ioMethodClose(struct raft_io *raft_io, raft_io_close_cb cb)
{
if (cb != NULL) {
cb(raft_io);
}
}
static int ioMethodLoad(struct raft_io *io,
raft_term *term,
raft_id *voted_for,
struct raft_snapshot **snapshot,
raft_index *start_index,
struct raft_entry **entries,
size_t *n_entries)
{
struct io *s;
int rv;
s = io->impl;
*term = s->term;
*voted_for = s->voted_for;
*start_index = 1;
*n_entries = s->n;
/* Make a copy of the persisted entries, storing their data into a
* single batch. */
rv = entryBatchCopy(s->entries, entries, s->n);
assert(rv == 0);
if (s->snapshot != NULL) {
*snapshot = raft_malloc(sizeof **snapshot);
assert(*snapshot != NULL);
rv = snapshotCopy(s->snapshot, *snapshot);
assert(rv == 0);
*start_index = (*snapshot)->index + 1;
} else {
*snapshot = NULL;
}
return 0;
}
static int ioMethodBootstrap(struct raft_io *raft_io,
const struct raft_configuration *conf)
{
struct io *io = raft_io->impl;
struct raft_buffer buf;
struct raft_entry *entries;
int rv;
if (io->term != 0) {
return RAFT_CANTBOOTSTRAP;
}
assert(io->voted_for == 0);
assert(io->snapshot == NULL);
assert(io->entries == NULL);
assert(io->n == 0);
/* Encode the given configuration. */
rv = configurationEncode(conf, &buf);
if (rv != 0) {
return rv;
}
entries = raft_calloc(1, sizeof *io->entries);
if (entries == NULL) {
return RAFT_NOMEM;
}
entries[0].term = 1;
entries[0].type = RAFT_CHANGE;
entries[0].buf = buf;
io->term = 1;
io->voted_for = 0;
io->snapshot = NULL;
io->entries = entries;
io->n = 1;
return 0;
}
static int ioMethodRecover(struct raft_io *io,
const struct raft_configuration *conf)
{
/* TODO: implement this API */
(void)io;
(void)conf;
return RAFT_IOERR;
}
static int ioMethodSetTerm(struct raft_io *raft_io, const raft_term term)
{
struct io *io = raft_io->impl;
if (faultTick(&io->term_fault_countdown)) {
return RAFT_IOERR;
}
io->term = term;
io->voted_for = 0;
return 0;
}
static int ioMethodSetVote(struct raft_io *raft_io, const raft_id server_id)
{
struct io *io = raft_io->impl;
if (faultTick(&io->vote_fault_countdown)) {
return RAFT_IOERR;
}
io->voted_for = server_id;
return 0;
}
static int ioMethodAppend(struct raft_io *raft_io,
struct raft_io_append *req,
const struct raft_entry entries[],
unsigned n,
raft_io_append_cb cb)
{
struct io *io = raft_io->impl;
struct append *r;
r = raft_malloc(sizeof *r);
assert(r != NULL);
r->type = APPEND;
r->completion_time = *io->time + io->disk_latency;
r->req = req;
r->entries = entries;
r->n = n;
req->cb = cb;
queue_insert_tail(&io->requests, &r->queue);
return 0;
}
static int ioMethodTruncate(struct raft_io *raft_io, raft_index index)
{
struct io *io = raft_io->impl;
size_t n;
n = (size_t)(index - 1); /* Number of entries left after truncation */
if (n > 0) {
struct raft_entry *entries;
/* Create a new array of entries holding the non-truncated
* entries */
entries = raft_malloc(n * sizeof *entries);
if (entries == NULL) {
return RAFT_NOMEM;
}
memcpy(entries, io->entries, n * sizeof *io->entries);
/* Release any truncated entry */
if (io->entries != NULL) {
size_t i;
for (i = n; i < io->n; i++) {
raft_free(io->entries[i].buf.base);
}
raft_free(io->entries);
}
io->entries = entries;
} else {
/* Release everything we have */
if (io->entries != NULL) {
size_t i;
for (i = 0; i < io->n; i++) {
raft_free(io->entries[i].buf.base);
}
raft_free(io->entries);
io->entries = NULL;
}
}
io->n = n;
return 0;
}
static int ioMethodSnapshotPut(struct raft_io *raft_io,
unsigned trailing,
struct raft_io_snapshot_put *req,
const struct raft_snapshot *snapshot,
raft_io_snapshot_put_cb cb)
{
struct io *io = raft_io->impl;
struct snapshot_put *r;
r = raft_malloc(sizeof *r);
assert(r != NULL);
r->type = SNAPSHOT_PUT;
r->req = req;
r->req->cb = cb;
r->snapshot = snapshot;
r->completion_time = *io->time + io->disk_latency;
r->trailing = trailing;
queue_insert_tail(&io->requests, &r->queue);
return 0;
}
static int ioMethodAsyncWork(struct raft_io *raft_io,
struct raft_io_async_work *req,
raft_io_async_work_cb cb)
{
struct io *io = raft_io->impl;
struct async_work *r;
r = raft_malloc(sizeof *r);
assert(r != NULL);
r->type = ASYNC_WORK;
r->req = req;
r->req->cb = cb;
r->completion_time = *io->time + io->work_duration;
queue_insert_tail(&io->requests, &r->queue);
return 0;
}
static int ioMethodSnapshotGet(struct raft_io *raft_io,
struct raft_io_snapshot_get *req,
raft_io_snapshot_get_cb cb)
{
struct io *io = raft_io->impl;
struct snapshot_get *r;
r = raft_malloc(sizeof *r);
assert(r != NULL);
r->type = SNAPSHOT_GET;
r->req = req;
r->req->cb = cb;
r->completion_time = *io->time + io->disk_latency;
queue_insert_tail(&io->requests, &r->queue);
return 0;
}
static raft_time ioMethodTime(struct raft_io *raft_io)
{
struct io *io = raft_io->impl;
return *io->time;
}
static int ioMethodRandom(struct raft_io *raft_io, int min, int max)
{
struct io *io = raft_io->impl;
int t = (int)io->randomized_election_timeout;
if (t < min) {
return min;
} else if (t > max) {
return max;
} else {
return t;
}
}
/* Queue up a request which will be processed later, when io_stub_flush()
* is invoked. */
static int ioMethodSend(struct raft_io *raft_io,
struct raft_io_send *req,
const struct raft_message *message,
raft_io_send_cb cb)
{
struct io *io = raft_io->impl;
struct send *r;
struct peer *peer;
if (faultTick(&io->send_fault_countdown)) {
return RAFT_IOERR;
}
r = raft_malloc(sizeof *r);
assert(r != NULL);
r->type = SEND;
r->req = req;
r->message = *message;
r->req->cb = cb;
peer = ioGetPeer(io, message->server_id);
r->completion_time = *io->time + peer->send_latency;
queue_insert_tail(&io->requests, &r->queue);
return 0;
}
static void ioReceive(struct io *io, struct raft_message *message)
{
io->recv_cb(io->io, message);
io->n_recv[message->type]++;
}
static void ioDeliverTransmit(struct io *io, struct transmit *transmit)
{
struct raft_message *message = &transmit->message;
struct peer *peer; /* Destination peer */
/* If this message type is in the drop list, let's discard it */
if (io->drop[message->type - 1]) {
ioDestroyTransmit(transmit);
return;
}
peer = ioGetPeer(io, message->server_id);
/* We don't have any peer with this ID or it's disconnected or if the
* connection is saturated, let's drop the message */
if (peer == NULL || !peer->connected || peer->saturated) {
ioDestroyTransmit(transmit);
return;
}
/* Update the message object with our details. */
message->server_id = io->id;
message->server_address = io->address;
ioReceive(peer->io, message);
raft_free(transmit);
}
/* Connect @raft_io to @other, enabling delivery of messages sent from @io to
* @other.
*/
static void ioConnect(struct raft_io *raft_io, struct raft_io *other)
{
struct io *io = raft_io->impl;
struct io *io_other = other->impl;
assert(io->n_peers < MAX_PEERS);
io->peers[io->n_peers].io = io_other;
io->peers[io->n_peers].connected = true;
io->peers[io->n_peers].saturated = false;
io->peers[io->n_peers].send_latency = SEND_LATENCY;
io->n_peers++;
}
/* Return whether the connection with the given peer is saturated. */
static bool ioSaturated(struct raft_io *raft_io, struct raft_io *other)
{
struct io *io = raft_io->impl;
struct io *io_other = other->impl;
struct peer *peer;
peer = ioGetPeer(io, io_other->id);
return peer != NULL && peer->saturated;
}
/* Disconnect @raft_io and @other, causing calls to @io->send() to fail
* asynchronously when sending messages to @other. */
static void ioDisconnect(struct raft_io *raft_io, struct raft_io *other)
{
struct io *io = raft_io->impl;
struct io *io_other = other->impl;
struct peer *peer;
peer = ioGetPeer(io, io_other->id);
assert(peer != NULL);
peer->connected = false;
}
/* Reconnect @raft_io and @other. */
static void ioReconnect(struct raft_io *raft_io, struct raft_io *other)
{
struct io *io = raft_io->impl;
struct io *io_other = other->impl;
struct peer *peer;
peer = ioGetPeer(io, io_other->id);
assert(peer != NULL);
peer->connected = true;
}
/* Saturate the connection from @io to @other, causing messages sent from @io to
* @other to be dropped. */
static void ioSaturate(struct raft_io *io, struct raft_io *other)
{
struct io *s;
struct io *s_other;
struct peer *peer;
s = io->impl;
s_other = other->impl;
peer = ioGetPeer(s, s_other->id);
assert(peer != NULL && peer->connected);
peer->saturated = true;
}
/* Desaturate the connection from @raft_io to @other, re-enabling delivery of
* messages sent from @raft_io to @other. */
static void ioDesaturate(struct raft_io *raft_io, struct raft_io *other)
{
struct io *io = raft_io->impl;
struct io *io_other = other->impl;
struct peer *peer;
peer = ioGetPeer(io, io_other->id);
assert(peer != NULL && peer->connected);
peer->saturated = false;
}
/* Enable or disable silently dropping all outgoing messages of type @type. */
void ioDrop(struct io *io, int type, bool flag)
{
io->drop[type - 1] = flag;
}
static int ioInit(struct raft_io *raft_io, unsigned index, raft_time *time)
{
struct io *io;
io = raft_malloc(sizeof *io);
assert(io != NULL);
io->io = raft_io;
io->index = index;
io->time = time;
io->term = 0;
io->voted_for = 0;
io->snapshot = NULL;
io->entries = NULL;
io->n = 0;
queue_init(&io->requests);
io->n_peers = 0;
io->randomized_election_timeout = ELECTION_TIMEOUT + index * 100;
io->network_latency = NETWORK_LATENCY;
io->disk_latency = DISK_LATENCY;
io->work_duration = WORK_DURATION;
io->append_fault_countdown = -1;
io->vote_fault_countdown = -1;
io->term_fault_countdown = -1;
io->send_fault_countdown = -1;
memset(io->drop, 0, sizeof io->drop);
memset(io->n_send, 0, sizeof io->n_send);
memset(io->n_recv, 0, sizeof io->n_recv);
io->n_append = 0;
raft_io->impl = io;
raft_io->version = 2;
raft_io->init = ioMethodInit;
raft_io->close = ioMethodClose;
raft_io->start = ioMethodStart;
raft_io->load = ioMethodLoad;
raft_io->bootstrap = ioMethodBootstrap;
raft_io->recover = ioMethodRecover;
raft_io->set_term = ioMethodSetTerm;
raft_io->set_vote = ioMethodSetVote;
raft_io->append = ioMethodAppend;
raft_io->truncate = ioMethodTruncate;
raft_io->send = ioMethodSend;
raft_io->snapshot_put = ioMethodSnapshotPut;
raft_io->async_work = ioMethodAsyncWork;
raft_io->snapshot_get = ioMethodSnapshotGet;
raft_io->time = ioMethodTime;
raft_io->random = ioMethodRandom;
return 0;
}
/* Release all memory held by the given stub I/O implementation. */
void ioClose(struct raft_io *raft_io)
{
struct io *io = raft_io->impl;
size_t i;
for (i = 0; i < io->n; i++) {
struct raft_entry *entry = &io->entries[i];
raft_free(entry->buf.base);
}
if (io->entries != NULL) {
raft_free(io->entries);
}
if (io->snapshot != NULL) {
snapshotClose(io->snapshot);
raft_free(io->snapshot);
}
raft_free(io);
}
/* Custom emit tracer function which include the server ID. */
static void emit(struct raft_tracer *t,
const char *file,
unsigned int line,
const char *func,
unsigned int level,
const char *message)
{
unsigned id = *(unsigned *)t->impl;
(void)func;
(void)level;
fprintf(stderr, "%d: %30s:%*d - %s\n", id, file, 3, line, message);
}
static int serverInit(struct raft_fixture *f, unsigned i, struct raft_fsm *fsm)
{
int rv;
struct raft_fixture_server *s;
s = raft_malloc(sizeof(*s));
if (s == NULL) {
return RAFT_NOMEM;
}
f->servers[i] = s;
s->alive = true;
s->id = i + 1;
sprintf(s->address, "%llu", s->id);
rv = ioInit(&s->io, i, &f->time);
if (rv != 0) {
return rv;
}
rv = raft_init(&s->raft, &s->io, fsm, s->id, s->address);
if (rv != 0) {
return rv;
}
raft_set_election_timeout(&s->raft, ELECTION_TIMEOUT);
raft_set_heartbeat_timeout(&s->raft, HEARTBEAT_TIMEOUT);
raft_set_install_snapshot_timeout(&s->raft, INSTALL_SNAPSHOT_TIMEOUT);
s->tracer.impl = (void *)&s->id;
s->tracer.emit = emit;
s->raft.tracer = NULL;
return 0;
}
static void serverClose(struct raft_fixture_server *s)
{
raft_close(&s->raft, NULL);
ioClose(&s->io);
raft_free(s);
}
/* Connect the server with the given index to all others */
static void serverConnectToAll(struct raft_fixture *f, unsigned i)
{
unsigned j;
for (j = 0; j < f->n; j++) {
struct raft_io *io1 = &f->servers[i]->io;
struct raft_io *io2 = &f->servers[j]->io;
if (i == j) {
continue;
}
ioConnect(io1, io2);
}
}
int raft_fixture_init(struct raft_fixture *f)
{
f->time = 0;
f->n = 0;
f->log = logInit();
if (f->log == NULL) {
return RAFT_NOMEM;
}
f->commit_index = 0;
f->hook = NULL;
f->event = raft_malloc(sizeof(*f->event));
if (f->event == NULL) {
return RAFT_NOMEM;
}
return 0;
}
void raft_fixture_close(struct raft_fixture *f)
{
unsigned i;
for (i = 0; i < f->n; i++) {
struct io *io = f->servers[i]->io.impl;
ioFlushAll(io);
}
for (i = 0; i < f->n; i++) {
serverClose(f->servers[i]);
}
raft_free(f->event);
logClose(f->log);
}
int raft_fixture_configuration(struct raft_fixture *f,
unsigned n_voting,
struct raft_configuration *configuration)
{
unsigned i;
assert(f->n > 0);
assert(n_voting > 0);
assert(n_voting <= f->n);
raft_configuration_init(configuration);
for (i = 0; i < f->n; i++) {
struct raft_fixture_server *s;
int role = i < n_voting ? RAFT_VOTER : RAFT_STANDBY;
int rv;
s = f->servers[i];
rv = raft_configuration_add(configuration, s->id, s->address,
role);
if (rv != 0) {
return rv;
}
}
return 0;
}
int raft_fixture_bootstrap(struct raft_fixture *f,
struct raft_configuration *configuration)
{
unsigned i;
for (i = 0; i < f->n; i++) {
struct raft *raft = raft_fixture_get(f, i);
int rv;
rv = raft_bootstrap(raft, configuration);
if (rv != 0) {
return rv;
}
}
return 0;
}
int raft_fixture_start(struct raft_fixture *f)
{
unsigned i;
int rv;
for (i = 0; i < f->n; i++) {
struct raft_fixture_server *s = f->servers[i];
rv = raft_start(&s->raft);
if (rv != 0) {
return rv;
}
}
return 0;
}
unsigned raft_fixture_n(struct raft_fixture *f)
{
return f->n;
}
raft_time raft_fixture_time(struct raft_fixture *f)
{
return f->time;
}
struct raft *raft_fixture_get(struct raft_fixture *f, unsigned i)
{
assert(i < f->n);
return &f->servers[i]->raft;
}
bool raft_fixture_alive(struct raft_fixture *f, unsigned i)
{
assert(i < f->n);
return f->servers[i]->alive;
}
unsigned raft_fixture_leader_index(struct raft_fixture *f)
{
if (f->leader_id != 0) {
return (unsigned)(f->leader_id - 1);
}
return f->n;
}
raft_id raft_fixture_voted_for(struct raft_fixture *f, unsigned i)
{
struct io *io = f->servers[i]->io.impl;
return io->voted_for;
}
/* Update the leader and check for election safety.
*
* From figure 3.2:
*
* Election Safety -> At most one leader can be elected in a given
* term.
*
* Return true if the current leader turns out to be different from the one at
* the time this function was called.
*/
static bool updateLeaderAndCheckElectionSafety(struct raft_fixture *f)
{
raft_id leader_id = 0;
unsigned leader_i = 0;
raft_term leader_term = 0;
unsigned i;
bool changed;
for (i = 0; i < f->n; i++) {
struct raft *raft = raft_fixture_get(f, i);
unsigned j;
/* If the server is not alive or is not the leader, skip to the
* next server. */
if (!raft_fixture_alive(f, i) ||
raft_state(raft) != RAFT_LEADER) {
continue;
}
/* Check that no other server is leader for this term. */
for (j = 0; j < f->n; j++) {
struct raft *other = raft_fixture_get(f, j);
if (other->id == raft->id ||
other->state != RAFT_LEADER) {
continue;
}
if (other->current_term == raft->current_term) {
fprintf(stderr,
"server %llu and %llu are both leaders "
"in term %llu",
raft->id, other->id,
raft->current_term);
abort();
}
}
if (raft->current_term > leader_term) {
leader_id = raft->id;
leader_i = i;
leader_term = raft->current_term;
}
}
/* Check that the leader is stable, in the sense that it has been
* acknowledged by all alive servers connected to it, and those servers
* together with the leader form a majority. */
if (leader_id != 0) {
unsigned n_acks = 0;
bool acked = true;
unsigned n_quorum = 0;
for (i = 0; i < f->n; i++) {
struct raft *raft = raft_fixture_get(f, i);
const struct raft_server *server =
configurationGet(&raft->configuration, raft->id);
/* If the server is not in the configuration or is idle,
* then don't count it. */
if (server == NULL || server->role == RAFT_SPARE) {
continue;
}
n_quorum++;
/* If this server is itself the leader, or it's not
* alive or it's not connected to the leader, then don't
* count it in for stability. */
if (i == leader_i || !raft_fixture_alive(f, i) ||
raft_fixture_saturated(f, leader_i, i)) {
continue;
}
if (raft->current_term != leader_term) {
acked = false;
break;
}
if (raft->state != RAFT_FOLLOWER) {
acked = false;
break;
}
if (raft->follower_state.current_leader.id == 0) {
acked = false;
break;
}
if (raft->follower_state.current_leader.id !=
leader_id) {
acked = false;
break;
}
n_acks++;
}
if (!acked || n_acks < (n_quorum / 2)) {
leader_id = 0;
}
}
changed = leader_id != f->leader_id;
f->leader_id = leader_id;
return changed;
}
/* Check for leader append-only.
*
* From figure 3.2:
*
* Leader Append-Only -> A leader never overwrites or deletes entries in its
* own log; it only appends new entries.
*/
static void checkLeaderAppendOnly(struct raft_fixture *f)
{
struct raft *raft;
raft_index index;
raft_index last = logLastIndex(f->log);
/* If the cached log is empty it means there was no leader before. */
if (last == 0) {
return;
}
/* If there's no new leader, just return. */
if (f->leader_id == 0) {
return;
}
raft = raft_fixture_get(f, (unsigned)f->leader_id - 1);
last = logLastIndex(f->log);
for (index = 1; index <= last; index++) {
const struct raft_entry *entry1;
const struct raft_entry *entry2;
size_t i;
entry1 = logGet(f->log, index);
entry2 = logGet(raft->log, index);
assert(entry1 != NULL);
/* Check if the entry was snapshotted. */
if (entry2 == NULL) {
assert(raft->log->snapshot.last_index >= index);
continue;
}
/* Entry was not overwritten. */
assert(entry1->type == entry2->type);
assert(entry1->term == entry2->term);
for (i = 0; i < entry1->buf.len; i++) {
assert(((uint8_t *)entry1->buf.base)[i] ==
((uint8_t *)entry2->buf.base)[i]);
}
}
}
/* Make a copy of the the current leader log, in order to perform the Leader
* Append-Only check at the next iteration. */
static void copyLeaderLog(struct raft_fixture *f)
{
struct raft *raft = raft_fixture_get(f, (unsigned)f->leader_id - 1);
struct raft_entry *entries;
unsigned n;
size_t i;
int rv;
logClose(f->log);
f->log = logInit();
if (f->log == NULL) {
assert(false);
return;
}
rv = logAcquire(raft->log, 1, &entries, &n);
assert(rv == 0);
for (i = 0; i < n; i++) {
struct raft_entry *entry = &entries[i];
struct raft_buffer buf;
buf.len = entry->buf.len;
buf.base = raft_malloc(buf.len);
assert(buf.base != NULL);
memcpy(buf.base, entry->buf.base, buf.len);
/* FIXME(cole) what to do here for is_local? */
rv = logAppend(f->log, entry->term, entry->type, buf, (struct raft_entry_local_data){}, false, NULL);
assert(rv == 0);
}
logRelease(raft->log, 1, entries, n);
}
/* Update the commit index to match the one from the current leader. */
static void updateCommitIndex(struct raft_fixture *f)
{
struct raft *raft = raft_fixture_get(f, (unsigned)f->leader_id - 1);
if (raft->commit_index > f->commit_index) {
f->commit_index = raft->commit_index;
}
}
/* Return the lowest tick time across all servers, along with the associated
* server index */
static void getLowestTickTime(struct raft_fixture *f, raft_time *t, unsigned *i)
{
unsigned j;
*t = (raft_time)-1 /* Maximum value */;
for (j = 0; j < f->n; j++) {
struct io *io = f->servers[j]->io.impl;
if (io->next_tick < *t) {
*t = io->next_tick;
*i = j;
}
}
}
/* Return the completion time of the request with the lowest completion time
* across all servers, along with the associated server index. */
static void getLowestRequestCompletionTime(struct raft_fixture *f,
raft_time *t,
unsigned *i)
{
unsigned j;
*t = (raft_time)-1 /* Maximum value */;
for (j = 0; j < f->n; j++) {
struct io *io = f->servers[j]->io.impl;
queue *head;
QUEUE_FOREACH(head, &io->requests)
{
struct ioRequest *r =
QUEUE_DATA(head, struct ioRequest, queue);
if (r->completion_time < *t) {
*t = r->completion_time;
*i = j;
}
}
}
}
/* Fire the tick callback of the i'th server. */
static void fireTick(struct raft_fixture *f, unsigned i)
{
struct io *io = f->servers[i]->io.impl;
f->time = io->next_tick;
f->event->server_index = i;
f->event->type = RAFT_FIXTURE_TICK;
io->next_tick += io->tick_interval;
if (f->servers[i]->alive) {
io->tick_cb(io->io);
}
}
/* Complete the first request with completion time @t on the @i'th server. */
static void completeRequest(struct raft_fixture *f, unsigned i, raft_time t)
{
struct io *io = f->servers[i]->io.impl;
queue *head;
struct ioRequest *r = NULL;
bool found = false;
f->time = t;
f->event->server_index = i;
QUEUE_FOREACH(head, &io->requests)
{
r = QUEUE_DATA(head, struct ioRequest, queue);
if (r->completion_time == t) {
found = true;
break;
}
}
assert(found);
queue_remove(head);
switch (r->type) {
case APPEND:
ioFlushAppend(io, (struct append *)r);
f->event->type = RAFT_FIXTURE_DISK;
break;
case SEND:
ioFlushSend(io, (struct send *)r);
f->event->type = RAFT_FIXTURE_NETWORK;
break;
case TRANSMIT:
ioDeliverTransmit(io, (struct transmit *)r);
f->event->type = RAFT_FIXTURE_NETWORK;
break;
case SNAPSHOT_PUT:
ioFlushSnapshotPut(io, (struct snapshot_put *)r);
f->event->type = RAFT_FIXTURE_DISK;
break;
case SNAPSHOT_GET:
ioFlushSnapshotGet(io, (struct snapshot_get *)r);
f->event->type = RAFT_FIXTURE_DISK;
break;
case ASYNC_WORK:
ioFlushAsyncWork(io, (struct async_work *)r);
f->event->type = RAFT_FIXTURE_WORK;
break;
default:
assert(0);
}
}
struct raft_fixture_event *raft_fixture_step(struct raft_fixture *f)
{
raft_time tick_time;
raft_time completion_time;
unsigned i = f->n;
unsigned j = f->n;
getLowestTickTime(f, &tick_time, &i);
getLowestRequestCompletionTime(f, &completion_time, &j);
assert(i < f->n || j < f->n);
if (tick_time < completion_time ||
(tick_time == completion_time && i <= j)) {
fireTick(f, i);
} else {
completeRequest(f, j, completion_time);
}
/* If the leader has not changed check the Leader Append-Only
* guarantee. */
if (!updateLeaderAndCheckElectionSafety(f)) {
checkLeaderAppendOnly(f);
}
/* If we have a leader, update leader-related state . */
if (f->leader_id != 0) {
copyLeaderLog(f);
updateCommitIndex(f);
}
if (f->hook != NULL) {
f->hook(f, f->event);
}
return f->event;
}
struct raft_fixture_event *raft_fixture_step_n(struct raft_fixture *f,
unsigned n)
{
unsigned i;
assert(n > 0);
for (i = 0; i < n - 1; i++) {
raft_fixture_step(f);
}
return raft_fixture_step(f);
}
bool raft_fixture_step_until(struct raft_fixture *f,
bool (*stop)(struct raft_fixture *f, void *arg),
void *arg,
unsigned max_msecs)
{
raft_time start = f->time;
while (!stop(f, arg) && (f->time - start) < max_msecs) {
raft_fixture_step(f);
}
return f->time - start < max_msecs;
}
/* A step function which return always false, forcing raft_fixture_step_n to
* advance time at each iteration. */
static bool spin(struct raft_fixture *f, void *arg)
{
(void)f;
(void)arg;
return false;
}
void raft_fixture_step_until_elapsed(struct raft_fixture *f, unsigned msecs)
{
raft_fixture_step_until(f, spin, NULL, msecs);
}
static bool hasLeader(struct raft_fixture *f, void *arg)
{
(void)arg;
return f->leader_id != 0;
}
bool raft_fixture_step_until_has_leader(struct raft_fixture *f,
unsigned max_msecs)
{
return raft_fixture_step_until(f, hasLeader, NULL, max_msecs);
}
static bool hasNoLeader(struct raft_fixture *f, void *arg)
{
(void)arg;
return f->leader_id == 0;
}
bool raft_fixture_step_until_has_no_leader(struct raft_fixture *f,
unsigned max_msecs)
{
return raft_fixture_step_until(f, hasNoLeader, NULL, max_msecs);
}
/* Enable/disable dropping outgoing messages of a certain type from all servers
* except one. */
static void dropAllExcept(struct raft_fixture *f,
int type,
bool flag,
unsigned i)
{
unsigned j;
for (j = 0; j < f->n; j++) {
struct raft_fixture_server *s = f->servers[j];
if (j == i) {
continue;
}
ioDrop(s->io.impl, type, flag);
}
}
/* Set the randomized election timeout of the given server to the minimum value
* compatible with its current state and timers. */
static void minimizeRandomizedElectionTimeout(struct raft_fixture *f,
unsigned i)
{
struct raft *raft = &f->servers[i]->raft;
raft_time now = raft->io->time(raft->io);
unsigned timeout = raft->election_timeout;
assert(raft->state == RAFT_FOLLOWER);
/* If the minimum election timeout value would make the timer expire in
* the past, cap it. */
if (now - raft->election_timer_start > timeout) {
timeout = (unsigned)(now - raft->election_timer_start);
}
raft->follower_state.randomized_election_timeout = timeout;
}
/* Set the randomized election timeout to the maximum value on all servers
* except the given one. */
static void maximizeAllRandomizedElectionTimeoutsExcept(struct raft_fixture *f,
unsigned i)
{
unsigned j;
for (j = 0; j < f->n; j++) {
struct raft *raft = &f->servers[j]->raft;
unsigned timeout = raft->election_timeout * 2;
if (j == i) {
continue;
}
assert(raft->state == RAFT_FOLLOWER);
raft->follower_state.randomized_election_timeout = timeout;
}
}
void raft_fixture_hook(struct raft_fixture *f, raft_fixture_event_cb hook)
{
f->hook = hook;
}
void raft_fixture_start_elect(struct raft_fixture *f, unsigned i)
{
struct raft *raft = raft_fixture_get(f, i);
unsigned j;
/* Make sure there's currently no leader. */
assert(f->leader_id == 0);
/* Make sure that the given server is voting. */
assert(configurationGet(&raft->configuration, raft->id)->role ==
RAFT_VOTER);
/* Make sure all servers are currently followers. */
for (j = 0; j < f->n; j++) {
assert(raft_state(&f->servers[j]->raft) == RAFT_FOLLOWER);
}
/* Pretend that the last randomized election timeout was set at the
* maximum value on all server expect the one to be elected, which is
* instead set to the minimum possible value compatible with its current
* state. */
minimizeRandomizedElectionTimeout(f, i);
maximizeAllRandomizedElectionTimeoutsExcept(f, i);
}
void raft_fixture_elect(struct raft_fixture *f, unsigned i)
{
struct raft *raft = raft_fixture_get(f, i);
raft_fixture_start_elect(f, i);
raft_fixture_step_until_has_leader(f, ELECTION_TIMEOUT * 20);
assert(f->leader_id == raft->id);
}
void raft_fixture_depose(struct raft_fixture *f)
{
unsigned leader_i;
/* Make sure there's a leader. */
assert(f->leader_id != 0);
leader_i = (unsigned)f->leader_id - 1;
assert(raft_state(&f->servers[leader_i]->raft) == RAFT_LEADER);
/* Set a very large election timeout on all followers, to prevent them
* from starting an election. */
maximizeAllRandomizedElectionTimeoutsExcept(f, leader_i);
/* Prevent all servers from sending append entries results, so the
* leader will eventually step down. */
dropAllExcept(f, RAFT_IO_APPEND_ENTRIES_RESULT, true, leader_i);
raft_fixture_step_until_has_no_leader(f, ELECTION_TIMEOUT * 3);
assert(f->leader_id == 0);
dropAllExcept(f, RAFT_IO_APPEND_ENTRIES_RESULT, false, leader_i);
}
struct step_apply
{
unsigned i;
raft_index index;
};
static bool hasAppliedIndex(struct raft_fixture *f, void *arg)
{
struct step_apply *apply = (struct step_apply *)arg;
struct raft *raft;
unsigned n = 0;
unsigned i;
if (apply->i < f->n) {
raft = raft_fixture_get(f, apply->i);
return raft_last_applied(raft) >= apply->index;
}
for (i = 0; i < f->n; i++) {
raft = raft_fixture_get(f, i);
if (raft_last_applied(raft) >= apply->index) {
n++;
}
}
return n == f->n;
}
bool raft_fixture_step_until_applied(struct raft_fixture *f,
unsigned i,
raft_index index,
unsigned max_msecs)
{
struct step_apply apply = {i, index};
return raft_fixture_step_until(f, hasAppliedIndex, &apply, max_msecs);
}
struct step_state
{
unsigned i;
int state;
};
static bool hasState(struct raft_fixture *f, void *arg)
{
struct step_state *target = (struct step_state *)arg;
struct raft *raft;
raft = raft_fixture_get(f, target->i);
return raft_state(raft) == target->state;
}
bool raft_fixture_step_until_state_is(struct raft_fixture *f,
unsigned i,
int state,
unsigned max_msecs)
{
struct step_state target = {i, state};
return raft_fixture_step_until(f, hasState, &target, max_msecs);
}
struct step_term
{
unsigned i;
raft_term term;
};
static bool hasTerm(struct raft_fixture *f, void *arg)
{
struct step_term *target = (struct step_term *)arg;
struct raft *raft;
raft = raft_fixture_get(f, target->i);
return raft->current_term == target->term;
}
bool raft_fixture_step_until_term_is(struct raft_fixture *f,
unsigned i,
raft_term term,
unsigned max_msecs)
{
struct step_term target = {i, term};
return raft_fixture_step_until(f, hasTerm, &target, max_msecs);
}
struct step_vote
{
unsigned i;
unsigned j;
};
static bool hasVotedFor(struct raft_fixture *f, void *arg)
{
struct step_vote *target = (struct step_vote *)arg;
struct raft *raft;
raft = raft_fixture_get(f, target->i);
return raft->voted_for == target->j + 1;
}
bool raft_fixture_step_until_voted_for(struct raft_fixture *f,
unsigned i,
unsigned j,
unsigned max_msecs)
{
struct step_vote target = {i, j};
return raft_fixture_step_until(f, hasVotedFor, &target, max_msecs);
}
struct step_deliver
{
unsigned i;
unsigned j;
};
static bool hasDelivered(struct raft_fixture *f, void *arg)
{
struct step_deliver *target = (struct step_deliver *)arg;
struct raft *raft;
struct io *io;
struct raft_message *message;
queue *head;
raft = raft_fixture_get(f, target->i);
io = raft->io->impl;
QUEUE_FOREACH(head, &io->requests)
{
struct ioRequest *r;
r = QUEUE_DATA(head, struct ioRequest, queue);
message = NULL;
switch (r->type) {
case SEND:
message = &((struct send *)r)->message;
break;
case TRANSMIT:
message = &((struct transmit *)r)->message;
break;
}
if (message != NULL && message->server_id == target->j + 1) {
return false;
}
}
return true;
}
bool raft_fixture_step_until_delivered(struct raft_fixture *f,
unsigned i,
unsigned j,
unsigned max_msecs)
{
struct step_deliver target = {i, j};
return raft_fixture_step_until(f, hasDelivered, &target, max_msecs);
}
void raft_fixture_disconnect(struct raft_fixture *f, unsigned i, unsigned j)
{
struct raft_io *io1 = &f->servers[i]->io;
struct raft_io *io2 = &f->servers[j]->io;
ioDisconnect(io1, io2);
}
void raft_fixture_reconnect(struct raft_fixture *f, unsigned i, unsigned j)
{
struct raft_io *io1 = &f->servers[i]->io;
struct raft_io *io2 = &f->servers[j]->io;
ioReconnect(io1, io2);
}
void raft_fixture_saturate(struct raft_fixture *f, unsigned i, unsigned j)
{
struct raft_io *io1 = &f->servers[i]->io;
struct raft_io *io2 = &f->servers[j]->io;
ioSaturate(io1, io2);
}
static void disconnectFromAll(struct raft_fixture *f, unsigned i)
{
unsigned j;
for (j = 0; j < f->n; j++) {
if (j == i) {
continue;
}
raft_fixture_saturate(f, i, j);
raft_fixture_saturate(f, j, i);
}
}
static void reconnectToAll(struct raft_fixture *f, unsigned i)
{
unsigned j;
for (j = 0; j < f->n; j++) {
if (j == i) {
continue;
}
/* Don't reconnect to disconnected peers */
if (!f->servers[j]->alive) {
continue;
}
raft_fixture_desaturate(f, i, j);
raft_fixture_desaturate(f, j, i);
}
}
bool raft_fixture_saturated(struct raft_fixture *f, unsigned i, unsigned j)
{
struct raft_io *io1 = &f->servers[i]->io;
struct raft_io *io2 = &f->servers[j]->io;
return ioSaturated(io1, io2);
}
void raft_fixture_desaturate(struct raft_fixture *f, unsigned i, unsigned j)
{
struct raft_io *io1 = &f->servers[i]->io;
struct raft_io *io2 = &f->servers[j]->io;
ioDesaturate(io1, io2);
}
void raft_fixture_kill(struct raft_fixture *f, unsigned i)
{
disconnectFromAll(f, i);
f->servers[i]->alive = false;
}
void raft_fixture_revive(struct raft_fixture *f, unsigned i)
{
reconnectToAll(f, i);
f->servers[i]->alive = true;
}
int raft_fixture_grow(struct raft_fixture *f, struct raft_fsm *fsm)
{
unsigned i;
unsigned j;
int rc;
i = f->n;
f->n++;
rc = serverInit(f, i, fsm);
if (rc != 0) {
return rc;
}
serverConnectToAll(f, i);
for (j = 0; j < f->n; j++) {
struct raft_io *io1 = &f->servers[i]->io;
struct raft_io *io2 = &f->servers[j]->io;
ioConnect(io2, io1);
}
return 0;
}
void raft_fixture_set_randomized_election_timeout(struct raft_fixture *f,
unsigned i,
unsigned msecs)
{
struct io *io = f->servers[i]->io.impl;
io->randomized_election_timeout = msecs;
}
void raft_fixture_set_network_latency(struct raft_fixture *f,
unsigned i,
unsigned msecs)
{
struct io *io = f->servers[i]->io.impl;
io->network_latency = msecs;
}
void raft_fixture_set_disk_latency(struct raft_fixture *f,
unsigned i,
unsigned msecs)
{
struct io *io = f->servers[i]->io.impl;
io->disk_latency = msecs;
}
void raft_fixture_set_send_latency(struct raft_fixture *f,
unsigned i,
unsigned j,
unsigned msecs)
{
struct io *io = f->servers[i]->io.impl;
struct peer *peer = ioGetPeer(io, f->servers[j]->id);
peer->send_latency = msecs;
}
void raft_fixture_set_term(struct raft_fixture *f, unsigned i, raft_term term)
{
struct io *io = f->servers[i]->io.impl;
io->term = term;
}
void raft_fixture_set_snapshot(struct raft_fixture *f,
unsigned i,
struct raft_snapshot *snapshot)
{
struct io *io = f->servers[i]->io.impl;
io->snapshot = snapshot;
}
void raft_fixture_add_entry(struct raft_fixture *f,
unsigned i,
struct raft_entry *entry)
{
struct io *io = f->servers[i]->io.impl;
struct raft_entry *entries;
entries = raft_realloc(io->entries, (io->n + 1) * sizeof *entries);
assert(entries != NULL);
entries[io->n] = *entry;
io->entries = entries;
io->n++;
}
void raft_fixture_append_fault(struct raft_fixture *f, unsigned i, int delay)
{
struct io *io = f->servers[i]->io.impl;
io->append_fault_countdown = delay;
}
void raft_fixture_vote_fault(struct raft_fixture *f, unsigned i, int delay)
{
struct io *io = f->servers[i]->io.impl;
io->vote_fault_countdown = delay;
}
void raft_fixture_term_fault(struct raft_fixture *f, unsigned i, int delay)
{
struct io *io = f->servers[i]->io.impl;
io->term_fault_countdown = delay;
}
void raft_fixture_send_fault(struct raft_fixture *f, unsigned i, int delay)
{
struct io *io = f->servers[i]->io.impl;
io->send_fault_countdown = delay;
}
unsigned raft_fixture_n_send(struct raft_fixture *f, unsigned i, int type)
{
struct io *io = f->servers[i]->io.impl;
return io->n_send[type];
}
unsigned raft_fixture_n_recv(struct raft_fixture *f, unsigned i, int type)
{
struct io *io = f->servers[i]->io.impl;
return io->n_recv[type];
}
void raft_fixture_make_unavailable(struct raft_fixture *f, unsigned i)
{
struct raft *r = &f->servers[i]->raft;
convertToUnavailable(r);
}
dqlite-1.16.7/src/raft/flags.c 0000664 0000000 0000000 00000000431 14652527134 0016062 0 ustar 00root root 0000000 0000000 #include "flags.h"
inline raft_flags flagsSet(raft_flags in, raft_flags flags)
{
return in | flags;
}
inline raft_flags flagsClear(raft_flags in, raft_flags flags)
{
return in & (~flags);
}
inline bool flagsIsSet(raft_flags in, raft_flags flag)
{
return (bool)(in & flag);
}
dqlite-1.16.7/src/raft/flags.h 0000664 0000000 0000000 00000001155 14652527134 0016073 0 ustar 00root root 0000000 0000000 #ifndef FLAGS_H_
#define FLAGS_H_
#include "../raft.h"
#define RAFT_DEFAULT_FEATURE_FLAGS (0)
/* Adds the flags @flags to @in and returns the new flags. Multiple flags should
* be combined using the `|` operator. */
raft_flags flagsSet(raft_flags in, raft_flags flags);
/* Clears the flags @flags from @in and returns the new flags. Multiple flags
* should be combined using the `|` operator. */
raft_flags flagsClear(raft_flags in, raft_flags flags);
/* Returns `true` if the single flag @flag is set in @in, otherwise returns
* `false`. */
bool flagsIsSet(raft_flags in, raft_flags flag);
#endif /* FLAGS_H */
dqlite-1.16.7/src/raft/heap.c 0000664 0000000 0000000 00000004321 14652527134 0015705 0 ustar 00root root 0000000 0000000 #include "heap.h"
#include
#include "../raft.h"
static void *defaultMalloc(void *data, size_t size)
{
(void)data;
return malloc(size);
}
static void defaultFree(void *data, void *ptr)
{
(void)data;
free(ptr);
}
static void *defaultCalloc(void *data, size_t nmemb, size_t size)
{
(void)data;
return calloc(nmemb, size);
}
static void *defaultRealloc(void *data, void *ptr, size_t size)
{
(void)data;
return realloc(ptr, size);
}
static void *defaultAlignedAlloc(void *data, size_t alignment, size_t size)
{
(void)data;
return aligned_alloc(alignment, size);
}
static void defaultAlignedFree(void *data, size_t alignment, void *ptr)
{
(void)alignment;
defaultFree(data, ptr);
}
static struct raft_heap defaultHeap = {
NULL, /* data */
defaultMalloc, /* malloc */
defaultFree, /* free */
defaultCalloc, /* calloc */
defaultRealloc, /* realloc */
defaultAlignedAlloc, /* aligned_alloc */
defaultAlignedFree /* aligned_free */
};
static struct raft_heap *currentHeap = &defaultHeap;
void *RaftHeapMalloc(size_t size)
{
return currentHeap->malloc(currentHeap->data, size);
}
void RaftHeapFree(void *ptr)
{
if (ptr == NULL) {
return;
}
currentHeap->free(currentHeap->data, ptr);
}
void *RaftHeapCalloc(size_t nmemb, size_t size)
{
return currentHeap->calloc(currentHeap->data, nmemb, size);
}
void *RaftHeapRealloc(void *ptr, size_t size)
{
return currentHeap->realloc(currentHeap->data, ptr, size);
}
void *raft_malloc(size_t size)
{
return RaftHeapMalloc(size);
}
void raft_free(void *ptr)
{
RaftHeapFree(ptr);
}
void *raft_calloc(size_t nmemb, size_t size)
{
return RaftHeapCalloc(nmemb, size);
}
void *raft_realloc(void *ptr, size_t size)
{
return RaftHeapRealloc(ptr, size);
}
void *raft_aligned_alloc(size_t alignment, size_t size)
{
return currentHeap->aligned_alloc(currentHeap->data, alignment, size);
}
void raft_aligned_free(size_t alignment, void *ptr)
{
currentHeap->aligned_free(currentHeap->data, alignment, ptr);
}
void raft_heap_set(struct raft_heap *heap)
{
currentHeap = heap;
}
void raft_heap_set_default(void)
{
currentHeap = &defaultHeap;
}
const struct raft_heap *raft_heap_get(void)
{
return currentHeap;
}
dqlite-1.16.7/src/raft/heap.h 0000664 0000000 0000000 00000000413 14652527134 0015710 0 ustar 00root root 0000000 0000000 /* Internal heap APIs. */
#ifndef HEAP_H_
#define HEAP_H_
#include
void *RaftHeapMalloc(size_t size);
void *RaftHeapCalloc(size_t nmemb, size_t size);
void *RaftHeapRealloc(void *ptr, size_t size);
void RaftHeapFree(void *ptr);
#endif /* HEAP_H_ */
dqlite-1.16.7/src/raft/lifecycle.c 0000664 0000000 0000000 00000001370 14652527134 0016730 0 ustar 00root root 0000000 0000000 #include "lifecycle.h"
#include "../tracing.h"
#include "../lib/queue.h"
#include
#include
#include
static bool reqIdIsSet(const struct request *req)
{
return req->req_id[15] == (uint8_t)-1;
}
static uint64_t extractReqId(const struct request *req)
{
uint64_t id;
memcpy(&id, &req->req_id, sizeof(id));
return id;
}
void lifecycleRequestStart(struct raft *r, struct request *req)
{
if (reqIdIsSet(req)) {
tracef("request start id:%" PRIu64, extractReqId(req));
}
queue_insert_tail(&r->leader_state.requests, &req->queue);
}
void lifecycleRequestEnd(struct raft *r, struct request *req)
{
(void)r;
if (reqIdIsSet(req)) {
tracef("request end id:%" PRIu64, extractReqId(req));
}
queue_remove(&req->queue);
}
dqlite-1.16.7/src/raft/lifecycle.h 0000664 0000000 0000000 00000000336 14652527134 0016736 0 ustar 00root root 0000000 0000000 #ifndef LIFECYCLE_H_
#define LIFECYCLE_H_
#include "../raft.h"
#include "request.h"
void lifecycleRequestStart(struct raft *r, struct request *req);
void lifecycleRequestEnd(struct raft *r, struct request *req);
#endif
dqlite-1.16.7/src/raft/log.c 0000664 0000000 0000000 00000055534 14652527134 0015565 0 ustar 00root root 0000000 0000000 #include "log.h"
#include
#include "../raft.h"
#include "assert.h"
#include "configuration.h"
/* Calculate the reference count hash table key for the given log entry index in
* an hash table of the given size.
*
* The hash is simply the log entry index minus one modulo the size. This
* minimizes conflicts in the most frequent case, where a new log entry is
* simply appended to the log and can use the hash table bucket next to the
* bucket for the entry with the previous index (possibly resizing the table if
* its cap is reached). */
static size_t refsKey(const raft_index index, const size_t size)
{
assert(index > 0);
assert(size > 0);
return (size_t)((index - 1) % size);
}
/* Try to insert a new reference count item for the given log entry index into
* the given reference count hash table.
*
* A collision happens when the bucket associated with the hash key of the given
* log entry index is already used to refcount log entries with a different
* index. In that case the collision output parameter will be set to true and no
* new reference count item is inserted into the hash table.
*
* If two log entries have the same index but different terms, the associated
* bucket will be grown accordingly. */
static int refsTryInsert(struct raft_entry_ref *table,
const size_t size,
const raft_term term,
const raft_index index,
const unsigned short count,
struct raft_buffer buf,
void *batch,
bool *collision)
{
struct raft_entry_ref *bucket; /* Bucket associated with this index. */
struct raft_entry_ref *next_slot; /* For traversing the bucket slots. */
struct raft_entry_ref
*last_slot; /* To track the last traversed slot. */
struct raft_entry_ref *slot; /* Actual slot to use for this entry. */
size_t key;
assert(table != NULL);
assert(size > 0);
assert(term > 0);
assert(index > 0);
assert(count > 0);
assert(collision != NULL);
/* Calculate the hash table key for the given index. */
key = refsKey(index, size);
bucket = &table[key];
/* If a bucket is empty, then there's no collision and we can fill its
* first slot. */
if (bucket->count == 0) {
assert(bucket->next == NULL);
slot = bucket;
goto fill;
}
/* If the bucket is already used to refcount entries with a different
* index, then we have a collision and we must abort here. */
if (bucket->index != index) {
*collision = true;
return 0;
}
/* If we get here it means that the bucket is in use to refcount one or
* more entries with the same index as the given one, but different
* terms.
*
* We must append a newly allocated slot to refcount the entry with this
* term.
*
* So first let's find the last slot in the bucket. */
for (next_slot = bucket; next_slot != NULL;
next_slot = next_slot->next) {
/* All entries in a bucket must have the same index. */
assert(next_slot->index == index);
/* It should never happen that two entries with the same index
* and term get appended. So no existing slot in this bucket
* must track an entry with the same term as the given one. */
assert(next_slot->term != term);
last_slot = next_slot;
}
/* The last slot must have no next slot. */
assert(last_slot->next == NULL);
slot = raft_malloc(sizeof *slot);
if (slot == NULL) {
return RAFT_NOMEM;
}
last_slot->next = slot;
fill:
slot->term = term;
slot->index = index;
slot->count = count;
slot->buf = buf;
slot->batch = batch;
slot->next = NULL;
*collision = false;
return 0;
}
/* Move the slots of the given bucket into the given reference count hash
* table. The key of the bucket to use in the given table will be re-calculated
* according to the given size. */
static int refsMove(struct raft_entry_ref *bucket,
struct raft_entry_ref *table,
const size_t size)
{
struct raft_entry_ref *slot;
struct raft_entry_ref *next_slot;
assert(bucket != NULL);
assert(table != NULL);
assert(size > 0);
/* Only non-empty buckets should be moved. */
assert(bucket->count > 0);
/* For each slot in the bucket, insert the relevant entry in the given
* table, then free it. */
next_slot = bucket;
while (next_slot != NULL) {
bool collision;
int rv;
slot = next_slot;
/* Insert the reference count for this entry into the new table.
*/
rv = refsTryInsert(table, size, slot->term, slot->index,
slot->count, slot->buf, slot->batch,
&collision);
next_slot = slot->next;
/* Unless this is the very first slot in the bucket, we need to
* free the slot. */
if (slot != bucket) {
raft_free(slot);
}
if (rv != 0) {
return rv;
}
/* The given hash table is assumed to be large enough to hold
* all ref counts without any conflict. */
assert(!collision);
};
return 0;
}
/* Grow the size of the reference count hash table. */
static int refsGrow(struct raft_log *l)
{
struct raft_entry_ref *table; /* New hash table. */
size_t size; /* Size of the new hash table. */
size_t i;
assert(l != NULL);
assert(l->refs_size > 0);
size = l->refs_size * 2; /* Double the table size */
table = raft_calloc(size, sizeof *table);
if (table == NULL) {
return RAFT_NOMEM;
}
/* Populate the new hash table, inserting all entries existing in the
* current hash table. Each bucket will have a different key in the new
* hash table, since the size has changed. */
for (i = 0; i < l->refs_size; i++) {
struct raft_entry_ref *bucket = &l->refs[i];
if (bucket->count > 0) {
int rv = refsMove(bucket, table, size);
if (rv != 0) {
return rv;
}
} else {
/* If the count is zero, we expect that the bucket is
* unused. */
assert(bucket->next == NULL);
}
}
raft_free(l->refs);
l->refs = table;
l->refs_size = size;
return 0;
}
/* Initialize the reference count of the entry with the given index, setting it
* to 1. */
static int refsInit(struct raft_log *l,
const raft_term term,
const raft_index index,
struct raft_buffer buf,
void *batch)
{
int i;
assert(l != NULL);
assert(term > 0);
assert(index > 0);
/* Initialize the hash map with a reasonable size */
if (l->refs == NULL) {
l->refs_size = LOG__REFS_INITIAL_SIZE;
l->refs = raft_calloc(l->refs_size, sizeof *l->refs);
if (l->refs == NULL) {
return RAFT_NOMEM;
}
}
/* Check if the bucket associated with the given index is available
* (i.e. there are no collisions), or grow the table and re-key it
* otherwise.
*
* We limit the number of times we try to grow the table to 10, to avoid
* eating up too much memory. In practice, there should never be a case
* where this is not enough. */
for (i = 0; i < 10; i++) {
bool collision;
int rc;
rc = refsTryInsert(l->refs, l->refs_size, term, index, 1, buf,
batch, &collision);
if (rc != 0) {
return RAFT_NOMEM;
}
if (!collision) {
return 0;
}
rc = refsGrow(l);
if (rc != 0) {
return rc;
}
};
return RAFT_NOMEM;
}
/* Increment the refcount of the entry with the given term and index. */
static void refsIncr(struct raft_log *l,
const raft_term term,
const raft_index index)
{
size_t key; /* Hash table key for the given index. */
struct raft_entry_ref *slot; /* Slot for the given term/index */
assert(l != NULL);
assert(term > 0);
assert(index > 0);
key = refsKey(index, l->refs_size);
/* Lookup the slot associated with the given term/index, which must have
* been previously inserted. */
slot = &l->refs[key];
while (1) {
assert(slot != NULL);
assert(slot->index == index);
if (slot->term == term) {
break;
}
slot = slot->next;
}
assert(slot != NULL);
slot->count++;
}
/* Decrement the refcount of the entry with the given index. Return a boolean
* indicating whether the entry has now zero references. */
static bool refsDecr(struct raft_log *l,
const raft_term term,
const raft_index index)
{
size_t key; /* Hash table key for the given index. */
struct raft_entry_ref *slot; /* Slot for the given term/index */
struct raft_entry_ref
*prev_slot; /* Slot preceeding the one to decrement */
assert(l != NULL);
assert(term > 0);
assert(index > 0);
key = refsKey(index, l->refs_size);
prev_slot = NULL;
/* Lookup the slot associated with the given term/index, keeping track
* of its previous slot in the bucket list. */
slot = &l->refs[key];
while (1) {
assert(slot != NULL);
assert(slot->index == index);
if (slot->term == term) {
break;
}
prev_slot = slot;
slot = slot->next;
}
slot->count--;
if (slot->count > 0) {
/* The entry is still referenced. */
return false;
}
/* If the refcount has dropped to zero, delete the slot. */
if (prev_slot != NULL) {
/* This isn't the very first slot, simply unlink it from the
* slot list. */
prev_slot->next = slot->next;
raft_free(slot);
} else if (slot->next != NULL) {
/* This is the very first slot, and slot list is not empty. Copy
* the second slot into the first one, then delete it. */
struct raft_entry_ref *second_slot = slot->next;
*slot = *second_slot;
raft_free(second_slot);
}
return true;
}
struct raft_log *logInit(void)
{
struct raft_log *log;
log = raft_malloc(sizeof(*log));
if (log == NULL) {
return NULL;
}
log->entries = NULL;
log->size = 0;
log->front = log->back = 0;
log->offset = 0;
log->refs = NULL;
log->refs_size = 0;
log->snapshot.last_index = 0;
log->snapshot.last_term = 0;
return log;
}
/* Return the index of the i'th entry in the log. */
static raft_index indexAt(struct raft_log *l, size_t i)
{
return l->offset + i + 1;
}
/* Return the circular buffer position of the i'th entry in the log. */
static size_t positionAt(struct raft_log *l, size_t i)
{
return (l->front + i) % l->size;
}
/* Return the i'th entry in the log. */
static struct raft_entry *entryAt(struct raft_log *l, size_t i)
{
return &l->entries[positionAt(l, i)];
}
void logClose(struct raft_log *l)
{
void *batch = NULL; /* Last batch that has been freed */
assert(l != NULL);
if (l->entries != NULL) {
size_t i;
size_t n = logNumEntries(l);
for (i = 0; i < n; i++) {
struct raft_entry *entry = entryAt(l, i);
raft_index index = indexAt(l, i);
size_t key = refsKey(index, l->refs_size);
struct raft_entry_ref *slot = &l->refs[key];
/* We require that there are no outstanding references
* to active entries. */
assert(slot->count == 1);
/* TODO: we should support the case where the bucket has
* more than one slot. */
assert(slot->next == NULL);
/* Release the memory used by the entry data (either
* directly or via a batch). */
if (entry->batch == NULL) {
if (entry->buf.base != NULL) {
raft_free(entry->buf.base);
}
} else {
if (entry->batch != batch) {
/* This batch was not released yet, so
* let's do it now. */
batch = entry->batch;
raft_free(entry->batch);
}
}
}
raft_free(l->entries);
}
if (l->refs != NULL) {
raft_free(l->refs);
}
raft_free(l);
}
void logStart(struct raft_log *l,
raft_index snapshot_index,
raft_term snapshot_term,
raft_index start_index)
{
assert(logNumEntries(l) == 0);
assert(start_index > 0);
assert(start_index <= snapshot_index + 1);
assert(snapshot_index == 0 || snapshot_term != 0);
l->snapshot.last_index = snapshot_index;
l->snapshot.last_term = snapshot_term;
l->offset = start_index - 1;
}
/* Ensure that the entries array has enough free slots for adding a new entry.
*/
static int ensureCapacity(struct raft_log *l)
{
struct raft_entry *entries; /* New entries array */
size_t n; /* Current number of entries */
size_t size; /* Size of the new array */
size_t i;
n = logNumEntries(l);
if (n + 1 < l->size) {
return 0;
}
/* Make the new size twice the current size plus one (for the new
* entry). Over-allocating now avoids smaller allocations later. */
size = (l->size + 1) * 2;
entries = raft_calloc(size, sizeof *entries);
if (entries == NULL) {
return RAFT_NOMEM;
}
/* Copy all active old entries to the beginning of the newly allocated
* array. */
for (i = 0; i < n; i++) {
memcpy(&entries[i], entryAt(l, i), sizeof *entries);
}
/* Release the old entries array. */
if (l->entries != NULL) {
raft_free(l->entries);
}
l->entries = entries;
l->size = size;
l->front = 0;
l->back = n;
return 0;
}
int logReinstate(struct raft_log *l,
raft_term term,
unsigned short type,
bool *reinstated)
{
raft_index index;
size_t key;
struct raft_entry_ref *bucket;
struct raft_entry_ref *slot;
struct raft_entry *entry;
int rv;
*reinstated = false;
if (l->refs_size == 0) {
return 0;
}
index = logLastIndex(l) + 1;
key = refsKey(index, l->refs_size);
bucket = &l->refs[key];
if (bucket->count == 0 || bucket->index != index) {
return 0;
}
for (slot = bucket; slot != NULL; slot = slot->next) {
if (slot->term == term) {
rv = ensureCapacity(l);
if (rv != 0) {
return rv;
}
slot->count++;
l->back++;
l->back %= l->size;
entry = &l->entries[l->back];
entry->term = term;
entry->type = type;
entry->buf = slot->buf;
entry->batch = slot->batch;
*reinstated = true;
break;
}
}
return 0;
}
int logAppend(struct raft_log *l,
const raft_term term,
const unsigned short type,
struct raft_buffer buf,
struct raft_entry_local_data local_data,
bool is_local,
void *batch)
{
int rv;
struct raft_entry *entry;
raft_index index;
assert(l != NULL);
assert(term > 0);
assert(type == RAFT_CHANGE || type == RAFT_BARRIER ||
type == RAFT_COMMAND);
rv = ensureCapacity(l);
if (rv != 0) {
return rv;
}
index = logLastIndex(l) + 1;
rv = refsInit(l, term, index, buf, batch);
if (rv != 0) {
return rv;
}
entry = &l->entries[l->back];
entry->term = term;
entry->type = type;
entry->buf = buf;
entry->batch = batch;
entry->local_data = local_data;
entry->is_local = is_local;
l->back += 1;
l->back = l->back % l->size;
return 0;
}
int logAppendCommands(struct raft_log *l,
const raft_term term,
const struct raft_buffer bufs[],
const struct raft_entry_local_data local_data[],
const unsigned n)
{
unsigned i;
int rv;
assert(l != NULL);
assert(term > 0);
assert(bufs != NULL);
assert(n > 0);
for (i = 0; i < n; i++) {
struct raft_entry_local_data loc = (local_data != NULL) ? local_data[i] : (struct raft_entry_local_data){};
rv = logAppend(l, term, RAFT_COMMAND, bufs[i], loc, true, NULL);
if (rv != 0) {
return rv;
}
}
return 0;
}
int logAppendConfiguration(struct raft_log *l,
const raft_term term,
const struct raft_configuration *configuration)
{
struct raft_buffer buf;
int rv;
assert(l != NULL);
assert(term > 0);
assert(configuration != NULL);
/* Encode the configuration into a buffer. */
rv = configurationEncode(configuration, &buf);
if (rv != 0) {
goto err;
}
/* Append the new entry to the log. */
rv = logAppend(l, term, RAFT_CHANGE, buf, (struct raft_entry_local_data){}, true, NULL);
if (rv != 0) {
goto err_after_encode;
}
return 0;
err_after_encode:
raft_free(buf.base);
err:
assert(rv != 0);
return rv;
}
size_t logNumEntries(struct raft_log *l)
{
assert(l != NULL);
/* The circular buffer is not wrapped. */
if (l->front <= l->back) {
return l->back - l->front;
}
/* The circular buffer is wrapped. */
return l->size - l->front + l->back;
}
raft_index logLastIndex(struct raft_log *l)
{
/* If there are no entries in the log, but there is a snapshot available
* check that it's last index is consistent with the offset. */
if (logNumEntries(l) == 0 && l->snapshot.last_index != 0) {
assert(l->offset <= l->snapshot.last_index);
}
return l->offset + logNumEntries(l);
}
/* Return the position of the entry with the given index in the entries array.
*
* If no entry with the given index is in the log return the size of the entries
* array. */
static size_t locateEntry(struct raft_log *l, const raft_index index)
{
size_t n = logNumEntries(l);
if (n == 0 || index < indexAt(l, 0) || index > indexAt(l, n - 1)) {
return l->size;
}
/* Get the circular buffer position of the desired entry. Log indexes
* start at 1, so we subtract one to get array indexes. We also need to
* subtract any index offset this log might start at. */
return positionAt(l, (size_t)((index - 1) - l->offset));
}
raft_term logTermOf(struct raft_log *l, const raft_index index)
{
size_t i;
assert(index > 0);
assert(l->offset <= l->snapshot.last_index);
if ((index < l->offset + 1 && index != l->snapshot.last_index) ||
index > logLastIndex(l)) {
return 0;
}
if (index == l->snapshot.last_index) {
assert(l->snapshot.last_term != 0);
/* Coherence check that if we still have the entry at
* last_index, its term matches the one in the snapshot. */
i = locateEntry(l, index);
if (i != l->size) {
assert(l->entries[i].term == l->snapshot.last_term);
}
return l->snapshot.last_term;
}
i = locateEntry(l, index);
assert(i < l->size);
return l->entries[i].term;
}
raft_index logSnapshotIndex(struct raft_log *l)
{
return l->snapshot.last_index;
}
raft_term logLastTerm(struct raft_log *l)
{
raft_index last_index;
last_index = logLastIndex(l);
return last_index > 0 ? logTermOf(l, last_index) : 0;
}
const struct raft_entry *logGet(struct raft_log *l, const raft_index index)
{
size_t i;
assert(l != NULL);
/* Get the array index of the desired entry. */
i = locateEntry(l, index);
if (i == l->size) {
return NULL;
}
assert(i < l->size);
return &l->entries[i];
}
int logAcquire(struct raft_log *l,
const raft_index index,
struct raft_entry *entries[],
unsigned *n)
{
size_t i;
size_t j;
assert(l != NULL);
assert(index > 0);
assert(entries != NULL);
assert(n != NULL);
/* Get the array index of the first entry to acquire. */
i = locateEntry(l, index);
if (i == l->size) {
*n = 0;
*entries = NULL;
return 0;
}
if (i < l->back) {
/* The last entry does not wrap with respect to i, so the number
* of entries is simply the length of the range [i...l->back).
*/
*n = (unsigned)(l->back - i);
} else {
/* The last entry wraps with respect to i, so the number of
* entries is the sum of the lengths of the ranges [i...l->size)
* and [0...l->back), which is l->size - i + l->back.*/
*n = (unsigned)(l->size - i + l->back);
}
assert(*n > 0);
*entries = raft_calloc(*n, sizeof **entries);
if (*entries == NULL) {
return RAFT_NOMEM;
}
for (j = 0; j < *n; j++) {
size_t k = (i + j) % l->size;
struct raft_entry *entry = &(*entries)[j];
*entry = l->entries[k];
refsIncr(l, entry->term, index + j);
}
return 0;
}
/* Return true if the given batch is referenced by any entry currently in the
* log. */
static bool isBatchReferenced(struct raft_log *l, const void *batch)
{
size_t i;
/* Iterate through all live entries to see if there's one
* belonging to the same batch. This is slightly inefficient but
* this code path should be taken very rarely in practice. */
for (i = 0; i < logNumEntries(l); i++) {
struct raft_entry *entry = entryAt(l, i);
if (entry->batch == batch) {
return true;
}
}
return false;
}
void logRelease(struct raft_log *l,
const raft_index index,
struct raft_entry entries[],
const unsigned n)
{
size_t i;
void *batch = NULL; /* Last batch whose memory was freed */
assert(l != NULL);
assert((entries == NULL && n == 0) || (entries != NULL && n > 0));
for (i = 0; i < n; i++) {
struct raft_entry *entry = &entries[i];
bool unref;
unref = refsDecr(l, entry->term, index + i);
/* If there are no outstanding references to this entry, free
* its payload if it's not part of a batch, or check if we can
* free the batch itself. */
if (unref) {
if (entries[i].batch == NULL) {
if (entry->buf.base != NULL) {
raft_free(entries[i].buf.base);
}
} else {
if (entry->batch != batch) {
if (!isBatchReferenced(l,
entry->batch)) {
batch = entry->batch;
raft_free(batch);
}
}
}
}
}
if (entries != NULL) {
raft_free(entries);
}
}
/* Clear the log if it became empty. */
static void clearIfEmpty(struct raft_log *l)
{
if (logNumEntries(l) > 0) {
return;
}
raft_free(l->entries);
l->entries = NULL;
l->size = 0;
l->front = 0;
l->back = 0;
}
/* Destroy an entry, possibly releasing the memory of its buffer. */
static void destroyEntry(struct raft_log *l, struct raft_entry *entry)
{
if (entry->batch == NULL) {
if (entry->buf.base != NULL) {
raft_free(entry->buf.base);
}
} else {
if (!isBatchReferenced(l, entry->batch)) {
raft_free(entry->batch);
}
}
}
/* Core logic of @logTruncate and @logDiscard, removing all log entries from
* @index onward. If @destroy is true, also destroy the removed entries. */
static void removeSuffix(struct raft_log *l,
const raft_index index,
bool destroy)
{
size_t i;
size_t n;
raft_index start = index;
assert(l != NULL);
assert(index > l->offset);
assert(index <= logLastIndex(l));
/* Number of entries to delete */
n = (size_t)(logLastIndex(l) - start) + 1;
for (i = 0; i < n; i++) {
struct raft_entry *entry;
bool unref;
if (l->back == 0) {
l->back = l->size - 1;
} else {
l->back--;
}
entry = &l->entries[l->back];
unref = refsDecr(l, entry->term, start + n - i - 1);
if (unref && destroy) {
destroyEntry(l, entry);
}
}
clearIfEmpty(l);
}
void logTruncate(struct raft_log *l, const raft_index index)
{
if (logNumEntries(l) == 0) {
return;
}
removeSuffix(l, index, true);
}
void logDiscard(struct raft_log *l, const raft_index index)
{
removeSuffix(l, index, false);
}
/* Delete all entries up to the given index (included). */
static void removePrefix(struct raft_log *l, const raft_index index)
{
size_t i;
size_t n;
assert(l != NULL);
assert(index > 0);
assert(index <= logLastIndex(l));
/* Number of entries to delete */
n = (size_t)(index - indexAt(l, 0)) + 1;
for (i = 0; i < n; i++) {
struct raft_entry *entry;
bool unref;
entry = &l->entries[l->front];
if (l->front == l->size - 1) {
l->front = 0;
} else {
l->front++;
}
l->offset++;
unref = refsDecr(l, entry->term, l->offset);
if (unref) {
destroyEntry(l, entry);
}
}
clearIfEmpty(l);
}
void logSnapshot(struct raft_log *l, raft_index last_index, unsigned trailing)
{
raft_term last_term = logTermOf(l, last_index);
/* We must have an entry at this index */
assert(last_term != 0);
l->snapshot.last_index = last_index;
l->snapshot.last_term = last_term;
/* If we have not at least n entries preceeding the given last index,
* then there's nothing to remove and we're done. */
if (last_index <= trailing ||
locateEntry(l, last_index - trailing) == l->size) {
return;
}
removePrefix(l, last_index - trailing);
}
void logRestore(struct raft_log *l, raft_index last_index, raft_term last_term)
{
size_t n = logNumEntries(l);
assert(last_index > 0);
assert(last_term > 0);
if (n > 0) {
logTruncate(l, logLastIndex(l) - n + 1);
}
l->snapshot.last_index = last_index;
l->snapshot.last_term = last_term;
l->offset = last_index;
}
dqlite-1.16.7/src/raft/log.h 0000664 0000000 0000000 00000016065 14652527134 0015566 0 ustar 00root root 0000000 0000000 /* In-memory cache of the persistent raft log stored on disk. */
#ifndef RAFT_LOG_H_
#define RAFT_LOG_H_
#include "../raft.h"
/* Initial size of the entry reference count hash table. */
#define LOG__REFS_INITIAL_SIZE 256
/**
* Counter for outstanding references to a log entry.
*
* When an entry is first appended to the log, its refcount is set to one (the
* log itself is the only one referencing the entry). Whenever an entry is
* included in an I/O request (to write it to disk or to send it to other
* servers) its refcount is increased by one. Whenever an entry gets deleted
* from the log its refcount is decreased by one. Likewise, whenever an I/O
* request is completed the refcount of the relevant entries is decreased by
* one. When the refcount drops to zero the memory that its @buf attribute
* points to gets released, or, if the @batch attribute is non-NULL, a check is
* made to see if all other entries of the same batch also have a zero refcount,
* and the memory that @batch points to gets released if that's the case.
*/
struct raft_entry_ref
{
raft_term term; /* Term of the entry being ref-counted. */
raft_index index; /* Index of the entry being ref-counted. */
unsigned short count; /* Number of references. */
/* The next two fields are copied from the corresponding fields of the
* raft_entry pointed to by this reference. We store them here as well,
* so that logReinstate can retrieve them when it finds a raft_entry_ref
* with the same index and term as it was passed, and create a full
* raft_entry using them. */
struct raft_buffer buf;
void *batch;
struct raft_entry_ref
*next; /* Next item in the bucket (for collisions). */
};
/**
* In-memory cache of the persistent raft log stored on disk.
*
* The raft log cache is implemented as a circular buffer of log entries, which
* makes some frequent operations very efficient (e.g. deleting the first N
* entries when snapshotting).
*/
struct raft_log
{
struct raft_entry *entries; /* Circular buffer of log entries. */
size_t size; /* Number of available slots in the buffer. */
size_t front, back; /* Indexes of used slots [front, back). */
raft_index offset; /* Index of first entry is offset+1. */
struct raft_entry_ref
*refs; /* Log entries reference counts hash table. */
size_t refs_size; /* Size of the reference counts hash table. */
struct /* Information about last snapshot, or zero. */
{
raft_index
last_index; /* Snapshot replaces all entries up to here. */
raft_term last_term; /* Term of last index. */
} snapshot;
};
/* Initialize an empty in-memory log of raft entries. */
struct raft_log *logInit(void);
/* Release all memory used by the given log object. */
void logClose(struct raft_log *l);
/* Called at startup when populating the log with entries loaded from disk. It
* sets the starting state of the log. The start index must be lower or equal
* than snapshot_index + 1. */
void logStart(struct raft_log *l,
raft_index snapshot_index,
raft_term snapshot_term,
raft_index start_index);
/* Get the number of entries the log currently contains. */
size_t logNumEntries(struct raft_log *l);
/* Get the index of the last entry in the log. Return #0 if the log is empty. */
raft_index logLastIndex(struct raft_log *l);
/* Get the term of the last entry in the log. Return #0 if the log is empty. */
raft_term logLastTerm(struct raft_log *l);
/* Get the term of the entry with the given index. Return #0 if @index is *
* greater than the last index of the log, or if it's lower than oldest index we
* know the term of (either because it's outstanding or because it's the last
* entry in the most recent snapshot). */
raft_term logTermOf(struct raft_log *l, raft_index index);
/* Get the index of the last entry in the most recent snapshot. Return #0 if
* there are no snapshots. */
raft_index logSnapshotIndex(struct raft_log *l);
/* Get the entry with the given index. The returned pointer remains valid only
* as long as no API that might delete the entry with the given index is
* invoked. Return #NULL if there is no such entry. */
const struct raft_entry *logGet(struct raft_log *l, const raft_index index);
/* Check whether the hash map is already tracking an entry with the given
* @term and @index (that is not part of the "logical" log). If so, increment
* the refcount of that entry and set @reinstated to true; otherwise, set
* @reinstated to false. */
int logReinstate(struct raft_log *l,
raft_term term,
unsigned short type,
bool *reinstated);
/* Append a new entry to the log. */
int logAppend(struct raft_log *l,
raft_term term,
unsigned short type,
struct raft_buffer buf,
struct raft_entry_local_data local_data,
bool is_local,
void *batch);
/* Convenience to append a series of #RAFT_COMMAND entries. */
int logAppendCommands(struct raft_log *l,
const raft_term term,
const struct raft_buffer bufs[],
const struct raft_entry_local_data local_data[],
const unsigned n);
/* Convenience to encode and append a single #RAFT_CHANGE entry. */
int logAppendConfiguration(struct raft_log *l,
const raft_term term,
const struct raft_configuration *configuration);
/* Acquire an array of entries from the given index onwards. The payload
* memory referenced by the @buf attribute of the returned entries is guaranteed
* to be valid until logRelease() is called. */
int logAcquire(struct raft_log *l,
raft_index index,
struct raft_entry *entries[],
unsigned *n);
/* Release a previously acquired array of entries. */
void logRelease(struct raft_log *l,
raft_index index,
struct raft_entry entries[],
unsigned n);
/* Delete all entries from the given index (included) onwards. If the log is
* empty this is a no-op. If @index is lower than or equal to the index of the
* first entry in the log, then the log will become empty. */
void logTruncate(struct raft_log *l, const raft_index index);
/* Discard all entries from the given index (included) onwards. This is exactly
* the same as truncate, but the memory of the entries does not gets
* released. This is called as part of error handling, when reverting the effect
* of previous logAppend calls. */
void logDiscard(struct raft_log *l, const raft_index index);
/* To be called when taking a new snapshot. The log must contain an entry at
* last_index, which is the index of the last entry included in the
* snapshot. The function will update the last snapshot information and delete
* all entries up to last_index - trailing (included). If the log contains no
* entry at last_index - trailing, then no entry will be deleted. */
void logSnapshot(struct raft_log *l, raft_index last_index, unsigned trailing);
/* To be called when installing a snapshot.
*
* The log can be in any state. All outstanding entries will be discarded, the
* last index and last term of the most recent snapshot will be set to the given
* values, and the offset adjusted accordingly. */
void logRestore(struct raft_log *l, raft_index last_index, raft_term last_term);
#endif /* RAFT_LOG_H_ */
dqlite-1.16.7/src/raft/membership.c 0000664 0000000 0000000 00000016644 14652527134 0017136 0 ustar 00root root 0000000 0000000 #include "membership.h"
#include "../raft.h"
#include "../tracing.h"
#include "assert.h"
#include "configuration.h"
#include "err.h"
#include "heap.h"
#include "log.h"
#include "progress.h"
int membershipCanChangeConfiguration(struct raft *r)
{
int rv;
if (r->state != RAFT_LEADER || r->transfer != NULL) {
tracef("NOT LEADER");
rv = RAFT_NOTLEADER;
goto err;
}
if (r->configuration_uncommitted_index != 0) {
tracef("r->configuration_uncommitted_index %llu",
r->configuration_uncommitted_index);
rv = RAFT_CANTCHANGE;
goto err;
}
if (r->leader_state.promotee_id != 0) {
tracef("r->leader_state.promotee_id %llu",
r->leader_state.promotee_id);
rv = RAFT_CANTCHANGE;
goto err;
}
/* In order to become leader at all we are supposed to have committed at
* least the initial configuration at index 1. */
assert(r->configuration_committed_index > 0);
/* The index of the last committed configuration can't be greater than
* the last log index. */
assert(logLastIndex(r->log) >= r->configuration_committed_index);
/* No catch-up round should be in progress. */
assert(r->leader_state.round_number == 0);
assert(r->leader_state.round_index == 0);
assert(r->leader_state.round_start == 0);
return 0;
err:
assert(rv != 0);
ErrMsgFromCode(r->errmsg, rv);
return rv;
}
int membershipFetchLastCommittedConfiguration(struct raft *r,
struct raft_configuration *conf)
{
const struct raft_entry *entry;
int rv;
/* Try to get the entry at r->configuration_committed_index from the
* log. If the entry is not present in the log anymore because the log
* was truncated after a snapshot, we can just use
* configuration_last_snapshot, which we cached when we took or restored
* the snapshot and is guaranteed to match the content that the entry at
* r->configuration_committed_index had. */
entry = logGet(r->log, r->configuration_committed_index);
if (entry != NULL) {
rv = configurationDecode(&entry->buf, conf);
} else {
assert(r->configuration_last_snapshot.n > 0);
rv = configurationCopy(&r->configuration_last_snapshot, conf);
}
if (rv != 0) {
return rv;
}
return 0;
}
bool membershipUpdateCatchUpRound(struct raft *r)
{
unsigned server_index;
raft_index match_index;
raft_index last_index;
raft_time now = r->io->time(r->io);
raft_time round_duration;
bool is_up_to_date;
bool is_fast_enough;
assert(r->state == RAFT_LEADER);
assert(r->leader_state.promotee_id != 0);
server_index = configurationIndexOf(&r->configuration,
r->leader_state.promotee_id);
assert(server_index < r->configuration.n);
match_index = progressMatchIndex(r, server_index);
/* If the server did not reach the target index for this round, it did
* not catch up. */
if (match_index < r->leader_state.round_index) {
tracef(
"member (index: %u) not yet caught up match_index:%llu "
"round_index:%llu",
server_index, match_index, r->leader_state.round_index);
return false;
}
last_index = logLastIndex(r->log);
round_duration = now - r->leader_state.round_start;
is_up_to_date = match_index == last_index;
is_fast_enough = round_duration < r->election_timeout;
tracef("member is_up_to_date:%d is_fast_enough:%d", is_up_to_date,
is_fast_enough);
/* If the server's log is fully up-to-date or the round that just
* terminated was fast enough, then the server as caught up. */
if (is_up_to_date || is_fast_enough) {
r->leader_state.round_number = 0;
r->leader_state.round_index = 0;
r->leader_state.round_start = 0;
return true;
}
/* If we get here it means that this catch-up round is complete, but
* there are more entries to replicate, or it was not fast enough. Let's
* start a new round. */
r->leader_state.round_number++;
r->leader_state.round_index = last_index;
r->leader_state.round_start = now;
return false;
}
int membershipUncommittedChange(struct raft *r,
const raft_index index,
const struct raft_entry *entry)
{
struct raft_configuration configuration;
int rv;
char msg[128];
assert(r != NULL);
assert(r->state == RAFT_FOLLOWER);
assert(entry != NULL);
assert(entry->type == RAFT_CHANGE);
rv = configurationDecode(&entry->buf, &configuration);
if (rv != 0) {
tracef("failed to decode configuration at index:%llu", index);
goto err;
}
/* ignore errors */
snprintf(msg, sizeof(msg), "uncommitted config change at index:%llu",
index);
configurationTrace(r, &configuration, msg);
raft_configuration_close(&r->configuration);
r->configuration = configuration;
r->configuration_uncommitted_index = index;
return 0;
err:
assert(rv != 0);
return rv;
}
int membershipRollback(struct raft *r)
{
int rv;
assert(r != NULL);
assert(r->state == RAFT_FOLLOWER);
assert(r->configuration_uncommitted_index > 0);
tracef("roll back membership");
/* Fetch the last committed configuration entry. */
assert(r->configuration_committed_index != 0);
/* Replace the current configuration with the last committed one. */
configurationClose(&r->configuration);
rv = membershipFetchLastCommittedConfiguration(r, &r->configuration);
if (rv != 0) {
return rv;
}
configurationTrace(r, &r->configuration, "roll back config");
r->configuration_uncommitted_index = 0;
return 0;
}
void membershipLeadershipTransferInit(struct raft *r,
struct raft_transfer *req,
raft_id id,
raft_transfer_cb cb)
{
req->cb = cb;
req->id = id;
req->start = r->io->time(r->io);
req->send.data = NULL;
r->transfer = req;
}
static void membershipLeadershipSendCb(struct raft_io_send *send, int status)
{
(void)status;
RaftHeapFree(send);
}
int membershipLeadershipTransferStart(struct raft *r)
{
const struct raft_server *server;
struct raft_message message;
struct raft_io_send *send;
int rv;
assert(r->transfer->send.data == NULL);
server = configurationGet(&r->configuration, r->transfer->id);
assert(server != NULL);
if (server == NULL) {
tracef("transferee server not found in configuration");
return -1;
}
/* Don't use the raft_io_send object embedded in struct raft_transfer,
* since the two objects must have different lifetimes. For example
* raft_io_send might live longer than raft_transfer, see #396.
*
* Ideally we should remove the embedded struct raft_io_send send field
* from struct raft_transfer, and replace it with a raft_io_send *send
* pointer, that we set to the raft_io_send object allocated in this
* function. This would break ABI compatibility though. */
send = RaftHeapMalloc(sizeof *send);
if (send == NULL) {
return RAFT_NOMEM;
}
message.type = RAFT_IO_TIMEOUT_NOW;
message.server_id = server->id;
message.server_address = server->address;
message.timeout_now.term = r->current_term;
message.timeout_now.last_log_index = logLastIndex(r->log);
message.timeout_now.last_log_term = logLastTerm(r->log);
/* Set the data attribute of the raft_io_send object embedded in
* raft_transfer. This is needed because we historically used it as a
* flag to indicate that a transfer request was sent. See the
* replicationUpdate function. */
r->transfer->send.data = r;
send->data = r;
rv = r->io->send(r->io, send, &message, membershipLeadershipSendCb);
if (rv != 0) {
RaftHeapFree(send);
ErrMsgTransferf(r->io->errmsg, r->errmsg,
"send timeout now to %llu", server->id);
return rv;
}
return 0;
}
void membershipLeadershipTransferClose(struct raft *r)
{
struct raft_transfer *req = r->transfer;
raft_transfer_cb cb = req->cb;
r->transfer = NULL;
if (cb != NULL) {
cb(req);
}
}
dqlite-1.16.7/src/raft/membership.h 0000664 0000000 0000000 00000004350 14652527134 0017132 0 ustar 00root root 0000000 0000000 /* Membership-related APIs. */
#ifndef MEMBERSHIP_H_
#define MEMBERSHIP_H_
#include "../raft.h"
/* Helper returning an error if the configuration can't be changed, either
* because this node is not the leader or because a configuration change is
* already in progress. */
int membershipCanChangeConfiguration(struct raft *r);
/* Populate the given configuration object with the most recent committed
* configuration, the one contained in the entry at
* r->configuration_committed_index. */
int membershipFetchLastCommittedConfiguration(struct raft *r,
struct raft_configuration *conf);
/* Update the information about the progress that the non-voting server
* currently being promoted is making in catching with logs.
*
* Return false if the server being promoted did not yet catch-up with logs, and
* true if it did.
*
* This function must be called only by leaders after a @raft_assign request
* has been submitted. */
bool membershipUpdateCatchUpRound(struct raft *r);
/* Update the local configuration replacing it with the content of the given
* RAFT_CHANGE entry, which has just been received in as part of an
* AppendEntries RPC request. The uncommitted configuration index will be
* updated accordingly.
*
* It must be called only by followers. */
int membershipUncommittedChange(struct raft *r,
const raft_index index,
const struct raft_entry *entry);
/* Rollback any promotion configuration change that was applied locally, but
* failed to be committed. It must be called by followers after they receive an
* AppendEntries RPC request that instructs them to evict the uncommitted entry
* from their log. */
int membershipRollback(struct raft *r);
/* Initialize the state of a leadership transfer request. */
void membershipLeadershipTransferInit(struct raft *r,
struct raft_transfer *req,
raft_id id,
raft_transfer_cb cb);
/* Start the leadership transfer by sending a TimeoutNow message to the target
* server. */
int membershipLeadershipTransferStart(struct raft *r);
/* Finish a leadership transfer (whether successful or not), resetting the
* leadership transfer state and firing the user callback. */
void membershipLeadershipTransferClose(struct raft *r);
#endif /* MEMBERSHIP_H_ */
dqlite-1.16.7/src/raft/progress.c 0000664 0000000 0000000 00000020727 14652527134 0016644 0 ustar 00root root 0000000 0000000 #include "progress.h"
#include "../tracing.h"
#include "assert.h"
#include "configuration.h"
#include "log.h"
#ifndef max
#define max(a, b) ((a) < (b) ? (b) : (a))
#endif
#ifndef min
#define min(a, b) ((a) < (b) ? (a) : (b))
#endif
/* Initialize a single progress object. */
static void initProgress(struct raft_progress *p, raft_index last_index)
{
p->next_index = last_index + 1;
p->match_index = 0;
p->snapshot_index = 0;
p->last_send = 0;
p->snapshot_last_send = 0;
p->recent_recv = false;
p->state = PROGRESS__PROBE;
p->features = 0;
}
int progressBuildArray(struct raft *r)
{
struct raft_progress *progress;
unsigned i;
raft_index last_index = logLastIndex(r->log);
progress = raft_malloc(r->configuration.n * sizeof *progress);
if (progress == NULL) {
return RAFT_NOMEM;
}
for (i = 0; i < r->configuration.n; i++) {
initProgress(&progress[i], last_index);
if (r->configuration.servers[i].id == r->id) {
progress[i].match_index = r->last_stored;
}
}
r->leader_state.progress = progress;
return 0;
}
int progressRebuildArray(struct raft *r,
const struct raft_configuration *configuration)
{
raft_index last_index = logLastIndex(r->log);
struct raft_progress *progress;
unsigned i;
unsigned j;
raft_id id;
progress = raft_malloc(configuration->n * sizeof *progress);
if (progress == NULL) {
return RAFT_NOMEM;
}
/* First copy the progress information for the servers that exists both
* in the current and in the new configuration. */
for (i = 0; i < r->configuration.n; i++) {
id = r->configuration.servers[i].id;
j = configurationIndexOf(configuration, id);
if (j == configuration->n) {
/* This server is not present in the new configuration,
* so we just skip it. */
continue;
}
progress[j] = r->leader_state.progress[i];
}
/* Then reset the replication state for servers that are present in the
* new configuration, but not in the current one. */
for (i = 0; i < configuration->n; i++) {
id = configuration->servers[i].id;
j = configurationIndexOf(&r->configuration, id);
if (j < r->configuration.n) {
/* This server is present both in the new and in the
* current configuration, so we have already copied its
* next/match index value in the loop above. */
continue;
}
assert(j == r->configuration.n);
initProgress(&progress[i], last_index);
}
raft_free(r->leader_state.progress);
r->leader_state.progress = progress;
return 0;
}
bool progressIsUpToDate(struct raft *r, unsigned i)
{
struct raft_progress *p = &r->leader_state.progress[i];
raft_index last_index = logLastIndex(r->log);
return p->next_index == last_index + 1;
}
bool progressPersistedIsUpToDate(struct raft *r, unsigned i)
{
struct raft_progress *p = &r->leader_state.progress[i];
raft_index last_index = logLastIndex(r->log);
return p->match_index == last_index;
}
bool progressShouldReplicate(struct raft *r, unsigned i)
{
struct raft_progress *p = &r->leader_state.progress[i];
raft_time now = r->io->time(r->io);
bool needs_heartbeat = now - p->last_send >= r->heartbeat_timeout;
raft_index last_index = logLastIndex(r->log);
bool result = false;
/* We must be in a valid state. */
assert(p->state == PROGRESS__PROBE || p->state == PROGRESS__PIPELINE ||
p->state == PROGRESS__SNAPSHOT);
/* The next index to send must be lower than the highest index in our
* log. */
assert(p->next_index <= last_index + 1);
switch (p->state) {
case PROGRESS__SNAPSHOT:
/* Snapshot timed out, move to PROBE */
if (now - p->snapshot_last_send >=
r->install_snapshot_timeout) {
tracef("snapshot timed out for index:%u", i);
result = true;
progressAbortSnapshot(r, i);
} else {
/* Enforce Leadership during follower Snapshot
* installation */
result = needs_heartbeat;
}
break;
case PROGRESS__PROBE:
/* We send at most one message per heartbeat interval.
*/
result = needs_heartbeat;
break;
case PROGRESS__PIPELINE:
/* In replication mode we send empty append entries
* messages only if haven't sent anything in the last
* heartbeat interval. */
result = !progressIsUpToDate(r, i) || needs_heartbeat;
break;
}
return result;
}
raft_index progressNextIndex(struct raft *r, unsigned i)
{
return r->leader_state.progress[i].next_index;
}
raft_index progressMatchIndex(struct raft *r, unsigned i)
{
return r->leader_state.progress[i].match_index;
}
void progressUpdateLastSend(struct raft *r, unsigned i)
{
r->leader_state.progress[i].last_send = r->io->time(r->io);
}
void progressUpdateSnapshotLastSend(struct raft *r, unsigned i)
{
r->leader_state.progress[i].snapshot_last_send = r->io->time(r->io);
}
bool progressResetRecentRecv(struct raft *r, const unsigned i)
{
bool prev = r->leader_state.progress[i].recent_recv;
r->leader_state.progress[i].recent_recv = false;
return prev;
}
void progressMarkRecentRecv(struct raft *r, const unsigned i)
{
r->leader_state.progress[i].recent_recv = true;
}
inline void progressSetFeatures(struct raft *r,
const unsigned i,
raft_flags features)
{
r->leader_state.progress[i].features = features;
}
inline raft_flags progressGetFeatures(struct raft *r, const unsigned i)
{
return r->leader_state.progress[i].features;
}
bool progressGetRecentRecv(const struct raft *r, const unsigned i)
{
return r->leader_state.progress[i].recent_recv;
}
void progressToSnapshot(struct raft *r, unsigned i)
{
struct raft_progress *p = &r->leader_state.progress[i];
p->state = PROGRESS__SNAPSHOT;
p->snapshot_index = logSnapshotIndex(r->log);
}
void progressAbortSnapshot(struct raft *r, const unsigned i)
{
struct raft_progress *p = &r->leader_state.progress[i];
p->snapshot_index = 0;
p->state = PROGRESS__PROBE;
}
int progressState(struct raft *r, const unsigned i)
{
struct raft_progress *p = &r->leader_state.progress[i];
return p->state;
}
bool progressMaybeDecrement(struct raft *r,
const unsigned i,
raft_index rejected,
raft_index last_index)
{
struct raft_progress *p = &r->leader_state.progress[i];
assert(p->state == PROGRESS__PROBE || p->state == PROGRESS__PIPELINE ||
p->state == PROGRESS__SNAPSHOT);
if (p->state == PROGRESS__SNAPSHOT) {
/* The rejection must be stale or spurious if the rejected index
* does not match the last snapshot index. */
if (rejected != p->snapshot_index) {
return false;
}
progressAbortSnapshot(r, i);
return true;
}
if (p->state == PROGRESS__PIPELINE) {
/* The rejection must be stale if the rejected index is smaller
* than the matched one. */
if (rejected <= p->match_index) {
tracef("match index is up to date -> ignore ");
return false;
}
/* Directly decrease next to match + 1 */
p->next_index = min(rejected, p->match_index + 1);
progressToProbe(r, i);
return true;
}
/* The rejection must be stale or spurious if the rejected index does
* not match the next index minus one. */
if (rejected != p->next_index - 1) {
tracef(
"rejected index %llu different from next index %lld -> "
"ignore ",
rejected, p->next_index);
return false;
}
p->next_index = min(rejected, last_index + 1);
p->next_index = max(p->next_index, 1);
return true;
}
void progressOptimisticNextIndex(struct raft *r,
unsigned i,
raft_index next_index)
{
struct raft_progress *p = &r->leader_state.progress[i];
p->next_index = next_index;
}
bool progressMaybeUpdate(struct raft *r, unsigned i, raft_index last_index)
{
struct raft_progress *p = &r->leader_state.progress[i];
bool updated = false;
if (p->match_index < last_index) {
p->match_index = last_index;
updated = true;
}
if (p->next_index < last_index + 1) {
p->next_index = last_index + 1;
}
return updated;
}
void progressToProbe(struct raft *r, const unsigned i)
{
struct raft_progress *p = &r->leader_state.progress[i];
/* If the current state is snapshot, we know that the pending snapshot
* has been sent to this peer successfully, so we probe from
* snapshot_index + 1.*/
if (p->state == PROGRESS__SNAPSHOT) {
assert(p->snapshot_index > 0);
p->next_index = max(p->match_index + 1, p->snapshot_index);
p->snapshot_index = 0;
} else {
p->next_index = p->match_index + 1;
}
p->state = PROGRESS__PROBE;
}
void progressToPipeline(struct raft *r, const unsigned i)
{
struct raft_progress *p = &r->leader_state.progress[i];
p->state = PROGRESS__PIPELINE;
}
bool progressSnapshotDone(struct raft *r, const unsigned i)
{
struct raft_progress *p = &r->leader_state.progress[i];
assert(p->state == PROGRESS__SNAPSHOT);
return p->match_index >= p->snapshot_index;
}
#undef tracef
dqlite-1.16.7/src/raft/progress.h 0000664 0000000 0000000 00000012110 14652527134 0016634 0 ustar 00root root 0000000 0000000 /* Track replication progress on followers. */
#ifndef PROGRESS_H_
#define PROGRESS_H_
#include "../raft.h"
/* Possible values for the state field of struct raft_progress. */
enum {
PROGRESS__PROBE =
0, /* At most one AppendEntries per heartbeat interval */
PROGRESS__PIPELINE, /* Optimistically stream AppendEntries */
PROGRESS__SNAPSHOT /* Sending a snapshot */
};
/**
* Used by leaders to keep track of replication progress for each server.
*/
struct raft_progress
{
unsigned short state; /* Probe, pipeline or snapshot. */
raft_index next_index; /* Next entry to send. */
raft_index match_index; /* Highest index reported as replicated. */
raft_index
snapshot_index; /* Last index of most recent snapshot sent. */
raft_time last_send; /* Timestamp of last AppendEntries RPC. */
raft_time
snapshot_last_send; /* Timestamp of last InstallSnaphot RPC. */
bool recent_recv; /* A msg was received within election timeout. */
raft_flags features; /* What the server is capable of. */
};
/* Create and initialize the array of progress objects used by the leader to *
* track followers. The match index will be set to zero, and the next index to
* the current last index plus 1. */
int progressBuildArray(struct raft *r);
/* Re-build the progress array against a new configuration.
*
* Progress information for servers existing both in the new and in the current
* configuration will remain unchanged.
*
* Progress information for servers existing only in the new configuration will
* be initialized as in progressBuildArray().*/
int progressRebuildArray(struct raft *r,
const struct raft_configuration *configuration);
/* Whether the i'th server in the configuration has been sent all the log
* entries. */
bool progressIsUpToDate(struct raft *r, unsigned i);
/* Whether the persisted log of the i'th server in the configuration up-to-date
* with ours. */
bool progressPersistedIsUpToDate(struct raft *r, unsigned i);
/* Whether a new AppendEntries or InstallSnapshot message should be sent to the
* i'th server at this time.
*
* See the docstring of replicationProgress() for details about how the decision
* is taken. */
bool progressShouldReplicate(struct raft *r, unsigned i);
/* Return the index of the next entry that should be sent to the i'th server. */
raft_index progressNextIndex(struct raft *r, unsigned i);
/* Return the index of the most recent entry that the i'th server has reported
* as replicated. */
raft_index progressMatchIndex(struct raft *r, unsigned i);
/* Update the last_send timestamp after an AppendEntries request has been
* sent. */
void progressUpdateLastSend(struct raft *r, unsigned i);
/* Update the snapshot_last_send timestamp after an InstallSnaphot request has
* been sent. */
void progressUpdateSnapshotLastSend(struct raft *r, unsigned i);
/* Reset to false the recent_recv flag of the server at the given index,
* returning the previous value.
*
* To be called once every election_timeout milliseconds. */
bool progressResetRecentRecv(struct raft *r, unsigned i);
/* Set to true the recent_recv flag of the server at the given index.
*
* To be called whenever we receive an AppendEntries RPC result */
void progressMarkRecentRecv(struct raft *r, unsigned i);
/* Return the value of the recent_recv flag. */
bool progressGetRecentRecv(const struct raft *r, unsigned i);
/* Convert to the i'th server to snapshot mode. */
void progressToSnapshot(struct raft *r, unsigned i);
/* Convert to probe mode. */
void progressToProbe(struct raft *r, unsigned i);
/* Convert to pipeline mode. */
void progressToPipeline(struct raft *r, unsigned i);
/* Abort snapshot mode and switch to back to probe.
*
* Called after sending the snapshot has failed or timed out. */
void progressAbortSnapshot(struct raft *r, unsigned i);
/* Return the progress mode code for the i'th server. */
int progressState(struct raft *r, unsigned i);
/* Optimistically update the next index of the given server.
*
* Called in pipeline mode after sending new entries. */
void progressOptimisticNextIndex(struct raft *r,
unsigned i,
raft_index next_index);
/* Return false if the given @index comes from an outdated message. Otherwise
* update the progress and returns true. To be called when receiving a
* successful AppendEntries RPC response. */
bool progressMaybeUpdate(struct raft *r, unsigned i, raft_index last_index);
/* Return false if the given rejected index comes from an out of order
* message. Otherwise decrease the progress next index to min(rejected,
* last_index) and returns true. To be called when receiving an unsuccessful
* AppendEntries RPC response. */
bool progressMaybeDecrement(struct raft *r,
unsigned i,
raft_index rejected,
raft_index last_index);
/* Return true if match_index is equal or higher than the snapshot_index. */
bool progressSnapshotDone(struct raft *r, unsigned i);
/* Sets the feature flags of a node. */
void progressSetFeatures(struct raft *r, const unsigned i, raft_flags features);
/* Gets the feature flags of a node. */
raft_flags progressGetFeatures(struct raft *r, const unsigned i);
#endif /* PROGRESS_H_ */
dqlite-1.16.7/src/raft/raft.c 0000664 0000000 0000000 00000015052 14652527134 0015727 0 ustar 00root root 0000000 0000000 #include "../raft.h"
#include
#include
#include "../tracing.h"
#include "assert.h"
#include "byte.h"
#include "callbacks.h"
#include "configuration.h"
#include "convert.h"
#include "election.h"
#include "err.h"
#include "flags.h"
#include "heap.h"
#include "log.h"
#include "membership.h"
#define DEFAULT_ELECTION_TIMEOUT 1000 /* One second */
#define DEFAULT_HEARTBEAT_TIMEOUT 100 /* One tenth of a second */
#define DEFAULT_INSTALL_SNAPSHOT_TIMEOUT 30000 /* 30 seconds */
#define DEFAULT_SNAPSHOT_THRESHOLD 1024
#define DEFAULT_SNAPSHOT_TRAILING 2048
/* Number of milliseconds after which a server promotion will be aborted if the
* server hasn't caught up with the logs yet. */
#define DEFAULT_MAX_CATCH_UP_ROUNDS 10
#define DEFAULT_MAX_CATCH_UP_ROUND_DURATION (5 * 1000)
int raft_version_number(void)
{
return RAFT_VERSION_NUMBER;
}
static int ioFsmVersionCheck(struct raft *r,
struct raft_io *io,
struct raft_fsm *fsm);
int raft_init(struct raft *r,
struct raft_io *io,
struct raft_fsm *fsm,
const raft_id id,
const char *address)
{
int rv;
assert(r != NULL);
rv = ioFsmVersionCheck(r, io, fsm);
if (rv != 0) {
goto err;
}
r->io = io;
r->io->data = r;
r->fsm = fsm;
r->tracer = NULL;
r->id = id;
/* Make a copy of the address */
r->address = RaftHeapMalloc(strlen(address) + 1);
if (r->address == NULL) {
rv = RAFT_NOMEM;
goto err;
}
strcpy(r->address, address);
r->current_term = 0;
r->voted_for = 0;
r->log = logInit();
if (r->log == NULL) {
rv = RAFT_NOMEM;
goto err_after_address_alloc;
}
raft_configuration_init(&r->configuration);
raft_configuration_init(&r->configuration_last_snapshot);
r->configuration_committed_index = 0;
r->configuration_uncommitted_index = 0;
r->election_timeout = DEFAULT_ELECTION_TIMEOUT;
r->heartbeat_timeout = DEFAULT_HEARTBEAT_TIMEOUT;
r->install_snapshot_timeout = DEFAULT_INSTALL_SNAPSHOT_TIMEOUT;
r->commit_index = 0;
r->last_applied = 0;
r->last_stored = 0;
r->state = RAFT_UNAVAILABLE;
r->leader_state.voter_contacts = 0;
rv = raftInitCallbacks(r);
if (rv != 0) {
goto err_after_address_alloc;
}
r->transfer = NULL;
r->snapshot.pending.term = 0;
r->snapshot.threshold = DEFAULT_SNAPSHOT_THRESHOLD;
r->snapshot.trailing = DEFAULT_SNAPSHOT_TRAILING;
r->snapshot.put.data = NULL;
r->close_cb = NULL;
memset(r->errmsg, 0, sizeof r->errmsg);
r->pre_vote = false;
r->max_catch_up_rounds = DEFAULT_MAX_CATCH_UP_ROUNDS;
r->max_catch_up_round_duration = DEFAULT_MAX_CATCH_UP_ROUND_DURATION;
rv = r->io->init(r->io, r->id, r->address);
if (rv != 0) {
ErrMsgTransfer(r->io->errmsg, r->errmsg, "io");
goto err_after_callbacks_alloc;
}
return 0;
err_after_callbacks_alloc:
raftDestroyCallbacks(r);
err_after_address_alloc:
RaftHeapFree(r->address);
err:
assert(rv != 0);
return rv;
}
static void ioCloseCb(struct raft_io *io)
{
struct raft *r = io->data;
tracef("io close cb");
raftDestroyCallbacks(r);
raft_free(r->address);
logClose(r->log);
raft_configuration_close(&r->configuration);
raft_configuration_close(&r->configuration_last_snapshot);
if (r->close_cb != NULL) {
r->close_cb(r);
}
}
void raft_close(struct raft *r, void (*cb)(struct raft *r))
{
assert(r->close_cb == NULL);
if (r->state != RAFT_UNAVAILABLE) {
convertToUnavailable(r);
}
r->close_cb = cb;
r->io->close(r->io, ioCloseCb);
}
void raft_register_state_cb(struct raft *r, raft_state_cb cb)
{
struct raft_callbacks *cbs = raftGetCallbacks(r);
assert(cbs != NULL);
cbs->state_cb = cb;
}
void raft_set_election_timeout(struct raft *r, const unsigned msecs)
{
r->election_timeout = msecs;
}
void raft_set_heartbeat_timeout(struct raft *r, const unsigned msecs)
{
r->heartbeat_timeout = msecs;
}
void raft_set_install_snapshot_timeout(struct raft *r, const unsigned msecs)
{
r->install_snapshot_timeout = msecs;
}
void raft_set_snapshot_threshold(struct raft *r, unsigned n)
{
r->snapshot.threshold = n;
}
void raft_set_snapshot_trailing(struct raft *r, unsigned n)
{
r->snapshot.trailing = n;
}
void raft_set_max_catch_up_rounds(struct raft *r, unsigned n)
{
r->max_catch_up_rounds = n;
}
void raft_set_max_catch_up_round_duration(struct raft *r, unsigned msecs)
{
r->max_catch_up_round_duration = msecs;
}
void raft_set_pre_vote(struct raft *r, bool enabled)
{
r->pre_vote = enabled;
}
const char *raft_errmsg(struct raft *r)
{
return r->errmsg;
}
int raft_voter_contacts(struct raft *r)
{
int ret;
if (r->state == RAFT_LEADER) {
ret = (int)r->leader_state.voter_contacts;
} else {
ret = -1;
}
return ret;
}
int raft_bootstrap(struct raft *r, const struct raft_configuration *conf)
{
int rv;
if (r->state != RAFT_UNAVAILABLE) {
return RAFT_BUSY;
}
rv = r->io->bootstrap(r->io, conf);
if (rv != 0) {
return rv;
}
return 0;
}
int raft_recover(struct raft *r, const struct raft_configuration *conf)
{
int rv;
if (r->state != RAFT_UNAVAILABLE) {
return RAFT_BUSY;
}
rv = r->io->recover(r->io, conf);
if (rv != 0) {
return rv;
}
return 0;
}
const char *raft_strerror(int errnum)
{
return errCodeToString(errnum);
}
void raft_configuration_init(struct raft_configuration *c)
{
configurationInit(c);
}
void raft_configuration_close(struct raft_configuration *c)
{
configurationClose(c);
}
int raft_configuration_add(struct raft_configuration *c,
const raft_id id,
const char *address,
const int role)
{
return configurationAdd(c, id, address, role);
}
int raft_configuration_encode(const struct raft_configuration *c,
struct raft_buffer *buf)
{
return configurationEncode(c, buf);
}
unsigned long long raft_digest(const char *text, unsigned long long n)
{
struct byteSha1 sha1;
uint8_t value[20];
uint64_t n64 = byteFlip64((uint64_t)n);
uint64_t digest;
byteSha1Init(&sha1);
byteSha1Update(&sha1, (const uint8_t *)text, (uint32_t)strlen(text));
byteSha1Update(&sha1, (const uint8_t *)&n64, (uint32_t)(sizeof n64));
byteSha1Digest(&sha1, value);
memcpy(&digest, value + (sizeof value - sizeof digest), sizeof digest);
return byteFlip64(digest);
}
static int ioFsmVersionCheck(struct raft *r,
struct raft_io *io,
struct raft_fsm *fsm)
{
if (io->version == 0) {
ErrMsgPrintf(r->errmsg, "io->version must be set");
return -1;
}
if (fsm->version == 0) {
ErrMsgPrintf(r->errmsg, "fsm->version must be set");
return -1;
}
if ((fsm->version > 2 && fsm->snapshot_async != NULL) &&
((io->version < 2) || (io->async_work == NULL))) {
ErrMsgPrintf(r->errmsg,
"async snapshot requires io->version > 1 and "
"async_work method.");
return -1;
}
return 0;
}
dqlite-1.16.7/src/raft/recv.c 0000664 0000000 0000000 00000012573 14652527134 0015737 0 ustar 00root root 0000000 0000000 #include "recv.h"
#include "../tracing.h"
#include "assert.h"
#include "convert.h"
#include "entry.h"
#include "heap.h"
#include "log.h"
#include "membership.h"
#include "recv_append_entries.h"
#include "recv_append_entries_result.h"
#include "recv_install_snapshot.h"
#include "recv_request_vote.h"
#include "recv_request_vote_result.h"
#include "recv_timeout_now.h"
#include "string.h"
/* Dispatch a single RPC message to the appropriate handler. */
static int recvMessage(struct raft *r, struct raft_message *message)
{
int rv = 0;
switch (message->type) {
case RAFT_IO_APPEND_ENTRIES:
rv = recvAppendEntries(r, message->server_id,
message->server_address,
&message->append_entries);
if (rv != 0) {
entryBatchesDestroy(
message->append_entries.entries,
message->append_entries.n_entries);
}
break;
case RAFT_IO_APPEND_ENTRIES_RESULT:
rv = recvAppendEntriesResult(
r, message->server_id, message->server_address,
&message->append_entries_result);
break;
case RAFT_IO_REQUEST_VOTE:
rv = recvRequestVote(r, message->server_id,
message->server_address,
&message->request_vote);
break;
case RAFT_IO_REQUEST_VOTE_RESULT:
rv = recvRequestVoteResult(
r, message->server_id, message->server_address,
&message->request_vote_result);
break;
case RAFT_IO_INSTALL_SNAPSHOT:
rv = recvInstallSnapshot(r, message->server_id,
message->server_address,
&message->install_snapshot);
/* Already installing a snapshot, wait for it and ignore
* this one */
if (rv == RAFT_BUSY) {
raft_free(message->install_snapshot.data.base);
raft_configuration_close(
&message->install_snapshot.conf);
rv = 0;
}
break;
case RAFT_IO_TIMEOUT_NOW:
rv = recvTimeoutNow(r, message->server_id,
message->server_address,
&message->timeout_now);
break;
default:
tracef("received unknown message type (%d)",
message->type);
/* Drop message */
return 0;
};
if (rv != 0 && rv != RAFT_NOCONNECTION) {
tracef("recv: %d: %s", message->type, raft_strerror(rv));
return rv;
}
/* If there's a leadership transfer in progress, check if it has
* completed. */
if (r->transfer != NULL) {
if (r->follower_state.current_leader.id == r->transfer->id) {
membershipLeadershipTransferClose(r);
}
}
return 0;
}
void recvCb(struct raft_io *io, struct raft_message *message)
{
struct raft *r = io->data;
int rv;
if (r->state == RAFT_UNAVAILABLE) {
switch (message->type) {
case RAFT_IO_APPEND_ENTRIES:
entryBatchesDestroy(
message->append_entries.entries,
message->append_entries.n_entries);
break;
case RAFT_IO_INSTALL_SNAPSHOT:
raft_configuration_close(
&message->install_snapshot.conf);
raft_free(message->install_snapshot.data.base);
break;
}
return;
}
rv = recvMessage(r, message);
if (rv != 0) {
convertToUnavailable(r);
}
}
int recvBumpCurrentTerm(struct raft *r, raft_term term)
{
int rv;
char msg[128];
assert(r != NULL);
assert(term > r->current_term);
sprintf(msg, "remote term %lld is higher than %lld -> bump local term",
term, r->current_term);
if (r->state != RAFT_FOLLOWER) {
strcat(msg, " and step down");
}
tracef("%s", msg);
/* Save the new term to persistent store, resetting the vote. */
rv = r->io->set_term(r->io, term);
if (rv != 0) {
return rv;
}
/* Update our cache too. */
r->current_term = term;
r->voted_for = 0;
if (r->state != RAFT_FOLLOWER) {
/* Also convert to follower. */
convertToFollower(r);
}
return 0;
}
void recvCheckMatchingTerms(struct raft *r, raft_term term, int *match)
{
if (term < r->current_term) {
*match = -1;
} else if (term > r->current_term) {
*match = 1;
} else {
*match = 0;
}
}
int recvEnsureMatchingTerms(struct raft *r, raft_term term, int *match)
{
int rv;
assert(r != NULL);
assert(match != NULL);
recvCheckMatchingTerms(r, term, match);
if (*match == -1) {
tracef("old term - current_term:%llu other_term:%llu",
r->current_term, term);
return 0;
}
/* From Figure 3.1:
*
* Rules for Servers: All Servers: If RPC request or response contains
* term T > currentTerm: set currentTerm = T, convert to follower.
*
* From state diagram in Figure 3.3:
*
* [leader]: discovers server with higher term -> [follower]
*
* From Section 3.3:
*
* If a candidate or leader discovers that its term is out of date, it
* immediately reverts to follower state.
*/
if (*match == 1) {
rv = recvBumpCurrentTerm(r, term);
if (rv != 0) {
tracef("recvBumpCurrentTerm failed %d", rv);
return rv;
}
}
return 0;
}
int recvUpdateLeader(struct raft *r, const raft_id id, const char *address)
{
assert(r->state == RAFT_FOLLOWER);
r->follower_state.current_leader.id = id;
/* If the address of the current leader is the same as the given one,
* we're done. */
if (r->follower_state.current_leader.address != NULL &&
strcmp(address, r->follower_state.current_leader.address) == 0) {
return 0;
}
if (r->follower_state.current_leader.address != NULL) {
RaftHeapFree(r->follower_state.current_leader.address);
}
r->follower_state.current_leader.address =
RaftHeapMalloc(strlen(address) + 1);
if (r->follower_state.current_leader.address == NULL) {
return RAFT_NOMEM;
}
strcpy(r->follower_state.current_leader.address, address);
return 0;
}
#undef tracef
dqlite-1.16.7/src/raft/recv.h 0000664 0000000 0000000 00000003333 14652527134 0015736 0 ustar 00root root 0000000 0000000 /* Receive an RPC message. */
#ifndef RECV_H_
#define RECV_H_
#include "../raft.h"
/* Callback to be passed to the raft_io implementation. It will be invoked upon
* receiving an RPC message. */
void recvCb(struct raft_io *io, struct raft_message *message);
/* Compare a request's term with the server's current term.
*
* The match output parameter will be set to 0 if the local term matches the
* request's term, to -1 if the request's term is lower, and to 1 if the
* request's term is higher. */
void recvCheckMatchingTerms(struct raft *r, raft_term term, int *match);
/* Bump the current term and possibly step down from candidate or leader
* state. */
int recvBumpCurrentTerm(struct raft *r, raft_term term);
/* Common logic for RPC handlers, comparing the request's term with the server's
* current term and possibly deciding to reject the request or step down from
* candidate or leader.
*
* From Section 3.3:
*
* If a candidate or leader discovers that its term is out of date, it
* immediately reverts to follower state. If a server receives a request with
* a stale term number, it rejects the request.
*
* The match output parameter will be set to 0 if the local term matches the
* request's term, to -1 if the request's term is lower, and to 1 if the
* request's term was higher but we have successfully bumped the local one to
* match it (and stepped down to follower in that case, if we were not
* follower already). */
int recvEnsureMatchingTerms(struct raft *r, raft_term term, int *match);
/* If different from the current one, update information about the current
* leader. Must be called only by followers. */
int recvUpdateLeader(struct raft *r, raft_id id, const char *address);
#endif /* RECV_H_ */
dqlite-1.16.7/src/raft/recv_append_entries.c 0000664 0000000 0000000 00000011065 14652527134 0021012 0 ustar 00root root 0000000 0000000 #include "recv_append_entries.h"
#include "../tracing.h"
#include "assert.h"
#include "convert.h"
#include "entry.h"
#include "flags.h"
#include "heap.h"
#include "log.h"
#include "recv.h"
#include "replication.h"
static void recvSendAppendEntriesResultCb(struct raft_io_send *req, int status)
{
(void)status;
RaftHeapFree(req);
}
int recvAppendEntries(struct raft *r,
raft_id id,
const char *address,
const struct raft_append_entries *args)
{
struct raft_io_send *req;
struct raft_message message;
struct raft_append_entries_result *result =
&message.append_entries_result;
int match;
bool async;
int rv;
assert(r != NULL);
assert(id > 0);
assert(args != NULL);
assert(address != NULL);
tracef(
"self:%llu from:%llu@%s leader_commit:%llu n_entries:%d "
"prev_log_index:%llu prev_log_term:%llu, term:%llu",
r->id, id, address, args->leader_commit, args->n_entries,
args->prev_log_index, args->prev_log_term, args->term);
result->rejected = args->prev_log_index;
result->last_log_index = logLastIndex(r->log);
result->version = RAFT_APPEND_ENTRIES_RESULT_VERSION;
result->features = RAFT_DEFAULT_FEATURE_FLAGS;
rv = recvEnsureMatchingTerms(r, args->term, &match);
if (rv != 0) {
return rv;
}
/* From Figure 3.1:
*
* AppendEntries RPC: Receiver implementation: Reply false if term <
* currentTerm.
*/
if (match < 0) {
tracef("local term is higher -> reject ");
goto reply;
}
/* If we get here it means that the term in the request matches our
* current term or it was higher and we have possibly stepped down,
* because we discovered the current leader:
*
* From Figure 3.1:
*
* Rules for Servers: Candidates: if AppendEntries RPC is received
* from new leader: convert to follower.
*
* From Section 3.4:
*
* While waiting for votes, a candidate may receive an AppendEntries
* RPC from another server claiming to be leader. If the leader's term
* (included in its RPC) is at least as large as the candidate's
* current term, then the candidate recognizes the leader as legitimate
* and returns to follower state. If the term in the RPC is smaller than
* the candidate's current term, then the candidate rejects the RPC and
* continues in candidate state.
*
* From state diagram in Figure 3.3:
*
* [candidate]: discovers current leader -> [follower]
*
* Note that it should not be possible for us to be in leader state,
* because the leader that is sending us the request should have either
* a lower term (and in that case we reject the request above), or a
* higher term (and in that case we step down). It can't have the same
* term because at most one leader can be elected at any given term.
*/
assert(r->state == RAFT_FOLLOWER || r->state == RAFT_CANDIDATE);
assert(r->current_term == args->term);
if (r->state == RAFT_CANDIDATE) {
/* The current term and the peer one must match, otherwise we
* would have either rejected the request or stepped down to
* followers. */
assert(match == 0);
tracef("discovered leader -> step down ");
convertToFollower(r);
}
assert(r->state == RAFT_FOLLOWER);
/* Update current leader because the term in this AppendEntries RPC is
* up to date. */
rv = recvUpdateLeader(r, id, address);
if (rv != 0) {
return rv;
}
/* Reset the election timer. */
r->election_timer_start = r->io->time(r->io);
/* If we are installing a snapshot, ignore these entries. TODO: we
* should do something smarter, e.g. buffering the entries in the I/O
* backend, which should be in charge of serializing everything. */
if (replicationInstallSnapshotBusy(r) && args->n_entries > 0) {
tracef("ignoring AppendEntries RPC during snapshot install");
entryBatchesDestroy(args->entries, args->n_entries);
return 0;
}
rv = replicationAppend(r, args, &result->rejected, &async);
if (rv != 0) {
return rv;
}
if (async) {
return 0;
}
/* Echo back to the leader the point that we reached. */
result->last_log_index = r->last_stored;
reply:
result->term = r->current_term;
/* Free the entries batch, if any. */
if (args->n_entries > 0 && args->entries[0].batch != NULL) {
raft_free(args->entries[0].batch);
}
if (args->entries != NULL) {
raft_free(args->entries);
}
message.type = RAFT_IO_APPEND_ENTRIES_RESULT;
message.server_id = id;
message.server_address = address;
req = RaftHeapMalloc(sizeof *req);
if (req == NULL) {
return RAFT_NOMEM;
}
req->data = r;
rv = r->io->send(r->io, req, &message, recvSendAppendEntriesResultCb);
if (rv != 0) {
raft_free(req);
return rv;
}
return 0;
}
#undef tracef
dqlite-1.16.7/src/raft/recv_append_entries.h 0000664 0000000 0000000 00000000545 14652527134 0021020 0 ustar 00root root 0000000 0000000 /* Receive an AppendEntries message. */
#ifndef RECV_APPEND_ENTRIES_H_
#define RECV_APPEND_ENTRIES_H_
#include "../raft.h"
/* Process an AppendEntries RPC from the given server. */
int recvAppendEntries(struct raft *r,
raft_id id,
const char *address,
const struct raft_append_entries *args);
#endif /* RECV_APPEND_ENTRIES_H_ */
dqlite-1.16.7/src/raft/recv_append_entries_result.c 0000664 0000000 0000000 00000003112 14652527134 0022402 0 ustar 00root root 0000000 0000000 #include "recv_append_entries_result.h"
#include "../tracing.h"
#include "assert.h"
#include "configuration.h"
#include "recv.h"
#include "replication.h"
int recvAppendEntriesResult(struct raft *r,
const raft_id id,
const char *address,
const struct raft_append_entries_result *result)
{
int match;
const struct raft_server *server;
int rv;
assert(r != NULL);
assert(id > 0);
assert(address != NULL);
assert(result != NULL);
tracef(
"self:%llu from:%llu@%s last_log_index:%llu rejected:%llu "
"term:%llu",
r->id, id, address, result->last_log_index, result->rejected,
result->term);
if (r->state != RAFT_LEADER) {
tracef("local server is not leader -> ignore");
return 0;
}
rv = recvEnsureMatchingTerms(r, result->term, &match);
if (rv != 0) {
return rv;
}
if (match < 0) {
tracef("local term is higher -> ignore ");
return 0;
}
/* If we have stepped down, abort here.
*
* From Figure 3.1:
*
* [Rules for Servers] All Servers: If RPC request or response
* contains term T > currentTerm: set currentTerm = T, convert to
* follower.
*/
if (match > 0) {
assert(r->state == RAFT_FOLLOWER);
return 0;
}
assert(result->term == r->current_term);
/* Ignore responses from servers that have been removed */
server = configurationGet(&r->configuration, id);
if (server == NULL) {
tracef("unknown server -> ignore");
return 0;
}
/* Update the progress of this server, possibly sending further entries.
*/
rv = replicationUpdate(r, server, result);
if (rv != 0) {
return rv;
}
return 0;
}
#undef tracef
dqlite-1.16.7/src/raft/recv_append_entries_result.h 0000664 0000000 0000000 00000000624 14652527134 0022414 0 ustar 00root root 0000000 0000000 /* Receive an AppendEntries result message. */
#ifndef RECV_APPEND_ENTRIES_RESULT_H_
#define RECV_APPEND_ENTRIES_RESULT_H_
#include "../raft.h"
/* Process an AppendEntries RPC result from the given server. */
int recvAppendEntriesResult(struct raft *r,
raft_id id,
const char *address,
const struct raft_append_entries_result *result);
#endif /* RECV_APPEND_ENTRIES_RESULT_H_ */
dqlite-1.16.7/src/raft/recv_install_snapshot.c 0000664 0000000 0000000 00000063710 14652527134 0021403 0 ustar 00root root 0000000 0000000 #include
#include
#include
#include "../tracing.h"
#include "assert.h"
#include "convert.h"
#include "flags.h"
#include "log.h"
#include "recv.h"
#include "replication.h"
#include "../lib/sm.h"
#include "../raft.h"
#include "../raft/recv_install_snapshot.h"
#include "../utils.h"
/**
* =Overview
*
* This detailed level design is based on PL018 and describes
* significant implementation details of data structures, RPCs
* introduced in it; provides model of operation and failure handling
* based on Leader's and Follower's states.
*
* =Data structures
*
* Among other structures it's needed to introduce a (persistent) container
* `HT` to efficiently store and map checksums to their page numbers on both
* the leader's and follower's side. HT is implemented on top of sqlite3
* database with unix VFS. Every database corresponds to a raft-related
* database and maintains the following schema:
*
* CREATE TABLE "map" ("checksum" INTEGER NOT NULL, "pageno" INTEGER NOT NULL UNIQUE)
* CREATE INDEX map_idx on map(checksum);
*
* Each database stores a mapping from checksum to page number. This
* provides an efficient way to insert and lookup records
* corresponding to the checksums and page numbers.
*/
/**
* =Operation
*
* 0. Leader creates one state machine per Follower to track of their states
* and moves it to F_ONLINE state. Follower creates a state machine to keep
* track of its states and moves it to NORMAL state.
*
* 1. The Leader learns the Follower’s follower.lastLogIndex during receiving
* replies on AppendEntries() RPC, fails to find follower.lastLogIndex in its
* RAFT log or tries and fails to construct an AppendEntries() message because
* of the WAL that contained some necessary frames has been rotated out, and
* understands that the snapshot installation procedure is required.
*
* Leader calls leader_tick() putting struct raft_message as a parameter which
* logic moves it from F_ONLINE to F_NEEDS_SNAP state.
*
* 2. The Leader triggers the creation of its HT and initiates the snapshot
* installation by sending InstallSnapshot() message as soon as the HT is
* created.
*
* 3. Upon receiving this message on the Follower's side, Follower calls
* follower_tick() putting struct raft_message as a parameter which triggers the
* creation of the HT on the follower side. Once HT is created follower moves
* to SIGS_CALC_STARTED and triggers a background job to calculate the checksum
* of its pages and inserting them in the HT.
*
* 4. The Leader probes the follower sending Signature(calculated?) messages
* and the Follower replies with either SignatureResult(calculated=false) if it
* is still calculating the chechsums or SignatureResult(calculated=true) if it
* has finished. If the process finishes, Follower moves into SIG_RECEIVING and
* Leader moves into REQ_SIG_LOOP.
*
* 5. The Leader sends Signature() messages to the Follower containing the page
* range for which we want to get the checksums.
*
* The Follower sends the requested checksums in a SignatureResult() message
* back to the Leader and the leader puts incomming payloads of Signature()
* message into the HT.
*
* 6. When the follower sends the checksum of its highest numbered page to the
* Leader, it sends the SignatureResult() message using the done=true flag,
* upon receiving it the Leader moves into READ_PAGES_LOOP state and the
* Follower moves into CHUNK_RECEIVING.
*
* 7. In READ_PAGES_LOOP state, the Leader starts iterating over
* the local persistent state, and calculates the checksum for each page the
* state has. Then, it tries to find the checksum it calculated in HT. Based on
* the result of this calculation, the Leader sends CP() or MV() to the
* Follower.
*
* The Follower receives the message and persists the page using a background
* job. Once the background job is finished, the Follower replies with
* CPResult() or MVResult().
*
* 8. When the iteration has finished the Leader sends
* InstallShapshot(..., done=true) message to the Follower. It moves the
* Follower back to NORMAL state and the state machine corresponding to the
* Follower on the Leader is moved to SNAPSHOT_DONE state.
*
* 9. The Leader sends AppendEntries() RPC to the Follower and restarts the
* algorithm from (1). The Leader's state machine is being moved to
* FOLLOWER_ONLINE state.
*
* =Failure model
*
* ==Unavailability of the Leader and Follower.
*
* To handle use-cases when any party of the communication becomes
* unavailable for a while without crash the following assumtions are
* made:
*
* - Signature() or InstallSnapshot(MV/CP) messages are idempotent and
* can be applied to the persistent state many times resulting the
* same transition.
*
* - Each message with data chuncks has an information about the
* "chunk index". Chunk indexes come in monotonically increasing
* order.
*
* - Each reply message acknowledges that the data received (or
* ignored) by sending `result` field back to the counter part along
* with last known chunk index as a confirmation that the receiver
* "knows everything up to the given chunck index".
*
* - If a party notices that last known chunk index sent back to it
* doesn't match it's own, the communication get's restarted from
* the lowest known index.
*
* If a reply is not received the Leader will eventually timeout and retry
* sending the same message.
*
* ==Crashes of the Leader and Follower.
*
* Crashes of the Leader are handled by Raft when a new leader is elected
* and the snapshot process is restarted.
*
* If the Follower receives an message which is not expected in the Follower's
* current state, the Follower will reply using the message's result RPC
* setting the unexpected=true flag. This response suggests the Leader to
* restart the snapshot installation procedure.
*
* In particular, if the follower crashes it will restart its state machine to
* the NORMAL state and reply using the unexpected=true flag for any messages
* not expected in the NORMAL state, suggesting the Leader to restart the
* procedure.
*
* =State model
*
* Definitions:
*
* Rf -- raft index sent in AppendEntriesResult() from Follower to Leader
* Tf -- Follower's term sent in AppendEntriesResult() from Follower to Leader
*
* Tl -- Leader's term
* Rl -- raft index of the Leader
*
* Leader's state machine:
*
* +-----------------------------+
* | | AppendEntriesResult() received
* | *Result(unexpected=true) | raft_log.find(Rf) == "FOUND"
* | received V +------------+
* | +-------------> F_ONLINE <------------+
* | | |
* | | | AppendEntriesResult() received
* | | | Rf << Rl && raft_log.find(Rf) == "ENOENTRY"
* | | V Trigger background job.
* | +--------------- HT_WAIT
* | | V HT creation finished,
* | +------------- F_NEEDS_SNAP*
* | | | InstallSnapshot() sent,
* | | V InstallSnapshotResult() received.
* | +----------- CHECK_F_HAS_SIGS* <-----------------------+ SignatureResult() had
* | | | Signature(calculated?) sent, | calculated=false and
* | | V SignatureResult() received. | timeout reached.
* | +------------- WAIT_SIGS -----------------------------+
* | | V SignatureResult() had calculated=true.
* | +------------- REQ_SIG_LOOP* <-------------------------+
* | | | Signature() sent, | Signature persisted in HT,
* | | V SignatureResult() received. | there are some pending
* | +------------- RECV_SIG_PART | signatures.
* | | V Background job triggered. |
* | +---------- PRESISTED_SIG_PART ------------------------+
* | | | Signature persisted in HT,
* | | V all signatures have been persisted.
* | +----------- READ_PAGES_LOOP <-------------------------+
* | | V Background job triggered. | There are pending pages to
* | +-------------- PAGE_READ* | be sent.
* | | | Page read from disk, |
* | | V CP()/MV() sent. |
* | +-------------- PAGE_SENT -----------------------------+
* | | V All pages sent and acked.
* | +-------------- SNAP_DONE
* | | | InstallSnapshot(done=true) sent,
* | | V and reply received.
* | +---------------- FINAL
* | |
* +-----------------------------+
*
* Note all states marked with (*) have an extra transition not represented in
* the diagram above. When the leader sends a message there is always a timeout
* sheduled. If the reply is not received and the timeout expires, we will stay
* in the same state and re-send the message.
*
* Follower's state machine:
*
* +------+ (%)
* +-------------------> NORMAL <----+
* | +-----------> |
* | | | InstallSnapshot() received.
* | | V
* | +--------- HT_CREATE
* | | V Trigger background job.
* | +---------- HT_WAIT
* | | | Background job finishes,
* | | | InstallSnapshotResult() sent.
* | | V
* | +------ SIGS_CALC_STARTED
* | | V Trigger background job.
* | +------ SIGS_CALC_LOOP <--------------------------+
* | | V Signature(calculated?) received. | SignatureResult(calculated=false) sent.
* | +--- SIGS_CALC_MSG_RECEIVED ----------------------+
* | | | Signatures for all db pages have been calculated.
* | | V SignatureResult(calculated=true) sent.
* | | SIGS_CALC_DONE
* | | V
* | +------- SIG_RECEIVING <--------------------------+
* | | V Signature() received. |
* | +------- SIG_PROCESSED |
* | | V Background job triggered. | Signature() had done=false,
* | +--------- SIG_READ | SignatureResult() sent.
* | | V Checksum is read from HT. |
* | +--------- SIG_REPLIED ---------------------------+
* | | | Signature() had done=true,
* | | V SignatureResult() sent.
* | +------- CHUNK_RECEIVING <------------------------+
* | | V CP()/MV() received. |
* | +------- CHUNK_PROCESSED |
* | | V Background job triggered. |
* | +------- CHUNK_APPLIED |
* | | V Chunk has been written to disk. |
* | +------- CHUNK_REPLIED ---------------------------+
* | (@ || %) | CP()/MV() had done=true.
* | V CPResult()/MVResult() sent.
* | FINAL
* | |
* +-----------------------+
*
* (@) -- AppendEntries() received && Tf < Tl
* (%) -- Signature()/CP()/MV() received and in the current state receving a
* message of such type is unexpected. *Result(unexpected=true) sent.
*/
/* TODO this uses several GNU extensions, do we use it?
#define RC(rc) ({ \
typeof(rc) __rc = (rc); \
printf("< rc=%d\n", __rc); \
__rc; \
}) */
enum rpc_state {
RPC_INIT,
RPC_FILLED,
RPC_SENT,
RPC_TIMEDOUT,
RPC_REPLIED,
RPC_ERROR,
RPC_END,
RPC_NR,
};
/* clang-format off */
static const struct sm_conf rpc_sm_conf[RPC_NR] = {
[RPC_INIT] = {
.flags = SM_INITIAL | SM_FINAL,
.name = "init",
.allowed = BITS(RPC_FILLED)
| BITS(RPC_ERROR),
},
[RPC_FILLED] = {
.name = "filled",
.allowed = BITS(RPC_SENT)
| BITS(RPC_ERROR),
},
[RPC_SENT] = {
.name = "sent",
.allowed = BITS(RPC_TIMEDOUT)
| BITS(RPC_REPLIED)
| BITS(RPC_ERROR)
| BITS(RPC_END),
},
[RPC_TIMEDOUT] = {
.name = "timedout",
.allowed = BITS(RPC_INIT),
},
[RPC_REPLIED] = {
.name = "replied",
.allowed = BITS(RPC_INIT)
| BITS(RPC_END),
},
[RPC_ERROR] = {
.name = "error",
.allowed = BITS(RPC_INIT),
.flags = SM_FINAL,
},
[RPC_END] = {
.name = "end",
.flags = SM_FINAL,
},
};
/* clang-format on */
enum work_state {
WORK_INIT,
WORK_DONE,
WORK_ERROR,
WORK_NR,
};
static const struct sm_conf work_sm_conf[WORK_NR] = {
[WORK_INIT] = {
.flags = SM_INITIAL | SM_FINAL,
.name = "w_init",
.allowed = BITS(WORK_DONE) | BITS(WORK_ERROR),
},
[WORK_DONE] = {
.flags = SM_FINAL,
.name = "w_done",
},
[WORK_ERROR] = {
.flags = SM_FINAL,
.name = "w_error",
},
};
enum to_state {
TO_INIT,
TO_STARTED,
TO_EXPIRED,
TO_CANCELED,
TO_NR,
};
/* clang-format off */
static const struct sm_conf to_sm_conf[TO_NR] = {
[TO_INIT] = {
.flags = SM_INITIAL | SM_FINAL,
.name = "init",
.allowed = BITS(TO_STARTED),
},
[TO_STARTED] = {
.flags = SM_FINAL,
.name = "started",
.allowed = BITS(TO_EXPIRED) | BITS(TO_CANCELED),
},
[TO_EXPIRED] = {
.flags = SM_FINAL,
.name = "expired",
},
[TO_CANCELED] = {
.flags = SM_FINAL,
.name = "canceled",
},
};
/* clang-format on */
#define M_MSG_SENT ((const struct raft_message *) 3)
#define M_TIMEOUT ((const struct raft_message *) 2)
#define M_WORK_DONE ((const struct raft_message *) 1)
static bool is_main_thread(void)
{
// TODO: thread local storage.
return true;
}
static bool work_sm_invariant(const struct sm *sm, int prev_state)
{
(void)sm;
(void)prev_state;
return true;
}
bool leader_sm_invariant(const struct sm *sm, int prev_state)
{
(void)sm;
(void)prev_state;
return true;
}
bool follower_sm_invariant(const struct sm *sm, int prev_state)
{
(void)sm;
(void)prev_state;
return true;
}
static bool rpc_sm_invariant(const struct sm *sm, int prev_state)
{
(void)sm;
(void)prev_state;
return true;
}
static bool to_sm_invariant(const struct sm *sm, int prev_state)
{
(void)sm;
(void)prev_state;
return true;
}
static void leader_work_done(struct work *w)
{
struct leader *leader = CONTAINER_OF(w, struct leader, work);
sm_move(&w->sm, WORK_DONE);
leader_tick(leader, M_WORK_DONE);
}
static void follower_work_done(struct work *w)
{
struct follower *follower = CONTAINER_OF(w, struct follower, work);
sm_move(&w->sm, WORK_DONE);
follower_tick(follower, M_WORK_DONE);
}
static void rpc_to_cb(uv_timer_t *handle)
{
struct timeout *to = CONTAINER_OF(handle, struct timeout, handle);
struct rpc *rpc = CONTAINER_OF(to, struct rpc, timeout);
struct leader *leader = CONTAINER_OF(rpc, struct leader, rpc);
sm_move(&to->sm, TO_EXPIRED);
sm_move(&rpc->sm, RPC_TIMEDOUT);
leader_tick(leader, M_TIMEOUT);
}
static void leader_to_cb(uv_timer_t *handle)
{
struct timeout *to = CONTAINER_OF(handle, struct timeout, handle);
struct leader *leader = CONTAINER_OF(to, struct leader, timeout);
sm_move(&to->sm, TO_EXPIRED);
leader_tick(leader, M_TIMEOUT);
}
static void leader_to_start(struct leader *leader,
struct timeout *to,
unsigned delay,
to_cb_op to_cb)
{
leader->ops->to_init(to);
sm_init(&to->sm, to_sm_invariant, NULL, to_sm_conf, "to", TO_INIT);
leader->ops->to_start(to, delay, to_cb);
sm_relate(&leader->sm, &to->sm);
sm_move(&to->sm, TO_STARTED);
}
static void leader_to_cancel(struct leader *leader, struct timeout *to)
{
leader->ops->to_stop(to);
sm_move(&to->sm, TO_CANCELED);
}
static void leader_sent_cb(struct sender *s, int rc)
{
struct rpc *rpc = CONTAINER_OF(s, struct rpc, sender);
struct leader *leader = CONTAINER_OF(rpc, struct leader, rpc);
if (UNLIKELY(rc != 0)) {
sm_move(&rpc->sm, RPC_ERROR);
return;
}
leader_tick(leader, M_MSG_SENT);
}
static void follower_sent_cb(struct sender *s, int rc)
{
struct rpc *rpc = CONTAINER_OF(s, struct rpc, sender);
struct follower *follower = CONTAINER_OF(rpc, struct follower, rpc);
if (UNLIKELY(rc != 0)) {
sm_move(&rpc->sm, RPC_ERROR);
return;
}
follower_tick(follower, M_MSG_SENT);
}
static bool is_a_trigger_leader(const struct leader *leader, const struct raft_message *incoming)
{
(void)leader;
(void)incoming;
return true;
}
static bool is_a_trigger_follower(const struct follower *follower,
const struct raft_message *incoming)
{
switch (sm_state(&follower->sm)) {
case FS_SIGS_CALC_LOOP:
return incoming != M_WORK_DONE;
case FS_SIG_PROCESSED:
case FS_CHUNCK_PROCESSED:
return incoming == M_WORK_DONE;
}
return true;
}
static bool is_a_duplicate(const void *state,
const struct raft_message *incoming)
{
(void)state;
(void)incoming;
return false;
}
static void work_init(struct work *w)
{
sm_init(&w->sm, work_sm_invariant, NULL, work_sm_conf, "work", WORK_INIT);
}
static void rpc_init(struct rpc *rpc)
{
sm_init(&rpc->sm, rpc_sm_invariant, NULL, rpc_sm_conf, "rpc", RPC_INIT);
}
static void rpc_fini(struct rpc *rpc)
{
sm_move(&rpc->sm, RPC_END);
}
static void work_fill_leader(struct leader *leader)
{
leader->work_cb = leader->ops->ht_create;
work_init(&leader->work);
sm_relate(&leader->sm, &leader->work.sm);
}
static void work_fill_follower(struct follower *follower)
{
switch (sm_state(&follower->sm)) {
case FS_HT_CREATE:
follower->work_cb = follower->ops->ht_create;
break;
case FS_SIGS_CALC_STARTED:
follower->work_cb = follower->ops->fill_ht;
break;
case FS_SIG_RECEIVING:
follower->work_cb = follower->ops->read_sig;
break;
case FS_CHUNCK_RECEIVING:
follower->work_cb = follower->ops->write_chunk;
break;
}
work_init(&follower->work);
sm_relate(&follower->sm, &follower->work.sm);
}
static void rpc_fill_leader(struct leader *leader)
{
rpc_init(&leader->rpc);
sm_relate(&leader->sm, &leader->rpc.sm);
sm_move(&leader->rpc.sm, RPC_FILLED);
}
static void rpc_fill_follower(struct follower *follower)
{
rpc_init(&follower->rpc);
sm_relate(&follower->sm, &follower->rpc.sm);
sm_move(&follower->rpc.sm, RPC_FILLED);
}
static int rpc_send(struct rpc *rpc, sender_send_op op, sender_cb_op sent_cb)
{
int rc = op(&rpc->sender, &rpc->message, sent_cb);
return rc;
}
static void follower_rpc_tick(struct rpc *rpc)
{
switch(sm_state(&rpc->sm)) {
case RPC_INIT:
break;
case RPC_FILLED:
sm_move(&rpc->sm, RPC_SENT);
break;
case RPC_SENT:
case RPC_TIMEDOUT:
case RPC_REPLIED:
case RPC_ERROR:
case RPC_END:
default:
break;
}
}
static void leader_rpc_tick(struct rpc *rpc)
{
switch(sm_state(&rpc->sm)) {
case RPC_INIT:
break;
case RPC_FILLED:
sm_move(&rpc->sm, RPC_SENT);
break;
case RPC_SENT:
sm_move(&rpc->sm, RPC_REPLIED);
break;
case RPC_TIMEDOUT:
case RPC_REPLIED:
case RPC_ERROR:
case RPC_END:
default:
break;
}
}
static void leader_reset(struct leader *leader)
{
(void)leader;
}
static bool is_an_unexpected_trigger(const struct leader *leader,
const struct raft_message *msg)
{
(void)leader;
if (msg == M_MSG_SENT || msg == M_TIMEOUT || msg == M_WORK_DONE) {
return false;
}
enum raft_result res = RAFT_RESULT_UNEXPECTED;
switch (msg->type) {
case RAFT_IO_APPEND_ENTRIES:
res = RAFT_RESULT_OK;
break;
case RAFT_IO_INSTALL_SNAPSHOT:
res = msg->install_snapshot.result;
break;
case RAFT_IO_INSTALL_SNAPSHOT_RESULT:
res = msg->install_snapshot_result.result;
break;
case RAFT_IO_INSTALL_SNAPSHOT_CP:
res = msg->install_snapshot_cp.result;
break;
case RAFT_IO_INSTALL_SNAPSHOT_CP_RESULT:
res = msg->install_snapshot_cp_result.result;
break;
case RAFT_IO_INSTALL_SNAPSHOT_MV:
res = msg->install_snapshot_mv.result;
break;
case RAFT_IO_INSTALL_SNAPSHOT_MV_RESULT:
res = msg->install_snapshot_mv_result.result;
break;
case RAFT_IO_SIGNATURE:
res = msg->signature.result;
break;
case RAFT_IO_SIGNATURE_RESULT:
res = msg->signature_result.result;
break;
}
return res == RAFT_RESULT_UNEXPECTED;
}
static int follower_next_state(struct sm *sm)
{
struct follower *follower = CONTAINER_OF(sm, struct follower, sm);
switch (sm_state(sm)) {
case FS_SIGS_CALC_LOOP:
return follower->sigs_calculated ? FS_SIGS_CALC_DONE : FS_SIGS_CALC_MSG_RECEIVED;
case FS_SIGS_CALC_MSG_RECEIVED:
return FS_SIGS_CALC_LOOP;
case FS_SIG_REPLIED:
return FS_CHUNCK_RECEIVING;
case FS_FINAL:
return FS_NORMAL;
}
return sm_state(sm) + 1;
}
static int leader_next_state(struct sm *sm)
{
struct leader *leader = CONTAINER_OF(sm, struct leader, sm);
switch (sm_state(sm)) {
case LS_WAIT_SIGS:
return sm_state(sm) + (leader->sigs_calculated ? +1 : -1);
case LS_FINAL:
return LS_F_ONLINE;
}
return sm_state(sm) + 1;
}
__attribute__((unused)) void leader_tick(struct leader *leader, const struct raft_message *incoming)
{
(void)leader_sm_conf;
(void)leader_sm_invariant;
int rc;
struct sm *sm = &leader->sm;
const struct leader_ops *ops = leader->ops;
PRE(is_main_thread());
if (!is_a_trigger_leader(leader, incoming) ||
is_a_duplicate(leader, incoming))
return;
if (is_an_unexpected_trigger(leader, incoming)) {
leader_reset(leader);
return;
}
again:
switch(sm_state(sm)) {
case LS_F_ONLINE:
case LS_RECV_SIG_PART:
case LS_READ_PAGES_LOOP:
work_fill_leader(leader);
ops->work_queue(&leader->work, leader->work_cb, leader_work_done);
sm_move(sm, leader_next_state(sm));
break;
case LS_HT_WAIT:
case LS_PAGE_SENT:
case LS_PERSISTED_SIG_PART:
sm_move(sm, leader_next_state(sm));
goto again;
case LS_FINAL:
sm_move(sm, leader_next_state(sm));
break;
case LS_PAGE_READ:
case LS_SNAP_DONE:
case LS_F_NEEDS_SNAP:
case LS_REQ_SIG_LOOP:
case LS_CHECK_F_HAS_SIGS:
leader_rpc_tick(&leader->rpc);
switch (sm_state(&leader->rpc.sm)) {
case RPC_SENT:
leader_to_start(leader, &leader->rpc.timeout, 10000, rpc_to_cb);
return;
case RPC_REPLIED:
leader_to_cancel(leader, &leader->rpc.timeout);
rpc_fini(&leader->rpc);
sm_move(sm, leader_next_state(sm));
goto again;
}
rpc_fill_leader(leader);
rc = rpc_send(&leader->rpc, ops->sender_send, leader_sent_cb);
if (rc != 0) {
goto again;
}
break;
case LS_WAIT_SIGS:
if (leader_next_state(sm) > sm_state(sm)) {
sm_move(sm, leader_next_state(sm));
goto again;
}
leader_to_start(leader, &leader->timeout, 10000, leader_to_cb);
sm_move(sm, leader_next_state(sm));
break;
default:
IMPOSSIBLE("");
}
}
__attribute__((unused)) void follower_tick(struct follower *follower, const struct raft_message *incoming)
{
(void)follower_sm_conf;
(void)follower_sm_invariant;
int rc;
struct sm *sm = &follower->sm;
const struct follower_ops *ops = follower->ops;
if (!is_a_trigger_follower(follower, incoming) ||
is_a_duplicate(follower, incoming))
return;
PRE(is_main_thread());
again:
switch (sm_state(&follower->sm)) {
case FS_NORMAL:
case FS_SIGS_CALC_LOOP:
case FS_SIG_READ:
case FS_CHUNCK_APPLIED:
follower_rpc_tick(&follower->rpc);
if (sm_state(&follower->rpc.sm) == RPC_SENT) {
rpc_fini(&follower->rpc);
sm_move(sm, follower_next_state(sm));
goto again;
}
rpc_fill_follower(follower);
rc = rpc_send(&follower->rpc, ops->sender_send, follower_sent_cb);
if (rc != 0) {
goto again;
}
break;
case FS_SIG_PROCESSED:
case FS_CHUNCK_PROCESSED:
case FS_CHUNCK_REPLIED:
case FS_HT_WAIT:
sm_move(sm, follower_next_state(sm));
goto again;
case FS_HT_CREATE:
case FS_SIGS_CALC_STARTED:
case FS_SIG_RECEIVING:
case FS_CHUNCK_RECEIVING:
work_fill_follower(follower);
ops->work_queue(&follower->work, follower->work_cb, follower_work_done);
sm_move(sm, follower_next_state(sm));
break;
case FS_SIG_REPLIED:
case FS_SIGS_CALC_DONE:
case FS_SIGS_CALC_MSG_RECEIVED:
case FS_FINAL:
sm_move(sm, follower_next_state(sm));
break;
default:
IMPOSSIBLE("");
}
}
static void installSnapshotSendCb(struct raft_io_send *req, int status)
{
(void)status;
raft_free(req);
}
int recvInstallSnapshot(struct raft *r,
const raft_id id,
const char *address,
struct raft_install_snapshot *args)
{
struct raft_io_send *req;
struct raft_message message;
struct raft_append_entries_result *result =
&message.append_entries_result;
int rv;
int match;
bool async;
assert(address != NULL);
tracef(
"self:%llu from:%llu@%s conf_index:%llu last_index:%llu "
"last_term:%llu "
"term:%llu",
r->id, id, address, args->conf_index, args->last_index,
args->last_term, args->term);
result->rejected = args->last_index;
result->last_log_index = logLastIndex(r->log);
result->version = RAFT_APPEND_ENTRIES_RESULT_VERSION;
result->features = RAFT_DEFAULT_FEATURE_FLAGS;
rv = recvEnsureMatchingTerms(r, args->term, &match);
if (rv != 0) {
return rv;
}
if (match < 0) {
tracef("local term is higher -> reject ");
goto reply;
}
/* TODO: this logic duplicates the one in the AppendEntries handler */
assert(r->state == RAFT_FOLLOWER || r->state == RAFT_CANDIDATE);
assert(r->current_term == args->term);
if (r->state == RAFT_CANDIDATE) {
assert(match == 0);
tracef("discovered leader -> step down ");
convertToFollower(r);
}
rv = recvUpdateLeader(r, id, address);
if (rv != 0) {
return rv;
}
r->election_timer_start = r->io->time(r->io);
rv = replicationInstallSnapshot(r, args, &result->rejected, &async);
if (rv != 0) {
tracef("replicationInstallSnapshot failed %d", rv);
return rv;
}
if (async) {
return 0;
}
if (result->rejected == 0) {
/* Echo back to the leader the point that we reached. */
result->last_log_index = args->last_index;
}
reply:
result->term = r->current_term;
/* Free the snapshot data. */
raft_configuration_close(&args->conf);
raft_free(args->data.base);
message.type = RAFT_IO_APPEND_ENTRIES_RESULT;
message.server_id = id;
message.server_address = address;
req = raft_malloc(sizeof *req);
if (req == NULL) {
return RAFT_NOMEM;
}
req->data = r;
rv = r->io->send(r->io, req, &message, installSnapshotSendCb);
if (rv != 0) {
raft_free(req);
return rv;
}
return 0;
}
#undef tracef
dqlite-1.16.7/src/raft/recv_install_snapshot.h 0000664 0000000 0000000 00000015367 14652527134 0021415 0 ustar 00root root 0000000 0000000 /* InstallSnapshot RPC handlers. */
#ifndef RECV_INSTALL_SNAPSHOT_H_
#define RECV_INSTALL_SNAPSHOT_H_
#include
#include
#include "../raft.h"
struct work;
struct sender;
struct timeout;
typedef void (*to_cb_op)(uv_timer_t *handle);
typedef void (*work_op)(struct work *w);
typedef void (*sender_cb_op)(struct sender *s, int rc);
struct work {
work_op work_cb;
work_op after_cb;
struct sm sm;
};
struct sender {
sender_cb_op cb;
};
struct timeout {
to_cb_op cb;
struct sm sm;
uv_timer_t handle;
};
struct rpc {
struct sm sm;
struct sender sender;
struct raft_message message;
struct timeout timeout;
};
typedef int (*sender_send_op)(struct sender *s,
struct raft_message *payload,
sender_cb_op cb);
struct leader_ops {
work_op ht_create;
void (*to_init)(struct timeout *to);
void (*to_stop)(struct timeout *to);
void (*to_start)(struct timeout *to, unsigned delay, to_cb_op cb);
sender_send_op sender_send;
void (*work_queue)(struct work *w,
work_op work, work_op after_cb);
};
struct follower_ops {
work_op ht_create;
work_op fill_ht;
work_op read_sig;
work_op write_chunk;
sender_send_op sender_send;
void (*work_queue)(struct work *w,
work_op work, work_op after_cb);
};
struct leader {
struct sm sm;
struct rpc rpc;
struct work work;
work_op work_cb;
struct timeout timeout;
const struct leader_ops *ops;
/* TODO dummy flags */
bool sigs_calculated;
bool sigs_more;
bool pages_more;
};
struct follower {
struct sm sm;
struct rpc rpc;
struct work work;
work_op work_cb;
const struct follower_ops *ops;
/* TODO dummy flags */
bool sigs_calculated;
};
void leader_tick(struct leader *leader, const struct raft_message *incoming);
void follower_tick(struct follower *follower, const struct raft_message *incoming);
/* TODO make all of these private and static once we can write tests without
* depending on the states. */
bool leader_sm_invariant(const struct sm *sm, int prev_state);
bool follower_sm_invariant(const struct sm *sm, int prev_state);
enum leader_states {
LS_F_ONLINE,
LS_HT_WAIT,
LS_F_NEEDS_SNAP,
LS_CHECK_F_HAS_SIGS,
LS_WAIT_SIGS,
LS_REQ_SIG_LOOP,
LS_RECV_SIG_PART,
LS_PERSISTED_SIG_PART,
LS_READ_PAGES_LOOP,
LS_PAGE_READ,
LS_PAGE_SENT,
LS_SNAP_DONE,
LS_FINAL,
LS_NR,
};
/* clang-format off */
static const struct sm_conf leader_sm_conf[LS_NR] = {
[LS_F_ONLINE] = {
.flags = SM_INITIAL | SM_FINAL,
.name = "online",
.allowed = BITS(LS_HT_WAIT)
| BITS(LS_F_ONLINE),
},
[LS_HT_WAIT] = {
.name = "ht-wait",
.allowed = BITS(LS_F_NEEDS_SNAP),
},
[LS_F_NEEDS_SNAP] = {
.name = "needs-snapshot",
.allowed = BITS(LS_CHECK_F_HAS_SIGS)
| BITS(LS_F_NEEDS_SNAP)
| BITS(LS_F_ONLINE),
},
[LS_CHECK_F_HAS_SIGS] = {
.name = "check-f-has-sigs",
.allowed = BITS(LS_CHECK_F_HAS_SIGS)
| BITS(LS_WAIT_SIGS)
| BITS(LS_F_ONLINE),
},
[LS_WAIT_SIGS] = {
.name = "wait-sigs",
.allowed = BITS(LS_CHECK_F_HAS_SIGS)
| BITS(LS_REQ_SIG_LOOP)
| BITS(LS_F_ONLINE),
},
[LS_REQ_SIG_LOOP] = {
.name = "req-sig-loop",
.allowed = BITS(LS_RECV_SIG_PART)
| BITS(LS_F_ONLINE),
},
[LS_RECV_SIG_PART] = {
.name = "recv-sig",
.allowed = BITS(LS_PERSISTED_SIG_PART)
| BITS(LS_REQ_SIG_LOOP)
| BITS(LS_F_ONLINE),
},
[LS_PERSISTED_SIG_PART] = {
.name = "pers-sig",
.allowed = BITS(LS_READ_PAGES_LOOP)
| BITS(LS_REQ_SIG_LOOP)
| BITS(LS_F_ONLINE),
},
[LS_READ_PAGES_LOOP] = {
.name = "read-pages-loop",
.allowed = BITS(LS_PAGE_READ)
| BITS(LS_F_ONLINE),
},
[LS_PAGE_READ] = {
.name = "page-read",
.allowed = BITS(LS_PAGE_SENT)
| BITS(LS_F_ONLINE),
},
[LS_PAGE_SENT] = {
.name = "page-sent",
.allowed = BITS(LS_READ_PAGES_LOOP)
| BITS(LS_SNAP_DONE)
| BITS(LS_F_ONLINE),
},
[LS_SNAP_DONE] = {
.name = "snap-done",
.allowed = BITS(LS_SNAP_DONE)
| BITS(LS_FINAL),
},
[LS_FINAL] = {
.name = "final",
.allowed = BITS(LS_F_ONLINE),
},
};
/* clang-format on */
enum follower_states {
FS_NORMAL,
FS_HT_CREATE,
FS_HT_WAIT,
FS_SIGS_CALC_STARTED,
FS_SIGS_CALC_LOOP,
FS_SIGS_CALC_MSG_RECEIVED,
FS_SIGS_CALC_DONE,
FS_SIG_RECEIVING,
FS_SIG_PROCESSED,
FS_SIG_READ,
FS_SIG_REPLIED,
FS_CHUNCK_RECEIVING,
FS_CHUNCK_PROCESSED,
FS_CHUNCK_APPLIED,
FS_CHUNCK_REPLIED,
FS_FINAL,
FS_NR,
};
/* clang-format off */
static const struct sm_conf follower_sm_conf[FS_NR] = {
[FS_NORMAL] = {
.flags = SM_INITIAL | SM_FINAL,
.name = "normal",
.allowed = BITS(FS_HT_CREATE)
| BITS(FS_NORMAL),
},
[FS_HT_CREATE] = {
.name = "ht_create",
.allowed = BITS(FS_HT_WAIT)
| BITS(FS_NORMAL),
},
[FS_HT_WAIT] = {
.name = "ht_waiting",
.allowed = BITS(FS_SIGS_CALC_STARTED)
| BITS(FS_NORMAL),
},
[FS_SIGS_CALC_STARTED] = {
.name = "signatures_calc_started",
.allowed = BITS(FS_SIGS_CALC_LOOP)
| BITS(FS_NORMAL),
},
[FS_SIGS_CALC_LOOP] = {
.name = "signatures_calc_loop",
.allowed = BITS(FS_SIGS_CALC_MSG_RECEIVED)
| BITS(FS_SIGS_CALC_DONE)
| BITS(FS_NORMAL),
},
[FS_SIGS_CALC_MSG_RECEIVED] = {
.name = "signatures_msg_received",
.allowed = BITS(FS_SIGS_CALC_LOOP)
| BITS(FS_NORMAL),
},
[FS_SIGS_CALC_DONE] = {
.name = "signatures_calc_done",
.allowed = BITS(FS_SIG_RECEIVING)
| BITS(FS_NORMAL),
},
[FS_SIG_RECEIVING] = {
.name = "signature_received",
.allowed = BITS(FS_SIG_PROCESSED)
| BITS(FS_NORMAL),
},
[FS_SIG_PROCESSED] = {
.name = "signature_processed",
.allowed = BITS(FS_SIG_READ)
| BITS(FS_NORMAL),
},
[FS_SIG_READ] = {
.name = "signature_read",
.allowed = BITS(FS_SIG_REPLIED)
| BITS(FS_NORMAL),
},
[FS_SIG_REPLIED] = {
.name = "signature_sent",
.allowed = BITS(FS_CHUNCK_RECEIVING)
| BITS(FS_SIG_RECEIVING)
| BITS(FS_NORMAL),
},
[FS_CHUNCK_RECEIVING] = {
.name = "chunk_received",
.allowed = BITS(FS_CHUNCK_PROCESSED)
| BITS(FS_NORMAL),
},
[FS_CHUNCK_PROCESSED] = {
.name = "chunk_processed",
.allowed = BITS(FS_CHUNCK_APPLIED)
| BITS(FS_NORMAL),
},
[FS_CHUNCK_APPLIED] = {
.name = "chunk_applied",
.allowed = BITS(FS_CHUNCK_REPLIED)
| BITS(FS_NORMAL),
},
[FS_CHUNCK_REPLIED] = {
.name = "chunk_replied",
.allowed = BITS(FS_CHUNCK_PROCESSED)
| BITS(FS_FINAL)
| BITS(FS_NORMAL),
},
[FS_FINAL] = {
.name = "final",
.allowed = BITS(FS_NORMAL),
},
};
/* clang-format on */
/* end of TODO make this private */
/* Process an InstallSnapshot RPC from the given server. */
int recvInstallSnapshot(struct raft *r,
raft_id id,
const char *address,
struct raft_install_snapshot *args);
#endif /* RECV_INSTALL_SNAPSHOT_H_ */
dqlite-1.16.7/src/raft/recv_request_vote.c 0000664 0000000 0000000 00000010607 14652527134 0020540 0 ustar 00root root 0000000 0000000 #include "recv_request_vote.h"
#include "../tracing.h"
#include "assert.h"
#include "election.h"
#include "recv.h"
#include "replication.h"
static void requestVoteSendCb(struct raft_io_send *req, int status)
{
(void)status;
raft_free(req);
}
int recvRequestVote(struct raft *r,
const raft_id id,
const char *address,
const struct raft_request_vote *args)
{
struct raft_io_send *req;
struct raft_message message;
struct raft_request_vote_result *result = &message.request_vote_result;
bool has_leader;
int match;
int rv;
assert(r != NULL);
assert(id > 0);
assert(args != NULL);
tracef(
"self:%llu from:%llu@%s candidate_id:%llu disrupt_leader:%d "
"last_log_index:%llu "
"last_log_term:%llu pre_vote:%d term:%llu",
r->id, id, address, args->candidate_id, args->disrupt_leader,
args->last_log_index, args->last_log_term, args->pre_vote,
args->term);
result->vote_granted = false;
result->pre_vote = args->pre_vote;
result->version = RAFT_REQUEST_VOTE_RESULT_VERSION;
/* Reject the request if we have a leader.
*
* From Section 4.2.3:
*
* [Removed] servers should not be able to disrupt a leader whose
* cluster is receiving heartbeats. [...] If a server receives a
* RequestVote request within the minimum election timeout of hearing
* from a current leader, it does not update its term or grant its vote
*
* From Section 4.2.3:
*
* This change conflicts with the leadership transfer mechanism as
* described in Chapter 3, in which a server legitimately starts an
* election without waiting an election timeout. In that case,
* RequestVote messages should be processed by other servers even when
* they believe a current cluster leader exists. Those RequestVote
* requests can include a special flag to indicate this behavior ("I
* have permission to disrupt the leader - it told me to!").
*/
has_leader = r->state == RAFT_LEADER ||
(r->state == RAFT_FOLLOWER &&
r->follower_state.current_leader.id != 0);
if (has_leader && !args->disrupt_leader) {
tracef("local server has a leader -> reject ");
goto reply;
}
/* If this is a pre-vote request, don't actually increment our term or
* persist the vote. */
if (args->pre_vote) {
recvCheckMatchingTerms(r, args->term, &match);
} else {
rv = recvEnsureMatchingTerms(r, args->term, &match);
if (rv != 0) {
return rv;
}
}
/* Reject the request if we are installing a snapshot.
*
* This condition should only be reachable if the disrupt_leader flag is
* set, since otherwise we wouldn't have passed the have_leader check
* above (follower state is not cleared while a snapshot is being
* installed). */
if (replicationInstallSnapshotBusy(r)) {
tracef("installing snapshot -> reject (disrupt_leader:%d)",
(int)args->disrupt_leader);
goto reply;
}
/* From Figure 3.1:
*
* RequestVote RPC: Receiver implementation: Reply false if
* term < currentTerm.
*
*/
if (match < 0) {
tracef("local term is higher -> reject ");
goto reply;
}
/* Unless this is a pre-vote request, at this point our term must be the
* same as the request term (otherwise we would have rejected the
* request or bumped our term). */
if (!args->pre_vote) {
tracef("no pre_vote: current_term:%llu term:%llu",
r->current_term, args->term);
assert(r->current_term == args->term);
}
rv = electionVote(r, args, &result->vote_granted);
if (rv != 0) {
return rv;
}
reply:
result->term = r->current_term;
/* Nodes don't update their term when seeing a Pre-Vote RequestVote RPC.
* To prevent the candidate from ignoring the response of this node if
* it has a smaller term than the candidate, we include the term of the
* request. The smaller term can occur if this node was partitioned from
* the cluster and has reestablished connectivity. This prevents a
* cluster deadlock when a majority of the nodes is online, but they
* fail to establish quorum because the vote of a former partitioned
* node with a smaller term is needed for majority.*/
if (args->pre_vote) {
result->term = args->term;
}
message.type = RAFT_IO_REQUEST_VOTE_RESULT;
message.server_id = id;
message.server_address = address;
req = raft_malloc(sizeof *req);
if (req == NULL) {
return RAFT_NOMEM;
}
req->data = r;
rv = r->io->send(r->io, req, &message, requestVoteSendCb);
if (rv != 0) {
raft_free(req);
return rv;
}
return 0;
}
#undef tracef
dqlite-1.16.7/src/raft/recv_request_vote.h 0000664 0000000 0000000 00000000511 14652527134 0020536 0 ustar 00root root 0000000 0000000 /* RequestVote RPC handler. */
#ifndef RECV_REQUEST_VOTE_H_
#define RECV_REQUEST_VOTE_H_
#include "../raft.h"
/* Process a RequestVote RPC from the given server. */
int recvRequestVote(struct raft *r,
raft_id id,
const char *address,
const struct raft_request_vote *args);
#endif /* RECV_REQUEST_VOTE_H_ */
dqlite-1.16.7/src/raft/recv_request_vote_result.c 0000664 0000000 0000000 00000010372 14652527134 0022135 0 ustar 00root root 0000000 0000000 #include "recv_request_vote_result.h"
#include "../tracing.h"
#include "assert.h"
#include "configuration.h"
#include "convert.h"
#include "election.h"
#include "recv.h"
#include "replication.h"
int recvRequestVoteResult(struct raft *r,
raft_id id,
const char *address,
const struct raft_request_vote_result *result)
{
size_t votes_index;
int match;
int rv;
(void)address;
assert(r != NULL);
assert(id > 0);
tracef(
"self:%llu from:%llu@%s term:%llu vote_granted:%d pre_vote:%d "
"version:%d",
r->id, id, address, result->term, result->vote_granted,
result->pre_vote, result->version);
votes_index = configurationIndexOfVoter(&r->configuration, id);
if (votes_index == r->configuration.n) {
tracef("non-voting or unknown server -> reject");
return 0;
}
/* Ignore responses if we are not candidate anymore */
if (r->state != RAFT_CANDIDATE) {
tracef("local server is not candidate -> ignore");
return 0;
}
/* If we're in the pre-vote phase, don't actually increment our term
* right now (we'll do it later, if we start the second phase), and also
* don't step down if the peer is just one term ahead (this is okay as
* in the request we sent our current term plus one). */
if (r->candidate_state.in_pre_vote) {
recvCheckMatchingTerms(r, result->term, &match);
} else {
rv = recvEnsureMatchingTerms(r, result->term, &match);
if (rv != 0) {
return rv;
}
}
/* Converted to follower as a result of seeing a higher term. */
if (r->state != RAFT_CANDIDATE) {
tracef("no longer candidate -> ignore");
return 0;
}
if (match < 0) {
/* If the term in the result is older than ours, this is an old
* message we should ignore, because the node who voted for us
* would have obtained our term. This happens if the network is
* pretty choppy. */
tracef("local term is higher -> ignore");
return 0;
}
/* Avoid counting pre-vote votes as regular votes. */
if (result->version > 1 && result->pre_vote &&
!r->candidate_state.in_pre_vote) {
tracef("receive stale pre-vote response -> ignore");
return 0;
}
/* This can happen when a candidate wins a pre-vote, bumps its term,
* sends real RequestVote RPCs, crashes, comes online, starts a pre-vote
* and then receives the response to the RequestVote RPC it sent
* out before crashing. */
if (result->version > 1 && !result->pre_vote &&
r->candidate_state.in_pre_vote) {
tracef("receive vote response during pre-vote -> ignore");
return 0;
}
/* If we're in the pre-vote phase, check that the peer's is at most one
* term ahead (possibly stepping down). If we're the actual voting
* phase, we expect our term must to be the same as the response term
* (otherwise we would have either ignored the result bumped our term).
*/
if (r->candidate_state.in_pre_vote) {
if (match > 0) {
if (result->term > r->current_term + 1) {
assert(!result->vote_granted);
rv = recvBumpCurrentTerm(r, result->term);
return rv;
}
}
} else {
assert(result->term == r->current_term);
}
/* If the vote was granted and we reached quorum, convert to leader.
*
* From Figure 3.1:
*
* If votes received from majority of severs: become leader.
*
* From state diagram in Figure 3.3:
*
* [candidate]: receives votes from majority of servers -> [leader]
*
* From Section 3.4:
*
* A candidate wins an election if it receives votes from a majority
* of the servers in the full cluster for the same term. Each server
* will vote for at most one candidate in a given term, on a
* firstcome-first-served basis [...]. Once a candidate wins an
* election, it becomes leader.
*/
if (result->vote_granted) {
if (electionTally(r, votes_index)) {
if (r->candidate_state.in_pre_vote) {
tracef(
"votes quorum reached -> pre-vote "
"successful");
r->candidate_state.in_pre_vote = false;
rv = electionStart(r);
if (rv != 0) {
return rv;
}
} else {
tracef(
"votes quorum reached -> convert to "
"leader");
rv = convertToLeader(r);
if (rv != 0) {
return rv;
}
/* Send initial heartbeat. */
replicationHeartbeat(r);
}
} else {
tracef("votes quorum not reached");
}
} else {
tracef("vote was not granted");
}
return 0;
}
#undef tracef
dqlite-1.16.7/src/raft/recv_request_vote_result.h 0000664 0000000 0000000 00000000573 14652527134 0022144 0 ustar 00root root 0000000 0000000 /* Receive a RequestVote result. */
#ifndef RECV_REQUEST_VOTE_RESULT_H_
#define RECV_REQUEST_VOTE_RESULT_H_
#include "../raft.h"
/* Process a RequestVote RPC result from the given server. */
int recvRequestVoteResult(struct raft *r,
raft_id id,
const char *address,
const struct raft_request_vote_result *result);
#endif /* RAFT_RECV_REQUEST_VOTE_RESULT_H_ */
dqlite-1.16.7/src/raft/recv_timeout_now.c 0000664 0000000 0000000 00000003661 14652527134 0020366 0 ustar 00root root 0000000 0000000 #include "recv_timeout_now.h"
#include "../tracing.h"
#include "assert.h"
#include "configuration.h"
#include "convert.h"
#include "log.h"
#include "recv.h"
int recvTimeoutNow(struct raft *r,
const raft_id id,
const char *address,
const struct raft_timeout_now *args)
{
const struct raft_server *local_server;
raft_index local_last_index;
raft_term local_last_term;
int match;
int rv;
assert(r != NULL);
assert(id > 0);
assert(args != NULL);
(void)address;
tracef(
"self:%llu from:%llu@%s last_log_index:%llu last_log_term:%llu "
"term:%llu",
r->id, id, address, args->last_log_index, args->last_log_term,
args->term);
/* Ignore the request if we are not voters. */
local_server = configurationGet(&r->configuration, r->id);
if (local_server == NULL || local_server->role != RAFT_VOTER) {
tracef("non-voter");
return 0;
}
/* Ignore the request if we are not follower, or we have different
* leader. */
if (r->state != RAFT_FOLLOWER ||
r->follower_state.current_leader.id != id) {
tracef("Ignore - r->state:%d current_leader.id:%llu", r->state,
r->follower_state.current_leader.id);
return 0;
}
/* Possibly update our term. Ignore the request if it turns out we have
* a higher term. */
rv = recvEnsureMatchingTerms(r, args->term, &match);
if (rv != 0) {
return rv;
}
if (match < 0) {
return 0;
}
/* Ignore the request if we our log is not up-to-date. */
local_last_index = logLastIndex(r->log);
local_last_term = logLastTerm(r->log);
if (local_last_index != args->last_log_index ||
local_last_term != args->last_log_term) {
return 0;
}
/* Finally, ignore the request if we're working on persisting some
* entries. */
if (r->follower_state.append_in_flight_count > 0) {
return 0;
}
/* Convert to candidate and start a new election. */
rv = convertToCandidate(r, true /* disrupt leader */);
if (rv != 0) {
return rv;
}
return 0;
}
#undef tracef
dqlite-1.16.7/src/raft/recv_timeout_now.h 0000664 0000000 0000000 00000000505 14652527134 0020365 0 ustar 00root root 0000000 0000000 /* Receive a TimeoutNow message. */
#ifndef RECV_TIMEOUT_NOW_H_
#define RECV_TIMEOUT_NOW_H_
#include "../raft.h"
/* Process a TimeoutNow RPC from the given server. */
int recvTimeoutNow(struct raft *r,
raft_id id,
const char *address,
const struct raft_timeout_now *args);
#endif /* RECV_TIMEOUT_NOW_H_ */
dqlite-1.16.7/src/raft/replication.c 0000664 0000000 0000000 00000140213 14652527134 0017302 0 ustar 00root root 0000000 0000000 #include
#include "assert.h"
#include "configuration.h"
#include "convert.h"
#include "entry.h"
#ifdef __GLIBC__
#include "error.h"
#endif
#include "../tracing.h"
#include "err.h"
#include "flags.h"
#include "heap.h"
#include "lifecycle.h"
#include "log.h"
#include "membership.h"
#include "progress.h"
#include "../lib/queue.h"
#include "replication.h"
#include "request.h"
#include "snapshot.h"
#ifndef max
#define max(a, b) ((a) < (b) ? (b) : (a))
#endif
#ifndef min
#define min(a, b) ((a) < (b) ? (a) : (b))
#endif
/* Context of a RAFT_IO_APPEND_ENTRIES request that was submitted with
* raft_io_>send(). */
struct sendAppendEntries
{
struct raft *raft; /* Instance sending the entries. */
struct raft_io_send send; /* Underlying I/O send request. */
raft_index index; /* Index of the first entry in the request. */
struct raft_entry *entries; /* Entries referenced in the request. */
unsigned n; /* Length of the entries array. */
raft_id server_id; /* Destination server. */
};
/* Callback invoked after request to send an AppendEntries RPC has completed. */
static void sendAppendEntriesCb(struct raft_io_send *send, const int status)
{
struct sendAppendEntries *req = send->data;
struct raft *r = req->raft;
unsigned i = configurationIndexOf(&r->configuration, req->server_id);
if (r->state == RAFT_LEADER && i < r->configuration.n) {
if (status != 0) {
tracef(
"failed to send append entries to server %llu: %s",
req->server_id, raft_strerror(status));
/* Go back to probe mode. */
progressToProbe(r, i);
}
}
/* Tell the log that we're done referencing these entries. */
logRelease(r->log, req->index, req->entries, req->n);
raft_free(req);
}
/* Send an AppendEntries message to the i'th server, including all log entries
* from the given point onwards. */
static int sendAppendEntries(struct raft *r,
const unsigned i,
const raft_index prev_index,
const raft_term prev_term)
{
struct raft_server *server = &r->configuration.servers[i];
struct raft_message message;
struct raft_append_entries *args = &message.append_entries;
struct sendAppendEntries *req;
raft_index next_index = prev_index + 1;
int rv;
args->term = r->current_term;
args->prev_log_index = prev_index;
args->prev_log_term = prev_term;
/* TODO: implement a limit to the total size of the entries being sent
*/
rv = logAcquire(r->log, next_index, &args->entries, &args->n_entries);
if (rv != 0) {
goto err;
}
/* From Section 3.5:
*
* The leader keeps track of the highest index it knows to be
* committed, and it includes that index in future AppendEntries RPCs
* (including heartbeats) so that the other servers eventually find out.
* Once a follower learns that a log entry is committed, it applies the
* entry to its local state machine (in log order)
*/
args->leader_commit = r->commit_index;
tracef(
"send %u entries starting at %llu to server %llu (last index %llu)",
args->n_entries, args->prev_log_index, server->id,
logLastIndex(r->log));
message.type = RAFT_IO_APPEND_ENTRIES;
message.server_id = server->id;
message.server_address = server->address;
req = raft_malloc(sizeof *req);
if (req == NULL) {
rv = RAFT_NOMEM;
goto err_after_entries_acquired;
}
req->raft = r;
req->index = args->prev_log_index + 1;
req->entries = args->entries;
req->n = args->n_entries;
req->server_id = server->id;
req->send.data = req;
rv = r->io->send(r->io, &req->send, &message, sendAppendEntriesCb);
if (rv != 0) {
goto err_after_req_alloc;
}
if (progressState(r, i) == PROGRESS__PIPELINE) {
/* Optimistically update progress. */
progressOptimisticNextIndex(r, i, req->index + req->n);
}
progressUpdateLastSend(r, i);
return 0;
err_after_req_alloc:
raft_free(req);
err_after_entries_acquired:
logRelease(r->log, next_index, args->entries, args->n_entries);
err:
assert(rv != 0);
return rv;
}
/* Context of a RAFT_IO_INSTALL_SNAPSHOT request that was submitted with
* raft_io_>send(). */
struct sendInstallSnapshot
{
struct raft *raft; /* Instance sending the snapshot. */
struct raft_io_snapshot_get get; /* Snapshot get request. */
struct raft_io_send send; /* Underlying I/O send request. */
struct raft_snapshot *snapshot; /* Snapshot to send. */
raft_id server_id; /* Destination server. */
};
static void sendInstallSnapshotCb(struct raft_io_send *send, int status)
{
struct sendInstallSnapshot *req = send->data;
struct raft *r = req->raft;
const struct raft_server *server;
server = configurationGet(&r->configuration, req->server_id);
if (status != 0) {
tracef("send install snapshot: %s", raft_strerror(status));
if (r->state == RAFT_LEADER && server != NULL) {
unsigned i;
i = configurationIndexOf(&r->configuration,
req->server_id);
progressAbortSnapshot(r, i);
}
}
snapshotClose(req->snapshot);
raft_free(req->snapshot);
raft_free(req);
}
static void sendSnapshotGetCb(struct raft_io_snapshot_get *get,
struct raft_snapshot *snapshot,
int status)
{
struct sendInstallSnapshot *req = get->data;
struct raft *r = req->raft;
struct raft_message message;
struct raft_install_snapshot *args = &message.install_snapshot;
const struct raft_server *server = NULL;
bool progress_state_is_snapshot = false;
unsigned i = 0;
int rv;
if (status != 0) {
tracef("get snapshot %s", raft_strerror(status));
goto abort;
}
if (r->state != RAFT_LEADER) {
goto abort_with_snapshot;
}
server = configurationGet(&r->configuration, req->server_id);
if (server == NULL) {
/* Probably the server was removed in the meantime. */
goto abort_with_snapshot;
}
i = configurationIndexOf(&r->configuration, req->server_id);
progress_state_is_snapshot = progressState(r, i) == PROGRESS__SNAPSHOT;
if (!progress_state_is_snapshot) {
/* Something happened in the meantime. */
goto abort_with_snapshot;
}
assert(snapshot->n_bufs == 1);
message.type = RAFT_IO_INSTALL_SNAPSHOT;
message.server_id = server->id;
message.server_address = server->address;
args->term = r->current_term;
args->last_index = snapshot->index;
args->last_term = snapshot->term;
args->conf_index = snapshot->configuration_index;
args->conf = snapshot->configuration;
args->data = snapshot->bufs[0];
req->snapshot = snapshot;
req->send.data = req;
tracef("sending snapshot with last index %llu to %llu", snapshot->index,
server->id);
rv = r->io->send(r->io, &req->send, &message, sendInstallSnapshotCb);
if (rv != 0) {
goto abort_with_snapshot;
}
goto out;
abort_with_snapshot:
snapshotClose(snapshot);
raft_free(snapshot);
abort:
if (r->state == RAFT_LEADER && server != NULL &&
progress_state_is_snapshot) {
progressAbortSnapshot(r, i);
}
raft_free(req);
out:
return;
}
/* Send the latest snapshot to the i'th server */
static int sendSnapshot(struct raft *r, const unsigned i)
{
struct raft_server *server = &r->configuration.servers[i];
struct sendInstallSnapshot *request;
int rv;
progressToSnapshot(r, i);
request = raft_malloc(sizeof *request);
if (request == NULL) {
rv = RAFT_NOMEM;
goto err;
}
request->raft = r;
request->server_id = server->id;
request->get.data = request;
/* TODO: make sure that the I/O implementation really returns the latest
* snapshot *at this time* and not any snapshot that might be stored at
* a later point. Otherwise the progress snapshot_index would be wrong.
*/
rv = r->io->snapshot_get(r->io, &request->get, sendSnapshotGetCb);
if (rv != 0) {
goto err_after_req_alloc;
}
progressUpdateSnapshotLastSend(r, i);
return 0;
err_after_req_alloc:
raft_free(request);
err:
progressAbortSnapshot(r, i);
assert(rv != 0);
return rv;
}
int replicationProgress(struct raft *r, unsigned i)
{
struct raft_server *server = &r->configuration.servers[i];
bool progress_state_is_snapshot =
progressState(r, i) == PROGRESS__SNAPSHOT;
raft_index snapshot_index = logSnapshotIndex(r->log);
raft_index next_index = progressNextIndex(r, i);
raft_index prev_index;
raft_term prev_term;
assert(r->state == RAFT_LEADER);
assert(server->id != r->id);
assert(next_index >= 1);
if (!progressShouldReplicate(r, i)) {
return 0;
}
/* From Section 3.5:
*
* When sending an AppendEntries RPC, the leader includes the index
* and term of the entry in its log that immediately precedes the new
* entries. If the follower does not find an entry in its log with the
* same index and term, then it refuses the new entries. The
* consistency check acts as an induction step: the initial empty state
* of the logs satisfies the Log Matching Property, and the consistency
* check preserves the Log Matching Property whenever logs are extended.
* As a result, whenever AppendEntries returns successfully, the leader
* knows that the follower's log is identical to its own log up through
* the new entries (Log Matching Property in Figure 3.2).
*/
if (next_index == 1) {
/* We're including the very first entry, so prevIndex and
* prevTerm are null. If the first entry is not available
* anymore, send the last snapshot if we're not already sending
* one. */
if (snapshot_index > 0 && !progress_state_is_snapshot) {
raft_index last_index = logLastIndex(r->log);
assert(last_index > 0); /* The log can't be empty */
goto send_snapshot;
}
prev_index = 0;
prev_term = 0;
} else {
/* Set prevIndex and prevTerm to the index and term of the entry
* at next_index - 1. */
prev_index = next_index - 1;
prev_term = logTermOf(r->log, prev_index);
/* If the entry is not anymore in our log, send the last
* snapshot if we're not doing so already. */
if (prev_term == 0 && !progress_state_is_snapshot) {
assert(prev_index < snapshot_index);
tracef("missing entry at index %lld -> send snapshot",
prev_index);
goto send_snapshot;
}
}
/* Send empty AppendEntries RPC when installing a snaphot */
if (progress_state_is_snapshot) {
prev_index = logLastIndex(r->log);
prev_term = logLastTerm(r->log);
}
return sendAppendEntries(r, i, prev_index, prev_term);
send_snapshot:
if (progressGetRecentRecv(r, i)) {
/* Only send a snapshot when we have heard from the server */
return sendSnapshot(r, i);
} else {
/* Send empty AppendEntries RPC when we haven't heard from the
* server */
prev_index = logLastIndex(r->log);
prev_term = logLastTerm(r->log);
return sendAppendEntries(r, i, prev_index, prev_term);
}
}
/* Possibly trigger I/O requests for newly appended log entries or heartbeats.
*
* This function loops through all followers and triggers replication on them.
*
* It must be called only by leaders. */
static int triggerAll(struct raft *r)
{
unsigned i;
int rv;
assert(r->state == RAFT_LEADER);
/* Trigger replication for servers we didn't hear from recently. */
for (i = 0; i < r->configuration.n; i++) {
struct raft_server *server = &r->configuration.servers[i];
if (server->id == r->id) {
continue;
}
/* Skip spare servers, unless they're being promoted. */
if (server->role == RAFT_SPARE &&
server->id != r->leader_state.promotee_id) {
continue;
}
rv = replicationProgress(r, i);
if (rv != 0 && rv != RAFT_NOCONNECTION) {
/* This is not a critical failure, let's just log it. */
tracef(
"failed to send append entries to server %llu: %s "
"(%d)",
server->id, raft_strerror(rv), rv);
}
}
return 0;
}
int replicationHeartbeat(struct raft *r)
{
return triggerAll(r);
}
/* Context for a write log entries request that was submitted by a leader. */
struct appendLeader
{
struct raft *raft; /* Instance that has submitted the request */
raft_index index; /* Index of the first entry in the request. */
struct raft_entry *entries; /* Entries referenced in the request. */
unsigned n; /* Length of the entries array. */
struct raft_io_append req;
};
/* Called after a successful append entries I/O request to update the index of
* the last entry stored on disk. Return how many new entries that are still
* present in our in-memory log were stored. */
static size_t updateLastStored(struct raft *r,
raft_index first_index,
struct raft_entry *entries,
size_t n_entries)
{
size_t i;
/* Check which of these entries is still in our in-memory log */
for (i = 0; i < n_entries; i++) {
struct raft_entry *entry = &entries[i];
raft_index index = first_index + i;
raft_term local_term = logTermOf(r->log, index);
/* If we have no entry at this index, or if the entry we have
* now has a different term, it means that this entry got
* truncated, so let's stop here. */
if (local_term == 0 ||
(local_term > 0 && local_term != entry->term)) {
break;
}
/* If we do have an entry at this index, its term must match the
* one of the entry we wrote on disk. */
assert(local_term != 0 && local_term == entry->term);
}
r->last_stored += i;
return i;
}
/* Get the request matching the given @index and @type, if any.
* The type check is skipped when @type == -1. */
static struct request *getRequest(struct raft *r,
const raft_index index,
int type)
{
queue *head;
struct request *req;
if (r->state != RAFT_LEADER) {
return NULL;
}
QUEUE_FOREACH(head, &r->leader_state.requests)
{
req = QUEUE_DATA(head, struct request, queue);
if (req->index == index) {
if (type != -1) {
assert(req->type == type);
}
lifecycleRequestEnd(r, req);
return req;
}
}
return NULL;
}
/* Invoked once a disk write request for new entries has been completed. */
static void appendLeaderCb(struct raft_io_append *append, int status)
{
struct appendLeader *request = append->data;
struct raft *r = request->raft;
size_t server_index;
raft_index index;
int rv;
tracef("leader: written %u entries starting at %lld: status %d",
request->n, request->index, status);
/* In case of a failed disk write, if we were the leader creating these
* entries in the first place, truncate our log too (since we have
* appended these entries to it) and fire the request callbacks.
*
* Afterward, convert immediately to follower state, giving the cluster
* a chance to elect another leader that doesn't have a full disk (or
* whatever caused our write error). */
if (status != 0) {
ErrMsgTransfer(r->io->errmsg, r->errmsg, "io");
for (unsigned i = 0; i < request->n; i++) {
const struct request *req =
getRequest(r, request->index + i, -1);
if (!req) {
tracef("no request found at index %llu",
request->index + i);
continue;
}
switch (req->type) {
case RAFT_COMMAND: {
struct raft_apply *apply =
(struct raft_apply *)req;
if (apply->cb) {
apply->cb(apply, status, NULL);
}
break;
}
case RAFT_BARRIER: {
struct raft_barrier *barrier =
(struct raft_barrier *)req;
if (barrier->cb) {
barrier->cb(barrier, status);
}
break;
}
case RAFT_CHANGE: {
struct raft_change *change =
(struct raft_change *)req;
if (change->cb) {
change->cb(change, status);
}
break;
}
default:
tracef(
"unknown request type, shutdown.");
assert(false);
break;
}
}
goto out;
}
updateLastStored(r, request->index, request->entries, request->n);
/* If we are not leader anymore, just discard the result. */
if (r->state != RAFT_LEADER) {
tracef("local server is not leader -> ignore write log result");
goto out;
}
/* Only update the next index if we are part of the current
* configuration. The only case where this is not true is when we were
* asked to remove ourselves from the cluster.
*
* From Section 4.2.2:
*
* there will be a period of time (while it is committing Cnew) when a
* leader can manage a cluster that does not include itself; it
* replicates log entries but does not count itself in majorities.
*/
server_index = configurationIndexOf(&r->configuration, r->id);
if (server_index < r->configuration.n) {
r->leader_state.progress[server_index].match_index =
r->last_stored;
}
/* Check if we can commit some new entries. */
replicationQuorum(r, r->last_stored);
rv = replicationApply(r);
if (rv != 0) {
/* TODO: just log the error? */
}
out:
/* Tell the log that we're done referencing these entries. */
logRelease(r->log, request->index, request->entries, request->n);
index = request->index;
raft_free(request);
if (status != 0) {
if (index <= logLastIndex(r->log)) {
logTruncate(r->log, index);
}
if (r->state == RAFT_LEADER) {
convertToFollower(r);
}
}
}
/* Submit a disk write for all entries from the given index onward. */
static int appendLeader(struct raft *r, raft_index index)
{
struct raft_entry *entries = NULL;
unsigned n;
struct appendLeader *request;
int rv;
assert(r->state == RAFT_LEADER);
assert(index > 0);
assert(index > r->last_stored);
/* Acquire all the entries from the given index onwards. */
rv = logAcquire(r->log, index, &entries, &n);
if (rv != 0) {
goto err;
}
/* We expect this function to be called only when there are actually
* some entries to write. */
if (n == 0) {
assert(false);
tracef("No log entries found at index %llu", index);
ErrMsgPrintf(r->errmsg, "No log entries found at index %llu",
index);
rv = RAFT_SHUTDOWN;
goto err_after_entries_acquired;
}
/* Allocate a new request. */
request = raft_malloc(sizeof *request);
if (request == NULL) {
rv = RAFT_NOMEM;
goto err_after_entries_acquired;
}
request->raft = r;
request->index = index;
request->entries = entries;
request->n = n;
request->req.data = request;
rv = r->io->append(r->io, &request->req, entries, n, appendLeaderCb);
if (rv != 0) {
ErrMsgTransfer(r->io->errmsg, r->errmsg, "io");
goto err_after_request_alloc;
}
return 0;
err_after_request_alloc:
raft_free(request);
err_after_entries_acquired:
logRelease(r->log, index, entries, n);
err:
assert(rv != 0);
return rv;
}
int replicationTrigger(struct raft *r, raft_index index)
{
int rv;
rv = appendLeader(r, index);
if (rv != 0) {
return rv;
}
return triggerAll(r);
}
/* Helper to be invoked after a promotion of a non-voting server has been
* requested via @raft_assign and that server has caught up with logs.
*
* This function changes the local configuration marking the server being
* promoted as actually voting, appends the a RAFT_CHANGE entry with the new
* configuration to the local log and triggers its replication. */
static int triggerActualPromotion(struct raft *r)
{
raft_index index;
raft_term term = r->current_term;
size_t server_index;
struct raft_server *server;
int old_role;
int rv;
assert(r->state == RAFT_LEADER);
assert(r->leader_state.promotee_id != 0);
server_index = configurationIndexOf(&r->configuration,
r->leader_state.promotee_id);
assert(server_index < r->configuration.n);
server = &r->configuration.servers[server_index];
assert(server->role != RAFT_VOTER);
/* Update our current configuration. */
old_role = server->role;
server->role = RAFT_VOTER;
/* Index of the entry being appended. */
index = logLastIndex(r->log) + 1;
/* Encode the new configuration and append it to the log. */
rv = logAppendConfiguration(r->log, term, &r->configuration);
if (rv != 0) {
goto err;
}
/* Start writing the new log entry to disk and send it to the followers.
*/
rv = replicationTrigger(r, index);
if (rv != 0) {
goto err_after_log_append;
}
r->leader_state.promotee_id = 0;
r->configuration_uncommitted_index = logLastIndex(r->log);
return 0;
err_after_log_append:
logTruncate(r->log, index);
err:
server->role = old_role;
assert(rv != 0);
return rv;
}
int replicationUpdate(struct raft *r,
const struct raft_server *server,
const struct raft_append_entries_result *result)
{
bool is_being_promoted;
raft_index last_index;
unsigned i;
int rv;
i = configurationIndexOf(&r->configuration, server->id);
assert(r->state == RAFT_LEADER);
assert(i < r->configuration.n);
progressMarkRecentRecv(r, i);
progressSetFeatures(r, i, result->features);
/* If the RPC failed because of a log mismatch, retry.
*
* From Figure 3.1:
*
* [Rules for servers] Leaders:
*
* - If AppendEntries fails because of log inconsistency:
* decrement nextIndex and retry.
*/
if (result->rejected > 0) {
bool retry;
retry = progressMaybeDecrement(r, i, result->rejected,
result->last_log_index);
if (retry) {
/* Retry, ignoring errors. */
tracef("log mismatch -> send old entries to %llu",
server->id);
replicationProgress(r, i);
}
return 0;
}
/* In case of success the remote server is expected to send us back the
* value of prevLogIndex + len(entriesToAppend). If it has a longer log,
* it might be a leftover from previous terms. */
last_index = result->last_log_index;
if (last_index > logLastIndex(r->log)) {
last_index = logLastIndex(r->log);
}
/* If the RPC succeeded, update our counters for this server.
*
* From Figure 3.1:
*
* [Rules for servers] Leaders:
*
* If successful update nextIndex and matchIndex for follower.
*/
if (!progressMaybeUpdate(r, i, last_index)) {
return 0;
}
switch (progressState(r, i)) {
case PROGRESS__SNAPSHOT:
/* If a snapshot has been installed, transition back to
* probe */
if (progressSnapshotDone(r, i)) {
progressToProbe(r, i);
}
break;
case PROGRESS__PROBE:
/* Transition to pipeline */
progressToPipeline(r, i);
}
/* If the server is currently being promoted and is catching with logs,
* update the information about the current catch-up round, and possibly
* proceed with the promotion. */
is_being_promoted = r->leader_state.promotee_id != 0 &&
r->leader_state.promotee_id == server->id;
if (is_being_promoted) {
bool is_up_to_date = membershipUpdateCatchUpRound(r);
if (is_up_to_date) {
rv = triggerActualPromotion(r);
if (rv != 0) {
return rv;
}
}
}
/* Check if we can commit some new entries. */
replicationQuorum(r, last_index);
rv = replicationApply(r);
if (rv != 0) {
/* TODO: just log the error? */
}
/* Abort here we have been removed and we are not leaders anymore. */
if (r->state != RAFT_LEADER) {
goto out;
}
/* Get again the server index since it might have been removed from the
* configuration. */
i = configurationIndexOf(&r->configuration, server->id);
if (i < r->configuration.n) {
/* If we are transferring leadership to this follower, check if
* its log is now up-to-date and, if so, send it a TimeoutNow
* RPC (unless we already did). */
if (r->transfer != NULL && r->transfer->id == server->id) {
if (progressPersistedIsUpToDate(r, i) &&
r->transfer->send.data == NULL) {
rv = membershipLeadershipTransferStart(r);
if (rv != 0) {
membershipLeadershipTransferClose(r);
}
}
}
/* If this follower is in pipeline mode, send it more entries.
*/
if (progressState(r, i) == PROGRESS__PIPELINE) {
replicationProgress(r, i);
}
}
out:
return 0;
}
static void sendAppendEntriesResultCb(struct raft_io_send *req, int status)
{
(void)status;
RaftHeapFree(req);
}
static void sendAppendEntriesResult(
struct raft *r,
const struct raft_append_entries_result *result)
{
struct raft_message message;
struct raft_io_send *req;
int rv;
assert(r->state == RAFT_FOLLOWER);
message.type = RAFT_IO_APPEND_ENTRIES_RESULT;
message.server_id = r->follower_state.current_leader.id;
message.server_address = r->follower_state.current_leader.address;
message.append_entries_result = *result;
req = raft_malloc(sizeof *req);
if (req == NULL) {
return;
}
req->data = r;
rv = r->io->send(r->io, req, &message, sendAppendEntriesResultCb);
if (rv != 0) {
raft_free(req);
}
}
/* Context for a write log entries request that was submitted by a follower. */
struct appendFollower
{
struct raft *raft; /* Instance that has submitted the request */
raft_index index; /* Index of the first entry in the request. */
struct raft_append_entries args;
struct raft_io_append req;
};
static void appendFollowerCb(struct raft_io_append *req, int status)
{
struct appendFollower *request = req->data;
struct raft *r = request->raft;
struct raft_append_entries *args = &request->args;
struct raft_append_entries_result result;
size_t i;
size_t j;
int rv;
tracef("I/O completed on follower: status %d", status);
assert(args->entries != NULL);
assert(args->n_entries > 0);
assert(r->state == RAFT_FOLLOWER || r->state == RAFT_UNAVAILABLE);
if (r->state == RAFT_UNAVAILABLE) {
goto out;
}
assert(r->follower_state.append_in_flight_count > 0);
r->follower_state.append_in_flight_count -= 1;
result.term = r->current_term;
result.version = RAFT_APPEND_ENTRIES_RESULT_VERSION;
result.features = RAFT_DEFAULT_FEATURE_FLAGS;
if (status != 0) {
ErrMsgTransfer(r->io->errmsg, r->errmsg, "io");
result.rejected = args->prev_log_index + 1;
goto respond;
}
/* We received an InstallSnapshot RPC while these entries were being
* persisted to disk */
if (replicationInstallSnapshotBusy(r)) {
goto out;
}
i = updateLastStored(r, request->index, args->entries, args->n_entries);
/* If none of the entries that we persisted is present anymore in our
* in-memory log, there's nothing to report or to do. We just discard
* them. */
if (i == 0) {
goto out;
}
/* Possibly apply configuration changes as uncommitted. */
for (j = 0; j < i; j++) {
struct raft_entry *entry = &args->entries[j];
raft_index index = request->index + j;
raft_term local_term = logTermOf(r->log, index);
assert(local_term != 0 && local_term == entry->term);
if (entry->type == RAFT_CHANGE) {
rv = membershipUncommittedChange(r, index, entry);
if (rv != 0) {
goto out;
}
}
}
/* From Figure 3.1:
*
* AppendEntries RPC: Receiver implementation: If leaderCommit >
* commitIndex, set commitIndex = min(leaderCommit, index of last new
* entry).
*/
if (args->leader_commit > r->commit_index &&
r->last_stored >= r->commit_index) {
r->commit_index = min(args->leader_commit, r->last_stored);
rv = replicationApply(r);
if (rv != 0) {
goto out;
}
}
/* If our term number has changed since receiving these entries,
* our current_leader may have changed as well, so don't send a response
* to that server. */
if (r->current_term != args->term) {
tracef(
"new role or term since receiving entries -> don't "
"respond");
goto out;
}
result.rejected = 0;
respond:
result.last_log_index = r->last_stored;
sendAppendEntriesResult(r, &result);
out:
logRelease(r->log, request->index, request->args.entries,
request->args.n_entries);
/* If the write failed, we need to truncate the log. */
if (status != 0) {
if (request->index <= logLastIndex(r->log)) {
logTruncate(r->log, request->index);
}
}
raft_free(request);
}
/* Check the log matching property against an incoming AppendEntries request.
*
* From Figure 3.1:
*
* [AppendEntries RPC] Receiver implementation:
*
* 2. Reply false if log doesn't contain an entry at prevLogIndex whose
* term matches prevLogTerm.
*
* Return 0 if the check passed.
*
* Return 1 if the check did not pass and the request needs to be rejected.
*
* Return -1 if there's a conflict and we need to shutdown. */
static int checkLogMatchingProperty(struct raft *r,
const struct raft_append_entries *args)
{
raft_term local_prev_term;
/* If this is the very first entry, there's nothing to check. */
if (args->prev_log_index == 0) {
return 0;
}
local_prev_term = logTermOf(r->log, args->prev_log_index);
if (local_prev_term == 0) {
tracef("no entry at index %llu -> reject",
args->prev_log_index);
return 1;
}
if (local_prev_term != args->prev_log_term) {
if (args->prev_log_index <= r->commit_index) {
/* Should never happen; something is seriously wrong! */
tracef(
"conflicting terms %llu and %llu for entry %llu "
"(commit "
"index %llu) -> shutdown",
local_prev_term, args->prev_log_term,
args->prev_log_index, r->commit_index);
return -1;
}
tracef("previous term mismatch -> reject");
return 1;
}
return 0;
}
/* Delete from our log all entries that conflict with the ones in the given
* AppendEntries request.
*
* From Figure 3.1:
*
* [AppendEntries RPC] Receiver implementation:
*
* 3. If an existing entry conflicts with a new one (same index but
* different terms), delete the existing entry and all that follow it.
*
* The i output parameter will be set to the array index of the first new log
* entry that we don't have yet in our log, among the ones included in the given
* AppendEntries request. */
static int deleteConflictingEntries(struct raft *r,
const struct raft_append_entries *args,
size_t *i)
{
size_t j;
int rv;
for (j = 0; j < args->n_entries; j++) {
struct raft_entry *entry = &args->entries[j];
raft_index entry_index = args->prev_log_index + 1 + j;
raft_term local_term = logTermOf(r->log, entry_index);
if (local_term > 0 && local_term != entry->term) {
if (entry_index <= r->commit_index) {
/* Should never happen; something is seriously
* wrong! */
tracef(
"new index conflicts with committed entry "
"-> shutdown");
return RAFT_SHUTDOWN;
}
tracef("log mismatch -> truncate (%llu)", entry_index);
/* Possibly discard uncommitted configuration changes.
*/
if (r->configuration_uncommitted_index >= entry_index) {
rv = membershipRollback(r);
if (rv != 0) {
return rv;
}
}
/* Delete all entries from this index on because they
* don't match. */
rv = r->io->truncate(r->io, entry_index);
if (rv != 0) {
return rv;
}
logTruncate(r->log, entry_index);
/* Drop information about previously stored entries that
* have just been discarded. */
if (r->last_stored >= entry_index) {
r->last_stored = entry_index - 1;
}
/* We want to append all entries from here on, replacing
* anything that we had before. */
break;
} else if (local_term == 0) {
/* We don't have an entry at this index, so we want to
* append this new one and all the subsequent ones. */
break;
}
}
*i = j;
return 0;
}
int replicationAppend(struct raft *r,
const struct raft_append_entries *args,
raft_index *rejected,
bool *async)
{
struct appendFollower *request;
int match;
size_t n;
size_t i;
size_t j;
bool reinstated;
int rv;
assert(r != NULL);
assert(args != NULL);
assert(rejected != NULL);
assert(async != NULL);
assert(r->state == RAFT_FOLLOWER);
*rejected = args->prev_log_index;
*async = false;
/* Check the log matching property. */
match = checkLogMatchingProperty(r, args);
if (match != 0) {
assert(match == 1 || match == -1);
return match == 1 ? 0 : RAFT_SHUTDOWN;
}
/* Delete conflicting entries. */
rv = deleteConflictingEntries(r, args, &i);
if (rv != 0) {
return rv;
}
*rejected = 0;
n = args->n_entries - i; /* Number of new entries */
/* If this is an empty AppendEntries, there's nothing to write. However
* we still want to check if we can commit some entry. However, don't
* commit anything while a snapshot install is busy, r->last_stored will
* be 0 in that case.
*
* From Figure 3.1:
*
* AppendEntries RPC: Receiver implementation: If leaderCommit >
* commitIndex, set commitIndex = min(leaderCommit, index of last new
* entry).
*/
if (n == 0) {
if ((args->leader_commit > r->commit_index) &&
r->last_stored >= r->commit_index &&
!replicationInstallSnapshotBusy(r)) {
r->commit_index =
min(args->leader_commit, r->last_stored);
rv = replicationApply(r);
if (rv != 0) {
return rv;
}
}
return 0;
}
*async = true;
request = raft_malloc(sizeof *request);
if (request == NULL) {
rv = RAFT_NOMEM;
goto err;
}
request->raft = r;
request->args = *args;
/* Index of first new entry */
request->index = args->prev_log_index + 1 + i;
/* Update our in-memory log to reflect that we received these entries.
* We'll notify the leader of a successful append once the write entries
* request that we issue below actually completes. */
for (j = 0; j < n; j++) {
struct raft_entry *entry = &args->entries[i + j];
/* We are trying to append an entry at index X with term T to
* our in-memory log. If we've gotten this far, we know that the
* log *logically* has no entry at this index. However, it's
* possible that we're still hanging on to such an entry,
* because we previously tried to append and replicate it, and
* the associated disk write failed, but some send requests are
* still pending that refer to it. Since the log is not capable
* of tracking multiple independent entries that share an index
* and term, we just piggyback on the already-stored entry in
* this case. */
rv =
logReinstate(r->log, entry->term, entry->type, &reinstated);
if (rv != 0) {
goto err_after_request_alloc;
} else if (reinstated) {
continue;
}
/* TODO This copy should not strictly be necessary, as the batch
* logic will take care of freeing the batch buffer in which the
* entries are received. However, this would lead to memory
* spikes in certain edge cases.
* https://github.com/canonical/dqlite/issues/276
*/
struct raft_entry copy = {0};
rv = entryCopy(entry, ©);
if (rv != 0) {
goto err_after_request_alloc;
}
rv = logAppend(r->log, copy.term, copy.type, copy.buf, (struct raft_entry_local_data){}, false, NULL);
if (rv != 0) {
goto err_after_request_alloc;
}
}
/* Acquire the relevant entries from the log. */
rv = logAcquire(r->log, request->index, &request->args.entries,
&request->args.n_entries);
if (rv != 0) {
goto err_after_request_alloc;
}
assert(request->args.n_entries == n);
if (request->args.n_entries == 0) {
tracef("No log entries found at index %llu", request->index);
ErrMsgPrintf(r->errmsg, "No log entries found at index %llu",
request->index);
rv = RAFT_SHUTDOWN;
goto err_after_acquire_entries;
}
request->req.data = request;
rv = r->io->append(r->io, &request->req, request->args.entries,
request->args.n_entries, appendFollowerCb);
if (rv != 0) {
ErrMsgTransfer(r->io->errmsg, r->errmsg, "io");
goto err_after_acquire_entries;
}
r->follower_state.append_in_flight_count += 1;
entryBatchesDestroy(args->entries, args->n_entries);
return 0;
err_after_acquire_entries:
/* Release the entries related to the IO request */
logRelease(r->log, request->index, request->args.entries,
request->args.n_entries);
err_after_request_alloc:
/* Release all entries added to the in-memory log, making
* sure the in-memory log and disk don't diverge, leading
* to future log entries not being persisted to disk.
*/
if (j != 0) {
logTruncate(r->log, request->index);
}
raft_free(request);
err:
assert(rv != 0);
return rv;
}
struct recvInstallSnapshot
{
struct raft *raft;
struct raft_snapshot snapshot;
raft_term term; /* Used to check for state transitions. */
};
static void installSnapshotCb(struct raft_io_snapshot_put *req, int status)
{
struct recvInstallSnapshot *request = req->data;
struct raft *r = request->raft;
struct raft_snapshot *snapshot = &request->snapshot;
struct raft_append_entries_result result;
bool should_respond = true;
int rv;
/* We avoid converting to candidate state while installing a snapshot.
*/
assert(r->state == RAFT_FOLLOWER || r->state == RAFT_UNAVAILABLE);
r->snapshot.put.data = NULL;
result.term = r->current_term;
result.version = RAFT_APPEND_ENTRIES_RESULT_VERSION;
result.features = RAFT_DEFAULT_FEATURE_FLAGS;
result.rejected = 0;
/* If we are shutting down, let's discard the result. */
if (r->state == RAFT_UNAVAILABLE) {
tracef(
"shutting down -> discard result of snapshot installation");
should_respond = false;
goto discard;
}
/* If the request is from a previous term, it means that someone else
* became a candidate while we were installing the snapshot. In that
* case, we want to install the snapshot anyway, but our "current
* leader" may no longer be the same as the server that sent the install
* request, so we shouldn't send a response to that server. */
if (request->term != r->current_term) {
tracef(
"new term since receiving snapshot -> install but don't "
"respond");
should_respond = false;
}
if (status != 0) {
tracef("save snapshot %llu: %s", snapshot->index,
raft_strerror(status));
goto discard;
}
/* From Figure 5.3:
*
* 7. Discard the entire log
* 8. Reset state machine using snapshot contents (and load lastConfig
* as cluster configuration).
*/
rv = snapshotRestore(r, snapshot);
if (rv != 0) {
tracef("restore snapshot %llu: %s", snapshot->index,
raft_strerror(status));
goto discard;
}
tracef("restored snapshot with last index %llu", snapshot->index);
goto respond;
discard:
/* In case of error we must also free the snapshot data buffer and free
* the configuration. */
result.rejected = snapshot->index;
raft_free(snapshot->bufs[0].base);
raft_free(snapshot->bufs);
raft_configuration_close(&snapshot->configuration);
respond:
if (should_respond) {
result.last_log_index = r->last_stored;
sendAppendEntriesResult(r, &result);
}
raft_free(request);
}
int replicationInstallSnapshot(struct raft *r,
const struct raft_install_snapshot *args,
raft_index *rejected,
bool *async)
{
struct recvInstallSnapshot *request;
struct raft_snapshot *snapshot;
raft_term local_term;
int rv;
assert(r->state == RAFT_FOLLOWER);
*rejected = args->last_index;
*async = false;
/* If we are taking a snapshot ourselves or installing a snapshot,
* ignore the request, the leader will eventually retry. TODO: we should
* do something smarter. */
if (r->snapshot.pending.term != 0 || r->snapshot.put.data != NULL) {
*async = true;
tracef("already taking or installing snapshot");
return RAFT_BUSY;
}
/* If our last snapshot is more up-to-date, this is a no-op */
if (r->log->snapshot.last_index >= args->last_index) {
tracef("have more recent snapshot");
*rejected = 0;
return 0;
}
/* If we already have all entries in the snapshot, this is a no-op */
local_term = logTermOf(r->log, args->last_index);
if (local_term != 0 && local_term >= args->last_term) {
tracef("have all entries");
*rejected = 0;
return 0;
}
*async = true;
/* Preemptively update our in-memory state. */
logRestore(r->log, args->last_index, args->last_term);
r->last_stored = 0;
request = raft_malloc(sizeof *request);
if (request == NULL) {
rv = RAFT_NOMEM;
goto err;
}
request->raft = r;
request->term = r->current_term;
snapshot = &request->snapshot;
snapshot->term = args->last_term;
snapshot->index = args->last_index;
snapshot->configuration_index = args->conf_index;
snapshot->configuration = args->conf;
snapshot->bufs = raft_malloc(sizeof *snapshot->bufs);
if (snapshot->bufs == NULL) {
rv = RAFT_NOMEM;
goto err_after_request_alloc;
}
snapshot->bufs[0] = args->data;
snapshot->n_bufs = 1;
assert(r->snapshot.put.data == NULL);
r->snapshot.put.data = request;
rv = r->io->snapshot_put(r->io,
0 /* zero trailing means replace everything */,
&r->snapshot.put, snapshot, installSnapshotCb);
if (rv != 0) {
tracef("snapshot_put failed %d", rv);
goto err_after_bufs_alloc;
}
return 0;
err_after_bufs_alloc:
raft_free(snapshot->bufs);
r->snapshot.put.data = NULL;
err_after_request_alloc:
raft_free(request);
err:
assert(rv != 0);
return rv;
}
/* Apply a RAFT_COMMAND entry that has been committed. */
static int applyCommand(struct raft *r,
const raft_index index,
const struct raft_buffer *buf)
{
struct raft_apply *req;
void *result;
int rv;
rv = r->fsm->apply(r->fsm, buf, &result);
if (rv != 0) {
return rv;
}
r->last_applied = index;
req = (struct raft_apply *)getRequest(r, index, RAFT_COMMAND);
if (req != NULL && req->cb != NULL) {
req->cb(req, 0, result);
}
return 0;
}
/* Fire the callback of a barrier request whose entry has been committed. */
static void applyBarrier(struct raft *r, const raft_index index)
{
r->last_applied = index;
struct raft_barrier *req;
req = (struct raft_barrier *)getRequest(r, index, RAFT_BARRIER);
if (req != NULL && req->cb != NULL) {
req->cb(req, 0);
}
}
/* Apply a RAFT_CHANGE entry that has been committed. */
static void applyChange(struct raft *r, const raft_index index)
{
struct raft_change *req;
assert(index > 0);
/* If this is an uncommitted configuration that we had already applied
* when submitting the configuration change (for leaders) or upon
* receiving it via an AppendEntries RPC (for followers), then reset the
* uncommitted index, since that uncommitted configuration is now
* committed. */
if (r->configuration_uncommitted_index == index) {
tracef("configuration at index:%llu is committed.", index);
r->configuration_uncommitted_index = 0;
}
r->configuration_committed_index = index;
r->last_applied = index;
if (r->state == RAFT_LEADER) {
const struct raft_server *server;
req = r->leader_state.change;
r->leader_state.change = NULL;
/* If we are leader but not part of this new configuration, step
* down.
*
* From Section 4.2.2:
*
* In this approach, a leader that is removed from the
* configuration steps down once the Cnew entry is committed.
*/
server = configurationGet(&r->configuration, r->id);
if (server == NULL || server->role != RAFT_VOTER) {
tracef(
"leader removed from config or no longer voter "
"server: %p",
(void *)server);
convertToFollower(r);
}
if (req != NULL && req->cb != NULL) {
req->cb(req, 0);
}
}
}
static bool shouldTakeSnapshot(struct raft *r)
{
/* If we are shutting down, let's not do anything. */
if (r->state == RAFT_UNAVAILABLE) {
return false;
}
/* If a snapshot is already in progress or we're installing a snapshot,
* we don't want to start another one. */
if (r->snapshot.pending.term != 0 || r->snapshot.put.data != NULL) {
return false;
};
/* If we didn't reach the threshold yet, do nothing. */
if (r->last_applied - r->log->snapshot.last_index <
r->snapshot.threshold) {
return false;
}
return true;
}
/*
* When taking a snapshot, ownership of the snapshot data is with raft if
* `snapshot_finalize` is NULL.
*/
static void takeSnapshotClose(struct raft *r, struct raft_snapshot *s)
{
if (r->fsm->version == 1 ||
(r->fsm->version > 1 && r->fsm->snapshot_finalize == NULL)) {
snapshotClose(s);
return;
}
configurationClose(&s->configuration);
r->fsm->snapshot_finalize(r->fsm, &s->bufs, &s->n_bufs);
}
static void takeSnapshotCb(struct raft_io_snapshot_put *req, int status)
{
struct raft *r = req->data;
struct raft_snapshot *snapshot;
int rv;
r->snapshot.put.data = NULL;
snapshot = &r->snapshot.pending;
if (status != 0) {
tracef("snapshot %lld at term %lld: %s", snapshot->index,
snapshot->term, raft_strerror(status));
goto out;
}
/* Cache the configuration contained in the snapshot. While the snapshot
* was written, new configuration changes could have been committed,
* these changes will not be purged from the log by this snapshot.
* However we still cache the configuration for consistency. */
configurationClose(&r->configuration_last_snapshot);
rv = configurationCopy(&snapshot->configuration,
&r->configuration_last_snapshot);
if (rv != 0) {
/* TODO: make this a hard fault, because if we have no backup
* and the log was truncated it will be impossible to rollback
* an aborted configuration change. */
tracef("failed to backup last committed configuration.");
}
logSnapshot(r->log, snapshot->index, r->snapshot.trailing);
out:
takeSnapshotClose(r, snapshot);
r->snapshot.pending.term = 0;
}
static int putSnapshot(struct raft *r,
struct raft_snapshot *snapshot,
raft_io_snapshot_put_cb cb)
{
int rv;
assert(r->snapshot.put.data == NULL);
r->snapshot.put.data = r;
rv = r->io->snapshot_put(r->io, r->snapshot.trailing, &r->snapshot.put,
snapshot, cb);
if (rv != 0) {
takeSnapshotClose(r, snapshot);
r->snapshot.pending.term = 0;
r->snapshot.put.data = NULL;
}
return rv;
}
static void takeSnapshotDoneCb(struct raft_io_async_work *take, int status)
{
struct raft *r = take->data;
struct raft_snapshot *snapshot = &r->snapshot.pending;
int rv;
raft_free(take);
if (status != 0) {
tracef("take snapshot failed %s", raft_strerror(status));
takeSnapshotClose(r, snapshot);
r->snapshot.pending.term = 0;
r->snapshot.put.data = NULL;
return;
}
rv = putSnapshot(r, snapshot, takeSnapshotCb);
if (rv != 0) {
tracef("put snapshot failed %d", rv);
}
}
static int takeSnapshotAsync(struct raft_io_async_work *take)
{
struct raft *r = take->data;
tracef("take snapshot async at %lld", r->snapshot.pending.index);
struct raft_snapshot *snapshot = &r->snapshot.pending;
return r->fsm->snapshot_async(r->fsm, &snapshot->bufs,
&snapshot->n_bufs);
}
static int takeSnapshot(struct raft *r)
{
struct raft_snapshot *snapshot;
int rv;
tracef("take snapshot at %lld", r->last_applied);
snapshot = &r->snapshot.pending;
snapshot->index = r->last_applied;
snapshot->term = logTermOf(r->log, r->last_applied);
snapshot->bufs = NULL;
snapshot->n_bufs = 0;
rv = membershipFetchLastCommittedConfiguration(
r, &snapshot->configuration);
if (rv != 0) {
goto abort;
}
snapshot->configuration_index = r->configuration_committed_index;
rv = r->fsm->snapshot(r->fsm, &snapshot->bufs, &snapshot->n_bufs);
if (rv != 0) {
/* Ignore transient errors. We'll retry next time. */
if (rv == RAFT_BUSY) {
rv = 0;
}
raft_configuration_close(&snapshot->configuration);
goto abort;
}
bool sync_snapshot =
r->fsm->version < 3 || r->fsm->snapshot_async == NULL;
if (sync_snapshot) {
/* putSnapshot will clean up config and buffers in case of error
*/
return putSnapshot(r, snapshot, takeSnapshotCb);
} else {
struct raft_io_async_work *take = raft_malloc(sizeof(*take));
if (take == NULL) {
rv = RAFT_NOMEM;
goto abort_after_snapshot;
}
take->data = r;
take->work = takeSnapshotAsync;
rv = r->io->async_work(r->io, take, takeSnapshotDoneCb);
if (rv != 0) {
raft_free(take);
goto abort_after_snapshot;
}
}
return 0;
abort_after_snapshot:
/* Closes config and finalizes snapshot */
takeSnapshotClose(r, snapshot);
abort:
r->snapshot.pending.term = 0;
return rv;
}
int replicationApply(struct raft *r)
{
raft_index index;
int rv = 0;
assert(r->state == RAFT_LEADER || r->state == RAFT_FOLLOWER);
assert(r->last_applied <= r->commit_index);
if (r->last_applied == r->commit_index) {
/* Nothing to do. */
return 0;
}
for (index = r->last_applied + 1; index <= r->commit_index; index++) {
const struct raft_entry *entry = logGet(r->log, index);
if (entry == NULL) {
/* This can happen while installing a snapshot */
tracef("replicationApply - ENTRY NULL");
return 0;
}
assert(entry->type == RAFT_COMMAND ||
entry->type == RAFT_BARRIER ||
entry->type == RAFT_CHANGE);
switch (entry->type) {
case RAFT_COMMAND:
rv = applyCommand(r, index, &entry->buf);
break;
case RAFT_BARRIER:
applyBarrier(r, index);
rv = 0;
break;
case RAFT_CHANGE:
applyChange(r, index);
rv = 0;
break;
default:
rv = 0; /* For coverity. This case can't be
taken. */
break;
}
if (rv != 0) {
break;
}
}
if (shouldTakeSnapshot(r)) {
rv = takeSnapshot(r);
}
return rv;
}
void replicationQuorum(struct raft *r, const raft_index index)
{
size_t votes = 0;
size_t i;
raft_term term;
assert(r->state == RAFT_LEADER);
if (index <= r->commit_index) {
return;
}
term = logTermOf(r->log, index);
/* TODO: fuzzy-test --seed 0x8db5fccc replication/entries/partitioned
* fails the assertion below. */
if (term == 0) {
return;
}
// assert(logTermOf(r->log, index) > 0);
assert(!(term > r->current_term));
/* Don't commit entries from previous terms by counting replicas. */
if (term < r->current_term) {
return;
}
for (i = 0; i < r->configuration.n; i++) {
struct raft_server *server = &r->configuration.servers[i];
if (server->role != RAFT_VOTER) {
continue;
}
if (r->leader_state.progress[i].match_index >= index) {
votes++;
}
}
if (votes > configurationVoterCount(&r->configuration) / 2) {
r->commit_index = index;
tracef("new commit index %llu", r->commit_index);
}
return;
}
inline bool replicationInstallSnapshotBusy(struct raft *r)
{
return r->last_stored == 0 && r->snapshot.put.data != NULL;
}
#undef tracef
dqlite-1.16.7/src/raft/replication.h 0000664 0000000 0000000 00000007325 14652527134 0017315 0 ustar 00root root 0000000 0000000 /* Log replication logic and helpers. */
#ifndef REPLICATION_H_
#define REPLICATION_H_
#include "../raft.h"
/* Send AppendEntries RPC messages to all followers to which no AppendEntries
* was sent in the last heartbeat interval. */
int replicationHeartbeat(struct raft *r);
/* Start a local disk write for entries from the given index onwards, and
* trigger replication against all followers, typically sending AppendEntries
* RPC messages with outstanding log entries. */
int replicationTrigger(struct raft *r, raft_index index);
/* Possibly send an AppendEntries or an InstallSnapshot RPC message to the
* server with the given index.
*
* The rules to decide whether or not to send a message are:
*
* - If we have sent an InstallSnapshot RPC recently and we haven't yet received
* a response, then don't send any new message.
*
* - If we are probing the follower (i.e. we haven't received a successful
* response during the last heartbeat interval), then send a message only if
* haven't sent any during the last heartbeat interval.
*
* - If we are pipelining entries to the follower, then send any new entries
* haven't yet sent.
*
* If a message should be sent, the rules to decide what type of message to send
* and what it should contain are:
*
* - If we don't have anymore the first entry that should be sent to the
* follower, then send an InstallSnapshot RPC with the last snapshot.
*
* - If we still have the first entry to send, then send all entries from that
index onward (possibly zero).
*
* This function must be called only by leaders. */
int replicationProgress(struct raft *r, unsigned i);
/* Update the replication state (match and next indexes) for the given server
* using the given AppendEntries RPC result.
*
* Possibly send to the server a new set of entries or a snapshot if the result
* was unsuccessful because of missing entries or if new entries were added to
* our log in the meantime.
*
* It must be called only by leaders. */
int replicationUpdate(struct raft *r,
const struct raft_server *server,
const struct raft_append_entries_result *result);
/* Append the log entries in the given request if the Log Matching Property is
* satisfied.
*
* The rejected output parameter will be set to 0 if the Log Matching Property
* was satisfied, or to args->prev_log_index if not.
*
* The async output parameter will be set to true if some of the entries in the
* request were not present in our log, and a disk write was started to persist
* them to disk. The entries will still be appended immediately to our in-memory
* copy of the log, but an AppendEntries result message will be sent only once
* the disk write completes and the I/O callback is invoked.
*
* It must be called only by followers. */
int replicationAppend(struct raft *r,
const struct raft_append_entries *args,
raft_index *rejected,
bool *async);
int replicationInstallSnapshot(struct raft *r,
const struct raft_install_snapshot *args,
raft_index *rejected,
bool *async);
/* Returns `true` if the raft instance is currently installing a snapshot */
bool replicationInstallSnapshotBusy(struct raft *r);
/* Apply any committed entry that was not applied yet.
*
* It must be called by leaders or followers. */
int replicationApply(struct raft *r);
/* Check if a quorum has been reached for the given log index, and update the
* commit index accordingly if so.
*
* From Figure 3.1:
*
* [Rules for servers] Leaders:
*
* If there exists an N such that N > commitIndex, a majority of
* matchIndex[i] >= N, and log[N].term == currentTerm: set commitIndex = N */
void replicationQuorum(struct raft *r, const raft_index index);
#endif /* REPLICATION_H_ */
dqlite-1.16.7/src/raft/request.h 0000664 0000000 0000000 00000000524 14652527134 0016466 0 ustar 00root root 0000000 0000000 #ifndef REQUEST_H_
#define REQUEST_H_
#include "../raft.h"
/* Abstract request type */
struct request
{
/* Must be kept in sync with RAFT__REQUEST in raft.h */
void *data;
int type;
raft_index index;
queue queue;
uint8_t req_id[16];
uint8_t client_id[16];
uint8_t unique_id[16];
uint64_t reserved[4];
};
#endif /* REQUEST_H_ */
dqlite-1.16.7/src/raft/snapshot.c 0000664 0000000 0000000 00000004762 14652527134 0016640 0 ustar 00root root 0000000 0000000 #include "snapshot.h"
#include
#include
#include "../tracing.h"
#include "assert.h"
#include "configuration.h"
#include "err.h"
#include "log.h"
void snapshotClose(struct raft_snapshot *s)
{
unsigned i;
configurationClose(&s->configuration);
for (i = 0; i < s->n_bufs; i++) {
raft_free(s->bufs[i].base);
}
raft_free(s->bufs);
}
void snapshotDestroy(struct raft_snapshot *s)
{
snapshotClose(s);
raft_free(s);
}
int snapshotRestore(struct raft *r, struct raft_snapshot *snapshot)
{
int rv;
assert(snapshot->n_bufs == 1);
rv = r->fsm->restore(r->fsm, &snapshot->bufs[0]);
if (rv != 0) {
tracef("restore snapshot %llu: %s", snapshot->index,
errCodeToString(rv));
return rv;
}
configurationClose(&r->configuration);
r->configuration = snapshot->configuration;
r->configuration_committed_index = snapshot->configuration_index;
r->configuration_uncommitted_index = 0;
/* Make a copy of the configuration contained in the snapshot, in case
* r->configuration gets overriden with an uncommitted configuration and
* we then need to rollback, but the log does not contain anymore the
* entry at r->configuration_committed_index because it was truncated.
*/
configurationClose(&r->configuration_last_snapshot);
rv = configurationCopy(&r->configuration,
&r->configuration_last_snapshot);
if (rv != 0) {
return rv;
}
configurationTrace(r, &r->configuration,
"configuration restore from snapshot");
r->commit_index = snapshot->index;
r->last_applied = snapshot->index;
r->last_stored = snapshot->index;
/* Don't free the snapshot data buffer, as ownership has been
* transferred to the fsm. */
raft_free(snapshot->bufs);
return 0;
}
int snapshotCopy(const struct raft_snapshot *src, struct raft_snapshot *dst)
{
int rv;
unsigned i;
size_t size;
uint8_t *cursor;
dst->term = src->term;
dst->index = src->index;
dst->configuration_index = src->configuration_index;
rv = configurationCopy(&src->configuration, &dst->configuration);
if (rv != 0) {
return rv;
}
size = 0;
for (i = 0; i < src->n_bufs; i++) {
size += src->bufs[i].len;
}
dst->bufs = raft_malloc(sizeof *dst->bufs);
assert(dst->bufs != NULL);
dst->bufs[0].base = raft_malloc(size);
dst->bufs[0].len = size;
if (dst->bufs[0].base == NULL) {
return RAFT_NOMEM;
}
cursor = dst->bufs[0].base;
for (i = 0; i < src->n_bufs; i++) {
memcpy(cursor, src->bufs[i].base, src->bufs[i].len);
cursor += src->bufs[i].len;
}
dst->n_bufs = 1;
return 0;
}
#undef tracef
dqlite-1.16.7/src/raft/snapshot.h 0000664 0000000 0000000 00000001703 14652527134 0016635 0 ustar 00root root 0000000 0000000 #ifndef RAFT_SNAPSHOT_H_
#define RAFT_SNAPSHOT_H_
#include "../raft.h"
/* Release all memory associated with the given snapshot. */
void snapshotClose(struct raft_snapshot *s);
/* Like snapshotClose(), but also release the snapshot object itself. */
void snapshotDestroy(struct raft_snapshot *s);
/* Restore a snapshot.
*
* This will reset the current state of the server as if the last entry
* contained in the snapshot had just been persisted, committed and applied.
*
* The in-memory log must be empty when calling this function.
*
* If no error occurs, the memory of the snapshot object gets released. */
int snapshotRestore(struct raft *r, struct raft_snapshot *snapshot);
/* Make a full deep copy of a snapshot object.
*
* All data buffers in the source snapshot will be compacted in a single buffer
* in the destination snapshot. */
int snapshotCopy(const struct raft_snapshot *src, struct raft_snapshot *dst);
#endif /* RAFT_SNAPSHOT_H */
dqlite-1.16.7/src/raft/start.c 0000664 0000000 0000000 00000015634 14652527134 0016136 0 ustar 00root root 0000000 0000000 #include "../raft.h"
#include "../tracing.h"
#include "assert.h"
#include "configuration.h"
#include "convert.h"
#include "entry.h"
#include "err.h"
#include "log.h"
#include "recv.h"
#include "snapshot.h"
#include "tick.h"
/* Restore the most recent configuration entry found in the log. */
static int restoreMostRecentConfigurationEntry(struct raft *r,
struct raft_entry *entry,
raft_index index)
{
struct raft_configuration configuration;
int rv;
rv = configurationDecode(&entry->buf, &configuration);
if (rv != 0) {
configurationClose(&configuration);
return rv;
}
configurationClose(&r->configuration);
r->configuration = configuration;
/* If the configuration comes from entry at index 1 in the log, we know
* it's the bootstrap configuration and it's committed by default.
* Otherwise we we can't know if it's committed or not and treat it as
* uncommitted. */
if (index == 1) {
assert(r->configuration_uncommitted_index == 0);
r->configuration_committed_index = 1;
} else {
assert(r->configuration_committed_index < index);
r->configuration_uncommitted_index = index;
}
configurationTrace(r, &r->configuration,
"restore most recent configuration");
return 0;
}
/* Restore the entries that were loaded from persistent storage. The most recent
* configuration entry will be restored as well, if any.
*
* Note that if the last configuration entry in the log has index greater than
* one we cannot know if it is committed or not. Therefore we also need to track
* the second-to-last configuration entry. This second-to-last entry is
* committed by default as raft doesn't allow multiple uncommitted configuration
* entries. That entry is used in case of configuration rollback scenarios. If
* we don't find the second-to-last configuration entry in the log, it means
* that the log was truncated after a snapshot and second-to-last configuration
* is available in r->configuration_last_snapshot, which we popolated earlier
* when the snapshot was restored. */
static int restoreEntries(struct raft *r,
raft_index snapshot_index,
raft_term snapshot_term,
raft_index start_index,
struct raft_entry *entries,
size_t n)
{
struct raft_entry *conf = NULL;
raft_index conf_index = 0;
size_t i;
int rv;
logStart(r->log, snapshot_index, snapshot_term, start_index);
r->last_stored = start_index - 1;
for (i = 0; i < n; i++) {
struct raft_entry *entry = &entries[i];
rv = logAppend(r->log, entry->term, entry->type, entry->buf,
entry->local_data, entry->is_local, entry->batch);
if (rv != 0) {
goto err;
}
r->last_stored++;
/* Only take into account configurations that are newer than the
* configuration restored from the snapshot. */
if (entry->type == RAFT_CHANGE &&
r->last_stored > r->configuration_committed_index) {
/* If there is a previous configuration it must have
* been committed as we don't allow multiple uncommitted
* configurations. At the end of the loop
* r->configuration_committed_index will point to the
* second to last configuration entry, if any. */
if (conf_index != 0) {
r->configuration_committed_index = conf_index;
}
conf = entry;
conf_index = r->last_stored;
}
}
if (conf != NULL) {
rv = restoreMostRecentConfigurationEntry(r, conf, conf_index);
if (rv != 0) {
goto err;
}
}
raft_free(entries);
return 0;
err:
if (logNumEntries(r->log) > 0) {
logDiscard(r->log, r->log->offset + 1);
}
return rv;
}
/* If we're the only voting server in the configuration, automatically
* self-elect ourselves and convert to leader without waiting for the election
* timeout. */
static int maybeSelfElect(struct raft *r)
{
const struct raft_server *server;
int rv;
server = configurationGet(&r->configuration, r->id);
if (server == NULL || server->role != RAFT_VOTER ||
configurationVoterCount(&r->configuration) > 1) {
return 0;
}
/* Converting to candidate will notice that we're the only voter and
* automatically convert to leader. */
rv = convertToCandidate(r, false /* disrupt leader */);
if (rv != 0) {
return rv;
}
assert(r->state == RAFT_LEADER);
return 0;
}
int raft_start(struct raft *r)
{
struct raft_snapshot *snapshot;
raft_index snapshot_index = 0;
raft_term snapshot_term = 0;
raft_index start_index;
struct raft_entry *entries;
size_t n_entries;
int rv;
assert(r != NULL);
assert(r->state == RAFT_UNAVAILABLE);
assert(r->heartbeat_timeout != 0);
assert(r->heartbeat_timeout < r->election_timeout);
assert(r->install_snapshot_timeout != 0);
assert(logNumEntries(r->log) == 0);
assert(logSnapshotIndex(r->log) == 0);
assert(r->last_stored == 0);
#ifndef RAFT_REVISION
#define RAFT_REVISION "unknown"
#endif
tracef("starting version:%d revision:%s", raft_version_number(),
RAFT_REVISION);
rv = r->io->load(r->io, &r->current_term, &r->voted_for, &snapshot,
&start_index, &entries, &n_entries);
if (rv != 0) {
ErrMsgTransfer(r->io->errmsg, r->errmsg, "io");
return rv;
}
assert(start_index >= 1);
tracef(
"current_term:%llu voted_for:%llu start_index:%llu n_entries:%zu",
r->current_term, r->voted_for, start_index, n_entries);
/* If we have a snapshot, let's restore it. */
if (snapshot != NULL) {
tracef(
"restore snapshot with last index %llu and last term %llu",
snapshot->index, snapshot->term);
rv = snapshotRestore(r, snapshot);
if (rv != 0) {
snapshotDestroy(snapshot);
entryBatchesDestroy(entries, n_entries);
return rv;
}
snapshot_index = snapshot->index;
snapshot_term = snapshot->term;
raft_free(snapshot);
} else if (n_entries > 0) {
/* If we don't have a snapshot and the on-disk log is not empty,
* then the first entry must be a configuration entry. */
assert(start_index == 1);
assert(entries[0].type == RAFT_CHANGE);
/* As a small optimization, bump the commit index to 1 since we
* require the first entry to be the same on all servers. */
r->commit_index = 1;
r->last_applied = 1;
}
/* Append the entries to the log, possibly restoring the last
* configuration. */
tracef("restore %zu entries starting at %llu", n_entries, start_index);
rv = restoreEntries(r, snapshot_index, snapshot_term, start_index,
entries, n_entries);
if (rv != 0) {
entryBatchesDestroy(entries, n_entries);
return rv;
}
/* Start the I/O backend. The tickCb function is expected to fire every
* r->heartbeat_timeout milliseconds and recvCb whenever an RPC is
* received. */
rv = r->io->start(r->io, r->heartbeat_timeout, tickCb, recvCb);
if (rv != 0) {
tracef("io start failed %d", rv);
return rv;
}
/* By default we start as followers. */
convertToFollower(r);
/* If there's only one voting server, and that is us, it's safe to
* convert to leader right away. If that is not us, we're either joining
* the cluster or we're simply configured as non-voter, and we'll stay
* follower. */
rv = maybeSelfElect(r);
if (rv != 0) {
return rv;
}
return 0;
}
#undef tracef
dqlite-1.16.7/src/raft/state.c 0000664 0000000 0000000 00000001716 14652527134 0016115 0 ustar 00root root 0000000 0000000 #include "assert.h"
#include "configuration.h"
#include "election.h"
#include "log.h"
#include "../lib/queue.h"
int raft_state(struct raft *r)
{
return r->state;
}
void raft_leader(struct raft *r, raft_id *id, const char **address)
{
switch (r->state) {
case RAFT_UNAVAILABLE:
case RAFT_CANDIDATE:
*id = 0;
*address = NULL;
return;
case RAFT_FOLLOWER:
*id = r->follower_state.current_leader.id;
*address = r->follower_state.current_leader.address;
return;
case RAFT_LEADER:
if (r->transfer != NULL) {
*id = 0;
*address = NULL;
return;
}
*id = r->id;
*address = r->address;
return;
}
}
raft_index raft_last_index(struct raft *r)
{
return logLastIndex(r->log);
}
raft_index raft_last_applied(struct raft *r)
{
return r->last_applied;
}
int raft_role(struct raft *r)
{
const struct raft_server *local =
configurationGet(&r->configuration, r->id);
if (local == NULL) {
return -1;
}
return local->role;
}
dqlite-1.16.7/src/raft/syscall.c 0000664 0000000 0000000 00000002424 14652527134 0016444 0 ustar 00root root 0000000 0000000 #include "syscall.h"
#if HAVE_LINUX_AIO_ABI_H || HAVE_LINUX_IO_URING_H
#include
#include
#endif
#if HAVE_LINUX_AIO_ABI_H
int io_setup(unsigned nr_events, aio_context_t *ctx_idp)
{
return (int)syscall(__NR_io_setup, nr_events, ctx_idp);
}
int io_destroy(aio_context_t ctx_id)
{
return (int)syscall(__NR_io_destroy, ctx_id);
}
int io_submit(aio_context_t ctx_id, long nr, struct iocb **iocbpp)
{
return (int)syscall(__NR_io_submit, ctx_id, nr, iocbpp);
}
int io_getevents(aio_context_t ctx_id,
long min_nr,
long nr,
struct io_event *events,
struct timespec *timeout)
{
return (int)syscall(__NR_io_getevents, ctx_id, min_nr, nr, events,
timeout);
}
#endif
#if HAVE_LINUX_IO_URING_H
int io_uring_register(int fd,
unsigned int opcode,
const void *arg,
unsigned int nr_args)
{
return (int)syscall(__NR_io_uring_register, fd, opcode, arg, nr_args);
}
int io_uring_setup(unsigned int entries, struct io_uring_params *p)
{
return (int)syscall(__NR_io_uring_setup, entries, p);
}
int io_uring_enter(int fd,
unsigned int to_submit,
unsigned int min_complete,
unsigned int flags,
sigset_t *sig)
{
return (int)syscall(__NR_io_uring_enter, fd, to_submit, min_complete,
flags, sig, _NSIG / 8);
}
#endif
dqlite-1.16.7/src/raft/syscall.h 0000664 0000000 0000000 00000001724 14652527134 0016453 0 ustar 00root root 0000000 0000000 /* Wrappers for system calls not yet defined in libc. */
#ifndef SYSCALL_H_
#define SYSCALL_H_
#if HAVE_LINUX_AIO_ABI_H
#include
#include
#include
#endif
#if HAVE_LINUX_IO_URING_H
#include
#endif
#if HAVE_LINUX_AIO_ABI_H
/* AIO */
int io_setup(unsigned nr_events, aio_context_t *ctx_idp);
int io_destroy(aio_context_t ctx_id);
int io_submit(aio_context_t ctx_id, long nr, struct iocb **iocbpp);
int io_getevents(aio_context_t ctx_id,
long min_nr,
long nr,
struct io_event *events,
struct timespec *timeout);
#endif
#if HAVE_LINUX_IO_URING_H
/* uring */
int io_uring_register(int fd,
unsigned int opcode,
const void *arg,
unsigned int nr_args);
int io_uring_setup(unsigned int entries, struct io_uring_params *p);
int io_uring_enter(int fd,
unsigned int to_submit,
unsigned int min_complete,
unsigned int flags,
sigset_t *sig);
#endif
#endif /* SYSCALL_ */
dqlite-1.16.7/src/raft/tick.c 0000664 0000000 0000000 00000015727 14652527134 0015736 0 ustar 00root root 0000000 0000000 #include "../raft.h"
#include "../tracing.h"
#include "assert.h"
#include "configuration.h"
#include "convert.h"
#include "election.h"
#include "membership.h"
#include "progress.h"
#include "replication.h"
/* Apply time-dependent rules for followers (Figure 3.1). */
static int tickFollower(struct raft *r)
{
const struct raft_server *server;
int rv;
assert(r != NULL);
assert(r->state == RAFT_FOLLOWER);
server = configurationGet(&r->configuration, r->id);
/* If we have been removed from the configuration, or maybe we didn't
* receive one yet, just stay follower. */
if (server == NULL) {
return 0;
}
/* Check if we need to start an election.
*
* From Section 3.3:
*
* If a follower receives no communication over a period of time
* called the election timeout, then it assumes there is no viable
* leader and begins an election to choose a new leader.
*
* Figure 3.1:
*
* If election timeout elapses without receiving AppendEntries RPC
* from current leader or granting vote to candidate, convert to
* candidate.
*/
if (electionTimerExpired(r) && server->role == RAFT_VOTER) {
if (replicationInstallSnapshotBusy(r)) {
tracef(
"installing snapshot -> don't convert to "
"candidate");
electionResetTimer(r);
return 0;
}
if (r->follower_state.append_in_flight_count > 0) {
tracef(
"append in progress -> don't convert to candidate");
electionResetTimer(r);
return 0;
}
tracef("convert to candidate and start new election");
rv = convertToCandidate(r, false /* disrupt leader */);
if (rv != 0) {
tracef("convert to candidate: %s", raft_strerror(rv));
return rv;
}
}
return 0;
}
/* Apply time-dependent rules for candidates (Figure 3.1). */
static int tickCandidate(struct raft *r)
{
assert(r != NULL);
assert(r->state == RAFT_CANDIDATE);
/* Check if we need to start an election.
*
* From Section 3.4:
*
* The third possible outcome is that a candidate neither wins nor
* loses the election: if many followers become candidates at the same
* time, votes could be split so that no candidate obtains a majority.
* When this happens, each candidate will time out and start a new
* election by incrementing its term and initiating another round of
* RequestVote RPCs
*/
if (electionTimerExpired(r)) {
tracef("start new election");
return electionStart(r);
}
return 0;
}
/* Return true if we received an AppendEntries RPC result from a majority of
* voting servers since we became leaders or since the last time this function
* was called.
*
* For each server the function checks the recent_recv flag of the associated
* progress object, and resets the flag after the check. It returns true if a
* majority of voting server had the flag set to true. */
static bool checkContactQuorum(struct raft *r)
{
unsigned i;
unsigned contacts = 0;
assert(r->state == RAFT_LEADER);
for (i = 0; i < r->configuration.n; i++) {
struct raft_server *server = &r->configuration.servers[i];
bool recent_recv = progressResetRecentRecv(r, i);
if ((server->role == RAFT_VOTER && recent_recv) ||
server->id == r->id) {
contacts++;
}
}
r->leader_state.voter_contacts = contacts;
return contacts > configurationVoterCount(&r->configuration) / 2;
}
/* Apply time-dependent rules for leaders (Figure 3.1). */
static int tickLeader(struct raft *r)
{
raft_time now = r->io->time(r->io);
assert(r->state == RAFT_LEADER);
/* Check if we still can reach a majority of servers.
*
* From Section 6.2:
*
* A leader in Raft steps down if an election timeout elapses without
* a successful round of heartbeats to a majority of its cluster; this
* allows clients to retry their requests with another server.
*/
if (now - r->election_timer_start >= r->election_timeout) {
if (!checkContactQuorum(r)) {
tracef(
"unable to contact majority of cluster -> step "
"down");
convertToFollower(r);
return 0;
}
r->election_timer_start = r->io->time(r->io);
}
/* Possibly send heartbeats.
*
* From Figure 3.1:
*
* Send empty AppendEntries RPC during idle periods to prevent
* election timeouts.
*/
replicationHeartbeat(r);
/* If a server is being promoted, increment the timer of the current
* round or abort the promotion.
*
* From Section 4.2.1:
*
* The algorithm waits a fixed number of rounds (such as 10). If the
* last round lasts less than an election timeout, then the leader adds
* the new server to the cluster, under the assumption that there are
* not enough unreplicated entries to create a significant availability
* gap. Otherwise, the leader aborts the configuration change with an
* error.
*/
if (r->leader_state.promotee_id != 0) {
raft_id id = r->leader_state.promotee_id;
unsigned server_index;
raft_time round_duration = now - r->leader_state.round_start;
bool is_too_slow;
bool is_unresponsive;
/* If a promotion is in progress, we expect that our
* configuration contains an entry for the server being
* promoted, and that the server is not yet considered as
* voting. */
server_index = configurationIndexOf(&r->configuration, id);
assert(server_index < r->configuration.n);
assert(r->configuration.servers[server_index].role !=
RAFT_VOTER);
is_too_slow =
(r->leader_state.round_number == r->max_catch_up_rounds &&
round_duration > r->election_timeout);
is_unresponsive =
round_duration > r->max_catch_up_round_duration;
/* Abort the promotion if we are at the 10'th round and it's
* still taking too long, or if the server is unresponsive. */
if (is_too_slow || is_unresponsive) {
tracef(
"server_index:%d is_too_slow:%d is_unresponsive:%d",
server_index, is_too_slow, is_unresponsive);
struct raft_change *change;
r->leader_state.promotee_id = 0;
r->leader_state.round_index = 0;
r->leader_state.round_number = 0;
r->leader_state.round_start = 0;
change = r->leader_state.change;
r->leader_state.change = NULL;
if (change != NULL && change->cb != NULL) {
change->cb(change, RAFT_NOCONNECTION);
}
}
}
return 0;
}
static int tick(struct raft *r)
{
int rv = -1;
assert(r->state == RAFT_UNAVAILABLE || r->state == RAFT_FOLLOWER ||
r->state == RAFT_CANDIDATE || r->state == RAFT_LEADER);
/* If we are not available, let's do nothing. */
if (r->state == RAFT_UNAVAILABLE) {
return 0;
}
switch (r->state) {
case RAFT_FOLLOWER:
rv = tickFollower(r);
break;
case RAFT_CANDIDATE:
rv = tickCandidate(r);
break;
case RAFT_LEADER:
rv = tickLeader(r);
break;
}
return rv;
}
void tickCb(struct raft_io *io)
{
struct raft *r;
int rv;
r = io->data;
rv = tick(r);
if (rv != 0) {
convertToUnavailable(r);
return;
}
/* For all states: if there is a leadership transfer request in
* progress, check if it's expired. */
if (r->transfer != NULL) {
raft_time now = r->io->time(r->io);
if (now - r->transfer->start >= r->election_timeout) {
membershipLeadershipTransferClose(r);
}
}
}
#undef tracef
dqlite-1.16.7/src/raft/tick.h 0000664 0000000 0000000 00000000461 14652527134 0015730 0 ustar 00root root 0000000 0000000 /* Logic to be invoked periodically. */
#ifndef TICK_H_
#define TICK_H_
#include "../raft.h"
/* Callback to be passed to the @raft_io implementation. It notifies us that a
* certain amount of time has elapsed and will be invoked periodically. */
void tickCb(struct raft_io *io);
#endif /* TICK_H_ */
dqlite-1.16.7/src/raft/utils.h 0000664 0000000 0000000 00000000532 14652527134 0016135 0 ustar 00root root 0000000 0000000 #ifndef RAFT_UTILS_H_
#define RAFT_UTILS_H_
#include
/* Various utility functions and macros */
#define LIKELY(x) __builtin_expect(!!(x), 1)
#define UNLIKELY(x) __builtin_expect(!!(x), 0)
#define DBG() fprintf(stderr, "%s:%d\n", __func__, __LINE__)
#define ARRAY_SIZE(a) ((sizeof(a)) / (sizeof(a)[0]))
#endif /* RAFT_UTILS_H_ */
dqlite-1.16.7/src/raft/uv.c 0000664 0000000 0000000 00000045264 14652527134 0015435 0 ustar 00root root 0000000 0000000 #include "../raft.h"
#include
#include
#include
#include
#include
#include
#include
#include
#include "../raft.h"
#include "../tracing.h"
#include "assert.h"
#include "byte.h"
#include "configuration.h"
#include "entry.h"
#include "heap.h"
#include "snapshot.h"
#include "uv.h"
#include "uv_encoding.h"
#include "uv_os.h"
/* Retry to connect to peer servers every second.
*
* TODO: implement an exponential backoff instead. */
#define CONNECT_RETRY_DELAY 1000
/* Cleans up files that are no longer used by the system */
static int uvMaintenance(const char *dir, char *errmsg)
{
struct uv_fs_s req;
struct uv_dirent_s entry;
int n;
int i;
int rv;
int rv2;
n = uv_fs_scandir(NULL, &req, dir, 0, NULL);
if (n < 0) {
ErrMsgPrintf(errmsg, "scan data directory: %s", uv_strerror(n));
return RAFT_IOERR;
}
rv = 0;
for (i = 0; i < n; i++) {
const char *filename;
rv = uv_fs_scandir_next(&req, &entry);
assert(rv == 0); /* Can't fail in libuv */
filename = entry.name;
/* Remove leftover tmp-files */
if (strncmp(filename, TMP_FILE_PREFIX,
strlen(TMP_FILE_PREFIX)) == 0) {
UvFsRemoveFile(dir, filename,
errmsg); /* Ignore errors */
continue;
}
/* Remove orphaned snapshot files */
bool orphan = false;
if ((UvSnapshotIsOrphan(dir, filename, &orphan) == 0) &&
orphan) {
UvFsRemoveFile(dir, filename,
errmsg); /* Ignore errors */
continue;
}
/* Remove orphaned snapshot metadata files */
if ((UvSnapshotMetaIsOrphan(dir, filename, &orphan) == 0) &&
orphan) {
UvFsRemoveFile(dir, filename,
errmsg); /* Ignore errors */
}
}
rv2 = uv_fs_scandir_next(&req, &entry);
assert(rv2 == UV_EOF);
return rv;
}
/* Implementation of raft_io->config. */
static int uvInit(struct raft_io *io, raft_id id, const char *address)
{
struct uv *uv;
size_t direct_io;
struct uvMetadata metadata;
int rv;
uv = io->impl;
uv->id = id;
rv = UvFsCheckDir(uv->dir, io->errmsg);
if (rv != 0) {
return rv;
}
/* Probe file system capabilities */
rv = UvFsProbeCapabilities(uv->dir, &direct_io, &uv->async_io,
&uv->fallocate, io->errmsg);
if (rv != 0) {
return rv;
}
uv->direct_io = direct_io != 0;
uv->block_size = direct_io != 0 ? direct_io : 4096;
rv = uvMaintenance(uv->dir, io->errmsg);
if (rv != 0) {
return rv;
}
rv = uvMetadataLoad(uv->dir, &metadata, io->errmsg);
if (rv != 0) {
return rv;
}
uv->metadata = metadata;
rv = uv->transport->init(uv->transport, id, address);
if (rv != 0) {
ErrMsgTransfer(uv->transport->errmsg, io->errmsg, "transport");
return rv;
}
uv->transport->data = uv;
rv = uv_timer_init(uv->loop, &uv->timer);
assert(rv == 0); /* This should never fail */
uv->timer.data = uv;
return 0;
}
/* Periodic timer callback */
static void uvTickTimerCb(uv_timer_t *timer)
{
struct uv *uv;
uv = timer->data;
if (uv->tick_cb != NULL) {
uv->tick_cb(uv->io);
}
}
/* Implementation of raft_io->start. */
static int uvStart(struct raft_io *io,
unsigned msecs,
raft_io_tick_cb tick_cb,
raft_io_recv_cb recv_cb)
{
struct uv *uv;
int rv;
uv = io->impl;
uv->state = UV__ACTIVE;
uv->tick_cb = tick_cb;
uv->recv_cb = recv_cb;
rv = UvRecvStart(uv);
if (rv != 0) {
return rv;
}
rv = uv_timer_start(&uv->timer, uvTickTimerCb, msecs, msecs);
assert(rv == 0);
return 0;
}
void uvMaybeFireCloseCb(struct uv *uv)
{
tracef("uv maybe fire close cb");
if (!uv->closing) {
return;
}
if (uv->transport->data != NULL) {
return;
}
if (uv->timer.data != NULL) {
return;
}
if (!queue_empty(&uv->append_segments)) {
return;
}
if (!queue_empty(&uv->finalize_reqs)) {
return;
}
if (uv->finalize_work.data != NULL) {
return;
}
if (uv->prepare_inflight != NULL) {
return;
}
if (uv->barrier != NULL) {
return;
}
if (uv->snapshot_put_work.data != NULL) {
return;
}
if (!queue_empty(&uv->snapshot_get_reqs)) {
return;
}
if (!queue_empty(&uv->async_work_reqs)) {
return;
}
if (!queue_empty(&uv->aborting)) {
return;
}
assert(uv->truncate_work.data == NULL);
if (uv->close_cb != NULL) {
uv->close_cb(uv->io);
}
}
static void uvTickTimerCloseCb(uv_handle_t *handle)
{
struct uv *uv = handle->data;
assert(uv->closing);
uv->timer.data = NULL;
uvMaybeFireCloseCb(uv);
}
static void uvTransportCloseCb(struct raft_uv_transport *transport)
{
struct uv *uv = transport->data;
assert(uv->closing);
uv->transport->data = NULL;
uvMaybeFireCloseCb(uv);
}
/* Implementation of raft_io->close. */
static void uvClose(struct raft_io *io, raft_io_close_cb cb)
{
struct uv *uv;
uv = io->impl;
assert(uv != NULL);
assert(!uv->closing);
uv->close_cb = cb;
uv->closing = true;
UvSendClose(uv);
UvRecvClose(uv);
uvAppendClose(uv);
if (uv->transport->data != NULL) {
uv->transport->close(uv->transport, uvTransportCloseCb);
}
if (uv->timer.data != NULL) {
uv_close((uv_handle_t *)&uv->timer, uvTickTimerCloseCb);
}
uvMaybeFireCloseCb(uv);
}
/* Filter the given segment list to find the most recent contiguous chunk of
* closed segments that overlaps with the given snapshot last index. */
static int uvFilterSegments(struct uv *uv,
raft_index last_index,
const char *snapshot_filename,
struct uvSegmentInfo **segments,
size_t *n)
{
struct uvSegmentInfo *segment;
size_t i; /* First valid closed segment. */
size_t j; /* Last valid closed segment. */
/* If there are not segments at all, or only open segments, there's
* nothing to do. */
if (*segments == NULL || (*segments)[0].is_open) {
return 0;
}
/* Find the index of the most recent closed segment. */
for (j = 0; j < *n; j++) {
segment = &(*segments)[j];
if (segment->is_open) {
break;
}
}
assert(j > 0);
j--;
segment = &(*segments)[j];
tracef("most recent closed segment is %s", segment->filename);
/* If the end index of the last closed segment is lower than the last
* snapshot index, there might be no entry that we can keep. We return
* an empty segment list, unless there is at least one open segment, in
* that case we keep everything hoping that they contain all the entries
* since the last closed segment (TODO: we should encode the starting
* entry in the open segment). */
if (segment->end_index < last_index) {
if (!(*segments)[*n - 1].is_open) {
tracef(
"discarding all closed segments, since most recent "
"is behind "
"last snapshot");
raft_free(*segments);
*segments = NULL;
*n = 0;
return 0;
}
tracef(
"most recent closed segment %s is behind last snapshot, "
"yet there are open segments",
segment->filename);
}
/* Now scan the segments backwards, searching for the longest list of
* contiguous closed segments. */
if (j >= 1) {
for (i = j; i > 0; i--) {
struct uvSegmentInfo *newer;
struct uvSegmentInfo *older;
newer = &(*segments)[i];
older = &(*segments)[i - 1];
if (older->end_index != newer->first_index - 1) {
tracef("discarding non contiguous segment %s",
older->filename);
break;
}
}
} else {
i = j;
}
/* Make sure that the first index of the first valid closed segment is
* not greater than the snapshot's last index plus one (so there are no
* missing entries). */
segment = &(*segments)[i];
if (segment->first_index > last_index + 1) {
ErrMsgPrintf(uv->io->errmsg,
"closed segment %s is past last snapshot %s",
segment->filename, snapshot_filename);
return RAFT_CORRUPT;
}
if (i != 0) {
size_t new_n = *n - i;
struct uvSegmentInfo *new_segments;
new_segments = raft_malloc(new_n * sizeof *new_segments);
if (new_segments == NULL) {
return RAFT_NOMEM;
}
memcpy(new_segments, &(*segments)[i],
new_n * sizeof *new_segments);
raft_free(*segments);
*segments = new_segments;
*n = new_n;
}
return 0;
}
/* Load the last snapshot (if any) and all entries contained in all segment
* files of the data directory. This function can be called recursively, `depth`
* is there to ensure we don't get stuck in a recursive loop. */
static int uvLoadSnapshotAndEntries(struct uv *uv,
struct raft_snapshot **snapshot,
raft_index *start_index,
struct raft_entry *entries[],
size_t *n,
int depth)
{
struct uvSnapshotInfo *snapshots;
struct uvSegmentInfo *segments;
size_t n_snapshots;
size_t n_segments;
int rv;
*snapshot = NULL;
*start_index = 1;
*entries = NULL;
*n = 0;
/* List available snapshots and segments. */
rv = UvList(uv, &snapshots, &n_snapshots, &segments, &n_segments,
uv->io->errmsg);
if (rv != 0) {
goto err;
}
/* Load the most recent snapshot, if any. */
if (snapshots != NULL) {
char snapshot_filename[UV__FILENAME_LEN];
*snapshot = RaftHeapMalloc(sizeof **snapshot);
if (*snapshot == NULL) {
rv = RAFT_NOMEM;
goto err;
}
rv = UvSnapshotLoad(uv, &snapshots[n_snapshots - 1], *snapshot,
uv->io->errmsg);
if (rv != 0) {
RaftHeapFree(*snapshot);
*snapshot = NULL;
goto err;
}
uvSnapshotFilenameOf(&snapshots[n_snapshots - 1],
snapshot_filename);
tracef("most recent snapshot at %lld", (*snapshot)->index);
RaftHeapFree(snapshots);
snapshots = NULL;
/* Update the start index. If there are closed segments on disk
* let's make sure that the first index of the first closed
* segment is not greater than the snapshot's last index plus
* one (so there are no missing entries), and update the start
* index accordingly. */
rv = uvFilterSegments(uv, (*snapshot)->index, snapshot_filename,
&segments, &n_segments);
if (rv != 0) {
goto err;
}
if (segments != NULL) {
if (segments[0].is_open) {
*start_index = (*snapshot)->index + 1;
} else {
*start_index = segments[0].first_index;
}
} else {
*start_index = (*snapshot)->index + 1;
}
}
/* Read data from segments, closing any open segments. */
if (segments != NULL) {
raft_index last_index;
rv = uvSegmentLoadAll(uv, *start_index, segments, n_segments,
entries, n);
if (rv != 0) {
goto err;
}
/* Check if all entries that we loaded are actually behind the
* last snapshot. This can happen if the last closed segment was
* behind the last snapshot and there were open segments, but
* the entries in the open segments turned out to be behind the
* snapshot as well. */
last_index = *start_index + *n - 1;
if (*snapshot != NULL && last_index < (*snapshot)->index) {
ErrMsgPrintf(uv->io->errmsg,
"last entry on disk has index %llu, which "
"is behind "
"last snapshot's index %llu",
last_index, (*snapshot)->index);
rv = RAFT_CORRUPT;
goto err;
}
raft_free(segments);
segments = NULL;
}
return 0;
err:
assert(rv != 0);
if (*snapshot != NULL) {
snapshotDestroy(*snapshot);
*snapshot = NULL;
}
if (snapshots != NULL) {
raft_free(snapshots);
}
if (segments != NULL) {
raft_free(segments);
}
if (*entries != NULL) {
entryBatchesDestroy(*entries, *n);
*entries = NULL;
*n = 0;
}
/* Try to recover exactly once when corruption is detected, the first
* pass might have cleaned up corrupt data. Most of the arguments are
* already reset after the `err` label, except for `start_index`. */
if (rv == RAFT_CORRUPT && uv->auto_recovery && depth == 0) {
*start_index = 1;
return uvLoadSnapshotAndEntries(uv, snapshot, start_index,
entries, n, depth + 1);
}
return rv;
}
/* Implementation of raft_io->load. */
static int uvLoad(struct raft_io *io,
raft_term *term,
raft_id *voted_for,
struct raft_snapshot **snapshot,
raft_index *start_index,
struct raft_entry **entries,
size_t *n_entries)
{
struct uv *uv;
int rv;
uv = io->impl;
*term = uv->metadata.term;
*voted_for = uv->metadata.voted_for;
*snapshot = NULL;
rv = uvLoadSnapshotAndEntries(uv, snapshot, start_index, entries,
n_entries, 0);
if (rv != 0) {
return rv;
}
tracef("start index %lld, %zu entries", *start_index, *n_entries);
if (*snapshot == NULL) {
tracef("no snapshot");
}
/* Set the index of the next entry that will be appended. */
uv->append_next_index = *start_index + *n_entries;
return 0;
}
/* Implementation of raft_io->set_term. */
static int uvSetTerm(struct raft_io *io, const raft_term term)
{
struct uv *uv;
int rv;
uv = io->impl;
uv->metadata.version++;
uv->metadata.term = term;
uv->metadata.voted_for = 0;
rv = uvMetadataStore(uv, &uv->metadata);
if (rv != 0) {
return rv;
}
return 0;
}
/* Implementation of raft_io->set_term. */
static int uvSetVote(struct raft_io *io, const raft_id server_id)
{
struct uv *uv;
int rv;
uv = io->impl;
uv->metadata.version++;
uv->metadata.voted_for = server_id;
rv = uvMetadataStore(uv, &uv->metadata);
if (rv != 0) {
return rv;
}
return 0;
}
/* Implementation of raft_io->bootstrap. */
static int uvBootstrap(struct raft_io *io,
const struct raft_configuration *configuration)
{
struct uv *uv;
int rv;
uv = io->impl;
/* We shouldn't have written anything else yet. */
if (uv->metadata.term != 0) {
ErrMsgPrintf(io->errmsg, "metadata contains term %lld",
uv->metadata.term);
return RAFT_CANTBOOTSTRAP;
}
/* Write the term */
rv = uvSetTerm(io, 1);
if (rv != 0) {
return rv;
}
/* Create the first closed segment file, containing just one entry. */
rv = uvSegmentCreateFirstClosed(uv, configuration);
if (rv != 0) {
return rv;
}
return 0;
}
/* Implementation of raft_io->recover. */
static int uvRecover(struct raft_io *io, const struct raft_configuration *conf)
{
struct uv *uv = io->impl;
struct raft_snapshot *snapshot;
raft_index start_index;
raft_index next_index;
struct raft_entry *entries;
size_t n_entries;
int rv;
/* Load the current state. This also closes any leftover open segment.
*/
rv = uvLoadSnapshotAndEntries(uv, &snapshot, &start_index, &entries,
&n_entries, 0);
if (rv != 0) {
return rv;
}
/* We don't care about the actual data, just index of the last entry. */
if (snapshot != NULL) {
snapshotDestroy(snapshot);
}
if (entries != NULL) {
entryBatchesDestroy(entries, n_entries);
}
assert(start_index > 0);
next_index = start_index + n_entries;
rv = uvSegmentCreateClosedWithConfiguration(uv, next_index, conf);
if (rv != 0) {
return rv;
}
return 0;
}
/* Implementation of raft_io->time. */
static raft_time uvTime(struct raft_io *io)
{
struct uv *uv;
uv = io->impl;
return uv_now(uv->loop);
}
/* Implementation of raft_io->random. */
static int uvRandom(struct raft_io *io, int min, int max)
{
(void)io;
return min + (abs(rand()) % (max - min));
}
static void uvSeedRand(struct uv *uv)
{
ssize_t sz = -1;
unsigned seed = 0; /* fed to srand() */
sz = getrandom(&seed, sizeof seed, GRND_NONBLOCK);
if (sz == -1 || sz < ((ssize_t)sizeof seed)) {
/* Fall back to an inferior random seed when `getrandom` would
* have blocked or when not enough randomness was returned. */
seed ^= (unsigned)uv->id;
seed ^= (unsigned)uv_now(uv->loop);
struct timeval time = {0};
/* Ignore errors. */
gettimeofday(&time, NULL);
seed ^=
(unsigned)((time.tv_sec * 1000) + (time.tv_usec / 1000));
}
srand(seed);
}
int raft_uv_init(struct raft_io *io,
struct uv_loop_s *loop,
const char *dir,
struct raft_uv_transport *transport)
{
struct uv *uv;
void *data;
int rv;
assert(io != NULL);
assert(loop != NULL);
assert(dir != NULL);
assert(transport != NULL);
data = io->data;
memset(io, 0, sizeof *io);
io->data = data;
if (transport->version == 0) {
ErrMsgPrintf(io->errmsg, "transport->version must be set");
return RAFT_INVALID;
}
/* Ensure that the given path doesn't exceed our static buffer limit. */
if (!UV__DIR_HAS_VALID_LEN(dir)) {
ErrMsgPrintf(io->errmsg, "directory path too long");
return RAFT_NAMETOOLONG;
}
/* Allocate the raft_io_uv object */
uv = raft_malloc(sizeof *uv);
if (uv == NULL) {
rv = RAFT_NOMEM;
goto err;
}
memset(uv, 0, sizeof(struct uv));
uv->io = io;
uv->loop = loop;
strncpy(uv->dir, dir, sizeof(uv->dir) - 1);
uv->dir[sizeof(uv->dir) - 1] = '\0';
uv->transport = transport;
uv->transport->data = NULL;
uv->tracer = NULL;
uv->id = 0; /* Set by raft_io->config() */
uv->state = UV__PRISTINE;
uv->errored = false;
uv->direct_io = false;
uv->async_io = false;
uv->fallocate = false;
#ifdef LZ4_ENABLED
uv->snapshot_compression = true;
#else
uv->snapshot_compression = false;
#endif
uv->segment_size = UV__MAX_SEGMENT_SIZE;
uv->block_size = 0;
queue_init(&uv->clients);
queue_init(&uv->servers);
uv->connect_retry_delay = CONNECT_RETRY_DELAY;
uv->prepare_inflight = NULL;
queue_init(&uv->prepare_reqs);
queue_init(&uv->prepare_pool);
uv->prepare_next_counter = 1;
uv->append_next_index = 1;
queue_init(&uv->append_segments);
queue_init(&uv->append_pending_reqs);
queue_init(&uv->append_writing_reqs);
uv->barrier = NULL;
queue_init(&uv->finalize_reqs);
uv->finalize_work.data = NULL;
uv->truncate_work.data = NULL;
queue_init(&uv->snapshot_get_reqs);
queue_init(&uv->async_work_reqs);
uv->snapshot_put_work.data = NULL;
uv->timer.data = NULL;
uv->tick_cb = NULL; /* Set by raft_io->start() */
uv->recv_cb = NULL; /* Set by raft_io->start() */
queue_init(&uv->aborting);
uv->closing = false;
uv->close_cb = NULL;
uv->auto_recovery = true;
uvSeedRand(uv);
/* Set the raft_io implementation. */
io->version = 2; /* future-proof'ing */
io->impl = uv;
io->init = uvInit;
io->close = uvClose;
io->start = uvStart;
io->load = uvLoad;
io->bootstrap = uvBootstrap;
io->recover = uvRecover;
io->set_term = uvSetTerm;
io->set_vote = uvSetVote;
io->append = UvAppend;
io->truncate = UvTruncate;
io->send = UvSend;
io->snapshot_put = UvSnapshotPut;
io->snapshot_get = UvSnapshotGet;
io->async_work = UvAsyncWork;
io->time = uvTime;
io->random = uvRandom;
return 0;
err:
assert(rv != 0);
if (rv == RAFT_NOMEM) {
ErrMsgOom(io->errmsg);
}
return rv;
}
void raft_uv_close(struct raft_io *io)
{
struct uv *uv;
uv = io->impl;
io->impl = NULL;
raft_free(uv);
}
void raft_uv_set_segment_size(struct raft_io *io, size_t size)
{
struct uv *uv;
uv = io->impl;
uv->segment_size = size;
}
void raft_uv_set_block_size(struct raft_io *io, size_t size)
{
struct uv *uv;
uv = io->impl;
uv->block_size = size;
}
int raft_uv_set_snapshot_compression(struct raft_io *io, bool compressed)
{
struct uv *uv;
uv = io->impl;
#ifndef LZ4_AVAILABLE
if (compressed) {
return RAFT_INVALID;
}
#endif
uv->snapshot_compression = compressed;
return 0;
}
void raft_uv_set_connect_retry_delay(struct raft_io *io, unsigned msecs)
{
struct uv *uv;
uv = io->impl;
uv->connect_retry_delay = msecs;
}
void raft_uv_set_tracer(struct raft_io *io, struct raft_tracer *tracer)
{
struct uv *uv;
uv = io->impl;
uv->tracer = tracer;
}
void raft_uv_set_auto_recovery(struct raft_io *io, bool flag)
{
struct uv *uv;
uv = io->impl;
uv->auto_recovery = flag;
}
#undef tracef
dqlite-1.16.7/src/raft/uv.h 0000664 0000000 0000000 00000037521 14652527134 0015437 0 ustar 00root root 0000000 0000000 /* Implementation of the @raft_io interface based on libuv. */
#ifndef UV_H_
#define UV_H_
#include "../raft.h"
#include "../tracing.h"
#include "err.h"
#include "../lib/queue.h"
#include "uv_fs.h"
#include "uv_os.h"
/* 8 Megabytes */
#define UV__MAX_SEGMENT_SIZE (8 * 1024 * 1024)
/* Template string for closed segment filenames: start index (inclusive), end
* index (inclusive). */
#define UV__CLOSED_TEMPLATE "%016llu-%016llu"
/* Template string for open segment filenames: incrementing counter. */
#define UV__OPEN_TEMPLATE "open-%llu"
/* Enough to hold a segment filename (either open or closed) */
#define UV__SEGMENT_FILENAME_BUF_SIZE 34
/* Template string for snapshot filenames: snapshot term, snapshot index,
* creation timestamp (milliseconds since epoch). */
#define UV__SNAPSHOT_TEMPLATE "snapshot-%llu-%llu-%llu"
#define UV__SNAPSHOT_META_SUFFIX ".meta"
/* Template string for snapshot metadata filenames: snapshot term, snapshot
* index, creation timestamp (milliseconds since epoch). */
#define UV__SNAPSHOT_META_TEMPLATE \
UV__SNAPSHOT_TEMPLATE UV__SNAPSHOT_META_SUFFIX
/* State codes. */
enum {
UV__PRISTINE, /* Metadata cache populated and I/O capabilities probed */
UV__ACTIVE,
UV__CLOSED
};
/* Open segment counter type */
typedef unsigned long long uvCounter;
/* Information persisted in a single metadata file. */
struct uvMetadata
{
unsigned long long version; /* Monotonically increasing version */
raft_term term; /* Current term */
raft_id voted_for; /* Server ID of last vote, or 0 */
};
/* Hold state of a libuv-based raft_io implementation. */
struct uv
{
struct raft_io *io; /* I/O object we're implementing */
struct uv_loop_s *loop; /* UV event loop */
char dir[UV__DIR_LEN]; /* Data directory */
struct raft_uv_transport *transport; /* Network transport */
struct raft_tracer *tracer; /* Debug tracing */
raft_id id; /* Server ID */
int state; /* Current state */
bool snapshot_compression; /* If compression is enabled */
bool errored; /* If a disk I/O error was hit */
bool direct_io; /* Whether direct I/O is supported */
bool async_io; /* Whether async I/O is supported */
bool fallocate; /* Whether fallocate is supported */
size_t segment_size; /* Initial size of open segments. */
size_t block_size; /* Block size of the data dir */
queue clients; /* Outbound connections */
queue servers; /* Inbound connections */
unsigned connect_retry_delay; /* Client connection retry delay */
void *prepare_inflight; /* Segment being prepared */
queue prepare_reqs; /* Pending prepare requests. */
queue prepare_pool; /* Prepared open segments */
uvCounter prepare_next_counter; /* Counter of next open segment */
raft_index append_next_index; /* Index of next entry to append */
queue append_segments; /* Open segments in use. */
queue append_pending_reqs; /* Pending append requests. */
queue append_writing_reqs; /* Append requests in flight */
struct UvBarrier *barrier; /* Inflight barrier request */
queue finalize_reqs; /* Segments waiting to be closed */
struct uv_work_s finalize_work; /* Resize and rename segments */
struct uv_work_s truncate_work; /* Execute truncate log requests */
queue snapshot_get_reqs; /* Inflight get snapshot requests */
queue async_work_reqs; /* Inflight async work requests */
struct uv_work_s snapshot_put_work; /* Execute snapshot put requests */
struct uvMetadata metadata; /* Cache of metadata on disk */
struct uv_timer_s timer; /* Timer for periodic ticks */
raft_io_tick_cb tick_cb; /* Invoked when the timer expires */
raft_io_recv_cb recv_cb; /* Invoked when upon RPC messages */
queue aborting; /* Cleanups upon errors or shutdown */
bool closing; /* True if we are closing */
raft_io_close_cb close_cb; /* Invoked when finishing closing */
bool auto_recovery; /* Try to recover from corrupt segments */
};
/* Implementation of raft_io->truncate. */
int UvTruncate(struct raft_io *io, raft_index index);
/* Load Raft metadata from disk, choosing the most recent version (either the
* metadata1 or metadata2 file). */
int uvMetadataLoad(const char *dir, struct uvMetadata *metadata, char *errmsg);
/* Store the given metadata to disk, writing the appropriate metadata file
* according to the metadata version (if the version is odd, write metadata1,
* otherwise write metadata2). */
int uvMetadataStore(struct uv *uv, const struct uvMetadata *metadata);
/* Metadata about a segment file. */
struct uvSegmentInfo
{
bool is_open; /* Whether the segment is open */
union {
struct
{
raft_index
first_index; /* First index in a closed segment */
raft_index
end_index; /* Last index in a closed segment */
};
struct
{
unsigned long long counter; /* Open segment counter */
};
};
char filename[UV__SEGMENT_FILENAME_BUF_SIZE]; /* Segment filename */
};
/* Append a new item to the given segment info list if the given filename
* matches either the one of a closed segment (xxx-yyy) or the one of an open
* segment (open-xxx). */
int uvSegmentInfoAppendIfMatch(const char *filename,
struct uvSegmentInfo *infos[],
size_t *n_infos,
bool *appended);
/* Sort the given list of segments by comparing their filenames. Closed segments
* come before open segments. */
void uvSegmentSort(struct uvSegmentInfo *infos, size_t n_infos);
/* Keep only the closed segments whose entries are within the given trailing
* amount past the given snapshot last index. If the given trailing amount is 0,
* unconditionally delete all closed segments. */
int uvSegmentKeepTrailing(struct uv *uv,
struct uvSegmentInfo *segments,
size_t n,
raft_index last_index,
size_t trailing,
char *errmsg);
/* Load all entries contained in the given closed segment. */
int uvSegmentLoadClosed(struct uv *uv,
struct uvSegmentInfo *segment,
struct raft_entry *entries[],
size_t *n);
/* Load raft entries from the given segments. The @start_index is the expected
* index of the first entry of the first segment. */
int uvSegmentLoadAll(struct uv *uv,
const raft_index start_index,
struct uvSegmentInfo *segments,
size_t n_segments,
struct raft_entry **entries,
size_t *n_entries);
/* Return the number of blocks in a segments. */
#define uvSegmentBlocks(UV) (UV->segment_size / UV->block_size)
/* A dynamically allocated buffer holding data to be written into a segment
* file.
*
* The memory is aligned at disk block boundary, to allow for direct I/O. */
struct uvSegmentBuffer
{
size_t block_size; /* Disk block size for direct I/O */
uv_buf_t arena; /* Previously allocated memory that can be re-used */
size_t n; /* Write offset */
};
/* Initialize an empty buffer. */
void uvSegmentBufferInit(struct uvSegmentBuffer *b, size_t block_size);
/* Release all memory used by the buffer. */
void uvSegmentBufferClose(struct uvSegmentBuffer *b);
/* Encode the format version at the very beginning of the buffer. This function
* must be called when the buffer is empty. */
int uvSegmentBufferFormat(struct uvSegmentBuffer *b);
/* Extend the segment's buffer by encoding the given entries.
*
* Previous data in the buffer will be retained, and data for these new entries
* will be appended. */
int uvSegmentBufferAppend(struct uvSegmentBuffer *b,
const struct raft_entry entries[],
unsigned n_entries);
/* After all entries to write have been encoded, finalize the buffer by zeroing
* the unused memory of the last block. The out parameter will point to the
* memory to write. */
void uvSegmentBufferFinalize(struct uvSegmentBuffer *b, uv_buf_t *out);
/* Reset the buffer preparing it for the next segment write.
*
* If the retain parameter is greater than zero, then the data of the retain'th
* block will be copied at the beginning of the buffer and the write offset will
* be set accordingly. */
void uvSegmentBufferReset(struct uvSegmentBuffer *b, unsigned retain);
/* Write a closed segment, containing just one entry at the given index
* for the given configuration. */
int uvSegmentCreateClosedWithConfiguration(
struct uv *uv,
raft_index index,
const struct raft_configuration *configuration);
/* Write the first closed segment, containing just one entry for the given
* configuration. */
int uvSegmentCreateFirstClosed(struct uv *uv,
const struct raft_configuration *configuration);
/* Truncate a segment that was already closed. */
int uvSegmentTruncate(struct uv *uv,
struct uvSegmentInfo *segment,
raft_index index);
/* Info about a persisted snapshot stored in snapshot metadata file. */
struct uvSnapshotInfo
{
raft_term term;
raft_index index;
unsigned long long timestamp;
char filename[UV__FILENAME_LEN];
};
/* Render the filename of the data file of a snapshot */
void uvSnapshotFilenameOf(struct uvSnapshotInfo *info, char *filename);
/* Upon success `orphan` will be true if filename is a snapshot file without a
* sibling .meta file */
int UvSnapshotIsOrphan(const char *dir, const char *filename, bool *orphan);
/* Upon success `orphan` will be true if filename is a snapshot .meta file
* without a sibling snapshot file */
int UvSnapshotMetaIsOrphan(const char *dir, const char *filename, bool *orphan);
/* Append a new item to the given snapshot info list if the given filename
* matches the pattern of a snapshot metadata file (snapshot-xxx-yyy-zzz.meta)
* and there is actually a matching non-empty snapshot file on disk. */
int UvSnapshotInfoAppendIfMatch(struct uv *uv,
const char *filename,
struct uvSnapshotInfo *infos[],
size_t *n_infos,
bool *appended);
/* Sort the given list of snapshots by comparing their filenames. Older
* snapshots will come first. */
void UvSnapshotSort(struct uvSnapshotInfo *infos, size_t n_infos);
/* Load the snapshot associated with the given metadata. */
int UvSnapshotLoad(struct uv *uv,
struct uvSnapshotInfo *meta,
struct raft_snapshot *snapshot,
char *errmsg);
/* Implementation raft_io->snapshot_put (defined in uv_snapshot.c). */
int UvSnapshotPut(struct raft_io *io,
unsigned trailing,
struct raft_io_snapshot_put *req,
const struct raft_snapshot *snapshot,
raft_io_snapshot_put_cb cb);
/* Implementation of raft_io->snapshot_get (defined in uv_snapshot.c). */
int UvSnapshotGet(struct raft_io *io,
struct raft_io_snapshot_get *req,
raft_io_snapshot_get_cb cb);
/* Implementation of raft_io->async_work (defined in uv_work.c). */
int UvAsyncWork(struct raft_io *io,
struct raft_io_async_work *req,
raft_io_async_work_cb cb);
/* Return a list of all snapshots and segments found in the data directory. Both
* snapshots and segments are ordered by filename (closed segments come before
* open ones). */
int UvList(struct uv *uv,
struct uvSnapshotInfo *snapshots[],
size_t *n_snapshots,
struct uvSegmentInfo *segments[],
size_t *n_segments,
char *errmsg);
/* Request to obtain a newly prepared open segment. */
struct uvPrepare;
typedef void (*uvPrepareCb)(struct uvPrepare *req, int status);
struct uvPrepare
{
void *data; /* User data */
uv_file fd; /* Resulting segment file descriptor */
unsigned long long counter; /* Resulting segment counter */
uvPrepareCb cb; /* Completion callback */
queue queue; /* Links in uv_io->prepare_reqs */
};
/* Get a prepared open segment ready for writing. If a prepared open segment is
* already available in the pool, it will be returned immediately using the fd
* and counter pointers and the request callback won't be invoked. Otherwise the
* request will be queued and its callback invoked once a newly prepared segment
* is available. */
int UvPrepare(struct uv *uv,
uv_file *fd,
uvCounter *counter,
struct uvPrepare *req,
uvPrepareCb cb);
/* Cancel all pending prepare requests and start removing all unused prepared
* open segments. If a segment currently being created, wait for it to complete
* and then remove it immediately. */
void UvPrepareClose(struct uv *uv);
/* Implementation of raft_io->append. All the raft_buffers of the raft_entry
* structs in the entries array are required to have a len that is a multiple
* of 8. */
int UvAppend(struct raft_io *io,
struct raft_io_append *req,
const struct raft_entry entries[],
unsigned n,
raft_io_append_cb cb);
/* Pause request object and callback. */
struct UvBarrierReq;
/* A barrier cb that plans to perform work on the threadpool MUST exit early
* and cleanup resources when it detects uv->closing, this is to allow forced
* closing on shutdown. */
typedef void (*UvBarrierCb)(struct UvBarrierReq *req);
struct UvBarrierReq
{
bool blocking; /* Whether this barrier should block future writes */
void *data; /* User data */
UvBarrierCb cb; /* Completion callback */
queue queue; /* Queue of reqs triggered by a UvBarrier */
};
struct UvBarrier
{
bool blocking; /* Whether this barrier should block future writes */
queue reqs; /* Queue of UvBarrierReq */
};
/* Submit a barrier request to interrupt the normal flow of append
* operations.
*
* The following will happen:
*
* - Replace uv->append_next_index with the given next_index, so the next entry
* that will be appended will have the new index.
*
* - Execution of new writes for subsequent append requests will be blocked
* until UvUnblock is called when the barrier is blocking.
*
* - Wait for all currently pending and inflight append requests against all
* open segments to complete, and for those open segments to be finalized,
* then invoke the barrier callback.
*
* This API is used to implement truncate and snapshot install operations, which
* need to wait until all pending writes have settled and modify the log state,
* changing the next index. */
int UvBarrier(struct uv *uv, raft_index next_index, struct UvBarrierReq *req);
/* Trigger a callback for a barrier request in this @barrier. Returns true if a
* callback was triggered, false if there are no more requests to trigger.
* A barrier callback will call UvUnblock, which in turn will try to run the
* next callback, if any, from a barrier request in this barrier. */
bool UvBarrierMaybeTrigger(struct UvBarrier *barrier);
/* Add a Barrier @req to an existing @barrier. */
void UvBarrierAddReq(struct UvBarrier *barrier, struct UvBarrierReq *req);
/* Returns @true if there are no more segments referencing uv->barrier */
bool UvBarrierReady(struct uv *uv);
/* Resume writing append requests after UvBarrier has been called. */
void UvUnblock(struct uv *uv);
/* Cancel all pending write requests and request the current segment to be
* finalized. Must be invoked at closing time. */
void uvAppendClose(struct uv *uv);
/* Submit a request to finalize the open segment with the given counter.
*
* Requests are processed one at a time, to avoid ending up closing open segment
* N + 1 before closing open segment N. */
int UvFinalize(struct uv *uv,
unsigned long long counter,
size_t used,
raft_index first_index,
raft_index last_index);
/* Implementation of raft_io->send. */
int UvSend(struct raft_io *io,
struct raft_io_send *req,
const struct raft_message *message,
raft_io_send_cb cb);
/* Stop all clients by closing the outbound stream handles and canceling all
* pending send requests. */
void UvSendClose(struct uv *uv);
/* Start receiving messages from new incoming connections. */
int UvRecvStart(struct uv *uv);
/* Stop all servers by closing the inbound stream handles and aborting all
* requests being received. */
void UvRecvClose(struct uv *uv);
void uvMaybeFireCloseCb(struct uv *uv);
#endif /* UV_H_ */
dqlite-1.16.7/src/raft/uv_append.c 0000664 0000000 0000000 00000072014 14652527134 0016755 0 ustar 00root root 0000000 0000000 #include "assert.h"
#include "byte.h"
#include "heap.h"
#include "../lib/queue.h"
#include "uv.h"
#include "uv_encoding.h"
#include "uv_writer.h"
/* The happy path for an append request is:
*
* - If there is a current segment and it is has enough spare capacity to hold
* the entries in the request, then queue the request, linking it to the
* current segment.
*
* - If there is no current segment, or it hasn't enough spare capacity to hold
* the entries in the request, then request a new open segment to be prepared,
* queue the request and link it to the newly requested segment.
*
* - Wait for any pending write against the current segment to complete, and
* also for the prepare request if we asked for a new segment. Also wait for
* any in progress barrier to be removed.
*
* - Submit a write request for the entries in this append request. The write
* request might contain other append requests targeted to the current segment
* that might have accumulated in the meantime, if we have been waiting for a
* segment to be prepared, or for the previous write to complete or for a
* barrier to be removed.
*
* - Wait for the write request to finish and fire the append request's
* callback.
*
* Possible failure modes are:
*
* - The request to prepare a new segment fails.
* - The write request fails.
* - The request to finalize a new segment fails to be submitted.
*
* In all these cases we mark the instance as errored and fire the relevant
* callbacks.
**/
/* An open segment being written or waiting to be written. */
struct uvAliveSegment
{
struct uv *uv; /* Our writer */
struct uvPrepare prepare; /* Prepare segment file request */
struct UvWriter writer; /* Writer to perform async I/O */
struct UvWriterReq write; /* Write request */
unsigned long long counter; /* Open segment counter */
raft_index first_index; /* Index of the first entry written */
raft_index pending_last_index; /* Index of the last entry written */
size_t size; /* Total number of bytes used */
unsigned next_block; /* Next segment block to write */
struct uvSegmentBuffer pending; /* Buffer for data yet to be written */
uv_buf_t buf; /* Write buffer for current write */
raft_index last_index; /* Last entry actually written */
size_t written; /* Number of bytes actually written */
queue queue; /* Segment queue */
struct UvBarrier *barrier; /* Barrier waiting on this segment */
bool finalize; /* Finalize the segment after writing */
};
struct uvAppend
{
struct raft_io_append *req; /* User request */
const struct raft_entry *entries; /* Entries to write */
unsigned n; /* Number of entries */
struct uvAliveSegment *segment; /* Segment to write to */
queue queue;
};
static void uvAliveSegmentWriterCloseCb(struct UvWriter *writer)
{
struct uvAliveSegment *segment = writer->data;
struct uv *uv = segment->uv;
uvSegmentBufferClose(&segment->pending);
RaftHeapFree(segment);
uvMaybeFireCloseCb(uv);
}
/* Submit a request to close the current open segment. */
static void uvAliveSegmentFinalize(struct uvAliveSegment *s)
{
struct uv *uv = s->uv;
int rv;
rv = UvFinalize(uv, s->counter, s->written, s->first_index,
s->last_index);
if (rv != 0) {
uv->errored = true;
/* We failed to submit the finalize request, but let's still
* close the file handle and release the segment memory. */
}
queue_remove(&s->queue);
UvWriterClose(&s->writer, uvAliveSegmentWriterCloseCb);
}
/* Flush the append requests in the given queue, firing their callbacks with the
* given status. */
static void uvAppendFinishRequestsInQueue(struct uv *uv, queue *q, int status)
{
queue queue_copy;
struct uvAppend *append;
queue_init(&queue_copy);
while (!queue_empty(q)) {
queue *head;
head = queue_head(q);
append = QUEUE_DATA(head, struct uvAppend, queue);
/* Rollback the append next index if the result was
* unsuccessful. */
if (status != 0) {
tracef("rollback uv->append_next_index was:%llu",
uv->append_next_index);
uv->append_next_index -= append->n;
tracef("rollback uv->append_next_index now:%llu",
uv->append_next_index);
}
queue_remove(head);
queue_insert_tail(&queue_copy, head);
}
while (!queue_empty(&queue_copy)) {
queue *head;
struct raft_io_append *req;
head = queue_head(&queue_copy);
append = QUEUE_DATA(head, struct uvAppend, queue);
queue_remove(head);
req = append->req;
RaftHeapFree(append);
req->cb(req, status);
}
}
/* Flush the append requests in the writing queue, firing their callbacks with
* the given status. */
static void uvAppendFinishWritingRequests(struct uv *uv, int status)
{
uvAppendFinishRequestsInQueue(uv, &uv->append_writing_reqs, status);
}
/* Flush the append requests in the pending queue, firing their callbacks with
* the given status. */
static void uvAppendFinishPendingRequests(struct uv *uv, int status)
{
uvAppendFinishRequestsInQueue(uv, &uv->append_pending_reqs, status);
}
/* Return the segment currently being written, or NULL when no segment has been
* written yet. */
static struct uvAliveSegment *uvGetCurrentAliveSegment(struct uv *uv)
{
queue *head;
if (queue_empty(&uv->append_segments)) {
return NULL;
}
head = queue_head(&uv->append_segments);
return QUEUE_DATA(head, struct uvAliveSegment, queue);
}
/* Extend the segment's write buffer by encoding the entries in the given
* request into it. IOW, previous data in the write buffer will be retained, and
* data for these new entries will be appended. */
static int uvAliveSegmentEncodeEntriesToWriteBuf(struct uvAliveSegment *segment,
struct uvAppend *append)
{
int rv;
assert(append->segment == segment);
/* If this is the very first write to the segment, we need to include
* the format version */
if (segment->pending.n == 0 && segment->next_block == 0) {
rv = uvSegmentBufferFormat(&segment->pending);
if (rv != 0) {
return rv;
}
}
rv = uvSegmentBufferAppend(&segment->pending, append->entries,
append->n);
if (rv != 0) {
return rv;
}
segment->pending_last_index += append->n;
return 0;
}
static int uvAppendMaybeStart(struct uv *uv);
static void uvAliveSegmentWriteCb(struct UvWriterReq *write, const int status)
{
struct uvAliveSegment *s = write->data;
struct uv *uv = s->uv;
unsigned n_blocks;
int rv;
assert(uv->state != UV__CLOSED);
assert(s->buf.len % uv->block_size == 0);
assert(s->buf.len >= uv->block_size);
/* Check if the write was successful. */
if (status != 0) {
tracef("write: %s", uv->io->errmsg);
uv->errored = true;
goto out;
}
s->written = s->next_block * uv->block_size + s->pending.n;
s->last_index = s->pending_last_index;
/* Update our write markers.
*
* We have four cases:
*
* - The data fit completely in the leftover space of the first block
* that we wrote and there is more space left. In this case we just keep
* the scheduled marker unchanged.
*
* - The data fit completely in the leftover space of the first block
* that we wrote and there is no space left. In this case we advance the
* current block counter, reset the first write block and set the
* scheduled marker to 0.
*
* - The data did not fit completely in the leftover space of the first
* block that we wrote, so we wrote more than one block. The last
* block that we wrote was not filled completely and has leftover space.
* In this case we advance the current block counter and copy the memory
* used for the last block to the head of the write arena list, updating
* the scheduled marker accordingly.
*
* - The data did not fit completely in the leftover space of the first
* block that we wrote, so we wrote more than one block. The last
* block that we wrote was filled exactly and has no leftover space. In
* this case we advance the current block counter, reset the first
* buffer and set the scheduled marker to 0.
*/
n_blocks = (unsigned)(s->buf.len /
uv->block_size); /* Number of blocks written. */
if (s->pending.n < uv->block_size) {
/* Nothing to do */
assert(n_blocks == 1);
} else if (s->pending.n == uv->block_size) {
assert(n_blocks == 1);
s->next_block++;
uvSegmentBufferReset(&s->pending, 0);
} else {
assert(s->pending.n > uv->block_size);
assert(s->buf.len > uv->block_size);
if (s->pending.n % uv->block_size > 0) {
s->next_block += n_blocks - 1;
uvSegmentBufferReset(&s->pending, n_blocks - 1);
} else {
s->next_block += n_blocks;
uvSegmentBufferReset(&s->pending, 0);
}
}
out:
/* Fire the callbacks of all requests that were fulfilled with this
* write. */
uvAppendFinishWritingRequests(uv, status);
if (status != 0) {
/* When the write has failed additionally cancel all future
* append related activity. This will also rewind
* uv->append_next_index. All append requests need to be
* canceled because raft assumes all appends happen in order and
* if an append fails (and is not retried), we would be missing
* a sequence of log entries on disk. The implementation can't
* handle that + the accounting of the append index would be
* off.
*/
uvAppendFinishPendingRequests(uv, status);
/* Allow this segment to be finalized further down. Don't bother
* rewinding state to possibly reuse the segment for writing,
* it's too bug-prone. */
s->pending_last_index = s->last_index;
s->finalize = true;
}
/* During the closing sequence we should have already canceled all
* pending request. */
if (uv->closing) {
assert(queue_empty(&uv->append_pending_reqs));
assert(s->finalize);
uvAliveSegmentFinalize(s);
return;
}
/* Possibly process waiting requests. */
if (!queue_empty(&uv->append_pending_reqs)) {
rv = uvAppendMaybeStart(uv);
if (rv != 0) {
uv->errored = true;
}
} else if (s->finalize && (s->pending_last_index == s->last_index) &&
!s->writer.closing) {
/* If there are no more append_pending_reqs or write requests in
* flight, this segment must be finalized here in case we don't
* receive AppendEntries RPCs anymore (could happen during a
* Snapshot install, causing the BarrierCb to never fire), but
* check that the callbacks that fired after completion of this
* write didn't already close the segment. */
uvAliveSegmentFinalize(s);
}
}
/* Submit a file write request to append the entries encoded in the write buffer
* of the given segment. */
static int uvAliveSegmentWrite(struct uvAliveSegment *s)
{
int rv;
assert(s->counter != 0);
assert(s->pending.n > 0);
uvSegmentBufferFinalize(&s->pending, &s->buf);
rv = UvWriterSubmit(&s->writer, &s->write, &s->buf, 1,
s->next_block * s->uv->block_size,
uvAliveSegmentWriteCb);
if (rv != 0) {
return rv;
}
return 0;
}
/* Start writing all pending append requests for the current segment, unless we
* are already writing, or the segment itself has not yet been prepared or we
* are blocked on a barrier. If there are no more requests targeted at the
* current segment, make sure it's marked to be finalize and try with the next
* segment. */
static int uvAppendMaybeStart(struct uv *uv)
{
struct uvAliveSegment *segment;
struct uvAppend *append;
unsigned n_reqs;
queue *head;
queue q;
int rv;
assert(!uv->closing);
assert(!queue_empty(&uv->append_pending_reqs));
/* If we are already writing, let's wait. */
if (!queue_empty(&uv->append_writing_reqs)) {
return 0;
}
start:
segment = uvGetCurrentAliveSegment(uv);
assert(segment != NULL);
/* If the preparer isn't done yet, let's wait. */
if (segment->counter == 0) {
return 0;
}
/* If there's a blocking barrier in progress, and it's not waiting for
* this segment to be finalized, let's wait.
*
* FIXME shouldn't we wait even if segment->barrier == uv->barrier, if
* there are other open segments associated with the same barrier? */
if (uv->barrier != NULL && segment->barrier != uv->barrier &&
uv->barrier->blocking) {
return 0;
}
/* If there's no barrier in progress and this segment is marked with a
* barrier, it means that this was a pending barrier, which we can
* become the current barrier now. */
if (uv->barrier == NULL && segment->barrier != NULL) {
uv->barrier = segment->barrier;
}
/* Let's add to the segment's write buffer all pending requests targeted
* to this segment. */
queue_init(&q);
n_reqs = 0;
while (!queue_empty(&uv->append_pending_reqs)) {
head = queue_head(&uv->append_pending_reqs);
append = QUEUE_DATA(head, struct uvAppend, queue);
assert(append->segment != NULL);
if (append->segment != segment) {
break; /* Not targeted to this segment */
}
queue_remove(head);
queue_insert_tail(&q, head);
n_reqs++;
rv = uvAliveSegmentEncodeEntriesToWriteBuf(segment, append);
if (rv != 0) {
goto err;
}
}
/* If we have no more requests for this segment, let's check if it has
* been marked for closing, and in that case finalize it and possibly
* trigger a write against the next segment (unless there is a truncate
* request, in that case we need to wait for it). Otherwise it must mean
* we have exhausted the queue of pending append requests. */
if (n_reqs == 0) {
assert(queue_empty(&uv->append_writing_reqs));
if (segment->finalize) {
uvAliveSegmentFinalize(segment);
if (!queue_empty(&uv->append_pending_reqs)) {
goto start;
}
}
assert(queue_empty(&uv->append_pending_reqs));
return 0;
}
while (!queue_empty(&q)) {
head = queue_head(&q);
queue_remove(head);
queue_insert_tail(&uv->append_writing_reqs, head);
}
rv = uvAliveSegmentWrite(segment);
if (rv != 0) {
goto err;
}
return 0;
err:
assert(rv != 0);
return rv;
}
/* Invoked when a newly added open segment becomes ready for writing, after the
* associated UvPrepare request completes (either synchronously or
* asynchronously). */
static int uvAliveSegmentReady(struct uv *uv,
uv_file fd,
uvCounter counter,
struct uvAliveSegment *segment)
{
int rv;
rv = UvWriterInit(&segment->writer, uv->loop, fd, uv->direct_io,
uv->async_io, 1, uv->io->errmsg);
if (rv != 0) {
ErrMsgWrapf(uv->io->errmsg, "setup writer for open-%llu",
counter);
return rv;
}
segment->counter = counter;
return 0;
}
static void uvAliveSegmentPrepareCb(struct uvPrepare *req, int status)
{
struct uvAliveSegment *segment = req->data;
struct uv *uv = segment->uv;
int rv;
assert(segment->counter == 0);
assert(segment->written == 0);
/* If we have been closed, let's discard the segment. */
if (uv->closing) {
queue_remove(&segment->queue);
assert(status ==
RAFT_CANCELED); /* UvPrepare cancels pending reqs */
uvSegmentBufferClose(&segment->pending);
RaftHeapFree(segment);
return;
}
if (status != 0) {
tracef("prepare segment failed (%d)", status);
rv = status;
goto err;
}
assert(req->counter > 0);
assert(req->fd >= 0);
/* There must be pending appends that were waiting for this prepare
* requests. */
assert(!queue_empty(&uv->append_pending_reqs));
rv = uvAliveSegmentReady(uv, req->fd, req->counter, segment);
if (rv != 0) {
tracef("prepare segment ready failed (%d)", rv);
goto err;
}
rv = uvAppendMaybeStart(uv);
if (rv != 0) {
tracef("prepare segment start failed (%d)", rv);
goto err;
}
return;
err:
queue_remove(&segment->queue);
RaftHeapFree(segment);
uv->errored = true;
uvAppendFinishPendingRequests(uv, rv);
}
/* Initialize a new open segment object. */
static void uvAliveSegmentInit(struct uvAliveSegment *s, struct uv *uv)
{
s->uv = uv;
s->prepare.data = s;
s->writer.data = s;
s->write.data = s;
s->counter = 0;
s->first_index = uv->append_next_index;
s->pending_last_index = s->first_index - 1;
s->last_index = 0;
s->size = sizeof(uint64_t) /* Format version */;
s->next_block = 0;
uvSegmentBufferInit(&s->pending, uv->block_size);
s->written = 0;
s->barrier = NULL;
s->finalize = false;
}
/* Add a new active open segment, since the append request being submitted does
* not fit in the last segment we scheduled writes for, or no segment had been
* previously requested at all. */
static int uvAppendPushAliveSegment(struct uv *uv)
{
struct uvAliveSegment *segment;
uv_file fd;
uvCounter counter;
int rv;
segment = RaftHeapMalloc(sizeof *segment);
if (segment == NULL) {
rv = RAFT_NOMEM;
goto err;
}
uvAliveSegmentInit(segment, uv);
queue_insert_tail(&uv->append_segments, &segment->queue);
rv = UvPrepare(uv, &fd, &counter, &segment->prepare,
uvAliveSegmentPrepareCb);
if (rv != 0) {
goto err_after_alloc;
}
/* If we've been returned a ready prepared segment right away, start
* writing to it immediately. */
if (fd != -1) {
rv = uvAliveSegmentReady(uv, fd, counter, segment);
if (rv != 0) {
goto err_after_prepare;
}
}
return 0;
err_after_prepare:
UvOsClose(fd);
UvFinalize(uv, counter, 0, 0, 0);
err_after_alloc:
queue_remove(&segment->queue);
RaftHeapFree(segment);
err:
assert(rv != 0);
return rv;
}
/* Return the last segment that we have requested to prepare. */
static struct uvAliveSegment *uvGetLastAliveSegment(struct uv *uv)
{
queue *tail;
if (queue_empty(&uv->append_segments)) {
return NULL;
}
tail = queue_tail(&uv->append_segments);
return QUEUE_DATA(tail, struct uvAliveSegment, queue);
}
/* Return #true if the remaining capacity of the given segment is equal or
* greater than @size. */
static bool uvAliveSegmentHasEnoughSpareCapacity(struct uvAliveSegment *s,
size_t size)
{
return s->size + size <= s->uv->segment_size;
}
/* Add @size bytes to the number of bytes that the segment will hold. The actual
* write will happen when the previous write completes, if any. */
static void uvAliveSegmentReserveSegmentCapacity(struct uvAliveSegment *s,
size_t size)
{
s->size += size;
}
/* Return the number of bytes needed to store the batch of entries of this
* append request on disk. */
static size_t uvAppendSize(struct uvAppend *a)
{
size_t size = sizeof(uint32_t) * 2; /* CRC checksums */
unsigned i;
size += uvSizeofBatchHeader(a->n, true); /* Batch header */
for (i = 0; i < a->n; i++) { /* Entries data */
size += bytePad64(a->entries[i].buf.len);
}
return size;
}
/* Enqueue an append entries request, assigning it to the appropriate active
* open segment. */
static int uvAppendEnqueueRequest(struct uv *uv, struct uvAppend *append)
{
struct uvAliveSegment *segment;
size_t size;
bool fits;
int rv;
assert(append->entries != NULL);
assert(append->n > 0);
assert(uv->append_next_index > 0);
tracef("enqueue %u entries", append->n);
size = uvAppendSize(append);
/* If we have no segments yet, it means this is the very first append,
* and we need to add a new segment. Otherwise we check if the last
* segment has enough room for this batch of entries. */
segment = uvGetLastAliveSegment(uv);
if (segment == NULL || segment->finalize) {
fits = false;
} else {
fits = uvAliveSegmentHasEnoughSpareCapacity(segment, size);
if (!fits) {
segment->finalize =
true; /* Finalize when all writes are done */
}
}
/* If there's no segment or if this batch does not fit in this segment,
* we need to add a new one. */
if (!fits) {
rv = uvAppendPushAliveSegment(uv);
if (rv != 0) {
goto err;
}
}
segment = uvGetLastAliveSegment(uv); /* Get the last added segment */
uvAliveSegmentReserveSegmentCapacity(segment, size);
append->segment = segment;
queue_insert_tail(&uv->append_pending_reqs, &append->queue);
uv->append_next_index += append->n;
tracef("set uv->append_next_index %llu", uv->append_next_index);
return 0;
err:
assert(rv != 0);
return rv;
}
/* Check that all entry buffers are 8-byte aligned */
static int uvCheckEntryBuffersAligned(struct uv *uv,
const struct raft_entry entries[],
unsigned n)
{
unsigned i;
for (i = 0; i < n; i++) {
if (entries[i].buf.len % 8) {
ErrMsgPrintf(uv->io->errmsg,
"entry buffers must be 8-byte aligned");
tracef("%s", uv->io->errmsg);
return RAFT_INVALID;
}
}
return 0;
}
int UvAppend(struct raft_io *io,
struct raft_io_append *req,
const struct raft_entry entries[],
unsigned n,
raft_io_append_cb cb)
{
struct uv *uv;
struct uvAppend *append;
int rv;
uv = io->impl;
assert(!uv->closing);
append = RaftHeapCalloc(1, sizeof *append);
if (append == NULL) {
rv = RAFT_NOMEM;
goto err;
}
append->req = req;
append->entries = entries;
append->n = n;
req->cb = cb;
rv = uvCheckEntryBuffersAligned(uv, entries, n);
if (rv != 0) {
goto err_after_req_alloc;
}
rv = uvAppendEnqueueRequest(uv, append);
if (rv != 0) {
goto err_after_req_alloc;
}
assert(append->segment != NULL);
assert(!queue_empty(&uv->append_pending_reqs));
/* Try to write immediately. */
rv = uvAppendMaybeStart(uv);
if (rv != 0) {
return rv;
}
return 0;
err_after_req_alloc:
RaftHeapFree(append);
err:
assert(rv != 0);
return rv;
}
/* Finalize the current segment as soon as all its pending or inflight append
* requests get completed. */
static void uvFinalizeCurrentAliveSegmentOnceIdle(struct uv *uv)
{
struct uvAliveSegment *s;
queue *head;
bool has_pending_reqs;
bool has_writing_reqs;
s = uvGetCurrentAliveSegment(uv);
if (s == NULL) {
return;
}
/* Check if there are pending append requests targeted to the current
* segment. */
has_pending_reqs = false;
QUEUE_FOREACH(head, &uv->append_pending_reqs)
{
struct uvAppend *r = QUEUE_DATA(head, struct uvAppend, queue);
if (r->segment == s) {
has_pending_reqs = true;
break;
}
}
has_writing_reqs = !queue_empty(&uv->append_writing_reqs);
/* If there is no pending append request or inflight write against the
* current segment, we can submit a request for it to be closed
* immediately. Otherwise, we set the finalize flag.
*
* TODO: is it actually possible to have pending requests with no
* writing requests? Probably no. */
if (!has_pending_reqs && !has_writing_reqs) {
uvAliveSegmentFinalize(s);
} else {
s->finalize = true;
}
}
bool UvBarrierReady(struct uv *uv)
{
if (uv->barrier == NULL) {
return true;
}
queue *head;
QUEUE_FOREACH(head, &uv->append_segments)
{
struct uvAliveSegment *segment;
segment = QUEUE_DATA(head, struct uvAliveSegment, queue);
if (segment->barrier == uv->barrier) {
return false;
}
}
return true;
}
bool UvBarrierMaybeTrigger(struct UvBarrier *barrier)
{
if (!barrier) {
return false;
}
if (!queue_empty(&barrier->reqs)) {
queue *head;
struct UvBarrierReq *r;
head = queue_head(&barrier->reqs);
queue_remove(head);
r = QUEUE_DATA(head, struct UvBarrierReq, queue);
r->cb(r);
return true;
}
return false;
}
/* Used during cleanup. */
static void uvBarrierTriggerAll(struct UvBarrier *barrier)
{
while (UvBarrierMaybeTrigger(barrier)) {
;
}
}
static struct UvBarrier *uvBarrierCreate(void)
{
struct UvBarrier *barrier;
barrier = RaftHeapCalloc(1, sizeof(*barrier));
if (!barrier) {
return NULL;
}
barrier->blocking = false;
queue_init(&barrier->reqs);
return barrier;
}
int UvBarrier(struct uv *uv, raft_index next_index, struct UvBarrierReq *req)
{
/* The barrier to attach to. */
struct UvBarrier *barrier = NULL;
struct uvAliveSegment *segment = NULL;
queue *head;
assert(!uv->closing);
/* The next entry will be appended at this index. */
uv->append_next_index = next_index;
tracef("UvBarrier uv->append_next_index:%llu", uv->append_next_index);
/* Arrange for all open segments not already involved in other barriers
* to be finalized as soon as their append requests get completed and
* mark them as involved in this specific barrier request. */
QUEUE_FOREACH(head, &uv->append_segments)
{
segment = QUEUE_DATA(head, struct uvAliveSegment, queue);
if (segment->barrier != NULL) {
/* If a non-blocking barrier precedes this blocking
* request, we want to also block all future writes. */
if (req->blocking) {
segment->barrier->blocking = true;
}
continue;
}
if (!barrier) {
barrier = uvBarrierCreate();
if (!barrier) {
return RAFT_NOMEM;
}
/* And add the request to the barrier. */
UvBarrierAddReq(barrier, req);
}
segment->barrier = barrier;
if (segment == uvGetCurrentAliveSegment(uv)) {
uvFinalizeCurrentAliveSegmentOnceIdle(uv);
continue;
}
segment->finalize = true;
}
/* Unable to attach to a segment, because all segments are involved in a
* barrier, or there are no segments. */
if (barrier == NULL) {
/* Attach req to last segment barrier. */
if (segment != NULL) {
barrier = segment->barrier;
/* There is no segment, attach to uv->barrier. */
} else if (uv->barrier != NULL) {
barrier = uv->barrier;
/* There is no uv->barrier, make new one. */
} else {
barrier = uvBarrierCreate();
if (!barrier) {
return RAFT_NOMEM;
}
}
UvBarrierAddReq(barrier, req);
}
/* Let's not continue writing new entries if something down the line
* asked us to stop writing. */
if (uv->barrier != NULL && req->blocking) {
uv->barrier->blocking = true;
}
assert(barrier != NULL);
if (uv->barrier == NULL) {
uv->barrier = barrier;
/* If there's no pending append-related activity, we can fire
* the callback immediately.
*
* TODO: find a way to avoid invoking this synchronously. */
if (queue_empty(&uv->append_segments) &&
queue_empty(&uv->finalize_reqs) &&
uv->finalize_work.data == NULL) {
/* Not interested in return value. */
UvBarrierMaybeTrigger(barrier);
}
}
return 0;
}
void UvUnblock(struct uv *uv)
{
/* First fire all pending barrier requests. Unblock will be called again
* when that request's callback is fired. */
if (UvBarrierMaybeTrigger(uv->barrier)) {
tracef("UvUnblock triggered barrier request callback.");
return;
}
/* All requests in barrier are finished. */
tracef("UvUnblock queue empty");
RaftHeapFree(uv->barrier);
uv->barrier = NULL;
if (uv->closing) {
uvMaybeFireCloseCb(uv);
return;
}
if (!queue_empty(&uv->append_pending_reqs)) {
int rv;
rv = uvAppendMaybeStart(uv);
if (rv != 0) {
uv->errored = true;
}
}
}
void UvBarrierAddReq(struct UvBarrier *barrier, struct UvBarrierReq *req)
{
assert(barrier != NULL);
assert(req != NULL);
/* Once there's a blocking req, this barrier becomes blocking. */
barrier->blocking |= req->blocking;
queue_insert_tail(&barrier->reqs, &req->queue);
}
/* Fire all pending barrier requests, the barrier callback will notice that
* we're closing and abort there. */
static void uvBarrierClose(struct uv *uv)
{
tracef("uv barrier close");
struct UvBarrier *barrier = NULL;
queue *head;
assert(uv->closing);
QUEUE_FOREACH(head, &uv->append_segments)
{
struct uvAliveSegment *segment;
segment = QUEUE_DATA(head, struct uvAliveSegment, queue);
if (segment->barrier != NULL && segment->barrier != barrier &&
segment->barrier != uv->barrier) {
barrier = segment->barrier;
/* Fire all barrier cb's, this is safe because the
* barrier cb exits early when uv->closing is true. */
uvBarrierTriggerAll(barrier);
RaftHeapFree(barrier);
}
/* The segment->barrier field is used:
*
* - by UvBarrierReady, to check whether it's time to invoke the
* barrier callback after successfully finalizing a segment
* - by uvAppendMaybeStart, to see whether we should go ahead
* with writing to a segment even though a barrier is active
* because the barrier is waiting on that same segment to be
* finalized (but see the
* FIXME in that function)
* - to save a barrier for later, if UvBarrier was called when
* uv->barrier was already set
*
* If we're cancelling the barrier, we don't need to save it for
* later; the callback will not be invoked a second time in any
* case; and uvAppendMaybeStart won't be called while closing.
* So it's fine to clear segment->barrier here. */
segment->barrier = NULL;
}
/* There might still be a current barrier set on uv->barrier, meaning
* that the open segment it was associated with has started to be
* finalized and is not anymore in the append_segments queue. Let's
* cancel all untriggered barrier request callbacks too. */
if (uv->barrier != NULL) {
uvBarrierTriggerAll(uv->barrier);
/* Clear uv->barrier if there's no active work on the thread
* pool. When the work on the threadpool finishes, UvUnblock
* will notice we're closing, clear and free uv->barrier and
* call uvMaybeFireCloseCb. UnUnblock will not try to fire
* anymore barrier request callbacks because they were triggered
* in the line above. */
if (uv->snapshot_put_work.data == NULL &&
uv->truncate_work.data == NULL) {
RaftHeapFree(uv->barrier);
uv->barrier = NULL;
}
}
}
void uvAppendClose(struct uv *uv)
{
struct uvAliveSegment *segment;
assert(uv->closing);
uvBarrierClose(uv);
UvPrepareClose(uv);
uvAppendFinishPendingRequests(uv, RAFT_CANCELED);
uvFinalizeCurrentAliveSegmentOnceIdle(uv);
/* Also finalize the segments that we didn't write at all and are just
* sitting in the append_segments queue waiting for writes against the
* current segment to complete. */
while (!queue_empty(&uv->append_segments)) {
segment = uvGetLastAliveSegment(uv);
assert(segment != NULL);
if (segment == uvGetCurrentAliveSegment(uv)) {
break; /* We reached the head of the queue */
}
assert(segment->written == 0);
uvAliveSegmentFinalize(segment);
}
}
dqlite-1.16.7/src/raft/uv_encoding.c 0000664 0000000 0000000 00000034672 14652527134 0017304 0 ustar 00root root 0000000 0000000 #include "uv_encoding.h"
#include
#include
#include "../raft.h"
#include "assert.h"
#include "byte.h"
#include "configuration.h"
/**
* Size of the request preamble.
*/
#define RAFT_IO_UV__PREAMBLE_SIZE \
(sizeof(uint64_t) /* Message type. */ + \
sizeof(uint64_t) /* Message size. */)
static size_t sizeofRequestVoteV1(void)
{
return sizeof(uint64_t) + /* Term. */
sizeof(uint64_t) + /* Candidate ID. */
sizeof(uint64_t) + /* Last log index. */
sizeof(uint64_t) /* Last log term. */;
}
static size_t sizeofRequestVote(void)
{
return sizeofRequestVoteV1() +
sizeof(uint64_t) /* Leadership transfer. */;
}
static size_t sizeofRequestVoteResultV1(void)
{
return sizeof(uint64_t) + /* Term. */
sizeof(uint64_t) /* Vote granted. */;
}
static size_t sizeofRequestVoteResult(void)
{
return sizeofRequestVoteResultV1() + /* Size of older version 1 message
*/
sizeof(uint64_t) /* Flags. */;
}
static size_t sizeofAppendEntries(const struct raft_append_entries *p)
{
return sizeof(uint64_t) + /* Leader's term. */
sizeof(uint64_t) + /* Leader ID */
sizeof(uint64_t) + /* Previous log entry index */
sizeof(uint64_t) + /* Previous log entry term */
sizeof(uint64_t) + /* Leader's commit index */
sizeof(uint64_t) + /* Number of entries in the batch */
16 * p->n_entries /* One header per entry */;
}
static size_t sizeofAppendEntriesResultV0(void)
{
return sizeof(uint64_t) + /* Term. */
sizeof(uint64_t) + /* Success. */
sizeof(uint64_t) /* Last log index. */;
}
static size_t sizeofAppendEntriesResult(void)
{
return sizeofAppendEntriesResultV0() +
sizeof(uint64_t) /* 64 bit Flags. */;
}
static size_t sizeofInstallSnapshot(const struct raft_install_snapshot *p)
{
size_t conf_size = configurationEncodedSize(&p->conf);
return sizeof(uint64_t) + /* Leader's term. */
sizeof(uint64_t) + /* Leader ID */
sizeof(uint64_t) + /* Snapshot's last index */
sizeof(uint64_t) + /* Term of last index */
sizeof(uint64_t) + /* Configuration's index */
sizeof(uint64_t) + /* Length of configuration */
conf_size + /* Configuration data */
sizeof(uint64_t); /* Length of snapshot data */
}
static size_t sizeofTimeoutNow(void)
{
return sizeof(uint64_t) + /* Term. */
sizeof(uint64_t) + /* Last log index. */
sizeof(uint64_t) /* Last log term. */;
}
size_t uvSizeofBatchHeader(size_t n, bool with_local_data)
{
size_t res = 8 + /* Number of entries in the batch, little endian */
16 * n; /* One header per entry */;
if (with_local_data) {
#ifdef DQLITE_NEXT
res += 8; /* Local data length, applies to all entries */
#endif
}
return res;
}
static void encodeRequestVote(const struct raft_request_vote *p, void *buf)
{
void *cursor = buf;
uint64_t flags = 0;
if (p->disrupt_leader) {
flags |= 1 << 0;
}
if (p->pre_vote) {
flags |= 1 << 1;
}
bytePut64(&cursor, p->term);
bytePut64(&cursor, p->candidate_id);
bytePut64(&cursor, p->last_log_index);
bytePut64(&cursor, p->last_log_term);
bytePut64(&cursor, flags);
}
static void encodeRequestVoteResult(const struct raft_request_vote_result *p,
void *buf)
{
void *cursor = buf;
uint64_t flags = 0;
if (p->pre_vote) {
flags |= (1 << 0);
}
bytePut64(&cursor, p->term);
bytePut64(&cursor, p->vote_granted);
bytePut64(&cursor, flags);
}
static void encodeAppendEntries(const struct raft_append_entries *p, void *buf)
{
void *cursor;
cursor = buf;
bytePut64(&cursor, p->term); /* Leader's term. */
bytePut64(&cursor, p->prev_log_index); /* Previous index. */
bytePut64(&cursor, p->prev_log_term); /* Previous term. */
bytePut64(&cursor, p->leader_commit); /* Commit index. */
uvEncodeBatchHeader(p->entries, p->n_entries, cursor, false /* no local data */);
}
static void encodeAppendEntriesResult(
const struct raft_append_entries_result *p,
void *buf)
{
void *cursor = buf;
bytePut64(&cursor, p->term);
bytePut64(&cursor, p->rejected);
bytePut64(&cursor, p->last_log_index);
bytePut64(&cursor, p->features);
}
static void encodeInstallSnapshot(const struct raft_install_snapshot *p,
void *buf)
{
void *cursor;
size_t conf_size = configurationEncodedSize(&p->conf);
cursor = buf;
bytePut64(&cursor, p->term); /* Leader's term. */
bytePut64(&cursor, p->last_index); /* Snapshot last index. */
bytePut64(&cursor, p->last_term); /* Term of last index. */
bytePut64(&cursor, p->conf_index); /* Configuration index. */
bytePut64(&cursor, conf_size); /* Configuration length. */
configurationEncodeToBuf(&p->conf, cursor);
cursor = (uint8_t *)cursor + conf_size;
bytePut64(&cursor, p->data.len); /* Snapshot data size. */
}
static void encodeTimeoutNow(const struct raft_timeout_now *p, void *buf)
{
void *cursor = buf;
bytePut64(&cursor, p->term);
bytePut64(&cursor, p->last_log_index);
bytePut64(&cursor, p->last_log_term);
}
int uvEncodeMessage(const struct raft_message *message,
uv_buf_t **bufs,
unsigned *n_bufs)
{
uv_buf_t header;
void *cursor;
/* Figure out the length of the header for this request and allocate a
* buffer for it. */
header.len = RAFT_IO_UV__PREAMBLE_SIZE;
switch (message->type) {
case RAFT_IO_REQUEST_VOTE:
header.len += sizeofRequestVote();
break;
case RAFT_IO_REQUEST_VOTE_RESULT:
header.len += sizeofRequestVoteResult();
break;
case RAFT_IO_APPEND_ENTRIES:
header.len +=
sizeofAppendEntries(&message->append_entries);
break;
case RAFT_IO_APPEND_ENTRIES_RESULT:
header.len += sizeofAppendEntriesResult();
break;
case RAFT_IO_INSTALL_SNAPSHOT:
header.len +=
sizeofInstallSnapshot(&message->install_snapshot);
break;
case RAFT_IO_TIMEOUT_NOW:
header.len += sizeofTimeoutNow();
break;
default:
return RAFT_MALFORMED;
};
header.base = raft_malloc(header.len);
if (header.base == NULL) {
goto oom;
}
cursor = header.base;
/* Encode the request preamble, with message type and message size. */
bytePut64(&cursor, message->type);
bytePut64(&cursor, header.len - RAFT_IO_UV__PREAMBLE_SIZE);
/* Encode the request header. */
switch (message->type) {
case RAFT_IO_REQUEST_VOTE:
encodeRequestVote(&message->request_vote, cursor);
break;
case RAFT_IO_REQUEST_VOTE_RESULT:
encodeRequestVoteResult(&message->request_vote_result,
cursor);
break;
case RAFT_IO_APPEND_ENTRIES:
encodeAppendEntries(&message->append_entries, cursor);
break;
case RAFT_IO_APPEND_ENTRIES_RESULT:
encodeAppendEntriesResult(
&message->append_entries_result, cursor);
break;
case RAFT_IO_INSTALL_SNAPSHOT:
encodeInstallSnapshot(&message->install_snapshot,
cursor);
break;
case RAFT_IO_TIMEOUT_NOW:
encodeTimeoutNow(&message->timeout_now, cursor);
break;
};
*n_bufs = 1;
/* For AppendEntries request we also send the entries payload. */
if (message->type == RAFT_IO_APPEND_ENTRIES) {
*n_bufs += message->append_entries.n_entries;
}
/* For InstallSnapshot request we also send the snapshot payload. */
if (message->type == RAFT_IO_INSTALL_SNAPSHOT) {
*n_bufs += 1;
}
*bufs = raft_calloc(*n_bufs, sizeof **bufs);
if (*bufs == NULL) {
goto oom_after_header_alloc;
}
(*bufs)[0] = header;
if (message->type == RAFT_IO_APPEND_ENTRIES) {
unsigned i;
for (i = 0; i < message->append_entries.n_entries; i++) {
const struct raft_entry *entry =
&message->append_entries.entries[i];
(*bufs)[i + 1].base = entry->buf.base;
(*bufs)[i + 1].len = entry->buf.len;
}
}
if (message->type == RAFT_IO_INSTALL_SNAPSHOT) {
(*bufs)[1].base = message->install_snapshot.data.base;
(*bufs)[1].len = message->install_snapshot.data.len;
}
return 0;
oom_after_header_alloc:
raft_free(header.base);
oom:
return RAFT_NOMEM;
}
void uvEncodeBatchHeader(const struct raft_entry *entries,
unsigned n,
void *buf,
bool with_local_data)
{
unsigned i;
void *cursor = buf;
/* Number of entries in the batch, little endian */
bytePut64(&cursor, n);
if (with_local_data) {
#ifdef DQLITE_NEXT
/* Local data size per entry, little endian */
bytePut64(&cursor, (uint64_t)sizeof(struct raft_entry_local_data));
#endif
}
for (i = 0; i < n; i++) {
const struct raft_entry *entry = &entries[i];
/* Term in which the entry was created, little endian. */
bytePut64(&cursor, entry->term);
/* Message type (Either RAFT_COMMAND or RAFT_CHANGE) */
bytePut8(&cursor, (uint8_t)entry->type);
cursor = (uint8_t *)cursor + 3; /* Unused */
/* Size of the log entry data, little endian. */
bytePut32(&cursor, (uint32_t)entry->buf.len);
}
}
static void decodeRequestVote(const uv_buf_t *buf, struct raft_request_vote *p)
{
const void *cursor;
cursor = buf->base;
p->version = 1;
p->term = byteGet64(&cursor);
p->candidate_id = byteGet64(&cursor);
p->last_log_index = byteGet64(&cursor);
p->last_log_term = byteGet64(&cursor);
/* Support for legacy request vote that doesn't have disrupt_leader. */
if (buf->len == sizeofRequestVoteV1()) {
p->disrupt_leader = false;
p->pre_vote = false;
} else {
p->version = 2;
uint64_t flags = byteGet64(&cursor);
p->disrupt_leader = (bool)(flags & 1 << 0);
p->pre_vote = (bool)(flags & 1 << 1);
}
}
static void decodeRequestVoteResult(const uv_buf_t *buf,
struct raft_request_vote_result *p)
{
const void *cursor;
cursor = buf->base;
p->version = 1;
p->term = byteGet64(&cursor);
p->vote_granted = byteGet64(&cursor);
if (buf->len > sizeofRequestVoteResultV1()) {
p->version = 2;
uint64_t flags = byteGet64(&cursor);
p->pre_vote = (flags & (1 << 0));
}
}
int uvDecodeBatchHeader(const void *batch,
struct raft_entry **entries,
unsigned *n,
uint64_t *local_data_size)
{
const void *cursor = batch;
size_t i;
int rv;
*n = (unsigned)byteGet64(&cursor);
if (*n == 0) {
*entries = NULL;
return 0;
}
if (local_data_size != NULL) {
#ifdef DQLITE_NEXT
uint64_t z = byteGet64(&cursor);
if (z == 0 || z > sizeof(struct raft_entry_local_data) || z % sizeof(uint64_t) != 0) {
rv = RAFT_MALFORMED;
goto err;
}
*local_data_size = z;
#endif
}
*entries = raft_malloc(*n * sizeof **entries);
if (*entries == NULL) {
rv = RAFT_NOMEM;
goto err;
}
for (i = 0; i < *n; i++) {
struct raft_entry *entry = &(*entries)[i];
entry->term = byteGet64(&cursor);
entry->type = byteGet8(&cursor);
if (entry->type != RAFT_COMMAND &&
entry->type != RAFT_BARRIER && entry->type != RAFT_CHANGE) {
rv = RAFT_MALFORMED;
goto err_after_alloc;
}
cursor = (uint8_t *)cursor + 3; /* Unused */
/* Size of the log entry data, little endian. */
entry->buf.len = byteGet32(&cursor);
}
return 0;
err_after_alloc:
raft_free(*entries);
*entries = NULL;
err:
assert(rv != 0);
return rv;
}
static int decodeAppendEntries(const uv_buf_t *buf,
struct raft_append_entries *args)
{
const void *cursor;
int rv;
assert(buf != NULL);
assert(args != NULL);
cursor = buf->base;
args->version = 0;
args->term = byteGet64(&cursor);
args->prev_log_index = byteGet64(&cursor);
args->prev_log_term = byteGet64(&cursor);
args->leader_commit = byteGet64(&cursor);
rv = uvDecodeBatchHeader(cursor, &args->entries, &args->n_entries, false);
if (rv != 0) {
return rv;
}
return 0;
}
static void decodeAppendEntriesResult(const uv_buf_t *buf,
struct raft_append_entries_result *p)
{
const void *cursor;
cursor = buf->base;
p->version = 0;
p->term = byteGet64(&cursor);
p->rejected = byteGet64(&cursor);
p->last_log_index = byteGet64(&cursor);
p->features = 0;
if (buf->len > sizeofAppendEntriesResultV0()) {
p->version = 1;
p->features = byteGet64(&cursor);
}
}
static int decodeInstallSnapshot(const uv_buf_t *buf,
struct raft_install_snapshot *args)
{
const void *cursor;
struct raft_buffer conf;
int rv;
assert(buf != NULL);
assert(args != NULL);
cursor = buf->base;
args->version = 0;
args->term = byteGet64(&cursor);
args->last_index = byteGet64(&cursor);
args->last_term = byteGet64(&cursor);
args->conf_index = byteGet64(&cursor);
conf.len = (size_t)byteGet64(&cursor);
conf.base = (void *)cursor;
rv = configurationDecode(&conf, &args->conf);
if (rv != 0) {
return rv;
}
cursor = (uint8_t *)cursor + conf.len;
args->data.len = (size_t)byteGet64(&cursor);
return 0;
}
static void decodeTimeoutNow(const uv_buf_t *buf, struct raft_timeout_now *p)
{
const void *cursor;
cursor = buf->base;
p->version = 0;
p->term = byteGet64(&cursor);
p->last_log_index = byteGet64(&cursor);
p->last_log_term = byteGet64(&cursor);
}
int uvDecodeMessage(uint16_t type,
const uv_buf_t *header,
struct raft_message *message,
size_t *payload_len)
{
unsigned i;
int rv = 0;
memset(message, 0, sizeof(*message));
message->type = (unsigned short)type;
*payload_len = 0;
/* Decode the header. */
switch (type) {
case RAFT_IO_REQUEST_VOTE:
decodeRequestVote(header, &message->request_vote);
break;
case RAFT_IO_REQUEST_VOTE_RESULT:
decodeRequestVoteResult(header,
&message->request_vote_result);
break;
case RAFT_IO_APPEND_ENTRIES:
rv = decodeAppendEntries(header,
&message->append_entries);
for (i = 0; i < message->append_entries.n_entries;
i++) {
*payload_len +=
message->append_entries.entries[i].buf.len;
}
break;
case RAFT_IO_APPEND_ENTRIES_RESULT:
decodeAppendEntriesResult(
header, &message->append_entries_result);
break;
case RAFT_IO_INSTALL_SNAPSHOT:
rv = decodeInstallSnapshot(header,
&message->install_snapshot);
*payload_len += message->install_snapshot.data.len;
break;
case RAFT_IO_TIMEOUT_NOW:
decodeTimeoutNow(header, &message->timeout_now);
break;
default:
rv = RAFT_IOERR;
break;
};
return rv;
}
int uvDecodeEntriesBatch(uint8_t *batch,
size_t offset,
struct raft_entry *entries,
unsigned n,
uint64_t local_data_size)
{
uint8_t *cursor;
assert(batch != NULL);
cursor = batch + offset;
for (size_t i = 0; i < n; i++) {
struct raft_entry *entry = &entries[i];
entry->batch = batch;
entry->buf.base = (entry->buf.len > 0) ? cursor : NULL;
cursor += entry->buf.len;
if (entry->buf.len % 8 != 0) {
/* Add padding */
cursor = cursor + 8 - (entry->buf.len % 8);
}
entry->is_local = false;
entry->local_data = (struct raft_entry_local_data){};
assert(local_data_size <= sizeof(entry->local_data.buf));
assert(local_data_size % 8 == 0);
#ifdef DQLITE_NEXT
memcpy(entry->local_data.buf, cursor, local_data_size);
cursor += local_data_size;
#endif
}
return 0;
}
dqlite-1.16.7/src/raft/uv_encoding.h 0000664 0000000 0000000 00000003660 14652527134 0017302 0 ustar 00root root 0000000 0000000 /* Encoding routines for the the libuv-based @raft_io backend. */
#ifndef UV_ENCODING_H_
#define UV_ENCODING_H_
#include
#include "../raft.h"
/* Current disk format version. */
#ifdef DQLITE_NEXT
#define UV__DISK_FORMAT 2
#else
#define UV__DISK_FORMAT 1
#endif
int uvEncodeMessage(const struct raft_message *message,
uv_buf_t **bufs,
unsigned *n_bufs);
int uvDecodeMessage(uint16_t type,
const uv_buf_t *header,
struct raft_message *message,
size_t *payload_len);
int uvDecodeBatchHeader(const void *batch,
struct raft_entry **entries,
unsigned *n,
uint64_t *local_data_size);
int uvDecodeEntriesBatch(uint8_t *batch,
size_t offset,
struct raft_entry *entries,
unsigned n,
uint64_t local_data_size);
/**
* The layout of the memory pointed at by a @batch pointer is the following:
*
* [8 bytes] Number of entries in the batch, little endian.
* [header1] Header data of the first entry of the batch.
* [ ... ] More headers
* [headerN] Header data of the last entry of the batch.
* [data1 ] Payload data of the first entry of the batch.
* [ ... ] More data
* [dataN ] Payload data of the last entry of the batch.
*
* An entry header is 16-byte long and has the following layout:
*
* [8 bytes] Term in which the entry was created, little endian.
* [1 byte ] Message type (Either RAFT_COMMAND or RAFT_CHANGE)
* [3 bytes] Currently unused.
* [4 bytes] Size of the log entry data, little endian.
* [8 bytes] Size of the local buffer, little endian.
*
* A payload data section for an entry is simply a sequence of bytes of
* arbitrary lengths, possibly padded with extra bytes to reach 8-byte boundary
* (which means that all entry data pointers are 8-byte aligned).
*/
size_t uvSizeofBatchHeader(size_t n, bool with_local_data);
void uvEncodeBatchHeader(const struct raft_entry *entries,
unsigned n,
void *buf,
bool with_local_data);
#endif /* UV_ENCODING_H_ */
dqlite-1.16.7/src/raft/uv_finalize.c 0000664 0000000 0000000 00000010477 14652527134 0017314 0 ustar 00root root 0000000 0000000 #include "assert.h"
#include "heap.h"
#include "../lib/queue.h"
#include "uv.h"
#include "uv_os.h"
/* Metadata about an open segment not used anymore and that should be closed or
* remove (if not written at all). */
struct uvDyingSegment
{
struct uv *uv;
uvCounter counter; /* Segment counter */
size_t used; /* Number of used bytes */
raft_index first_index; /* Index of first entry */
raft_index last_index; /* Index of last entry */
int status; /* Status code of blocking syscalls */
queue queue; /* Link to finalize queue */
};
/* Run all blocking syscalls involved in closing a used open segment.
*
* An open segment is closed by truncating its length to the number of bytes
* that were actually written into it and then renaming it. */
static void uvFinalizeWorkCb(uv_work_t *work)
{
struct uvDyingSegment *segment = work->data;
struct uv *uv = segment->uv;
char filename1[UV__FILENAME_LEN];
char filename2[UV__FILENAME_LEN];
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int rv;
sprintf(filename1, UV__OPEN_TEMPLATE, segment->counter);
sprintf(filename2, UV__CLOSED_TEMPLATE, segment->first_index,
segment->last_index);
tracef("finalize %s into %s", filename1, filename2);
/* If the segment hasn't actually been used (because the writer has been
* closed or aborted before making any write), just remove it. */
if (segment->used == 0) {
tracef("remove unused segment file: %s", filename1);
rv = UvFsRemoveFile(uv->dir, filename1, errmsg);
if (rv != 0) {
goto err;
}
goto sync;
}
/* Truncate and rename the segment.*/
rv = UvFsTruncateAndRenameFile(uv->dir, segment->used, filename1,
filename2, errmsg);
if (rv != 0) {
goto err;
}
sync:
rv = UvFsSyncDir(uv->dir, errmsg);
if (rv != 0) {
goto err;
}
segment->status = 0;
return;
err:
tracef("truncate segment %s: %s", filename1, errmsg);
assert(rv != 0);
segment->status = rv;
}
static int uvFinalizeStart(struct uvDyingSegment *segment);
static void uvFinalizeAfterWorkCb(uv_work_t *work, int status)
{
struct uvDyingSegment *segment = work->data;
struct uv *uv = segment->uv;
tracef("uv finalize after work segment %p cb status:%d",
(void *)segment, status);
queue *head;
int rv;
assert(status == 0); /* We don't cancel worker requests */
uv->finalize_work.data = NULL;
if (segment->status != 0) {
uv->errored = true;
}
RaftHeapFree(segment);
/* If we have no more dismissed segments to close, check if there's a
* barrier to unblock or if we are done closing. */
if (queue_empty(&uv->finalize_reqs)) {
tracef("unblock barrier or close");
if (uv->barrier != NULL && UvBarrierReady(uv)) {
UvBarrierMaybeTrigger(uv->barrier);
}
uvMaybeFireCloseCb(uv);
return;
}
/* Grab a new dismissed segment to close. */
head = queue_head(&uv->finalize_reqs);
segment = QUEUE_DATA(head, struct uvDyingSegment, queue);
queue_remove(&segment->queue);
rv = uvFinalizeStart(segment);
if (rv != 0) {
RaftHeapFree(segment);
uv->errored = true;
}
}
/* Start finalizing an open segment. */
static int uvFinalizeStart(struct uvDyingSegment *segment)
{
struct uv *uv = segment->uv;
int rv;
assert(uv->finalize_work.data == NULL);
assert(segment->counter > 0);
uv->finalize_work.data = segment;
rv = uv_queue_work(uv->loop, &uv->finalize_work, uvFinalizeWorkCb,
uvFinalizeAfterWorkCb);
if (rv != 0) {
ErrMsgPrintf(uv->io->errmsg,
"start to truncate segment file %llu: %s",
segment->counter, uv_strerror(rv));
return RAFT_IOERR;
}
return 0;
}
int UvFinalize(struct uv *uv,
unsigned long long counter,
size_t used,
raft_index first_index,
raft_index last_index)
{
struct uvDyingSegment *segment;
int rv;
if (used > 0) {
assert(first_index > 0);
assert(last_index >= first_index);
}
segment = RaftHeapMalloc(sizeof *segment);
if (segment == NULL) {
return RAFT_NOMEM;
}
segment->uv = uv;
segment->counter = counter;
segment->used = used;
segment->first_index = first_index;
segment->last_index = last_index;
/* If we're already processing a segment, let's put the request in the
* queue and wait. */
if (uv->finalize_work.data != NULL) {
queue_insert_tail(&uv->finalize_reqs, &segment->queue);
return 0;
}
rv = uvFinalizeStart(segment);
if (rv != 0) {
RaftHeapFree(segment);
return rv;
}
return 0;
}
#undef tracef
dqlite-1.16.7/src/raft/uv_fs.c 0000664 0000000 0000000 00000046072 14652527134 0016123 0 ustar 00root root 0000000 0000000 #include "uv_fs.h"
#include
#include
#include
#include
#include "assert.h"
#include "compress.h"
#include "err.h"
#include "heap.h"
#include "uv_os.h"
int UvFsCheckDir(const char *dir, char *errmsg)
{
struct uv_fs_s req;
int rv;
/* Make sure we have a directory we can write into. */
rv = uv_fs_stat(NULL, &req, dir, NULL);
if (rv != 0) {
switch (rv) {
case UV_ENOENT:
ErrMsgPrintf((char *)errmsg,
"directory '%s' does not exist",
dir);
return RAFT_NOTFOUND;
case UV_EACCES:
ErrMsgPrintf((char *)errmsg,
"can't access directory '%s'",
dir);
return RAFT_UNAUTHORIZED;
case UV_ENOTDIR:
ErrMsgPrintf((char *)errmsg,
"path '%s' is not a directory",
dir);
return RAFT_INVALID;
}
ErrMsgPrintf((char *)errmsg, "can't stat '%s': %s", dir,
uv_strerror(rv));
return RAFT_IOERR;
}
if (!(req.statbuf.st_mode & S_IFDIR)) {
ErrMsgPrintf((char *)errmsg, "path '%s' is not a directory",
dir);
return RAFT_INVALID;
}
if (!(req.statbuf.st_mode & S_IWRITE)) {
ErrMsgPrintf((char *)errmsg, "directory '%s' is not writable",
dir);
return RAFT_INVALID;
}
return 0;
}
int UvFsSyncDir(const char *dir, char *errmsg)
{
uv_file fd;
int rv;
rv = UvOsOpen(dir, UV_FS_O_RDONLY | UV_FS_O_DIRECTORY, 0, &fd);
if (rv != 0) {
UvOsErrMsg(errmsg, "open directory", rv);
return RAFT_IOERR;
}
rv = UvOsFsync(fd);
UvOsClose(fd);
if (rv != 0) {
UvOsErrMsg(errmsg, "fsync directory", rv);
return RAFT_IOERR;
}
return 0;
}
int UvFsFileExists(const char *dir,
const char *filename,
bool *exists,
char *errmsg)
{
uv_stat_t sb;
char path[UV__PATH_SZ];
int rv;
rv = UvOsJoin(dir, filename, path);
if (rv != 0) {
return RAFT_INVALID;
}
rv = UvOsStat(path, &sb);
if (rv != 0) {
if (rv == UV_ENOENT) {
*exists = false;
goto out;
}
UvOsErrMsg(errmsg, "stat", rv);
return RAFT_IOERR;
}
*exists = true;
out:
return 0;
}
/* Get the size of the given file. */
int UvFsFileSize(const char *dir,
const char *filename,
off_t *size,
char *errmsg)
{
uv_stat_t sb;
char path[UV__PATH_SZ];
int rv;
rv = UvOsJoin(dir, filename, path);
if (rv != 0) {
return RAFT_INVALID;
}
rv = UvOsStat(path, &sb);
if (rv != 0) {
UvOsErrMsg(errmsg, "stat", rv);
return RAFT_IOERR;
}
*size = (off_t)sb.st_size;
return 0;
}
int UvFsFileIsEmpty(const char *dir,
const char *filename,
bool *empty,
char *errmsg)
{
off_t size;
int rv;
rv = UvFsFileSize(dir, filename, &size, errmsg);
if (rv != 0) {
return rv;
}
*empty = size == 0 ? true : false;
return 0;
}
/* Open a file in a directory. */
static int uvFsOpenFile(const char *dir,
const char *filename,
int flags,
int mode,
uv_file *fd,
char *errmsg)
{
char path[UV__PATH_SZ];
int rv;
rv = UvOsJoin(dir, filename, path);
if (rv != 0) {
return RAFT_INVALID;
}
rv = UvOsOpen(path, flags, mode, fd);
if (rv != 0) {
UvOsErrMsg(errmsg, "open", rv);
return RAFT_IOERR;
}
return 0;
}
int UvFsOpenFileForReading(const char *dir,
const char *filename,
uv_file *fd,
char *errmsg)
{
char path[UV__PATH_SZ];
int flags = O_RDONLY;
int rv;
rv = UvOsJoin(dir, filename, path);
if (rv != 0) {
return RAFT_INVALID;
}
return uvFsOpenFile(dir, filename, flags, 0, fd, errmsg);
}
int UvFsAllocateFile(const char *dir,
const char *filename,
size_t size,
uv_file *fd,
bool fallocate,
char *errmsg)
{
char path[UV__PATH_SZ];
int flags = O_WRONLY | O_CREAT | O_EXCL; /* Common open flags */
int rv = 0;
rv = UvOsJoin(dir, filename, path);
if (rv != 0) {
return RAFT_INVALID;
}
/* Allocate the desired size. */
if (fallocate) {
/* TODO: use RWF_DSYNC instead, if available. */
flags |= O_DSYNC;
rv = uvFsOpenFile(dir, filename, flags, S_IRUSR | S_IWUSR, fd,
errmsg);
if (rv != 0) {
goto err;
}
rv = UvOsFallocate(*fd, 0, (off_t)size);
if (rv == 0) {
return 0;
} else if (rv == UV_ENOSPC) {
ErrMsgPrintf(errmsg,
"not enough space to allocate %zu bytes",
size);
rv = RAFT_NOSPACE;
goto err_after_open;
} else {
UvOsErrMsg(errmsg, "posix_allocate", rv);
rv = RAFT_IOERR;
goto err_after_open;
}
} else {
/* Emulate fallocate, open without O_DSYNC, because we risk
* doing a lot of synced writes. */
rv = uvFsOpenFile(dir, filename, flags, S_IRUSR | S_IWUSR, fd,
errmsg);
if (rv != 0) {
goto err;
}
rv = UvOsFallocateEmulation(*fd, 0, (off_t)size);
if (rv == UV_ENOSPC) {
ErrMsgPrintf(errmsg,
"not enough space to allocate %zu bytes",
size);
rv = RAFT_NOSPACE;
goto err_after_open;
} else if (rv != 0) {
ErrMsgPrintf(errmsg, "fallocate emulation %d", rv);
rv = RAFT_IOERR;
goto err_after_open;
}
rv = UvOsFsync(*fd);
if (rv != 0) {
ErrMsgPrintf(errmsg, "fsync %d", rv);
rv = RAFT_IOERR;
goto err_after_open;
}
/* Now close and reopen the file with O_DSYNC */
rv = UvOsClose(*fd);
if (rv != 0) {
ErrMsgPrintf(errmsg, "close %d", rv);
goto err_unlink;
}
/* TODO: use RWF_DSYNC instead, if available. */
flags = O_WRONLY | O_DSYNC;
rv = uvFsOpenFile(dir, filename, flags, S_IRUSR | S_IWUSR, fd,
errmsg);
if (rv != 0) {
goto err_unlink;
}
}
return 0;
err_after_open:
UvOsClose(*fd);
err_unlink:
UvOsUnlink(path);
err:
assert(rv != 0);
return rv;
}
static int uvFsWriteFile(const char *dir,
const char *filename,
int flags,
struct raft_buffer *bufs,
unsigned n_bufs,
char *errmsg)
{
uv_file fd;
int rv;
size_t size;
unsigned i;
size = 0;
for (i = 0; i < n_bufs; i++) {
size += bufs[i].len;
}
rv = uvFsOpenFile(dir, filename, flags, S_IRUSR | S_IWUSR, &fd, errmsg);
if (rv != 0) {
goto err;
}
rv = UvOsWrite(fd, (const uv_buf_t *)bufs, n_bufs, 0);
if (rv != (int)(size)) {
if (rv < 0) {
UvOsErrMsg(errmsg, "write", rv);
} else {
ErrMsgPrintf(errmsg,
"short write: %d only bytes written", rv);
}
goto err_after_file_open;
}
rv = UvOsFsync(fd);
if (rv != 0) {
UvOsErrMsg(errmsg, "fsync", rv);
goto err_after_file_open;
}
rv = UvOsClose(fd);
if (rv != 0) {
UvOsErrMsg(errmsg, "close", rv);
goto err;
}
return 0;
err_after_file_open:
UvOsClose(fd);
err:
return rv;
}
int UvFsMakeFile(const char *dir,
const char *filename,
struct raft_buffer *bufs,
unsigned n_bufs,
char *errmsg)
{
int rv;
char tmp_filename[UV__FILENAME_LEN + 1] = {0};
char path[UV__PATH_SZ] = {0};
char tmp_path[UV__PATH_SZ] = {0};
/* Create a temp file with the given content
* TODO as of libuv 1.34.0, use `uv_fs_mkstemp` */
size_t sz = sizeof(tmp_filename);
rv = snprintf(tmp_filename, sz, TMP_FILE_FMT, filename);
if (rv < 0 || rv >= (int)sz) {
return rv;
}
int flags = UV_FS_O_WRONLY | UV_FS_O_CREAT | UV_FS_O_EXCL;
rv = uvFsWriteFile(dir, tmp_filename, flags, bufs, n_bufs, errmsg);
if (rv != 0) {
goto err_after_tmp_create;
}
/* Check if the file exists */
bool exists = false;
rv = UvFsFileExists(dir, filename, &exists, errmsg);
if (rv != 0) {
goto err_after_tmp_create;
}
if (exists) {
rv = -1;
goto err_after_tmp_create;
}
/* Rename the temp file. Remark that there is a race between the
* existence check and the rename, there is no `renameat2` equivalent in
* libuv. However, in the current implementation this should pose no
* problems.*/
rv = UvOsJoin(dir, tmp_filename, tmp_path);
if (rv != 0) {
return RAFT_INVALID;
}
rv = UvOsJoin(dir, filename, path);
if (rv != 0) {
return RAFT_INVALID;
}
rv = UvOsRename(tmp_path, path);
if (rv != 0) {
UvOsErrMsg(errmsg, "rename", rv);
goto err_after_tmp_create;
}
rv = UvFsSyncDir(dir, errmsg);
if (rv != 0) {
char ignored[RAFT_ERRMSG_BUF_SIZE];
UvFsRemoveFile(dir, filename, ignored);
return rv;
}
return 0;
err_after_tmp_create:
UvFsRemoveFile(dir, tmp_filename, errmsg);
return rv;
}
int UvFsMakeOrOverwriteFile(const char *dir,
const char *filename,
const struct raft_buffer *buf,
char *errmsg)
{
char path[UV__PATH_SZ];
int flags = UV_FS_O_WRONLY;
int mode = 0;
bool exists = true;
uv_file fd;
int rv;
rv = UvOsJoin(dir, filename, path);
if (rv != 0) {
return RAFT_INVALID;
}
open:
rv = UvOsOpen(path, flags, mode, &fd);
if (rv != 0) {
if (rv == UV_ENOENT && !(flags & UV_FS_O_CREAT)) {
exists = false;
flags |= UV_FS_O_CREAT;
mode = S_IRUSR | S_IWUSR;
goto open;
}
goto err;
}
rv = UvOsWrite(fd, (const uv_buf_t *)buf, 1, 0);
if (rv != (int)(buf->len)) {
if (rv < 0) {
UvOsErrMsg(errmsg, "write", rv);
} else {
ErrMsgPrintf(errmsg,
"short write: %d only bytes written", rv);
}
goto err_after_file_open;
}
if (exists) {
rv = UvOsFdatasync(fd);
if (rv != 0) {
UvOsErrMsg(errmsg, "fsync", rv);
goto err_after_file_open;
}
} else {
rv = UvOsFsync(fd);
if (rv != 0) {
UvOsErrMsg(errmsg, "fsync", rv);
goto err_after_file_open;
}
}
rv = UvOsClose(fd);
if (rv != 0) {
UvOsErrMsg(errmsg, "close", rv);
goto err;
}
if (!exists) {
rv = UvFsSyncDir(dir, errmsg);
if (rv != 0) {
goto err;
}
}
return 0;
err_after_file_open:
UvOsClose(fd);
err:
return RAFT_IOERR;
}
int UvFsReadInto(uv_file fd, struct raft_buffer *buf, char *errmsg)
{
ssize_t rv;
size_t offset = 0;
/* TODO: use uv_fs_read() */
while (offset < buf->len) {
rv = read(fd, (char *)buf->base + offset, buf->len - offset);
if (rv == -1) {
UvOsErrMsg(errmsg, "read", -errno);
return RAFT_IOERR;
}
/* EOF. Don't think this is reachable, but just make very sure
* we don't loop forever. */
if (rv == 0) {
break;
}
assert(rv > 0);
offset += (size_t)rv;
}
if (offset < buf->len) {
ErrMsgPrintf(errmsg, "short read: %zu bytes instead of %zu",
offset, buf->len);
return RAFT_IOERR;
}
return 0;
}
int UvFsReadFile(const char *dir,
const char *filename,
struct raft_buffer *buf,
char *errmsg)
{
uv_stat_t sb;
char path[UV__PATH_SZ];
uv_file fd;
int rv;
rv = UvOsJoin(dir, filename, path);
if (rv != 0) {
return RAFT_INVALID;
}
rv = UvOsStat(path, &sb);
if (rv != 0) {
UvOsErrMsg(errmsg, "stat", rv);
rv = RAFT_IOERR;
goto err;
}
rv = uvFsOpenFile(dir, filename, O_RDONLY, 0, &fd, errmsg);
if (rv != 0) {
goto err;
}
buf->len = (size_t)sb.st_size;
buf->base = RaftHeapMalloc(buf->len);
if (buf->base == NULL) {
ErrMsgOom(errmsg);
rv = RAFT_NOMEM;
goto err_after_open;
}
rv = UvFsReadInto(fd, buf, errmsg);
if (rv != 0) {
goto err_after_buf_alloc;
}
UvOsClose(fd);
return 0;
err_after_buf_alloc:
RaftHeapFree(buf->base);
err_after_open:
UvOsClose(fd);
err:
return rv;
}
int UvFsReadFileInto(const char *dir,
const char *filename,
struct raft_buffer *buf,
char *errmsg)
{
char path[UV__PATH_SZ];
uv_file fd;
int rv;
rv = UvOsJoin(dir, filename, path);
if (rv != 0) {
return RAFT_INVALID;
}
rv = uvFsOpenFile(dir, filename, O_RDONLY, 0, &fd, errmsg);
if (rv != 0) {
goto err;
}
rv = UvFsReadInto(fd, buf, errmsg);
if (rv != 0) {
goto err_after_open;
}
UvOsClose(fd);
return 0;
err_after_open:
UvOsClose(fd);
err:
return rv;
}
int UvFsRemoveFile(const char *dir, const char *filename, char *errmsg)
{
char path[UV__PATH_SZ];
int rv;
rv = UvOsJoin(dir, filename, path);
if (rv != 0) {
return RAFT_INVALID;
}
rv = UvOsUnlink(path);
if (rv != 0) {
UvOsErrMsg(errmsg, "unlink", rv);
return RAFT_IOERR;
}
return 0;
}
int UvFsRenameFile(const char *dir,
const char *filename1,
const char *filename2,
char *errmsg)
{
char path1[UV__PATH_SZ];
char path2[UV__PATH_SZ];
int rv;
rv = UvOsJoin(dir, filename1, path1);
if (rv != 0) {
return RAFT_INVALID;
}
rv = UvOsJoin(dir, filename2, path2);
if (rv != 0) {
return RAFT_INVALID;
}
rv = UvOsRename(path1, path2);
if (rv != 0) {
UvOsErrMsg(errmsg, "rename", rv);
return rv;
}
return 0;
}
int UvFsTruncateAndRenameFile(const char *dir,
size_t size,
const char *filename1,
const char *filename2,
char *errmsg)
{
char path1[UV__PATH_SZ];
char path2[UV__PATH_SZ];
uv_file fd;
int rv;
rv = UvOsJoin(dir, filename1, path1);
if (rv != 0) {
return RAFT_INVALID;
}
rv = UvOsJoin(dir, filename2, path2);
if (rv != 0) {
return RAFT_INVALID;
}
/* Truncate and rename. */
rv = UvOsOpen(path1, UV_FS_O_RDWR, 0, &fd);
if (rv != 0) {
UvOsErrMsg(errmsg, "open", rv);
goto err;
}
rv = UvOsTruncate(fd, (off_t)size);
if (rv != 0) {
UvOsErrMsg(errmsg, "truncate", rv);
goto err_after_open;
}
rv = UvOsFsync(fd);
if (rv != 0) {
UvOsErrMsg(errmsg, "fsync", rv);
goto err_after_open;
}
UvOsClose(fd);
rv = UvOsRename(path1, path2);
if (rv != 0) {
UvOsErrMsg(errmsg, "rename", rv);
goto err;
}
return 0;
err_after_open:
UvOsClose(fd);
err:
return RAFT_IOERR;
}
/* Check if direct I/O is possible on the given fd. */
static int probeDirectIO(int fd, size_t *size, char *errmsg)
{
struct statfs fs_info; /* To check the file system type. */
void *buf; /* Buffer to use for the probe write. */
int rv;
rv = UvOsSetDirectIo(fd);
if (rv != 0) {
if (rv != UV_EINVAL) {
/* UNTESTED: the parameters are ok, so this should never
* happen. */
UvOsErrMsg(errmsg, "fnctl", rv);
return RAFT_IOERR;
}
rv = fstatfs(fd, &fs_info);
if (rv == -1) {
/* UNTESTED: in practice ENOMEM should be the only
* failure mode */
UvOsErrMsg(errmsg, "fstatfs", -errno);
return RAFT_IOERR;
}
switch (fs_info.f_type) {
case 0x01021994: /* TMPFS_MAGIC */
case 0x2fc12fc1: /* ZFS magic */
case 0x24051905: /* UBIFS Support magic */
*size = 0;
return 0;
default:
/* UNTESTED: this is an unsupported file system.
*/
ErrMsgPrintf(errmsg,
"unsupported file system: %llx",
(unsigned long long)fs_info.f_type);
return RAFT_IOERR;
}
}
/* Try to perform direct I/O, using various buffer size. */
*size = 4096;
while (*size >= 512) {
buf = raft_aligned_alloc(*size, *size);
if (buf == NULL) {
ErrMsgOom(errmsg);
return RAFT_NOMEM;
}
memset(buf, 0, *size);
rv = (int)write(fd, buf, *size);
raft_aligned_free(*size, buf);
if (rv > 0) {
/* Since we fallocate'ed the file, we should never fail
* because of lack of disk space, and all bytes should
* have been written. */
assert(rv == (int)(*size));
return 0;
}
assert(rv == -1);
if (errno != EIO && errno != EOPNOTSUPP) {
/* UNTESTED: this should basically fail only because of
* disk errors, since we allocated the file with
* posix_fallocate. */
/* FIXME: this is a workaround because shiftfs doesn't
* return EINVAL in the fnctl call above, for example
* when the underlying fs is ZFS. */
if (errno == EINVAL && *size == 4096) {
*size = 0;
return 0;
}
UvOsErrMsg(errmsg, "write", -errno);
return RAFT_IOERR;
}
*size = *size / 2;
}
*size = 0;
return 0;
}
/* Check if fully non-blocking async I/O is possible on the given fd. */
static int probeAsyncIO(int fd, size_t size, bool *ok, char *errmsg)
{
void *buf; /* Buffer to use for the probe write */
aio_context_t ctx = 0; /* KAIO context handle */
struct iocb iocb; /* KAIO request object */
struct iocb *iocbs = &iocb; /* Because the io_submit() API sucks */
struct io_event event; /* KAIO response object */
int n_events;
int rv;
/* Setup the KAIO context handle */
rv = UvOsIoSetup(1, &ctx);
if (rv != 0) {
UvOsErrMsg(errmsg, "io_setup", rv);
/* UNTESTED: in practice this should fail only with ENOMEM */
return RAFT_IOERR;
}
/* Allocate the write buffer */
buf = raft_aligned_alloc(size, size);
if (buf == NULL) {
ErrMsgOom(errmsg);
return RAFT_NOMEM;
}
memset(buf, 0, size);
/* Prepare the KAIO request object */
memset(&iocb, 0, sizeof iocb);
iocb.aio_lio_opcode = IOCB_CMD_PWRITE;
*((void **)(&iocb.aio_buf)) = buf;
iocb.aio_nbytes = size;
iocb.aio_offset = 0;
iocb.aio_fildes = (uint32_t)fd;
iocb.aio_reqprio = 0;
iocb.aio_rw_flags |= RWF_NOWAIT | RWF_DSYNC;
/* Submit the KAIO request */
rv = UvOsIoSubmit(ctx, 1, &iocbs);
if (rv != 0) {
/* UNTESTED: in practice this should fail only with ENOMEM */
raft_aligned_free(size, buf);
UvOsIoDestroy(ctx);
/* On ZFS 0.8 this is not properly supported yet. Also, when
* running on older kernels a binary compiled on a kernel with
* RWF_NOWAIT support, we might get EINVAL. */
if (errno == EOPNOTSUPP || errno == EINVAL) {
*ok = false;
return 0;
}
UvOsErrMsg(errmsg, "io_submit", rv);
return RAFT_IOERR;
}
/* Fetch the response: will block until done. */
n_events = UvOsIoGetevents(ctx, 1, 1, &event, NULL);
assert(n_events == 1);
if (n_events != 1) {
/* UNTESTED */
UvOsErrMsg(errmsg, "UvOsIoGetevents", n_events);
return RAFT_IOERR;
}
/* Release the write buffer. */
raft_aligned_free(size, buf);
/* Release the KAIO context handle. */
rv = UvOsIoDestroy(ctx);
if (rv != 0) {
UvOsErrMsg(errmsg, "io_destroy", rv);
return RAFT_IOERR;
}
if (event.res > 0) {
assert(event.res == (int)size);
*ok = true;
} else {
/* UNTESTED: this should basically fail only because of disk
* errors, since we allocated the file with posix_fallocate and
* the block size is supposed to be correct. */
*ok = false;
}
return 0;
}
#define UV__FS_PROBE_FALLOCATE_FILE ".probe_fallocate"
/* Leave detection of other error conditions to other probe* functions, only
* bother checking if posix_fallocate returns success. */
static void probeFallocate(const char *dir, bool *fallocate)
{
int flags = O_WRONLY | O_CREAT | O_EXCL; /* Common open flags */
char ignored[RAFT_ERRMSG_BUF_SIZE];
int rv = 0;
int fd = -1;
*fallocate = false;
UvFsRemoveFile(dir, UV__FS_PROBE_FALLOCATE_FILE, ignored);
rv = uvFsOpenFile(dir, UV__FS_PROBE_FALLOCATE_FILE, flags,
S_IRUSR | S_IWUSR, &fd, ignored);
if (rv != 0) {
goto out;
}
rv = UvOsFallocate(fd, 0, (off_t)4096);
if (rv == 0) {
*fallocate = true;
}
close(fd);
out:
UvFsRemoveFile(dir, UV__FS_PROBE_FALLOCATE_FILE, ignored);
}
#define UV__FS_PROBE_FILE ".probe"
#define UV__FS_PROBE_FILE_SIZE 4096
int UvFsProbeCapabilities(const char *dir,
size_t *direct,
bool *async,
bool *fallocate,
char *errmsg)
{
int fd; /* File descriptor of the probe file */
int rv;
char ignored[RAFT_ERRMSG_BUF_SIZE];
probeFallocate(dir, fallocate);
/* Create a temporary probe file. */
UvFsRemoveFile(dir, UV__FS_PROBE_FILE, ignored);
rv = UvFsAllocateFile(dir, UV__FS_PROBE_FILE, UV__FS_PROBE_FILE_SIZE,
&fd, *fallocate, errmsg);
if (rv != 0) {
ErrMsgWrapf(errmsg, "create I/O capabilities probe file");
goto err;
}
UvFsRemoveFile(dir, UV__FS_PROBE_FILE, ignored);
/* Check if we can use direct I/O. */
rv = probeDirectIO(fd, direct, errmsg);
if (rv != 0) {
ErrMsgWrapf(errmsg, "probe Direct I/O");
goto err_after_file_open;
}
/* If direct I/O is not possible, we can't perform fully asynchronous
* I/O, because io_submit might potentially block. */
if (*direct == 0) {
*async = false;
goto out;
}
rv = probeAsyncIO(fd, *direct, async, errmsg);
if (rv != 0) {
ErrMsgWrapf(errmsg, "probe Async I/O");
goto err_after_file_open;
}
out:
close(fd);
return 0;
err_after_file_open:
close(fd);
err:
return rv;
}
dqlite-1.16.7/src/raft/uv_fs.h 0000664 0000000 0000000 00000007003 14652527134 0016117 0 ustar 00root root 0000000 0000000 /* File system related utilities. */
#ifndef UV_FS_H_
#define UV_FS_H_
#include
#include "../raft.h"
#include "err.h"
#define TMP_FILE_PREFIX "tmp-"
#define TMP_FILE_FMT TMP_FILE_PREFIX "%s"
/* Check that the given directory can be used. */
int UvFsCheckDir(const char *dir, char *errmsg);
/* Sync the given directory by calling fsync(). */
int UvFsSyncDir(const char *dir, char *errmsg);
/* Check whether a the given file exists. */
int UvFsFileExists(const char *dir,
const char *filename,
bool *exists,
char *errmsg);
/* Get the size of the given file. */
int UvFsFileSize(const char *dir,
const char *filename,
off_t *size,
char *errmsg);
/* Check whether the given file in the given directory is empty. */
int UvFsFileIsEmpty(const char *dir,
const char *filename,
bool *empty,
char *errmsg);
/* Create the given file in the given directory and allocate the given size to
* it, returning its file descriptor. The file must not exist yet. */
int UvFsAllocateFile(const char *dir,
const char *filename,
size_t size,
uv_file *fd,
bool fallocate,
char *errmsg);
/* Create a file and write the given content into it. */
int UvFsMakeFile(const char *dir,
const char *filename,
struct raft_buffer *bufs,
unsigned n_bufs,
char *errmsg);
/* Create or overwrite a file.
*
* If the file does not exists yet, it gets created, the given content written
* to it, and then fully persisted to disk by fsync()'ing the file and the
* dir.
*
* If the file already exists, it gets overwritten. The assumption is that the
* file size will stay the same and its content will change, so only fdatasync()
* will be used */
int UvFsMakeOrOverwriteFile(const char *dir,
const char *filename,
const struct raft_buffer *buf,
char *errmsg);
/* Open a file for reading. */
int UvFsOpenFileForReading(const char *dir,
const char *filename,
uv_file *fd,
char *errmsg);
/* Read exactly buf->len bytes from the given file descriptor into
buf->base. Fail if less than buf->len bytes are read. */
int UvFsReadInto(uv_file fd, struct raft_buffer *buf, char *errmsg);
/* Read all the content of the given file. */
int UvFsReadFile(const char *dir,
const char *filename,
struct raft_buffer *buf,
char *errmsg);
/* Read exactly buf->len bytes from the given file into buf->base. Fail if less
* than buf->len bytes are read. */
int UvFsReadFileInto(const char *dir,
const char *filename,
struct raft_buffer *buf,
char *errmsg);
/* Synchronously remove a file, calling the unlink() system call. */
int UvFsRemoveFile(const char *dir, const char *filename, char *errmsg);
/* Synchronously truncate a file to the given size and then rename it. */
int UvFsTruncateAndRenameFile(const char *dir,
size_t size,
const char *filename1,
const char *filename2,
char *errmsg);
/* Synchronously rename a file. */
int UvFsRenameFile(const char *dir,
const char *filename1,
const char *filename2,
char *errmsg);
/* Return information about the I/O capabilities of the underlying file
* system.
*
* The @direct parameter will be set to zero if direct I/O is not possible, or
* to the block size to use for direct I/O otherwise.
*
* The @async parameter will be set to true if fully asynchronous I/O is
* possible using the KAIO API. */
int UvFsProbeCapabilities(const char *dir,
size_t *direct,
bool *async,
bool *fallocate,
char *errmsg);
#endif /* UV_FS_H_ */
dqlite-1.16.7/src/raft/uv_ip.c 0000664 0000000 0000000 00000003263 14652527134 0016116 0 ustar 00root root 0000000 0000000 #include
#include
#include
#include "../raft.h"
#include "uv_ip.h"
static const char *strCpyUntil(char *target,
const char *source,
size_t target_size,
char separator)
{
size_t i;
for (i = 0; i < target_size; ++i) {
if (!source[i] || source[i] == separator) {
target[i] = 0;
return source + i;
} else {
target[i] = source[i];
}
}
return NULL;
}
int uvIpAddrSplit(const char *address,
char *host,
size_t host_size,
char *service,
size_t service_size)
{
char colon = ':';
const char *service_ptr = NULL;
if (host) {
service_ptr = strCpyUntil(host, address, host_size, colon);
if (!service_ptr) {
return RAFT_NAMETOOLONG;
}
}
if (service) {
if (!service_ptr) {
service_ptr = strchr(address, colon);
}
if (!service_ptr || *service_ptr == 0 ||
*(++service_ptr) == 0) {
service_ptr = "8080";
}
if (!strCpyUntil(service, service_ptr, service_size, 0)) {
return RAFT_NAMETOOLONG;
}
}
return 0;
}
/* Synchronoues resolve hostname to IP address */
int uvIpResolveBindAddresses(const char *address, struct addrinfo **ai_result)
{
static struct addrinfo hints = {
.ai_flags = AI_PASSIVE | AI_NUMERICSERV,
.ai_family = AF_INET,
.ai_socktype = SOCK_STREAM,
.ai_protocol = 0};
char hostname[NI_MAXHOST];
char service[NI_MAXSERV];
int rv;
rv = uvIpAddrSplit(address, hostname, sizeof(hostname), service,
sizeof(service));
if (rv != 0) {
return rv;
}
if (hostname[0]) {
rv = getaddrinfo(hostname, service, &hints, ai_result);
} else {
rv = getaddrinfo(NULL, service, &hints, ai_result);
}
if (rv != 0) {
return RAFT_IOERR;
}
return 0;
}
dqlite-1.16.7/src/raft/uv_ip.h 0000664 0000000 0000000 00000000647 14652527134 0016126 0 ustar 00root root 0000000 0000000 /* IP-related utils. */
#ifndef UV_IP_H_
#define UV_IP_H_
#include
/* Split @address into @host and @service. */
int uvIpAddrSplit(const char *address,
char *host,
size_t host_size,
char *service,
size_t service_size);
struct addrinfo;
/* Synchronous resolve hostname to IP address */
int uvIpResolveBindAddresses(const char *address, struct addrinfo **ai_result);
#endif /* UV_IP_H */
dqlite-1.16.7/src/raft/uv_list.c 0000664 0000000 0000000 00000004617 14652527134 0016465 0 ustar 00root root 0000000 0000000 #include
#include
#include "assert.h"
#include "uv.h"
static const char *uvListIgnored[] = {".", "..", "metadata1", "metadata2",
NULL};
/* Return true if the given filename should be ignored. */
static bool uvListShouldIgnore(const char *filename)
{
const char **cursor = uvListIgnored;
bool result = false;
if (strlen(filename) >= UV__FILENAME_LEN) {
return true;
}
while (*cursor != NULL) {
if (strcmp(filename, *cursor) == 0) {
result = true;
break;
}
cursor++;
}
return result;
}
int UvList(struct uv *uv,
struct uvSnapshotInfo *snapshots[],
size_t *n_snapshots,
struct uvSegmentInfo *segments[],
size_t *n_segments,
char *errmsg)
{
struct uv_fs_s req;
struct uv_dirent_s entry;
int n;
int i;
int rv;
n = uv_fs_scandir(NULL, &req, uv->dir, 0, NULL);
if (n < 0) {
ErrMsgPrintf(errmsg, "scan data directory: %s", uv_strerror(n));
return RAFT_IOERR;
}
*snapshots = NULL;
*n_snapshots = 0;
*segments = NULL;
*n_segments = 0;
rv = 0;
for (i = 0; i < n; i++) {
const char *filename;
bool appended;
rv = uv_fs_scandir_next(&req, &entry);
assert(rv == 0); /* Can't fail in libuv */
filename = entry.name;
/* If an error occurred while processing a preceeding entry or
* if we know that this is not a segment filename, just free it
* and skip to the next one. */
if (uvListShouldIgnore(filename)) {
tracef("ignore %s", filename);
continue;
}
/* Append to the snapshot list if it's a snapshot metadata
* filename and a valid associated snapshot file exists. */
rv = UvSnapshotInfoAppendIfMatch(uv, filename, snapshots,
n_snapshots, &appended);
if (rv != 0) {
goto error;
}
if (appended) {
tracef("snapshot %s", filename);
continue;
}
/* Append to the segment list if it's a segment filename */
rv = uvSegmentInfoAppendIfMatch(entry.name, segments,
n_segments, &appended);
if (rv != 0) {
goto error;
}
if (appended) {
tracef("segment %s", filename);
continue;
}
tracef("ignore %s", filename);
}
rv = uv_fs_scandir_next(&req, &entry);
assert(rv == UV_EOF);
if (*snapshots != NULL) {
UvSnapshotSort(*snapshots, *n_snapshots);
}
if (*segments != NULL) {
uvSegmentSort(*segments, *n_segments);
}
return 0;
error:
uv_fs_req_cleanup(&req);
raft_free(*segments);
*segments = NULL;
raft_free(*snapshots);
*snapshots = NULL;
return rv;
}
#undef tracef
dqlite-1.16.7/src/raft/uv_metadata.c 0000664 0000000 0000000 00000012445 14652527134 0017270 0 ustar 00root root 0000000 0000000 #include "assert.h"
#include "byte.h"
#include "uv.h"
#include "uv_encoding.h"
/* We have metadata1 and metadata2. */
#define METADATA_FILENAME_PREFIX "metadata"
#define METADATA_FILENAME_SIZE (sizeof(METADATA_FILENAME_PREFIX) + 2)
/* Format, version, term, vote */
#define METADATA_CONTENT_SIZE (8 * 4)
/* Encode the content of a metadata file. */
static void uvMetadataEncode(const struct uvMetadata *metadata, void *buf)
{
void *cursor = buf;
bytePut64(&cursor, UV__DISK_FORMAT);
bytePut64(&cursor, metadata->version);
bytePut64(&cursor, metadata->term);
bytePut64(&cursor, metadata->voted_for);
}
/* Decode the content of a metadata file. */
static int uvMetadataDecode(const void *buf,
struct uvMetadata *metadata,
char *errmsg)
{
const void *cursor = buf;
uint64_t format;
format = byteGet64(&cursor);
if (format != UV__DISK_FORMAT) {
ErrMsgPrintf(errmsg, "bad format version %ju", format);
return RAFT_MALFORMED;
}
metadata->version = byteGet64(&cursor);
metadata->term = byteGet64(&cursor);
metadata->voted_for = byteGet64(&cursor);
/* Coherence checks that values make sense */
if (metadata->version == 0) {
ErrMsgPrintf(errmsg, "version is set to zero");
return RAFT_CORRUPT;
}
return 0;
}
/* Render the filename of the metadata file with index @n. */
static void uvMetadataFilename(const unsigned short n, char *filename)
{
sprintf(filename, METADATA_FILENAME_PREFIX "%d", n);
}
/* Read the n'th metadata file (with n equal to 1 or 2) and decode the content
* of the file, populating the given metadata buffer accordingly. */
static int uvMetadataLoadN(const char *dir,
const unsigned short n,
struct uvMetadata *metadata,
char *errmsg)
{
char filename[METADATA_FILENAME_SIZE]; /* Filename of the metadata file
*/
uint8_t content[METADATA_CONTENT_SIZE]; /* Content of metadata file */
off_t size;
struct raft_buffer buf;
bool exists;
int rv;
assert(n == 1 || n == 2);
/* Render the metadata path */
uvMetadataFilename(n, filename);
rv = UvFsFileExists(dir, filename, &exists, errmsg);
if (rv != 0) {
ErrMsgWrapf(errmsg, "check if %s exists", filename);
return rv;
}
memset(metadata, 0, sizeof *metadata);
/* If the file does not exist, just return. */
if (!exists) {
return 0;
}
/* If the file exists but has less bytes than expected assume that the
* server crashed while writing this metadata file, and pretend it has
* not been written at all. If it has more file than expected, return an
* error. */
rv = UvFsFileSize(dir, filename, &size, errmsg);
if (rv != 0) {
ErrMsgWrapf(errmsg, "check size of %s", filename);
return rv;
}
if (size != sizeof content) {
if ((size_t)size < sizeof content) {
rv = UvFsRemoveFile(dir, filename, errmsg);
if (rv != 0) {
return rv;
}
return 0;
}
ErrMsgPrintf(errmsg, "%s has size %jd instead of %zu", filename,
(intmax_t)size, sizeof content);
return RAFT_CORRUPT;
}
/* Read the content of the metadata file. */
buf.base = content;
buf.len = sizeof content;
rv = UvFsReadFileInto(dir, filename, &buf, errmsg);
if (rv != 0) {
ErrMsgWrapf(errmsg, "read content of %s", filename);
return rv;
};
/* Decode the content of the metadata file. */
rv = uvMetadataDecode(content, metadata, errmsg);
if (rv != 0) {
ErrMsgWrapf(errmsg, "decode content of %s", filename);
return rv;
}
return 0;
}
int uvMetadataLoad(const char *dir, struct uvMetadata *metadata, char *errmsg)
{
struct uvMetadata metadata1;
struct uvMetadata metadata2;
int rv;
/* Read the two metadata files (if available). */
rv = uvMetadataLoadN(dir, 1, &metadata1, errmsg);
if (rv != 0) {
return rv;
}
rv = uvMetadataLoadN(dir, 2, &metadata2, errmsg);
if (rv != 0) {
return rv;
}
/* Check the versions. */
if (metadata1.version == 0 && metadata2.version == 0) {
/* Neither metadata file exists: have a brand new server. */
metadata->version = 0;
metadata->term = 0;
metadata->voted_for = 0;
} else if (metadata1.version == metadata2.version) {
/* The two metadata files can't have the same version. */
ErrMsgPrintf(errmsg,
"metadata1 and metadata2 are both at version %llu",
metadata1.version);
return RAFT_CORRUPT;
} else {
/* Pick the metadata with the grater version. */
if (metadata1.version > metadata2.version) {
*metadata = metadata1;
} else {
*metadata = metadata2;
}
}
return 0;
}
/* Return the metadata file index associated with the given version. */
static unsigned short uvMetadataFileIndex(unsigned long long version)
{
return version % 2 == 1 ? 1 : 2;
}
int uvMetadataStore(struct uv *uv, const struct uvMetadata *metadata)
{
char filename[METADATA_FILENAME_SIZE]; /* Filename of the metadata file
*/
uint8_t content[METADATA_CONTENT_SIZE]; /* Content of metadata file */
struct raft_buffer buf;
unsigned short n;
int rv;
assert(metadata->version > 0);
/* Encode the given metadata. */
uvMetadataEncode(metadata, content);
/* Render the metadata file name. */
n = uvMetadataFileIndex(metadata->version);
uvMetadataFilename(n, filename);
/* Write the metadata file, creating it if it does not exist. */
buf.base = content;
buf.len = sizeof content;
rv = UvFsMakeOrOverwriteFile(uv->dir, filename, &buf, uv->io->errmsg);
if (rv != 0) {
ErrMsgWrapf(uv->io->errmsg, "persist %s", filename);
return rv;
}
return 0;
}
dqlite-1.16.7/src/raft/uv_os.c 0000664 0000000 0000000 00000007772 14652527134 0016140 0 ustar 00root root 0000000 0000000 #include "uv_os.h"
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "assert.h"
#include "err.h"
#include "syscall.h"
/* Default permissions when creating a directory. */
#define DEFAULT_DIR_PERM 0700
int UvOsOpen(const char *path, int flags, int mode, uv_file *fd)
{
struct uv_fs_s req;
int rv;
rv = uv_fs_open(NULL, &req, path, flags, mode, NULL);
if (rv < 0) {
return rv;
}
*fd = rv;
return 0;
}
int UvOsClose(uv_file fd)
{
struct uv_fs_s req;
return uv_fs_close(NULL, &req, fd, NULL);
}
/* Emulate fallocate(). Mostly taken from glibc's implementation. */
int UvOsFallocateEmulation(int fd, off_t offset, off_t len)
{
ssize_t increment;
struct statfs f;
int rv;
rv = fstatfs(fd, &f);
if (rv != 0) {
return -errno;
}
if (f.f_bsize == 0) {
increment = 512;
} else if (f.f_bsize < 4096) {
increment = (ssize_t)f.f_bsize;
} else {
increment = 4096;
}
for (offset += (len - 1) % increment; len > 0; offset += increment) {
len -= increment;
rv = (int)pwrite(fd, "", 1, offset);
if (rv != 1) {
return -errno;
}
}
return 0;
}
int UvOsFallocate(uv_file fd, off_t offset, off_t len)
{
int rv;
rv = posix_fallocate(fd, offset, len);
if (rv != 0) {
/* From the manual page:
*
* posix_fallocate() returns zero on success, or an error
* number on failure. Note that errno is not set.
*/
return -rv;
}
return 0;
}
int UvOsTruncate(uv_file fd, off_t offset)
{
struct uv_fs_s req;
return uv_fs_ftruncate(NULL, &req, fd, offset, NULL);
}
int UvOsFsync(uv_file fd)
{
struct uv_fs_s req;
return uv_fs_fsync(NULL, &req, fd, NULL);
}
int UvOsFdatasync(uv_file fd)
{
struct uv_fs_s req;
return uv_fs_fdatasync(NULL, &req, fd, NULL);
}
int UvOsStat(const char *path, uv_stat_t *sb)
{
struct uv_fs_s req;
int rv;
rv = uv_fs_stat(NULL, &req, path, NULL);
if (rv != 0) {
return rv;
}
memcpy(sb, &req.statbuf, sizeof *sb);
return 0;
}
int UvOsWrite(uv_file fd,
const uv_buf_t bufs[],
unsigned int nbufs,
int64_t offset)
{
struct uv_fs_s req;
return uv_fs_write(NULL, &req, fd, bufs, nbufs, offset, NULL);
}
int UvOsUnlink(const char *path)
{
struct uv_fs_s req;
return uv_fs_unlink(NULL, &req, path, NULL);
}
int UvOsRename(const char *path1, const char *path2)
{
struct uv_fs_s req;
return uv_fs_rename(NULL, &req, path1, path2, NULL);
}
int UvOsJoin(const char *dir, const char *filename, char *path)
{
if (!UV__DIR_HAS_VALID_LEN(dir) ||
!UV__FILENAME_HAS_VALID_LEN(filename)) {
return -1;
}
strcpy(path, dir);
strcat(path, "/");
strcat(path, filename);
return 0;
}
int UvOsIoSetup(unsigned nr, aio_context_t *ctxp)
{
int rv;
rv = io_setup(nr, ctxp);
if (rv == -1) {
return -errno;
}
return 0;
}
int UvOsIoDestroy(aio_context_t ctx)
{
int rv;
rv = io_destroy(ctx);
if (rv == -1) {
return -errno;
}
return 0;
}
int UvOsIoSubmit(aio_context_t ctx, long nr, struct iocb **iocbpp)
{
int rv;
rv = io_submit(ctx, nr, iocbpp);
if (rv == -1) {
return -errno;
}
assert(rv == nr); /* TODO: can something else be returned? */
return 0;
}
int UvOsIoGetevents(aio_context_t ctx,
long min_nr,
long max_nr,
struct io_event *events,
struct timespec *timeout)
{
int rv;
do {
rv = io_getevents(ctx, min_nr, max_nr, events, timeout);
} while (rv == -1 && errno == EINTR);
if (rv == -1) {
return -errno;
}
assert(rv >= min_nr);
assert(rv <= max_nr);
return rv;
}
int UvOsEventfd(unsigned int initval, int flags)
{
int rv;
/* At the moment only UV_FS_O_NONBLOCK is supported */
assert(flags == UV_FS_O_NONBLOCK);
flags = EFD_NONBLOCK | EFD_CLOEXEC;
rv = eventfd(initval, flags);
if (rv == -1) {
return -errno;
}
return rv;
}
int UvOsSetDirectIo(uv_file fd)
{
int flags; /* Current fcntl flags */
int rv;
flags = fcntl(fd, F_GETFL);
rv = fcntl(fd, F_SETFL, flags | UV_FS_O_DIRECT);
if (rv == -1) {
return -errno;
}
return 0;
}
dqlite-1.16.7/src/raft/uv_os.h 0000664 0000000 0000000 00000005503 14652527134 0016133 0 ustar 00root root 0000000 0000000 /* Operating system related utilities. */
#ifndef UV_OS_H_
#define UV_OS_H_
#include
#include
#include
#include
#include
/* Maximum size of a full file system path string. */
#define UV__PATH_SZ 1024
/* Maximum length of a filename string. */
#define UV__FILENAME_LEN 128
/* Length of path separator. */
#define UV__SEP_LEN 1 /* strlen("/") */
/* True if STR's length is at most LEN. */
#define LEN_AT_MOST_(STR, LEN) (strnlen(STR, LEN + 1) <= LEN)
/* Maximum length of a directory path string. */
#define UV__DIR_LEN (UV__PATH_SZ - UV__SEP_LEN - UV__FILENAME_LEN - 1)
/* True if the given DIR string has at most UV__DIR_LEN chars. */
#define UV__DIR_HAS_VALID_LEN(DIR) LEN_AT_MOST_(DIR, UV__DIR_LEN)
/* True if the given FILENAME string has at most UV__FILENAME_LEN chars. */
#define UV__FILENAME_HAS_VALID_LEN(FILENAME) \
LEN_AT_MOST_(FILENAME, UV__FILENAME_LEN)
/* Portable open() */
int UvOsOpen(const char *path, int flags, int mode, uv_file *fd);
/* Portable close() */
int UvOsClose(uv_file fd);
/* TODO: figure a portable abstraction. */
int UvOsFallocate(uv_file fd, off_t offset, off_t len);
/* Emulation to use in case UvOsFallocate fails with -EONOTSUPP.
* This might happen with a libc implementation (e.g. musl) that
* doesn't implement a transparent fallback if fallocate() is
* not supported by the underlying file system. */
int UvOsFallocateEmulation(int fd, off_t offset, off_t len);
/* Portable truncate() */
int UvOsTruncate(uv_file fd, off_t offset);
/* Portable fsync() */
int UvOsFsync(uv_file fd);
/* Portable fdatasync() */
int UvOsFdatasync(uv_file fd);
/* Portable stat() */
int UvOsStat(const char *path, uv_stat_t *sb);
/* Portable write() */
int UvOsWrite(uv_file fd,
const uv_buf_t bufs[],
unsigned int nbufs,
int64_t offset);
/* Portable unlink() */
int UvOsUnlink(const char *path);
/* Portable rename() */
int UvOsRename(const char *path1, const char *path2);
/* Join dir and filename into a full OS path. */
int UvOsJoin(const char *dir, const char *filename, char *path);
/* TODO: figure a portable abstraction. */
int UvOsIoSetup(unsigned nr, aio_context_t *ctxp);
int UvOsIoDestroy(aio_context_t ctx);
int UvOsIoSubmit(aio_context_t ctx, long nr, struct iocb **iocbpp);
int UvOsIoGetevents(aio_context_t ctx,
long min_nr,
long max_nr,
struct io_event *events,
struct timespec *timeout);
int UvOsEventfd(unsigned int initval, int flags);
int UvOsSetDirectIo(uv_file fd);
/* Format an error message caused by a failed system call or stdlib function. */
#define UvOsErrMsg(ERRMSG, SYSCALL, ERRNUM) \
{ \
ErrMsgPrintf(ERRMSG, "%s", uv_strerror(ERRNUM)); \
ErrMsgWrapf(ERRMSG, SYSCALL); \
}
#endif /* UV_OS_H_ */
dqlite-1.16.7/src/raft/uv_prepare.c 0000664 0000000 0000000 00000020774 14652527134 0017152 0 ustar 00root root 0000000 0000000 #include
#include
#include "assert.h"
#include "heap.h"
#include "uv.h"
#include "uv_os.h"
/* The happy path for UvPrepare is:
*
* - If there is an unused open segment available, return its fd and counter
* immediately.
*
* - Otherwise, wait for the creation of a new open segment to complete,
* possibly kicking off the creation logic if no segment is being created
* currently.
*
* Possible failure modes are:
*
* - The create file request fails, in that case we fail all pending prepare
* requests and we mark the uv instance as errored.
*
* On close:
*
* - Cancel all pending prepare requests.
* - Remove unused prepared open segments.
* - Wait for any pending internal segment creation and then discard the newly
* created segment.
*/
/* Number of open segments that we try to keep ready for writing. */
#define UV__TARGET_POOL_SIZE 2
/* An open segment being prepared or sitting in the pool */
struct uvIdleSegment
{
struct uv *uv; /* Open segment file */
size_t size; /* Segment size */
struct uv_work_s work; /* To execute logic in the threadpool */
int status; /* Result of threadpool callback */
char errmsg[RAFT_ERRMSG_BUF_SIZE]; /* Error of threadpool callback */
unsigned long long counter; /* Segment counter */
char filename[UV__FILENAME_LEN]; /* Filename of the segment */
uv_file fd; /* File descriptor of prepared file */
queue queue; /* Pool */
};
static void uvPrepareWorkCb(uv_work_t *work)
{
struct uvIdleSegment *segment = work->data;
struct uv *uv = segment->uv;
int rv;
rv = UvFsAllocateFile(uv->dir, segment->filename, segment->size,
&segment->fd, uv->fallocate, segment->errmsg);
if (rv != 0) {
goto err;
}
rv = UvFsSyncDir(uv->dir, segment->errmsg);
if (rv != 0) {
goto err_after_allocate;
}
segment->status = 0;
return;
err_after_allocate:
UvOsClose(segment->fd);
err:
assert(rv != 0);
segment->status = rv;
return;
}
/* Flush all pending requests, invoking their callbacks with the given
* status. */
static void uvPrepareFinishAllRequests(struct uv *uv, int status)
{
while (!queue_empty(&uv->prepare_reqs)) {
queue *head;
struct uvPrepare *req;
head = queue_head(&uv->prepare_reqs);
req = QUEUE_DATA(head, struct uvPrepare, queue);
queue_remove(&req->queue);
req->cb(req, status);
}
}
/* Pop the oldest prepared segment in the pool and return its fd and counter
* through the given pointers. */
static void uvPrepareConsume(struct uv *uv, uv_file *fd, uvCounter *counter)
{
queue *head;
struct uvIdleSegment *segment;
/* Pop a segment from the pool. */
head = queue_head(&uv->prepare_pool);
segment = QUEUE_DATA(head, struct uvIdleSegment, queue);
assert(segment->fd >= 0);
queue_remove(&segment->queue);
*fd = segment->fd;
*counter = segment->counter;
RaftHeapFree(segment);
}
/* Finish the oldest pending prepare request using the next available prepared
* segment. */
static void uvPrepareFinishOldestRequest(struct uv *uv)
{
queue *head;
struct uvPrepare *req;
assert(!uv->closing);
assert(!queue_empty(&uv->prepare_reqs));
assert(!queue_empty(&uv->prepare_pool));
/* Pop the head of the prepare requests queue. */
head = queue_head(&uv->prepare_reqs);
req = QUEUE_DATA(head, struct uvPrepare, queue);
queue_remove(&req->queue);
/* Finish the request */
uvPrepareConsume(uv, &req->fd, &req->counter);
req->cb(req, 0);
}
/* Return the number of ready prepared open segments in the pool. */
static unsigned uvPrepareCount(struct uv *uv)
{
queue *head;
unsigned n;
n = 0;
QUEUE_FOREACH(head, &uv->prepare_pool)
{
n++;
}
return n;
}
static void uvPrepareAfterWorkCb(uv_work_t *work, int status);
/* Start creating a new segment file. */
static int uvPrepareStart(struct uv *uv)
{
struct uvIdleSegment *segment;
int rv;
assert(uv->prepare_inflight == NULL);
assert(uvPrepareCount(uv) < UV__TARGET_POOL_SIZE);
segment = RaftHeapMalloc(sizeof *segment);
if (segment == NULL) {
rv = RAFT_NOMEM;
goto err;
}
memset(segment, 0, sizeof *segment);
segment->uv = uv;
segment->counter = uv->prepare_next_counter;
segment->work.data = segment;
segment->fd = -1;
segment->size = uv->block_size * uvSegmentBlocks(uv);
sprintf(segment->filename, UV__OPEN_TEMPLATE, segment->counter);
tracef("create open segment %s", segment->filename);
rv = uv_queue_work(uv->loop, &segment->work, uvPrepareWorkCb,
uvPrepareAfterWorkCb);
if (rv != 0) {
/* UNTESTED: with the current libuv implementation this can't
* fail. */
tracef("can't create segment %s: %s", segment->filename,
uv_strerror(rv));
rv = RAFT_IOERR;
goto err_after_segment_alloc;
}
uv->prepare_inflight = segment;
uv->prepare_next_counter++;
return 0;
err_after_segment_alloc:
RaftHeapFree(segment);
err:
assert(rv != 0);
return rv;
}
static void uvPrepareAfterWorkCb(uv_work_t *work, int status)
{
struct uvIdleSegment *segment = work->data;
struct uv *uv = segment->uv;
int rv;
assert(status == 0);
uv->prepare_inflight =
NULL; /* Reset the creation in-progress marker. */
/* If we are closing, let's discard the segment. All pending requests
* have already being fired with RAFT_CANCELED. */
if (uv->closing) {
assert(queue_empty(&uv->prepare_pool));
assert(queue_empty(&uv->prepare_reqs));
if (segment->status == 0) {
char errmsg[RAFT_ERRMSG_BUF_SIZE];
UvOsClose(segment->fd);
UvFsRemoveFile(uv->dir, segment->filename, errmsg);
}
tracef("canceled creation of %s", segment->filename);
RaftHeapFree(segment);
uvMaybeFireCloseCb(uv);
return;
}
/* If the request has failed, mark all pending requests as failed and
* don't try to create any further segment.
*
* Note that if there's no pending request, we don't set the error
* message, to avoid overwriting previous errors. */
if (segment->status != 0) {
if (!queue_empty(&uv->prepare_reqs)) {
ErrMsgTransferf(segment->errmsg, uv->io->errmsg,
"create segment %s", segment->filename);
uvPrepareFinishAllRequests(uv, segment->status);
}
uv->errored = true;
RaftHeapFree(segment);
return;
}
assert(segment->fd >= 0);
tracef("completed creation of %s", segment->filename);
queue_insert_tail(&uv->prepare_pool, &segment->queue);
/* Let's process any pending request. */
if (!queue_empty(&uv->prepare_reqs)) {
uvPrepareFinishOldestRequest(uv);
}
/* If we are already creating a segment, we're done. */
if (uv->prepare_inflight != NULL) {
return;
}
/* If we have already enough prepared open segments, we're done. There
* can't be any outstanding prepare requests, since if the request queue
* was not empty, we would have called uvPrepareFinishOldestRequest()
* above, thus reducing the pool size and making it smaller than the
* target size. */
if (uvPrepareCount(uv) >= UV__TARGET_POOL_SIZE) {
assert(queue_empty(&uv->prepare_reqs));
return;
}
/* Let's start preparing a new open segment. */
rv = uvPrepareStart(uv);
if (rv != 0) {
uvPrepareFinishAllRequests(uv, rv);
uv->errored = true;
}
}
/* Discard a prepared open segment, closing its file descriptor and removing the
* underlying file. */
static void uvPrepareDiscard(struct uv *uv, uv_file fd, uvCounter counter)
{
char errmsg[RAFT_ERRMSG_BUF_SIZE];
char filename[UV__FILENAME_LEN];
assert(counter > 0);
assert(fd >= 0);
sprintf(filename, UV__OPEN_TEMPLATE, counter);
UvOsClose(fd);
UvFsRemoveFile(uv->dir, filename, errmsg);
}
int UvPrepare(struct uv *uv,
uv_file *fd,
uvCounter *counter,
struct uvPrepare *req,
uvPrepareCb cb)
{
int rv;
assert(!uv->closing);
if (!queue_empty(&uv->prepare_pool)) {
uvPrepareConsume(uv, fd, counter);
goto maybe_start;
}
*fd = -1;
*counter = 0;
req->cb = cb;
queue_insert_tail(&uv->prepare_reqs, &req->queue);
maybe_start:
/* If we are already creating a segment, let's just wait. */
if (uv->prepare_inflight != NULL) {
return 0;
}
rv = uvPrepareStart(uv);
if (rv != 0) {
goto err;
}
return 0;
err:
if (*fd != -1) {
uvPrepareDiscard(uv, *fd, *counter);
} else {
queue_remove(&req->queue);
}
assert(rv != 0);
return rv;
}
void UvPrepareClose(struct uv *uv)
{
assert(uv->closing);
/* Cancel all pending prepare requests. */
uvPrepareFinishAllRequests(uv, RAFT_CANCELED);
/* Remove any unused prepared segment. */
while (!queue_empty(&uv->prepare_pool)) {
queue *head;
struct uvIdleSegment *segment;
head = queue_head(&uv->prepare_pool);
segment = QUEUE_DATA(head, struct uvIdleSegment, queue);
queue_remove(&segment->queue);
uvPrepareDiscard(uv, segment->fd, segment->counter);
RaftHeapFree(segment);
}
}
#undef tracef
dqlite-1.16.7/src/raft/uv_recv.c 0000664 0000000 0000000 00000025410 14652527134 0016443 0 ustar 00root root 0000000 0000000 #include
#include "../raft.h"
#include "assert.h"
#include "byte.h"
#include "configuration.h"
#include "err.h"
#include "heap.h"
#include "uv.h"
#include "uv_encoding.h"
/* The happy path for a receiving an RPC message is:
*
* - When a peer server successfully establishes a new connection with us, the
* transport invokes our accept callback.
*
* - A new server object is created and added to the servers array. It starts
* reading from the stream handle of the new connection.
*
* - The RPC message preamble is read, which contains the message type and the
* message length.
*
* - The RPC message header is read, whose content depends on the message type.
*
* - Optionally, the RPC message payload is read (for AppendEntries requests).
*
* - The recv callback passed to raft_io->start() gets fired with the received
* message.
*
* Possible failure modes are:
*
* - The peer server disconnects. In this case the read callback will fire with
* UV_EOF, we'll close the stream handle and then release all memory
* associated with the server object.
*
* - The peer server sends us invalid data. In this case we close the stream
* handle and act like above.
*/
struct uvServer
{
struct uv *uv; /* libuv I/O implementation object */
raft_id id; /* ID of the remote server */
char *address; /* Address of the other server */
struct uv_stream_s *stream; /* Connection handle */
uv_buf_t buf; /* Sliding buffer for reading incoming data */
uint64_t preamble[2]; /* Static buffer with the request preamble */
uv_buf_t header; /* Dynamic buffer with the request header */
uv_buf_t payload; /* Dynamic buffer with the request payload */
struct raft_message message; /* The message being received */
queue queue; /* Servers queue */
};
/* Initialize a new server object for reading requests from an incoming
* connection. */
static int uvServerInit(struct uvServer *s,
struct uv *uv,
const raft_id id,
const char *address,
struct uv_stream_s *stream)
{
s->uv = uv;
s->id = id;
s->address = RaftHeapMalloc(strlen(address) + 1);
if (s->address == NULL) {
return RAFT_NOMEM;
}
strcpy(s->address, address);
s->stream = stream;
s->stream->data = s;
s->buf.base = NULL;
s->buf.len = 0;
s->preamble[0] = 0;
s->preamble[1] = 0;
s->header.base = NULL;
s->header.len = 0;
s->message.type = 0;
s->payload.base = NULL;
s->payload.len = 0;
queue_insert_tail(&uv->servers, &s->queue);
return 0;
}
static void uvServerDestroy(struct uvServer *s)
{
queue_remove(&s->queue);
if (s->header.base != NULL) {
/* This means we were interrupted while reading the header. */
RaftHeapFree(s->header.base);
switch (s->message.type) {
case RAFT_IO_APPEND_ENTRIES:
RaftHeapFree(s->message.append_entries.entries);
break;
case RAFT_IO_INSTALL_SNAPSHOT:
configurationClose(
&s->message.install_snapshot.conf);
break;
}
}
if (s->payload.base != NULL) {
/* This means we were interrupted while reading the payload. */
RaftHeapFree(s->payload.base);
}
RaftHeapFree(s->address);
RaftHeapFree(s->stream);
}
/* Invoked to initialize the read buffer for the next asynchronous read on the
* socket. */
static void uvServerAllocCb(uv_handle_t *handle,
size_t suggested_size,
uv_buf_t *buf)
{
struct uvServer *s = handle->data;
(void)suggested_size;
assert(!s->uv->closing);
/* If this is the first read of the preamble, or of the header, or of
* the payload, then initialize the read buffer, according to the chunk
* of data that we expect next. */
if (s->buf.len == 0) {
assert(s->buf.base == NULL);
/* Check if we expect the preamble. */
if (s->header.len == 0) {
assert(s->preamble[0] == 0);
assert(s->preamble[1] == 0);
s->buf.base = (char *)s->preamble;
s->buf.len = sizeof s->preamble;
goto out;
}
/* Check if we expect the header. */
if (s->payload.len == 0) {
assert(s->header.len > 0);
assert(s->header.base == NULL);
s->header.base = RaftHeapMalloc(s->header.len);
if (s->header.base == NULL) {
/* Setting all buffer fields to 0 will make
* read_cb fail with ENOBUFS. */
memset(buf, 0, sizeof *buf);
return;
}
s->buf = s->header;
goto out;
}
/* If we get here we should be expecting the payload. */
assert(s->payload.len > 0);
s->payload.base = RaftHeapMalloc(s->payload.len);
if (s->payload.base == NULL) {
/* Setting all buffer fields to 0 will make read_cb fail
* with ENOBUFS. */
memset(buf, 0, sizeof *buf);
return;
}
s->buf = s->payload;
}
out:
*buf = s->buf;
}
/* Callback invoked afer the stream handle of this server connection has been
* closed. We can release all resources associated with the server object. */
static void uvServerStreamCloseCb(uv_handle_t *handle)
{
struct uvServer *s = handle->data;
struct uv *uv = s->uv;
uvServerDestroy(s);
RaftHeapFree(s);
uvMaybeFireCloseCb(uv);
}
static void uvServerAbort(struct uvServer *s)
{
struct uv *uv = s->uv;
queue_remove(&s->queue);
queue_insert_tail(&uv->aborting, &s->queue);
uv_close((struct uv_handle_s *)s->stream, uvServerStreamCloseCb);
}
/* Invoke the receive callback. */
static void uvFireRecvCb(struct uvServer *s)
{
s->uv->recv_cb(s->uv->io, &s->message);
/* Reset our state as we'll start reading a new message. We don't need
* to release the payload buffer, since ownership was transferred to the
* user. */
memset(s->preamble, 0, sizeof s->preamble);
raft_free(s->header.base);
s->message.type = 0;
s->header.base = NULL;
s->header.len = 0;
s->payload.base = NULL;
s->payload.len = 0;
}
/* Callback invoked when data has been read from the socket. */
static void uvServerReadCb(uv_stream_t *stream,
ssize_t nread,
const uv_buf_t *buf)
{
struct uvServer *s = stream->data;
int rv;
(void)buf;
assert(!s->uv->closing);
/* If the read was successful, let's check if we have received all the
* data we expected. */
if (nread > 0) {
size_t n = (size_t)nread;
/* We shouldn't have read more data than the pending amount. */
assert(n <= s->buf.len);
/* Advance the read window */
s->buf.base += n;
s->buf.len -= n;
/* If there's more data to read in order to fill the current
* read buffer, just return, we'll be invoked again. */
if (s->buf.len > 0) {
return;
}
if (s->header.len == 0) {
/* If the header buffer is not set, it means that we've
* just completed reading the preamble. */
assert(s->header.base == NULL);
s->header.len = (size_t)byteFlip64(s->preamble[1]);
/* The length of the header must be greater than zero.
*/
if (s->header.len == 0) {
tracef("message has zero length");
goto abort;
}
} else if (s->payload.len == 0) {
/* If the payload buffer is not set, it means we just
* completed reading the message header. */
uint64_t type;
assert(s->header.base != NULL);
type = byteFlip64(s->preamble[0]);
/* Only use first 2 bytes of the type. Normally we would
* check if type doesn't overflow UINT16_MAX, but we
* don't do this to allow future legacy nodes to still
* handle messages that include extra information in the
* 6 unused bytes of the type field of the preamble.
* TODO: This is preparation to add the version of the
* message in the raft preamble. Once this change has
* been active for sufficiently long time, we can start
* encoding the version in some of the remaining bytes
* of s->preamble[0]. */
rv = uvDecodeMessage((uint16_t)type, &s->header,
&s->message, &s->payload.len);
if (rv != 0) {
tracef("decode message: %s",
errCodeToString(rv));
goto abort;
}
s->message.server_id = s->id;
s->message.server_address = s->address;
/* If the message has no payload, we're done. */
if (s->payload.len == 0) {
uvFireRecvCb(s);
}
} else {
/* If we get here it means that we've just completed
* reading the payload. TODO: avoid converting from
* uv_buf_t */
struct raft_buffer payload;
assert(s->payload.base != NULL);
assert(s->payload.len > 0);
switch (s->message.type) {
case RAFT_IO_APPEND_ENTRIES:
payload.base = s->payload.base;
payload.len = s->payload.len;
(void)uvDecodeEntriesBatch(
payload.base, 0,
s->message.append_entries.entries,
s->message.append_entries
.n_entries,
false);
break;
case RAFT_IO_INSTALL_SNAPSHOT:
s->message.install_snapshot.data.base =
s->payload.base;
break;
default:
/* We should never have read a payload
* in the first place */
assert(0);
}
uvFireRecvCb(s);
}
/* Mark that we're done with this chunk. When the alloc callback
* will trigger again it will notice that it needs to change the
* read buffer. */
assert(s->buf.len == 0);
s->buf.base = NULL;
return;
}
/* The if nread>0 condition above should always exit the function with a
* goto abort or a return. */
assert(nread <= 0);
if (nread == 0) {
/* Empty read */
return;
}
if (nread != UV_EOF) {
tracef("receive data: %s", uv_strerror((int)nread));
}
abort:
uvServerAbort(s);
}
/* Start reading incoming requests. */
static int uvServerStart(struct uvServer *s)
{
int rv;
rv = uv_read_start(s->stream, uvServerAllocCb, uvServerReadCb);
if (rv != 0) {
tracef("start reading: %s", uv_strerror(rv));
return RAFT_IOERR;
}
return 0;
}
static int uvAddServer(struct uv *uv,
raft_id id,
const char *address,
struct uv_stream_s *stream)
{
struct uvServer *server;
int rv;
/* Initialize the new connection */
server = RaftHeapMalloc(sizeof *server);
if (server == NULL) {
rv = RAFT_NOMEM;
goto err;
}
rv = uvServerInit(server, uv, id, address, stream);
if (rv != 0) {
goto err_after_server_alloc;
}
/* This will start reading requests. */
rv = uvServerStart(server);
if (rv != 0) {
goto err_after_init_server;
}
return 0;
err_after_init_server:
uvServerDestroy(server);
err_after_server_alloc:
raft_free(server);
err:
assert(rv != 0);
return rv;
}
static void uvRecvAcceptCb(struct raft_uv_transport *transport,
raft_id id,
const char *address,
struct uv_stream_s *stream)
{
struct uv *uv = transport->data;
int rv;
assert(!uv->closing);
rv = uvAddServer(uv, id, address, stream);
if (rv != 0) {
tracef("add server: %s", errCodeToString(rv));
uv_close((struct uv_handle_s *)stream,
(uv_close_cb)RaftHeapFree);
}
}
int UvRecvStart(struct uv *uv)
{
int rv;
rv = uv->transport->listen(uv->transport, uvRecvAcceptCb);
if (rv != 0) {
return rv;
}
return 0;
}
void UvRecvClose(struct uv *uv)
{
while (!queue_empty(&uv->servers)) {
queue *head;
struct uvServer *server;
head = queue_head(&uv->servers);
server = QUEUE_DATA(head, struct uvServer, queue);
uvServerAbort(server);
}
}
#undef tracef
dqlite-1.16.7/src/raft/uv_segment.c 0000664 0000000 0000000 00000072022 14652527134 0017147 0 ustar 00root root 0000000 0000000 #include
#include
#include
#include
#include
#include
#include
#include "array.h"
#include "assert.h"
#include "byte.h"
#include "configuration.h"
#include "entry.h"
#include "heap.h"
#include "uv.h"
#include "uv_encoding.h"
/* Check if the given filename matches the one of a closed segment (xxx-yyy), or
* of an open segment (open-xxx), and fill the given info structure if so.
*
* Return true if the filename matched, false otherwise. */
static bool uvSegmentInfoMatch(const char *filename, struct uvSegmentInfo *info)
{
int consumed;
int matched;
size_t n;
size_t filename_len = strnlen(filename, UV__FILENAME_LEN + 1);
assert(filename_len < UV__FILENAME_LEN);
matched = sscanf(filename, UV__CLOSED_TEMPLATE "%n", &info->first_index,
&info->end_index, &consumed);
if (matched == 2 && consumed == (int)filename_len) {
info->is_open = false;
goto match;
}
matched =
sscanf(filename, UV__OPEN_TEMPLATE "%n", &info->counter, &consumed);
if (matched == 1 && consumed == (int)filename_len) {
info->is_open = true;
goto match;
}
return false;
match:
n = sizeof(info->filename) - 1;
strncpy(info->filename, filename, n);
info->filename[n] = '\0';
return true;
}
int uvSegmentInfoAppendIfMatch(const char *filename,
struct uvSegmentInfo *infos[],
size_t *n_infos,
bool *appended)
{
struct uvSegmentInfo info;
bool matched;
int rv;
/* Check if it's a closed or open filename */
matched = uvSegmentInfoMatch(filename, &info);
/* If this is neither a closed or an open segment, return. */
if (!matched) {
*appended = false;
return 0;
}
ARRAY__APPEND(struct uvSegmentInfo, info, infos, n_infos, rv);
if (rv == -1) {
return RAFT_NOMEM;
}
*appended = true;
return 0;
}
/* Compare two segments to decide which one is more recent. */
static int uvSegmentInfoCompare(const void *p1, const void *p2)
{
struct uvSegmentInfo *s1 = (struct uvSegmentInfo *)p1;
struct uvSegmentInfo *s2 = (struct uvSegmentInfo *)p2;
/* Closed segments are less recent than open segments. */
if (s1->is_open && !s2->is_open) {
return 1;
}
if (!s1->is_open && s2->is_open) {
return -1;
}
/* If the segments are open, compare the counter. */
if (s1->is_open) {
assert(s2->is_open);
assert(s1->counter != s2->counter);
return s1->counter < s2->counter ? -1 : 1;
}
/* If the segments are closed, compare the first index. The index ranges
* must be disjoint. */
if (s2->first_index > s1->end_index) {
return -1;
}
return 1;
}
void uvSegmentSort(struct uvSegmentInfo *infos, size_t n_infos)
{
qsort(infos, n_infos, sizeof *infos, uvSegmentInfoCompare);
}
int uvSegmentKeepTrailing(struct uv *uv,
struct uvSegmentInfo *segments,
size_t n,
raft_index last_index,
size_t trailing,
char *errmsg)
{
raft_index retain_index;
size_t i;
int rv;
assert(last_index > 0);
assert(n > 0);
if (last_index <= trailing) {
return 0;
}
/* Index of the oldest entry we want to retain. */
retain_index = last_index - trailing + 1;
for (i = 0; i < n; i++) {
struct uvSegmentInfo *segment = &segments[i];
if (segment->is_open) {
break;
}
if (trailing == 0 || segment->end_index < retain_index) {
rv = UvFsRemoveFile(uv->dir, segment->filename, errmsg);
if (rv != 0) {
ErrMsgWrapf(errmsg, "delete closed segment %s",
segment->filename);
return rv;
}
} else {
break;
}
}
return 0;
}
/* Read a segment file and return its format version. */
static int uvReadSegmentFile(struct uv *uv,
const char *filename,
struct raft_buffer *buf,
uint64_t *format)
{
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int rv;
rv = UvFsReadFile(uv->dir, filename, buf, errmsg);
if (rv != 0) {
ErrMsgTransfer(errmsg, uv->io->errmsg, "read file");
return RAFT_IOERR;
}
if (buf->len < 8) {
ErrMsgPrintf(uv->io->errmsg, "file has only %zu bytes",
buf->len);
RaftHeapFree(buf->base);
return RAFT_IOERR;
}
*format = byteFlip64(*(uint64_t *)buf->base);
return 0;
}
/* Consume the content buffer, returning a pointer to the current position and
* advancing the offset of n bytes. Return an error if not enough bytes are
* available. */
static int uvConsumeContent(const struct raft_buffer *content,
size_t *offset,
size_t n,
void **data,
char *errmsg)
{
if (*offset + n > content->len) {
size_t remaining = content->len - *offset;
ErrMsgPrintf(errmsg, "short read: %zu bytes instead of %zu",
remaining, n);
return RAFT_IOERR;
}
if (data != NULL) {
*data = &((uint8_t *)content->base)[*offset];
}
*offset += n;
return 0;
}
/* Load a single batch of entries from a segment.
*
* Set @last to #true if the loaded batch is the last one. */
static int uvLoadEntriesBatch(struct uv *uv,
const struct raft_buffer *content,
struct raft_entry **entries,
unsigned *n_entries,
size_t *offset, /* Offset of last batch */
bool *last)
{
void *checksums; /* CRC32 checksums */
void *batch; /* Entries batch */
unsigned long n; /* Number of entries in the batch */
unsigned max_n; /* Maximum number of entries we expect */
unsigned i; /* Iterate through the entries */
struct raft_buffer header; /* Batch header */
struct raft_buffer data; /* Batch data */
uint32_t crc1; /* Target checksum */
uint32_t crc2; /* Actual checksum */
char errmsg[RAFT_ERRMSG_BUF_SIZE];
size_t start;
int rv;
/* Save the current offset, to provide more information when logging. */
start = *offset;
/* Read the checksums. */
rv = uvConsumeContent(content, offset, sizeof(uint32_t) * 2, &checksums,
errmsg);
if (rv != 0) {
ErrMsgTransfer(errmsg, uv->io->errmsg, "read preamble");
return RAFT_IOERR;
}
/* Read the first 8 bytes of the batch, which contains the number of
* entries in the batch. */
rv =
uvConsumeContent(content, offset, sizeof(uint64_t), &batch, errmsg);
if (rv != 0) {
ErrMsgTransfer(errmsg, uv->io->errmsg, "read preamble");
return RAFT_IOERR;
}
n = (size_t)byteFlip64(*(uint64_t *)batch);
if (n == 0) {
ErrMsgPrintf(uv->io->errmsg,
"entries count in preamble is zero");
rv = RAFT_CORRUPT;
goto err;
}
/* Very optimistic upper bound of the number of entries we should
* expect. This is mainly a protection against allocating too much
* memory. Each entry will consume at least 4 words (for term, type,
* size and payload). */
max_n = UV__MAX_SEGMENT_SIZE / (sizeof(uint64_t) * 4);
if (n > max_n) {
ErrMsgPrintf(uv->io->errmsg,
"entries count %lu in preamble is too high", n);
rv = RAFT_CORRUPT;
goto err;
}
/* Consume the batch header, excluding the first 8 bytes containing the
* number of entries, which we have already read. */
header.len = uvSizeofBatchHeader(n, true);
header.base = batch;
rv = uvConsumeContent(content, offset,
uvSizeofBatchHeader(n, true) - sizeof(uint64_t), NULL,
errmsg);
if (rv != 0) {
ErrMsgTransfer(errmsg, uv->io->errmsg, "read header");
rv = RAFT_IOERR;
goto err;
}
/* Check batch header integrity. */
crc1 = byteFlip32(((uint32_t *)checksums)[0]);
crc2 = byteCrc32(header.base, header.len, 0);
if (crc1 != crc2) {
ErrMsgPrintf(uv->io->errmsg, "header checksum mismatch");
rv = RAFT_CORRUPT;
goto err;
}
/* Decode the batch header, allocating the entries array. */
uint64_t local_data_size = 0;
rv = uvDecodeBatchHeader(header.base, entries, n_entries, &local_data_size);
if (rv != 0) {
goto err;
}
/* Calculate the total size of the batch data. TODO this computation
* should be rolled into the actual parsing part somehow. */
data.len = 0;
for (i = 0; i < n; i++) {
data.len += (*entries)[i].buf.len;
#ifdef DQLITE_NEXT
data.len += sizeof((*entries)[i].local_data);
#endif
}
data.base = (uint8_t *)content->base + *offset;
/* Consume the batch data */
rv = uvConsumeContent(content, offset, data.len, NULL, errmsg);
if (rv != 0) {
ErrMsgTransfer(errmsg, uv->io->errmsg, "read data");
rv = RAFT_IOERR;
goto err_after_header_decode;
}
/* Check batch data integrity. */
crc1 = byteFlip32(((uint32_t *)checksums)[1]);
crc2 = byteCrc32(data.base, data.len, 0);
if (crc1 != crc2) {
tracef("batch is bad");
ErrMsgPrintf(uv->io->errmsg, "data checksum mismatch");
rv = RAFT_CORRUPT;
goto err_after_header_decode;
}
rv = uvDecodeEntriesBatch(content->base, *offset - data.len, *entries,
*n_entries, local_data_size);
if (rv != 0) {
goto err_after_header_decode;
}
*last = *offset == content->len;
return 0;
err_after_header_decode:
RaftHeapFree(*entries);
err:
*entries = NULL;
*n_entries = 0;
assert(rv != 0);
*offset = start;
return rv;
}
/* Append to @entries2 all entries in @entries1. */
static int extendEntries(const struct raft_entry *entries1,
const size_t n_entries1,
struct raft_entry **entries2,
size_t *n_entries2)
{
struct raft_entry *entries; /* To re-allocate the given entries */
size_t i;
entries = raft_realloc(*entries2,
(*n_entries2 + n_entries1) * sizeof *entries);
if (entries == NULL) {
return RAFT_NOMEM;
}
for (i = 0; i < n_entries1; i++) {
entries[*n_entries2 + i] = entries1[i];
}
*entries2 = entries;
*n_entries2 += n_entries1;
return 0;
}
int uvSegmentLoadClosed(struct uv *uv,
struct uvSegmentInfo *info,
struct raft_entry *entries[],
size_t *n)
{
bool empty; /* Whether the file is empty */
uint64_t format; /* Format version */
bool last; /* Whether the last batch was reached */
struct raft_entry *tmp_entries; /* Entries in current batch */
struct raft_buffer buf; /* Segment file content */
size_t offset; /* Content read cursor */
unsigned tmp_n; /* Number of entries in current batch */
unsigned expected_n; /* Number of entries that we expect to find */
int i;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int rv;
expected_n = (unsigned)(info->end_index - info->first_index + 1);
/* If the segment is completely empty, just bail out. */
rv = UvFsFileIsEmpty(uv->dir, info->filename, &empty, errmsg);
if (rv != 0) {
tracef("stat %s: %s", info->filename, errmsg);
rv = RAFT_IOERR;
goto err;
}
if (empty) {
ErrMsgPrintf(uv->io->errmsg, "file is empty");
rv = RAFT_CORRUPT;
goto err;
}
/* Open the segment file. */
rv = uvReadSegmentFile(uv, info->filename, &buf, &format);
if (rv != 0) {
goto err;
}
if (format != UV__DISK_FORMAT) {
ErrMsgPrintf(uv->io->errmsg, "unexpected format version %ju",
format);
rv = RAFT_CORRUPT;
goto err_after_read;
}
/* Load all batches in the segment. */
*entries = NULL;
*n = 0;
last = false;
offset = sizeof format;
for (i = 1; !last; i++) {
rv = uvLoadEntriesBatch(uv, &buf, &tmp_entries, &tmp_n, &offset,
&last);
if (rv != 0) {
ErrMsgWrapf(uv->io->errmsg,
"entries batch %u starting at byte %zu", i,
offset);
/* Clean up the last allocation from extendEntries. */
goto err_after_extend_entries;
}
rv = extendEntries(tmp_entries, tmp_n, entries, n);
if (rv != 0) {
goto err_after_batch_load;
}
raft_free(tmp_entries);
}
if (*n != expected_n) {
ErrMsgPrintf(uv->io->errmsg, "found %zu entries (expected %u)",
*n, expected_n);
rv = RAFT_CORRUPT;
goto err_after_extend_entries;
}
assert(i > 1); /* At least one batch was loaded. */
assert(*n > 0); /* At least one entry was loaded. */
return 0;
err_after_batch_load:
raft_free(tmp_entries[0].batch);
raft_free(tmp_entries);
err_after_extend_entries:
if (*entries != NULL) {
RaftHeapFree(*entries);
}
err_after_read:
RaftHeapFree(buf.base);
err:
assert(rv != 0);
return rv;
}
/* Check if the content of the segment file contains all zeros from the current
* offset onward. */
static bool uvContentHasOnlyTrailingZeros(const struct raft_buffer *buf,
size_t offset)
{
size_t i;
for (i = offset; i < buf->len; i++) {
if (((char *)buf->base)[i] != 0) {
return false;
}
}
return true;
}
/* Load all entries contained in an open segment. */
static int uvSegmentLoadOpen(struct uv *uv,
struct uvSegmentInfo *info,
struct raft_entry *entries[],
size_t *n,
raft_index *next_index)
{
raft_index first_index; /* Index of first entry in segment */
bool all_zeros; /* Whether the file is zero'ed */
bool empty; /* Whether the segment file is empty */
bool remove = false; /* Whether to remove this segment */
bool last = false; /* Whether the last batch was reached */
uint64_t format; /* Format version */
size_t n_batches = 0; /* Number of loaded batches */
struct raft_entry *tmp_entries; /* Entries in current batch */
struct raft_buffer buf = {0}; /* Segment file content */
size_t offset; /* Content read cursor */
unsigned tmp_n_entries; /* Number of entries in current batch */
int i;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int rv;
first_index = *next_index;
rv = UvFsFileIsEmpty(uv->dir, info->filename, &empty, errmsg);
if (rv != 0) {
tracef("check if %s is empty: %s", info->filename, errmsg);
rv = RAFT_IOERR;
goto err;
}
if (empty) {
/* Empty segment, let's discard it. */
tracef("remove empty open segment %s", info->filename);
remove = true;
goto done;
}
rv = uvReadSegmentFile(uv, info->filename, &buf, &format);
if (rv != 0) {
goto err;
}
/* Check that the format is the expected one, or perhaps 0, indicating
* that the segment was allocated but never written. */
offset = sizeof format;
if (format != UV__DISK_FORMAT) {
if (format == 0) {
all_zeros = uvContentHasOnlyTrailingZeros(&buf, offset);
if (all_zeros) {
/* This is equivalent to the empty case, let's
* remove the segment. */
tracef("remove zeroed open segment %s",
info->filename);
remove = true;
RaftHeapFree(buf.base);
buf.base = NULL;
goto done;
}
}
ErrMsgPrintf(uv->io->errmsg, "unexpected format version %ju",
format);
rv = RAFT_CORRUPT;
goto err_after_read;
}
/* Load all batches in the segment. */
for (i = 1; !last; i++) {
rv = uvLoadEntriesBatch(uv, &buf, &tmp_entries, &tmp_n_entries,
&offset, &last);
if (rv != 0) {
/* If this isn't a decoding error, just bail out. */
if (rv != RAFT_CORRUPT) {
ErrMsgWrapf(
uv->io->errmsg,
"entries batch %u starting at byte %zu", i,
offset);
goto err_after_read;
}
/* If this is a decoding error, and not an OS error,
* check if the rest of the file is filled with zeros.
* In that case we assume that the server shutdown
* uncleanly and we just truncate this incomplete data.
*/
all_zeros = uvContentHasOnlyTrailingZeros(&buf, offset);
if (!all_zeros) {
tracef("%s has non-zero trail", info->filename);
}
tracef(
"truncate open segment %s at %zu (batch %d), since "
"it has "
"corrupted "
"entries",
info->filename, offset, i);
break;
}
rv = extendEntries(tmp_entries, tmp_n_entries, entries, n);
if (rv != 0) {
goto err_after_batch_load;
}
raft_free(tmp_entries);
n_batches++;
*next_index += tmp_n_entries;
}
if (n_batches == 0) {
RaftHeapFree(buf.base);
buf.base = NULL;
remove = true;
}
done:
/* If the segment has no valid entries in it, we remove it. Otherwise we
* rename it and keep it. */
if (remove) {
rv = UvFsRemoveFile(uv->dir, info->filename, errmsg);
if (rv != 0) {
tracef("unlink %s: %s", info->filename, errmsg);
rv = RAFT_IOERR;
goto err_after_read;
}
} else {
char filename[UV__SEGMENT_FILENAME_BUF_SIZE];
raft_index end_index = *next_index - 1;
/* At least one entry was loaded */
assert(end_index >= first_index);
int nb = snprintf(filename, sizeof(filename),
UV__CLOSED_TEMPLATE, first_index, end_index);
if ((nb < 0) || ((size_t)nb >= sizeof(filename))) {
tracef("snprintf failed: %d", nb);
rv = RAFT_IOERR;
goto err;
}
tracef("finalize %s into %s", info->filename, filename);
rv = UvFsTruncateAndRenameFile(
uv->dir, (size_t)offset, info->filename, filename, errmsg);
if (rv != 0) {
tracef("finalize %s: %s", info->filename, errmsg);
rv = RAFT_IOERR;
goto err;
}
info->is_open = false;
info->first_index = first_index;
info->end_index = end_index;
memset(info->filename, '\0', sizeof(info->filename));
_Static_assert(sizeof(info->filename) >= sizeof(filename),
"Destination buffer too small");
/* info->filename is zeroed out, info->filename is at least as
* large as filename and we checked that nb < sizeof(filename)
* -> we won't overflow and the result will be zero terminated.
*/
memcpy(info->filename, filename, (size_t)nb);
}
return 0;
err_after_batch_load:
raft_free(tmp_entries[0].batch);
raft_free(tmp_entries);
err_after_read:
if (buf.base != NULL) {
RaftHeapFree(buf.base);
}
err:
assert(rv != 0);
return rv;
}
/* Ensure that the write buffer of the given segment is large enough to hold the
* the given number of bytes size. */
static int uvEnsureSegmentBufferIsLargeEnough(struct uvSegmentBuffer *b,
size_t size)
{
unsigned n = (unsigned)(size / b->block_size);
void *base;
size_t len;
if (b->arena.len >= size) {
assert(b->arena.base != NULL);
return 0;
}
if (size % b->block_size != 0) {
n++;
}
len = b->block_size * n;
base = raft_aligned_alloc(b->block_size, len);
if (base == NULL) {
return RAFT_NOMEM;
}
memset(base, 0, len);
/* If the current arena is initialized, we need to copy its content,
* since it might have data that we want to retain in the next write. */
if (b->arena.base != NULL) {
assert(b->arena.len >= b->block_size);
memcpy(base, b->arena.base, b->arena.len);
raft_aligned_free(b->block_size, b->arena.base);
}
b->arena.base = base;
b->arena.len = len;
return 0;
}
void uvSegmentBufferInit(struct uvSegmentBuffer *b, size_t block_size)
{
b->block_size = block_size;
b->arena.base = NULL;
b->arena.len = 0;
b->n = 0;
}
void uvSegmentBufferClose(struct uvSegmentBuffer *b)
{
if (b->arena.base != NULL) {
raft_aligned_free(b->block_size, b->arena.base);
}
}
int uvSegmentBufferFormat(struct uvSegmentBuffer *b)
{
int rv;
void *cursor;
size_t n;
assert(b->n == 0);
n = sizeof(uint64_t);
rv = uvEnsureSegmentBufferIsLargeEnough(b, n);
if (rv != 0) {
return rv;
}
b->n = n;
cursor = b->arena.base;
bytePut64(&cursor, UV__DISK_FORMAT);
return 0;
}
int uvSegmentBufferAppend(struct uvSegmentBuffer *b,
const struct raft_entry entries[],
unsigned n_entries)
{
size_t size; /* Total size of the batch */
uint32_t crc1; /* Header checksum */
uint32_t crc2; /* Data checksum */
void *crc1_p; /* Pointer to header checksum slot */
void *crc2_p; /* Pointer to data checksum slot */
void *header; /* Pointer to the header section */
void *cursor;
unsigned i;
int rv;
size = sizeof(uint32_t) * 2; /* CRC checksums */
size += uvSizeofBatchHeader(n_entries, true); /* Batch header */
for (i = 0; i < n_entries; i++) { /* Entries data */
size += bytePad64(entries[i].buf.len);
#ifdef DQLITE_NEXT
size += sizeof(struct raft_entry_local_data);
#endif
}
rv = uvEnsureSegmentBufferIsLargeEnough(b, b->n + size);
if (rv != 0) {
return rv;
}
cursor = b->arena.base + b->n;
/* Placeholder of the checksums */
crc1_p = cursor;
bytePut32(&cursor, 0);
crc2_p = cursor;
bytePut32(&cursor, 0);
/* Batch header */
header = cursor;
uvEncodeBatchHeader(entries, n_entries, cursor, true /* encode local data */);
crc1 = byteCrc32(header, uvSizeofBatchHeader(n_entries, true), 0);
cursor = (uint8_t *)cursor + uvSizeofBatchHeader(n_entries, true);
/* Batch data */
crc2 = 0;
for (i = 0; i < n_entries; i++) {
const struct raft_entry *entry = &entries[i];
assert(entry->buf.len % sizeof(uint64_t) == 0);
memcpy(cursor, entry->buf.base, entry->buf.len);
crc2 = byteCrc32(cursor, entry->buf.len, crc2);
cursor = (uint8_t *)cursor + entry->buf.len;
static_assert(sizeof(entry->local_data.buf) % sizeof(uint64_t) == 0,
"bad size for entry local data");
#ifdef DQLITE_NEXT
size_t local_data_size = sizeof(entry->local_data.buf);
memcpy(cursor, entry->local_data.buf, local_data_size);
crc2 = byteCrc32(cursor, local_data_size, crc2);
cursor = (uint8_t *)cursor + local_data_size;
#endif
}
bytePut32(&crc1_p, crc1);
bytePut32(&crc2_p, crc2);
b->n += size;
return 0;
}
void uvSegmentBufferFinalize(struct uvSegmentBuffer *b, uv_buf_t *out)
{
unsigned n_blocks;
unsigned tail;
n_blocks = (unsigned)(b->n / b->block_size);
if (b->n % b->block_size != 0) {
n_blocks++;
}
/* Set the remainder of the last block to 0 */
tail = (unsigned)(b->n % b->block_size);
if (tail != 0) {
memset(b->arena.base + b->n, 0, b->block_size - tail);
}
out->base = b->arena.base;
out->len = n_blocks * b->block_size;
}
void uvSegmentBufferReset(struct uvSegmentBuffer *b, unsigned retain)
{
assert(b->n > 0);
assert(b->arena.base != NULL);
if (retain == 0) {
b->n = 0;
memset(b->arena.base, 0, b->block_size);
return;
}
memcpy(b->arena.base, b->arena.base + retain * b->block_size,
b->block_size);
b->n = b->n % b->block_size;
}
/* When a corrupted segment is detected, the segment is renamed.
* Upon a restart, raft will not detect the segment anymore and will try
* to start without it.
* */
#define CORRUPT_FILE_FMT "corrupt-%" PRId64 "-%s"
static void uvMoveCorruptSegment(struct uv *uv, struct uvSegmentInfo *info)
{
char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0};
char new_filename[UV__FILENAME_LEN + 1] = {0};
size_t sz = sizeof(new_filename);
int rv;
struct timespec ts = {0};
/* Ignore errors */
clock_gettime(CLOCK_REALTIME, &ts);
int64_t ns = ts.tv_sec * 1000000000 + ts.tv_nsec;
rv = snprintf(new_filename, sz, CORRUPT_FILE_FMT, ns, info->filename);
if (rv < 0 || rv >= (int)sz) {
tracef("snprintf %d", rv);
return;
}
UvFsRenameFile(uv->dir, info->filename, new_filename, errmsg);
if (rv != 0) {
tracef("%s", errmsg);
return;
}
}
/*
* On startup, raft will try to recover when a corrupt segment is detected.
*
* When a corrupt open segment is encountered, it, and all subsequent open
* segments, are renamed. Not renaming newer, possible non-corrupt, open
* segments could lead to loading inconsistent data.
*
* When a corrupt closed segment is encountered, it will be renamed when
* it is the last closed segment, in that case all open-segments are renamed
* too.
*/
static void uvRecoverFromCorruptSegment(struct uv *uv,
size_t i_corrupt,
struct uvSegmentInfo *infos,
size_t n_infos)
{
struct uvSegmentInfo *info = &infos[i_corrupt];
if (info->is_open) {
for (size_t i = i_corrupt; i < n_infos; ++i) {
info = &infos[i];
uvMoveCorruptSegment(uv, info);
}
} else {
size_t i_next = i_corrupt + 1;
/* last segment or last closed segment. */
if (i_next == n_infos || infos[i_next].is_open) {
for (size_t i = i_corrupt; i < n_infos; ++i) {
info = &infos[i];
uvMoveCorruptSegment(uv, info);
}
}
}
}
int uvSegmentLoadAll(struct uv *uv,
const raft_index start_index,
struct uvSegmentInfo *infos,
size_t n_infos,
struct raft_entry **entries,
size_t *n_entries)
{
raft_index next_index; /* Next entry to load from disk */
struct raft_entry *tmp_entries; /* Entries in current segment */
size_t tmp_n; /* Number of entries in current segment */
size_t i;
int rv;
assert(start_index >= 1);
assert(n_infos > 0);
*entries = NULL;
*n_entries = 0;
next_index = start_index;
for (i = 0; i < n_infos; i++) {
struct uvSegmentInfo *info = &infos[i];
tracef("load segment %s", info->filename);
if (info->is_open) {
rv = uvSegmentLoadOpen(uv, info, entries, n_entries,
&next_index);
ErrMsgWrapf(uv->io->errmsg, "load open segment %s",
info->filename);
if (rv != 0) {
if (rv == RAFT_CORRUPT && uv->auto_recovery) {
uvRecoverFromCorruptSegment(
uv, i, infos, n_infos);
}
goto err;
}
} else {
assert(info->first_index >= start_index);
assert(info->first_index <= info->end_index);
/* Check that the start index encoded in the name of the
* segment matches what we expect and there are no gaps
* in the sequence. */
if (info->first_index != next_index) {
ErrMsgPrintf(uv->io->errmsg,
"unexpected closed segment %s: "
"first index should "
"have been %llu",
info->filename, next_index);
rv = RAFT_CORRUPT;
goto err;
}
rv =
uvSegmentLoadClosed(uv, info, &tmp_entries, &tmp_n);
if (rv != 0) {
ErrMsgWrapf(uv->io->errmsg,
"load closed segment %s",
info->filename);
if (rv == RAFT_CORRUPT && uv->auto_recovery) {
uvRecoverFromCorruptSegment(
uv, i, infos, n_infos);
}
goto err;
}
assert(tmp_n > 0);
rv = extendEntries(tmp_entries, tmp_n, entries,
n_entries);
if (rv != 0) {
/* TODO: release memory of entries in
* tmp_entries */
goto err;
}
raft_free(tmp_entries);
next_index += tmp_n;
}
}
return 0;
err:
assert(rv != 0);
/* Free any batch that we might have allocated and the entries array as
* well. */
if (*entries != NULL) {
void *batch = NULL;
for (i = 0; i < *n_entries; i++) {
struct raft_entry *entry = &(*entries)[i];
if (entry->batch != batch) {
batch = entry->batch;
raft_free(batch);
}
}
raft_free(*entries);
*entries = NULL;
*n_entries = 0;
}
return rv;
}
/* Write a closed segment */
static int uvWriteClosedSegment(struct uv *uv,
raft_index first_index,
raft_index last_index,
const struct raft_buffer *conf)
{
char filename[UV__FILENAME_LEN];
struct uvSegmentBuffer buf = {0};
struct raft_buffer data;
struct raft_entry entry = {0};
size_t cap;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int rv;
assert(first_index <= last_index);
/* Render the path */
sprintf(filename, UV__CLOSED_TEMPLATE, first_index, last_index);
/* Make sure that the given encoded configuration fits in the first
* block */
cap = uv->block_size -
(sizeof(uint64_t) /* Format version */ +
sizeof(uint64_t) /* Checksums */ + uvSizeofBatchHeader(1, true /* include local bufs */));
if (conf->len > cap) {
return RAFT_TOOBIG;
}
uvSegmentBufferInit(&buf, uv->block_size);
rv = uvSegmentBufferFormat(&buf);
if (rv != 0) {
return rv;
}
entry.term = 1;
entry.type = RAFT_CHANGE;
entry.buf = *conf;
rv = uvSegmentBufferAppend(&buf, &entry, 1);
if (rv != 0) {
uvSegmentBufferClose(&buf);
return rv;
}
data.base = buf.arena.base;
data.len = buf.n;
rv = UvFsMakeFile(uv->dir, filename, &data, 1, errmsg);
uvSegmentBufferClose(&buf);
if (rv != 0) {
tracef("write segment %s: %s", filename, errmsg);
return RAFT_IOERR;
}
return 0;
}
int uvSegmentCreateFirstClosed(struct uv *uv,
const struct raft_configuration *configuration)
{
return uvSegmentCreateClosedWithConfiguration(uv, 1, configuration);
}
int uvSegmentCreateClosedWithConfiguration(
struct uv *uv,
raft_index index,
const struct raft_configuration *configuration)
{
struct raft_buffer buf;
char filename[UV__FILENAME_LEN];
int rv;
/* Render the path */
sprintf(filename, UV__CLOSED_TEMPLATE, index, index);
/* Encode the given configuration. */
rv = configurationEncode(configuration, &buf);
if (rv != 0) {
goto err;
}
/* Write the file */
rv = uvWriteClosedSegment(uv, index, index, &buf);
if (rv != 0) {
goto err_after_configuration_encode;
}
raft_free(buf.base);
rv = UvFsSyncDir(uv->dir, uv->io->errmsg);
if (rv != 0) {
return RAFT_IOERR;
}
return 0;
err_after_configuration_encode:
raft_free(buf.base);
err:
assert(rv != 0);
return rv;
}
int uvSegmentTruncate(struct uv *uv,
struct uvSegmentInfo *segment,
raft_index index)
{
char filename[UV__FILENAME_LEN];
struct raft_entry *entries;
struct uvSegmentBuffer buf;
struct raft_buffer data;
size_t n;
unsigned m;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int rv;
assert(!segment->is_open);
tracef("truncate %llu-%llu at %llu", segment->first_index,
segment->end_index, index);
rv = uvSegmentLoadClosed(uv, segment, &entries, &n);
if (rv != 0) {
ErrMsgWrapf(uv->io->errmsg, "load closed segment %s",
segment->filename);
goto out;
}
/* Discard all entries after the truncate index (included) */
assert(index - segment->first_index < n);
m = (unsigned)(index - segment->first_index);
uvSegmentBufferInit(&buf, uv->block_size);
rv = uvSegmentBufferFormat(&buf);
if (rv != 0) {
goto out_after_buffer_init;
}
rv = uvSegmentBufferAppend(&buf, entries, m);
if (rv != 0) {
goto out_after_buffer_init;
}
/* Render the path.
*
* TODO: we should use a temporary file name so in case of crash we
* don't consider this segment as corrupted.
*/
sprintf(filename, UV__CLOSED_TEMPLATE, segment->first_index, index - 1);
data.base = buf.arena.base;
data.len = buf.n;
rv = UvFsMakeFile(uv->dir, filename, &data, 1, errmsg);
if (rv != 0) {
tracef("write %s: %s", filename, errmsg);
rv = RAFT_IOERR;
goto out_after_buffer_init;
}
out_after_buffer_init:
uvSegmentBufferClose(&buf);
entryBatchesDestroy(entries, n);
out:
return rv;
}
#undef tracef
dqlite-1.16.7/src/raft/uv_send.c 0000664 0000000 0000000 00000032370 14652527134 0016440 0 ustar 00root root 0000000 0000000 #include
#include "../raft.h"
#include "assert.h"
#include "heap.h"
#include "uv.h"
#include "uv_encoding.h"
/* The happy path for an raft_io_send request is:
*
* - Get the uvClient object whose address matches the one of target server.
* - Encode the message and write it using the uvClient's TCP handle.
* - Once the write completes, fire the send request callback.
*
* Possible failure modes are:
*
* - The uv->clients queue has no client object with a matching address. In this
* case add a new client object to the array, add the send request to the
* queue of pending requests and submit a connection request. Once the
* connection request succeeds, try to write the encoded request to the
* connected stream handle. If the connection request fails, schedule another
* attempt.
*
* - The uv->clients queue has a client object which is not connected. Add the
* send request to the pending queue, and, if there's no connection attempt
* already in progress, start a new one.
*
* - The write request fails (either synchronously or asynchronously). In this
* case we fire the request callback with an error, close the connection
* stream, and start a re-connection attempt.
*/
/* Maximum number of requests that can be buffered. */
#define UV__CLIENT_MAX_PENDING 3
struct uvClient
{
struct uv *uv; /* libuv I/O implementation object */
struct uv_timer_s timer; /* Schedule connection attempts */
struct raft_uv_connect connect; /* Connection request */
struct uv_stream_s *stream; /* Current connection handle */
struct uv_stream_s *old_stream; /* Connection handle being closed */
unsigned n_connect_attempt; /* Consecutive connection attempts */
raft_id id; /* ID of the other server */
char *address; /* Address of the other server */
queue pending; /* Pending send message requests */
queue queue; /* Clients queue */
bool closing; /* True after calling uvClientAbort */
};
/* Hold state for a single send RPC message request. */
struct uvSend
{
struct uvClient *client; /* Client connected to the target server */
struct raft_io_send *req; /* User request */
uv_buf_t *bufs; /* Encoded raft RPC message to send */
unsigned n_bufs; /* Number of buffers */
uv_write_t write; /* Stream write request */
queue queue; /* Pending send requests queue */
};
/* Free all memory used by the given send request object, including the object
* itself. */
static void uvSendDestroy(struct uvSend *s)
{
if (s->bufs != NULL) {
/* Just release the first buffer. Further buffers are entry or
* snapshot payloads, which we were passed but we don't own. */
RaftHeapFree(s->bufs[0].base);
/* Release the buffers array. */
RaftHeapFree(s->bufs);
}
RaftHeapFree(s);
}
/* Initialize a new client associated with the given server. */
static int uvClientInit(struct uvClient *c,
struct uv *uv,
raft_id id,
const char *address)
{
int rv;
c->uv = uv;
c->timer.data = c;
c->connect.data = NULL; /* Set upon starting a connect request */
c->stream = NULL; /* Set upon successful connection */
c->old_stream = NULL; /* Set after closing the current connection */
c->n_connect_attempt = 0;
c->id = id;
c->address = RaftHeapMalloc(strlen(address) + 1);
if (c->address == NULL) {
return RAFT_NOMEM;
}
rv = uv_timer_init(c->uv->loop, &c->timer);
assert(rv == 0);
strcpy(c->address, address);
queue_init(&c->pending);
c->closing = false;
queue_insert_tail(&uv->clients, &c->queue);
return 0;
}
/* If there's no more pending cleanup, remove the client from the abort queue
* and destroy it. */
static void uvClientMaybeDestroy(struct uvClient *c)
{
struct uv *uv = c->uv;
assert(c->stream == NULL);
if (c->connect.data != NULL) {
return;
}
if (c->timer.data != NULL) {
return;
}
if (c->old_stream != NULL) {
return;
}
while (!queue_empty(&c->pending)) {
queue *head;
struct uvSend *send;
struct raft_io_send *req;
head = queue_head(&c->pending);
send = QUEUE_DATA(head, struct uvSend, queue);
queue_remove(head);
req = send->req;
uvSendDestroy(send);
if (req->cb != NULL) {
req->cb(req, RAFT_CANCELED);
}
}
queue_remove(&c->queue);
assert(c->address != NULL);
RaftHeapFree(c->address);
RaftHeapFree(c);
uvMaybeFireCloseCb(uv);
}
/* Forward declaration. */
static void uvClientConnect(struct uvClient *c);
static void uvClientDisconnectCloseCb(struct uv_handle_s *handle)
{
struct uvClient *c = handle->data;
assert(c->old_stream != NULL);
assert(c->stream == NULL);
assert(handle == (struct uv_handle_s *)c->old_stream);
RaftHeapFree(c->old_stream);
c->old_stream = NULL;
if (c->closing) {
uvClientMaybeDestroy(c);
} else {
uvClientConnect(c); /* Trigger a new connection attempt. */
}
}
/* Close the current connection. */
static void uvClientDisconnect(struct uvClient *c)
{
assert(c->stream != NULL);
assert(c->old_stream == NULL);
c->old_stream = c->stream;
c->stream = NULL;
uv_close((struct uv_handle_s *)c->old_stream,
uvClientDisconnectCloseCb);
}
/* Invoked once an encoded RPC message has been written out. */
static void uvSendWriteCb(struct uv_write_s *write, const int status)
{
struct uvSend *send = write->data;
struct uvClient *c = send->client;
struct raft_io_send *req = send->req;
int cb_status = 0;
/* If the write failed and we're not currently closing, let's consider
* the current stream handle as busted and start disconnecting (unless
* we're already doing so). We'll trigger a new connection attempt once
* the handle is closed. */
if (status != 0) {
cb_status = RAFT_IOERR;
if (!c->closing) {
if (c->stream != NULL) {
uvClientDisconnect(c);
}
} else if (status == UV_ECANCELED) {
cb_status = RAFT_CANCELED;
}
}
uvSendDestroy(send);
if (req->cb != NULL) {
req->cb(req, cb_status);
}
}
static int uvClientSend(struct uvClient *c, struct uvSend *send)
{
int rv;
assert(!c->closing);
send->client = c;
/* If there's no connection available, let's queue the request. */
if (c->stream == NULL) {
tracef("no connection available -> enqueue message");
queue_insert_tail(&c->pending, &send->queue);
return 0;
}
tracef("connection available -> write message");
send->write.data = send;
rv = uv_write(&send->write, c->stream, send->bufs, send->n_bufs,
uvSendWriteCb);
if (rv != 0) {
tracef("write message failed -> rv %d", rv);
/* UNTESTED: what are the error conditions? perhaps ENOMEM */
return RAFT_IOERR;
}
return 0;
}
/* Try to execute all send requests that were blocked in the queue waiting for a
* connection. */
static void uvClientSendPending(struct uvClient *c)
{
int rv;
assert(c->stream != NULL);
tracef("send pending messages");
while (!queue_empty(&c->pending)) {
queue *head;
struct uvSend *send;
head = queue_head(&c->pending);
send = QUEUE_DATA(head, struct uvSend, queue);
queue_remove(head);
rv = uvClientSend(c, send);
if (rv != 0) {
if (send->req->cb != NULL) {
send->req->cb(send->req, rv);
}
uvSendDestroy(send);
}
}
}
static void uvClientTimerCb(uv_timer_t *timer)
{
struct uvClient *c = timer->data;
tracef("timer expired -> attempt to reconnect");
uvClientConnect(c); /* Retry to connect. */
}
/* Return the number of send requests that we have been parked in the send queue
* because no connection is available yet. */
static unsigned uvClientPendingCount(struct uvClient *c)
{
queue *head;
unsigned n = 0;
QUEUE_FOREACH(head, &c->pending)
{
n++;
}
return n;
}
static void uvClientConnectCb(struct raft_uv_connect *req,
struct uv_stream_s *stream,
int status)
{
struct uvClient *c = req->data;
unsigned n_pending;
int rv;
tracef("connect attempt completed -> status %s",
errCodeToString(status));
assert(c->connect.data != NULL);
assert(c->stream == NULL);
assert(c->old_stream == NULL);
assert(!uv_is_active((struct uv_handle_s *)&c->timer));
c->connect.data = NULL;
/* If we are closing, bail out, possibly discarding the new connection.
*/
if (c->closing) {
if (status == 0) {
assert(stream != NULL);
c->stream = stream;
c->stream->data = c;
uvClientDisconnect(c);
} else {
uvClientMaybeDestroy(c);
}
return;
}
/* If, the connection attempt was successful, we're good. If we have
* pending requests, let's try to execute them. */
if (status == 0) {
assert(stream != NULL);
c->stream = stream;
c->n_connect_attempt = 0;
c->stream->data = c;
uvClientSendPending(c);
return;
}
/* Shrink the queue of pending requests, by failing the oldest ones */
n_pending = uvClientPendingCount(c);
if (n_pending > UV__CLIENT_MAX_PENDING) {
unsigned i;
for (i = 0; i < n_pending - UV__CLIENT_MAX_PENDING; i++) {
tracef("queue full -> evict oldest message");
queue *head;
struct uvSend *old_send;
struct raft_io_send *old_req;
head = queue_head(&c->pending);
old_send = QUEUE_DATA(head, struct uvSend, queue);
queue_remove(head);
old_req = old_send->req;
uvSendDestroy(old_send);
if (old_req->cb != NULL) {
old_req->cb(old_req, RAFT_NOCONNECTION);
}
}
}
/* Let's schedule another attempt. */
rv = uv_timer_start(&c->timer, uvClientTimerCb,
c->uv->connect_retry_delay, 0);
assert(rv == 0);
}
/* Perform a single connection attempt, scheduling a retry if it fails. */
static void uvClientConnect(struct uvClient *c)
{
int rv;
assert(!c->closing);
assert(c->stream == NULL);
assert(c->old_stream == NULL);
assert(!uv_is_active((struct uv_handle_s *)&c->timer));
assert(c->connect.data == NULL);
c->n_connect_attempt++;
c->connect.data = c;
rv = c->uv->transport->connect(c->uv->transport, &c->connect, c->id,
c->address, uvClientConnectCb);
if (rv != 0) {
/* Restart the timer, so we can retry. */
c->connect.data = NULL;
rv = uv_timer_start(&c->timer, uvClientTimerCb,
c->uv->connect_retry_delay, 0);
assert(rv == 0);
}
}
/* Final callback in the close chain of an io_uv__client object */
static void uvClientTimerCloseCb(struct uv_handle_s *handle)
{
struct uvClient *c = handle->data;
assert(handle == (struct uv_handle_s *)&c->timer);
c->timer.data = NULL;
uvClientMaybeDestroy(c);
}
/* Start shutting down a client. This happens when the `raft_io` instance
* has been closed or when the address of the client has changed. */
static void uvClientAbort(struct uvClient *c)
{
struct uv *uv = c->uv;
int rv;
assert(c->stream != NULL || c->old_stream != NULL ||
uv_is_active((struct uv_handle_s *)&c->timer) ||
c->connect.data != NULL);
queue_remove(&c->queue);
queue_insert_tail(&uv->aborting, &c->queue);
rv = uv_timer_stop(&c->timer);
assert(rv == 0);
/* If we are connected, let's close the outbound stream handle. This
* will eventually complete all inflight write requests, possibly with
* failing them with UV_ECANCELED. */
if (c->stream != NULL) {
uvClientDisconnect(c);
}
/* Closing the timer implicitly stop it, so the timeout callback won't
* be fired. */
uv_close((struct uv_handle_s *)&c->timer, uvClientTimerCloseCb);
c->closing = true;
}
/* Find the client object associated with the given server, or create one if
* there's none yet. */
static int uvGetClient(struct uv *uv,
const raft_id id,
const char *address,
struct uvClient **client)
{
queue *head;
int rv;
/* Check if we already have a client object for this peer server. */
QUEUE_FOREACH(head, &uv->clients)
{
*client = QUEUE_DATA(head, struct uvClient, queue);
if ((*client)->id != id) {
continue;
}
/* Client address has changed, abort connection and create a new
* one. */
if (strcmp((*client)->address, address) != 0) {
uvClientAbort(*client);
break;
}
return 0;
}
/* Initialize the new connection */
*client = RaftHeapMalloc(sizeof **client);
if (*client == NULL) {
rv = RAFT_NOMEM;
goto err;
}
rv = uvClientInit(*client, uv, id, address);
if (rv != 0) {
goto err_after_client_alloc;
}
/* Make a first connection attempt right away.. */
uvClientConnect(*client);
return 0;
err_after_client_alloc:
RaftHeapFree(*client);
err:
assert(rv != 0);
return rv;
}
int UvSend(struct raft_io *io,
struct raft_io_send *req,
const struct raft_message *message,
raft_io_send_cb cb)
{
struct uv *uv = io->impl;
struct uvSend *send;
struct uvClient *client;
int rv;
assert(!uv->closing);
/* Allocate a new request object. */
send = RaftHeapMalloc(sizeof *send);
if (send == NULL) {
rv = RAFT_NOMEM;
goto err;
}
send->req = req;
req->cb = cb;
rv = uvEncodeMessage(message, &send->bufs, &send->n_bufs);
if (rv != 0) {
send->bufs = NULL;
goto err_after_send_alloc;
}
/* Get a client object connected to the target server, creating it if it
* doesn't exist yet. */
rv = uvGetClient(uv, message->server_id, message->server_address,
&client);
if (rv != 0) {
goto err_after_send_alloc;
}
rv = uvClientSend(client, send);
if (rv != 0) {
goto err_after_send_alloc;
}
return 0;
err_after_send_alloc:
uvSendDestroy(send);
err:
assert(rv != 0);
return rv;
}
void UvSendClose(struct uv *uv)
{
assert(uv->closing);
while (!queue_empty(&uv->clients)) {
queue *head;
struct uvClient *client;
head = queue_head(&uv->clients);
client = QUEUE_DATA(head, struct uvClient, queue);
uvClientAbort(client);
}
}
#undef tracef
dqlite-1.16.7/src/raft/uv_snapshot.c 0000664 0000000 0000000 00000046500 14652527134 0017346 0 ustar 00root root 0000000 0000000 #include
#include
#include "array.h"
#include "assert.h"
#include "byte.h"
#include "compress.h"
#include "configuration.h"
#include "heap.h"
#include "uv.h"
#include "uv_encoding.h"
#include "uv_os.h"
/* Arbitrary maximum configuration size. Should be practically be enough */
#define UV__META_MAX_CONFIGURATION_SIZE 1024 * 1024
/* Returns true if the filename is a valid snapshot file or snapshot meta
* filename depending on the `meta` switch. If the parse is successful, the
* arguments will contain the parsed values. */
static bool uvSnapshotParseFilename(const char *filename,
bool meta,
raft_term *term,
raft_index *index,
raft_time *timestamp)
{
/* Check if it's a well-formed snapshot filename */
int consumed = 0;
int matched;
size_t filename_len = strlen(filename);
assert(filename_len < UV__FILENAME_LEN);
if (meta) {
matched = sscanf(filename, UV__SNAPSHOT_META_TEMPLATE "%n",
term, index, timestamp, &consumed);
} else {
matched = sscanf(filename, UV__SNAPSHOT_TEMPLATE "%n", term,
index, timestamp, &consumed);
}
if (matched != 3 || consumed != (int)filename_len) {
return false;
}
return true;
}
/* Check if the given filename matches the pattern of a snapshot metadata
* filename (snapshot-xxx-yyy-zzz.meta), and fill the given info structure if
* so.
*
* Return true if the filename matched, false otherwise. */
static bool uvSnapshotInfoMatch(const char *filename,
struct uvSnapshotInfo *info)
{
if (!uvSnapshotParseFilename(filename, true, &info->term, &info->index,
&info->timestamp)) {
return false;
}
/* Allow room for '\0' terminator */
size_t n = sizeof(info->filename) - 1;
strncpy(info->filename, filename, n);
info->filename[n] = '\0';
return true;
}
void uvSnapshotFilenameOf(struct uvSnapshotInfo *info, char *filename)
{
size_t len = strlen(info->filename) - strlen(".meta");
assert(len < UV__FILENAME_LEN);
strcpy(filename, info->filename);
filename[len] = 0;
}
int UvSnapshotInfoAppendIfMatch(struct uv *uv,
const char *filename,
struct uvSnapshotInfo *infos[],
size_t *n_infos,
bool *appended)
{
struct uvSnapshotInfo info;
bool matched;
char snapshot_filename[UV__FILENAME_LEN];
bool exists;
bool is_empty;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int rv;
/* Check if it's a snapshot metadata filename */
matched = uvSnapshotInfoMatch(filename, &info);
if (!matched) {
*appended = false;
return 0;
}
/* Check if there's actually a valid snapshot file for this snapshot
* metadata. If there's none or it's empty, it means that we aborted
* before finishing the snapshot, or that another thread is still busy
* writing the snapshot. */
uvSnapshotFilenameOf(&info, snapshot_filename);
rv = UvFsFileExists(uv->dir, snapshot_filename, &exists, errmsg);
if (rv != 0) {
tracef("stat %s: %s", snapshot_filename, errmsg);
rv = RAFT_IOERR;
return rv;
}
if (!exists) {
*appended = false;
return 0;
}
/* TODO This check is strictly not needed, snapshot files are created by
* renaming fully written and synced tmp-files. Leaving it here, just to
* be extra-safe. Can probably be removed once more data integrity
* checks are performed at startup. */
rv = UvFsFileIsEmpty(uv->dir, snapshot_filename, &is_empty, errmsg);
if (rv != 0) {
tracef("is_empty %s: %s", snapshot_filename, errmsg);
rv = RAFT_IOERR;
return rv;
}
if (is_empty) {
*appended = false;
return 0;
}
ARRAY__APPEND(struct uvSnapshotInfo, info, infos, n_infos, rv);
if (rv == -1) {
return RAFT_NOMEM;
}
*appended = true;
return 0;
}
static int uvSnapshotIsOrphanInternal(const char *dir,
const char *filename,
bool meta,
bool *orphan)
{
int rv;
*orphan = false;
raft_term term;
raft_index index;
raft_time timestamp;
if (!uvSnapshotParseFilename(filename, meta, &term, &index,
×tamp)) {
return 0;
}
/* filename is a well-formed snapshot filename, check if the sibling
* file exists. */
char sibling_filename[UV__FILENAME_LEN];
if (meta) {
rv = snprintf(sibling_filename, UV__FILENAME_LEN,
UV__SNAPSHOT_TEMPLATE, term, index, timestamp);
} else {
rv = snprintf(sibling_filename, UV__FILENAME_LEN,
UV__SNAPSHOT_META_TEMPLATE, term, index,
timestamp);
}
if (rv >= UV__FILENAME_LEN) {
/* Output truncated */
return -1;
}
bool sibling_exists = false;
char ignored[RAFT_ERRMSG_BUF_SIZE];
rv = UvFsFileExists(dir, sibling_filename, &sibling_exists, ignored);
if (rv != 0) {
return rv;
}
*orphan = !sibling_exists;
return 0;
}
int UvSnapshotIsOrphan(const char *dir, const char *filename, bool *orphan)
{
return uvSnapshotIsOrphanInternal(dir, filename, false, orphan);
}
int UvSnapshotMetaIsOrphan(const char *dir, const char *filename, bool *orphan)
{
return uvSnapshotIsOrphanInternal(dir, filename, true, orphan);
}
/* Compare two snapshots to decide which one is more recent. */
static int uvSnapshotCompare(const void *p1, const void *p2)
{
struct uvSnapshotInfo *s1 = (struct uvSnapshotInfo *)p1;
struct uvSnapshotInfo *s2 = (struct uvSnapshotInfo *)p2;
/* If terms are different, the snapshot with the highest term is the
* most recent. */
if (s1->term != s2->term) {
return s1->term < s2->term ? -1 : 1;
}
/* If the term are identical and the index differ, the snapshot with the
* highest index is the most recent */
if (s1->index != s2->index) {
return s1->index < s2->index ? -1 : 1;
}
/* If term and index are identical, compare the timestamp. */
return s1->timestamp < s2->timestamp ? -1 : 1;
}
/* Sort the given snapshots. */
void UvSnapshotSort(struct uvSnapshotInfo *infos, size_t n_infos)
{
qsort(infos, n_infos, sizeof *infos, uvSnapshotCompare);
}
/* Parse the metadata file of a snapshot and populate the metadata portion of
* the given snapshot object accordingly. */
static int uvSnapshotLoadMeta(struct uv *uv,
struct uvSnapshotInfo *info,
struct raft_snapshot *snapshot,
char *errmsg)
{
uint64_t header[1 + /* Format version */
1 + /* CRC checksum */
1 + /* Configuration index */
1 /* Configuration length */];
struct raft_buffer buf;
uint64_t format;
uint32_t crc1;
uint32_t crc2;
uv_file fd;
int rv;
snapshot->term = info->term;
snapshot->index = info->index;
rv = UvFsOpenFileForReading(uv->dir, info->filename, &fd, errmsg);
if (rv != 0) {
tracef("open %s: %s", info->filename, errmsg);
rv = RAFT_IOERR;
goto err;
}
buf.base = header;
buf.len = sizeof header;
rv = UvFsReadInto(fd, &buf, errmsg);
if (rv != 0) {
tracef("read %s: %s", info->filename, errmsg);
rv = RAFT_IOERR;
goto err_after_open;
}
format = byteFlip64(header[0]);
if (format != UV__DISK_FORMAT) {
tracef("load %s: unsupported format %ju", info->filename,
format);
rv = RAFT_MALFORMED;
goto err_after_open;
}
crc1 = (uint32_t)byteFlip64(header[1]);
snapshot->configuration_index = byteFlip64(header[2]);
buf.len = (size_t)byteFlip64(header[3]);
if (buf.len > UV__META_MAX_CONFIGURATION_SIZE) {
tracef("load %s: configuration data too big (%zd)",
info->filename, buf.len);
rv = RAFT_CORRUPT;
goto err_after_open;
}
if (buf.len == 0) {
tracef("load %s: no configuration data", info->filename);
rv = RAFT_CORRUPT;
goto err_after_open;
}
buf.base = RaftHeapMalloc(buf.len);
if (buf.base == NULL) {
rv = RAFT_NOMEM;
goto err_after_open;
}
rv = UvFsReadInto(fd, &buf, errmsg);
if (rv != 0) {
tracef("read %s: %s", info->filename, errmsg);
rv = RAFT_IOERR;
goto err_after_buf_malloc;
}
crc2 = byteCrc32(header + 2, sizeof header - sizeof(uint64_t) * 2, 0);
crc2 = byteCrc32(buf.base, buf.len, crc2);
if (crc1 != crc2) {
ErrMsgPrintf(errmsg, "read %s: checksum mismatch",
info->filename);
rv = RAFT_CORRUPT;
goto err_after_buf_malloc;
}
rv = configurationDecode(&buf, &snapshot->configuration);
if (rv != 0) {
goto err_after_buf_malloc;
}
RaftHeapFree(buf.base);
UvOsClose(fd);
return 0;
err_after_buf_malloc:
RaftHeapFree(buf.base);
err_after_open:
close(fd);
err:
assert(rv != 0);
return rv;
}
/* Load the snapshot data file and populate the data portion of the given
* snapshot object accordingly. */
static int uvSnapshotLoadData(struct uv *uv,
struct uvSnapshotInfo *info,
struct raft_snapshot *snapshot,
char *errmsg)
{
char filename[UV__FILENAME_LEN];
struct raft_buffer buf;
int rv;
uvSnapshotFilenameOf(info, filename);
rv = UvFsReadFile(uv->dir, filename, &buf, errmsg);
if (rv != 0) {
tracef("stat %s: %s", filename, errmsg);
goto err;
}
if (IsCompressed(buf.base, buf.len)) {
struct raft_buffer decompressed = {0};
tracef("snapshot decompress start");
rv = Decompress(buf, &decompressed, errmsg);
tracef("snapshot decompress end %d", rv);
if (rv != 0) {
tracef("decompress failed rv:%d", rv);
goto err_after_read_file;
}
RaftHeapFree(buf.base);
buf = decompressed;
}
snapshot->bufs = RaftHeapMalloc(sizeof *snapshot->bufs);
snapshot->n_bufs = 1;
if (snapshot->bufs == NULL) {
rv = RAFT_NOMEM;
goto err_after_read_file;
}
snapshot->bufs[0] = buf;
return 0;
err_after_read_file:
RaftHeapFree(buf.base);
err:
assert(rv != 0);
return rv;
}
int UvSnapshotLoad(struct uv *uv,
struct uvSnapshotInfo *meta,
struct raft_snapshot *snapshot,
char *errmsg)
{
int rv;
rv = uvSnapshotLoadMeta(uv, meta, snapshot, errmsg);
if (rv != 0) {
return rv;
}
rv = uvSnapshotLoadData(uv, meta, snapshot, errmsg);
if (rv != 0) {
return rv;
}
return 0;
}
struct uvSnapshotPut
{
struct uv *uv;
size_t trailing;
struct raft_io_snapshot_put *req;
const struct raft_snapshot *snapshot;
struct
{
unsigned long long timestamp;
uint64_t header[4]; /* Format, CRC, configuration index/len */
struct raft_buffer bufs[2]; /* Preamble and configuration */
} meta;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int status;
struct UvBarrierReq barrier;
};
struct uvSnapshotGet
{
struct uv *uv;
struct raft_io_snapshot_get *req;
struct raft_snapshot *snapshot;
struct uv_work_s work;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int status;
queue queue;
};
static int uvSnapshotKeepLastTwo(struct uv *uv,
struct uvSnapshotInfo *snapshots,
size_t n)
{
size_t i;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int rv;
/* Leave at least two snapshots, for safety. */
if (n <= 2) {
return 0;
}
for (i = 0; i < n - 2; i++) {
struct uvSnapshotInfo *snapshot = &snapshots[i];
char filename[UV__FILENAME_LEN];
rv = UvFsRemoveFile(uv->dir, snapshot->filename, errmsg);
if (rv != 0) {
tracef("unlink %s: %s", snapshot->filename, errmsg);
return RAFT_IOERR;
}
uvSnapshotFilenameOf(snapshot, filename);
rv = UvFsRemoveFile(uv->dir, filename, errmsg);
if (rv != 0) {
tracef("unlink %s: %s", filename, errmsg);
return RAFT_IOERR;
}
}
return 0;
}
/* Remove all segments and snapshots that are not needed anymore, because their
past the trailing amount. */
static int uvRemoveOldSegmentsAndSnapshots(struct uv *uv,
raft_index last_index,
size_t trailing,
char *errmsg)
{
struct uvSnapshotInfo *snapshots;
struct uvSegmentInfo *segments;
size_t n_snapshots;
size_t n_segments;
int rv = 0;
rv = UvList(uv, &snapshots, &n_snapshots, &segments, &n_segments,
errmsg);
if (rv != 0) {
goto out;
}
rv = uvSnapshotKeepLastTwo(uv, snapshots, n_snapshots);
if (rv != 0) {
goto out;
}
if (segments != NULL) {
rv = uvSegmentKeepTrailing(uv, segments, n_segments, last_index,
trailing, errmsg);
if (rv != 0) {
goto out;
}
}
rv = UvFsSyncDir(uv->dir, errmsg);
out:
if (snapshots != NULL) {
RaftHeapFree(snapshots);
}
if (segments != NULL) {
RaftHeapFree(segments);
}
return rv;
}
static int makeFileCompressed(const char *dir,
const char *filename,
struct raft_buffer *bufs,
unsigned n_bufs,
char *errmsg)
{
int rv;
struct raft_buffer compressed = {0};
rv = Compress(bufs, n_bufs, &compressed, errmsg);
if (rv != 0) {
ErrMsgWrapf(errmsg, "compress %s", filename);
return RAFT_IOERR;
}
rv = UvFsMakeFile(dir, filename, &compressed, 1, errmsg);
raft_free(compressed.base);
return rv;
}
static void uvSnapshotPutWorkCb(uv_work_t *work)
{
struct uvSnapshotPut *put = work->data;
struct uv *uv = put->uv;
char metadata[UV__FILENAME_LEN];
char snapshot[UV__FILENAME_LEN];
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int rv;
sprintf(metadata, UV__SNAPSHOT_META_TEMPLATE, put->snapshot->term,
put->snapshot->index, put->meta.timestamp);
rv = UvFsMakeFile(uv->dir, metadata, put->meta.bufs, 2, put->errmsg);
if (rv != 0) {
tracef("snapshot.meta creation failed %d", rv);
ErrMsgWrapf(put->errmsg, "write %s", metadata);
put->status = RAFT_IOERR;
return;
}
sprintf(snapshot, UV__SNAPSHOT_TEMPLATE, put->snapshot->term,
put->snapshot->index, put->meta.timestamp);
tracef("snapshot write start");
if (uv->snapshot_compression) {
rv = makeFileCompressed(uv->dir, snapshot, put->snapshot->bufs,
put->snapshot->n_bufs, put->errmsg);
} else {
rv = UvFsMakeFile(uv->dir, snapshot, put->snapshot->bufs,
put->snapshot->n_bufs, put->errmsg);
}
tracef("snapshot write end %d", rv);
if (rv != 0) {
tracef("snapshot creation failed %d", rv);
ErrMsgWrapf(put->errmsg, "write %s", snapshot);
UvFsRemoveFile(uv->dir, metadata, errmsg);
UvFsRemoveFile(uv->dir, snapshot, errmsg);
put->status = RAFT_IOERR;
return;
}
rv = UvFsSyncDir(uv->dir, put->errmsg);
if (rv != 0) {
put->status = RAFT_IOERR;
return;
}
rv = uvRemoveOldSegmentsAndSnapshots(uv, put->snapshot->index,
put->trailing, put->errmsg);
if (rv != 0) {
put->status = rv;
return;
}
put->status = 0;
return;
}
/* Finish the put request, releasing all associated memory and invoking its
* callback. */
static void uvSnapshotPutFinish(struct uvSnapshotPut *put)
{
struct raft_io_snapshot_put *req = put->req;
int status = put->status;
struct uv *uv = put->uv;
assert(uv->snapshot_put_work.data == NULL);
RaftHeapFree(put->meta.bufs[1].base);
RaftHeapFree(put);
req->cb(req, status);
}
static void uvSnapshotPutAfterWorkCb(uv_work_t *work, int status)
{
struct uvSnapshotPut *put = work->data;
struct uv *uv = put->uv;
assert(status == 0);
uv->snapshot_put_work.data = NULL;
uvSnapshotPutFinish(put);
UvUnblock(uv);
}
/* Start processing the given put request. */
static void uvSnapshotPutStart(struct uvSnapshotPut *put)
{
struct uv *uv = put->uv;
int rv;
/* If this is an install request, the barrier callback must have fired.
*/
if (put->trailing == 0) {
assert(put->barrier.data == NULL);
}
uv->snapshot_put_work.data = put;
rv = uv_queue_work(uv->loop, &uv->snapshot_put_work,
uvSnapshotPutWorkCb, uvSnapshotPutAfterWorkCb);
if (rv != 0) {
tracef("store snapshot %lld: %s", put->snapshot->index,
uv_strerror(rv));
uv->errored = true;
}
}
static void uvSnapshotPutBarrierCb(struct UvBarrierReq *barrier)
{
/* Ensure that we don't invoke this callback more than once. */
barrier->cb = NULL;
struct uvSnapshotPut *put = barrier->data;
if (put == NULL) {
return;
}
struct uv *uv = put->uv;
put->barrier.data = NULL;
/* If we're closing, abort the request. */
if (uv->closing) {
put->status = RAFT_CANCELED;
uvSnapshotPutFinish(put);
uvMaybeFireCloseCb(uv);
return;
}
uvSnapshotPutStart(put);
}
int UvSnapshotPut(struct raft_io *io,
unsigned trailing,
struct raft_io_snapshot_put *req,
const struct raft_snapshot *snapshot,
raft_io_snapshot_put_cb cb)
{
struct uv *uv;
struct uvSnapshotPut *put;
void *cursor;
unsigned crc;
int rv;
raft_index next_index;
uv = io->impl;
if (uv->closing) {
return RAFT_CANCELED;
}
assert(uv->snapshot_put_work.data == NULL);
tracef("put snapshot at %lld, keeping %d", snapshot->index, trailing);
put = RaftHeapMalloc(sizeof *put);
if (put == NULL) {
rv = RAFT_NOMEM;
goto err;
}
put->uv = uv;
put->req = req;
put->snapshot = snapshot;
put->meta.timestamp = uv_now(uv->loop);
put->trailing = trailing;
put->barrier.data = put;
put->barrier.blocking = trailing == 0;
put->barrier.cb = uvSnapshotPutBarrierCb;
req->cb = cb;
/* Prepare the buffers for the metadata file. */
put->meta.bufs[0].base = put->meta.header;
put->meta.bufs[0].len = sizeof put->meta.header;
rv = configurationEncode(&snapshot->configuration, &put->meta.bufs[1]);
if (rv != 0) {
goto err_after_req_alloc;
}
cursor = put->meta.header;
bytePut64(&cursor, UV__DISK_FORMAT);
bytePut64(&cursor, 0);
bytePut64(&cursor, snapshot->configuration_index);
bytePut64(&cursor, put->meta.bufs[1].len);
crc = byteCrc32(&put->meta.header[2], sizeof(uint64_t) * 2, 0);
crc = byteCrc32(put->meta.bufs[1].base, put->meta.bufs[1].len, crc);
cursor = &put->meta.header[1];
bytePut64(&cursor, crc);
/* - If the trailing parameter is set to 0, it means that we're
* restoring a snapshot. Submit a barrier request setting the next
* append index to the snapshot's last index + 1.
* - When we are only writing a snapshot during normal operation, we
* close all current open segments. New writes can continue on newly
* opened segments that will only contain entries that are newer than
* the snapshot, and we don't change append_next_index. */
next_index =
(trailing == 0) ? (snapshot->index + 1) : uv->append_next_index;
rv = UvBarrier(uv, next_index, &put->barrier);
if (rv != 0) {
goto err_after_configuration_encode;
}
return 0;
err_after_configuration_encode:
RaftHeapFree(put->meta.bufs[1].base);
err_after_req_alloc:
RaftHeapFree(put);
err:
assert(rv != 0);
return rv;
}
static void uvSnapshotGetWorkCb(uv_work_t *work)
{
struct uvSnapshotGet *get = work->data;
struct uv *uv = get->uv;
struct uvSnapshotInfo *snapshots;
size_t n_snapshots;
struct uvSegmentInfo *segments;
size_t n_segments;
int rv;
get->status = 0;
rv = UvList(uv, &snapshots, &n_snapshots, &segments, &n_segments,
get->errmsg);
if (rv != 0) {
get->status = rv;
goto out;
}
if (snapshots != NULL) {
rv = UvSnapshotLoad(uv, &snapshots[n_snapshots - 1],
get->snapshot, get->errmsg);
if (rv != 0) {
get->status = rv;
}
RaftHeapFree(snapshots);
}
if (segments != NULL) {
RaftHeapFree(segments);
}
out:
return;
}
static void uvSnapshotGetAfterWorkCb(uv_work_t *work, int status)
{
struct uvSnapshotGet *get = work->data;
struct raft_io_snapshot_get *req = get->req;
struct raft_snapshot *snapshot = get->snapshot;
int req_status = get->status;
struct uv *uv = get->uv;
assert(status == 0);
queue_remove(&get->queue);
RaftHeapFree(get);
req->cb(req, snapshot, req_status);
uvMaybeFireCloseCb(uv);
}
int UvSnapshotGet(struct raft_io *io,
struct raft_io_snapshot_get *req,
raft_io_snapshot_get_cb cb)
{
struct uv *uv;
struct uvSnapshotGet *get;
int rv;
uv = io->impl;
assert(!uv->closing);
get = RaftHeapMalloc(sizeof *get);
if (get == NULL) {
rv = RAFT_NOMEM;
goto err;
}
get->uv = uv;
get->req = req;
req->cb = cb;
get->snapshot = RaftHeapMalloc(sizeof *get->snapshot);
if (get->snapshot == NULL) {
rv = RAFT_NOMEM;
goto err_after_req_alloc;
}
get->work.data = get;
queue_insert_tail(&uv->snapshot_get_reqs, &get->queue);
rv = uv_queue_work(uv->loop, &get->work, uvSnapshotGetWorkCb,
uvSnapshotGetAfterWorkCb);
if (rv != 0) {
queue_remove(&get->queue);
tracef("get last snapshot: %s", uv_strerror(rv));
rv = RAFT_IOERR;
goto err_after_snapshot_alloc;
}
return 0;
err_after_snapshot_alloc:
RaftHeapFree(get->snapshot);
err_after_req_alloc:
RaftHeapFree(get);
err:
assert(rv != 0);
return rv;
}
#undef tracef
dqlite-1.16.7/src/raft/uv_tcp.c 0000664 0000000 0000000 00000005124 14652527134 0016272 0 ustar 00root root 0000000 0000000 #include "uv_tcp.h"
#include "uv_ip.h"
#include
#include "../raft.h"
#include "assert.h"
#include "err.h"
#include "heap.h"
/* Implementation of raft_uv_transport->init. */
static int uvTcpInit(struct raft_uv_transport *transport,
raft_id id,
const char *address)
{
struct UvTcp *t = transport->impl;
assert(id > 0);
assert(address != NULL);
t->id = id;
t->address = address;
return 0;
}
/* Implementation of raft_uv_transport->close. */
static void uvTcpClose(struct raft_uv_transport *transport,
raft_uv_transport_close_cb cb)
{
struct UvTcp *t = transport->impl;
assert(!t->closing);
t->closing = true;
t->close_cb = cb;
UvTcpListenClose(t);
UvTcpConnectClose(t);
UvTcpMaybeFireCloseCb(t);
}
void UvTcpMaybeFireCloseCb(struct UvTcp *t)
{
if (!t->closing) {
return;
}
assert(queue_empty(&t->accepting));
assert(queue_empty(&t->connecting));
if (!queue_empty(&t->aborting)) {
return;
}
if (t->listeners != NULL) {
return;
}
if (t->close_cb != NULL) {
t->close_cb(t->transport);
}
}
int raft_uv_tcp_init(struct raft_uv_transport *transport,
struct uv_loop_s *loop)
{
struct UvTcp *t;
void *data = transport->data;
int version = transport->version;
if (version != 1) {
ErrMsgPrintf(transport->errmsg, "Invalid version: %d", version);
return RAFT_INVALID;
}
memset(transport, 0, sizeof *transport);
transport->data = data;
transport->version = version;
t = raft_malloc(sizeof *t);
if (t == NULL) {
ErrMsgOom(transport->errmsg);
return RAFT_NOMEM;
}
t->transport = transport;
t->loop = loop;
t->id = 0;
t->address = NULL;
t->bind_address = NULL;
t->listeners = NULL;
t->n_listeners = 0;
t->accept_cb = NULL;
queue_init(&t->accepting);
queue_init(&t->connecting);
queue_init(&t->aborting);
t->closing = false;
t->close_cb = NULL;
transport->impl = t;
transport->init = uvTcpInit;
transport->close = uvTcpClose;
transport->listen = UvTcpListen;
transport->connect = UvTcpConnect;
return 0;
}
void raft_uv_tcp_close(struct raft_uv_transport *transport)
{
struct UvTcp *t = transport->impl;
raft_free(t->bind_address);
raft_free(t);
}
int raft_uv_tcp_set_bind_address(struct raft_uv_transport *transport,
const char *address)
{
struct UvTcp *t = transport->impl;
char hostname[NI_MAXHOST];
char service[NI_MAXSERV];
int rv;
rv = uvIpAddrSplit(address, hostname, sizeof(hostname), service,
sizeof(service));
if (rv != 0) {
return RAFT_INVALID;
}
t->bind_address = raft_malloc(strlen(address) + 1);
if (t->bind_address == NULL) {
return RAFT_NOMEM;
}
strcpy(t->bind_address, address);
return 0;
}
dqlite-1.16.7/src/raft/uv_tcp.h 0000664 0000000 0000000 00000003331 14652527134 0016275 0 ustar 00root root 0000000 0000000 #ifndef UV_TCP_H_
#define UV_TCP_H_
#include "../raft.h"
#include "../lib/queue.h"
/* Protocol version. */
#define UV__TCP_HANDSHAKE_PROTOCOL 1
struct UvTcp
{
struct raft_uv_transport *transport; /* Interface object we implement */
struct uv_loop_s *loop; /* Event loop */
raft_id id; /* ID of this raft server */
const char *address; /* Address of this raft server */
unsigned n_listeners; /* Number of listener sockets */
struct uv_tcp_s *listeners; /* Listener sockets */
raft_uv_accept_cb accept_cb; /* Call after accepting a connection */
queue accepting; /* Connections being accepted */
queue connecting; /* Pending connection requests */
queue aborting; /* Connections being aborted */
bool closing; /* True after close() is called */
raft_uv_transport_close_cb
close_cb; /* Call when it's safe to free us */
char *bind_address; /* Optional address:port to bind to */
};
/* Implementation of raft_uv_transport->listen. */
int UvTcpListen(struct raft_uv_transport *transport, raft_uv_accept_cb cb);
/* Stop accepting new connection and close all connections being accepted. */
void UvTcpListenClose(struct UvTcp *t);
/* Implementation of raft_uv_transport->connect. */
int UvTcpConnect(struct raft_uv_transport *transport,
struct raft_uv_connect *req,
raft_id id,
const char *address,
raft_uv_connect_cb cb);
/* Abort all pending connection requests. */
void UvTcpConnectClose(struct UvTcp *t);
/* Fire the transport close callback if the transport is closing and there's no
* more pending callback. */
void UvTcpMaybeFireCloseCb(struct UvTcp *t);
#endif /* UV_TCP_H_ */
dqlite-1.16.7/src/raft/uv_tcp_connect.c 0000664 0000000 0000000 00000025075 14652527134 0020012 0 ustar 00root root 0000000 0000000 #include
#include "assert.h"
#include "byte.h"
#include "err.h"
#include "heap.h"
#include "uv_ip.h"
#include "uv_tcp.h"
/* The happy path of a connection request is:
*
* - Create a TCP handle and submit a TCP connect request.
* - Initiate an asynchronous dns resolve request
* - Once the name lookup was successfull connect to the first given IP
* - Once connected over TCP, submit a write request for the handshake.
* - Once the write completes, fire the connection request callback.
*
* Alternative happy path of a connection request, if hostname resolves to
* multiple IPs and first/second/... IP is reachable:
* - close the tcp handle and initiate a new connect with next IP in cb
*
* Possible failure modes are:
*
* - The name resolve for the hostname is not sucessfull, close the TCP handle
* and fire the request callback.
*
* - The transport get closed, close the TCP handle and and fire the request
* callback with RAFT_CANCELED.
*
* - Either the TCP connect or the write request fails: close the TCP handle and
* fire the request callback with RAFT_NOCONNECTION.
*/
/* Hold state for a single connection request. */
struct uvTcpConnect
{
struct UvTcp *t; /* Transport implementation */
struct raft_uv_connect *req; /* User request */
uv_buf_t handshake; /* Handshake data */
struct uv_tcp_s *tcp; /* TCP connection socket handle */
struct uv_getaddrinfo_s getaddrinfo; /* DNS resolve request */
const struct addrinfo
*ai_current; /* The current sockaddr to connect to */
struct uv_connect_s connect; /* TCP connection request */
struct uv_write_s write; /* TCP handshake request */
int status; /* Returned to the request callback */
bool resolving; /* Indicate name resolving in progress */
bool retry; /* Indicate tcp connect failure handling */
queue queue; /* Pending connect queue */
};
/* Encode an handshake message into the given buffer. */
static int uvTcpEncodeHandshake(raft_id id, const char *address, uv_buf_t *buf)
{
void *cursor;
size_t address_len = bytePad64(strlen(address) + 1);
buf->len = sizeof(uint64_t) + /* Protocol version. */
sizeof(uint64_t) + /* Server ID. */
sizeof(uint64_t) /* Size of the address buffer */;
buf->len += address_len;
buf->base = RaftHeapMalloc(buf->len);
if (buf->base == NULL) {
return RAFT_NOMEM;
}
cursor = buf->base;
bytePut64(&cursor, UV__TCP_HANDSHAKE_PROTOCOL);
bytePut64(&cursor, id);
bytePut64(&cursor, address_len);
strcpy(cursor, address);
return 0;
}
/* Finish the connect request, releasing its memory and firing the connect
* callback. */
static void uvTcpConnectFinish(struct uvTcpConnect *connect)
{
struct uv_stream_s *stream = (struct uv_stream_s *)connect->tcp;
struct raft_uv_connect *req = connect->req;
int status = connect->status;
queue_remove(&connect->queue);
RaftHeapFree(connect->handshake.base);
uv_freeaddrinfo(connect->getaddrinfo.addrinfo);
raft_free(connect);
req->cb(req, stream, status);
}
/* The TCP connection handle has been closed in consequence of an error or
* because the transport is closing. */
static void uvTcpConnectUvCloseCb(struct uv_handle_s *handle)
{
struct uvTcpConnect *connect = handle->data;
struct UvTcp *t = connect->t;
assert(connect->status != 0);
assert(handle == (struct uv_handle_s *)connect->tcp);
RaftHeapFree(connect->tcp);
connect->tcp = NULL;
uvTcpConnectFinish(connect);
UvTcpMaybeFireCloseCb(t);
}
/* Abort a connection request. */
static void uvTcpConnectAbort(struct uvTcpConnect *connect)
{
queue_remove(&connect->queue);
queue_insert_tail(&connect->t->aborting, &connect->queue);
uv_cancel((struct uv_req_s *)&connect->getaddrinfo);
/* Call uv_close on the tcp handle, if there is no getaddrinfo request
* in flight and the handle is not currently closed due to next IP
* connect attempt.
* Data structures may only be freed after the uvGetAddrInfoCb was
* triggered. Tcp handle will be closed in the uvGetAddrInfoCb in this
* case. uvTcpConnectUvCloseCb will be invoked from
* uvTcpTryNextConnectCb in case a next IP connect should be started. */
if (!connect->resolving && !connect->retry) {
uv_close((struct uv_handle_s *)connect->tcp,
uvTcpConnectUvCloseCb);
}
}
/* The handshake TCP write completes. Fire the connect callback. */
static void uvTcpConnectUvWriteCb(struct uv_write_s *write, int status)
{
struct uvTcpConnect *connect = write->data;
struct UvTcp *t = connect->t;
if (t->closing) {
connect->status = RAFT_CANCELED;
return;
}
if (status != 0) {
assert(status !=
UV_ECANCELED); /* t->closing would have been true */
connect->status = RAFT_NOCONNECTION;
uvTcpConnectAbort(connect);
return;
}
uvTcpConnectFinish(connect);
}
/* Helper function to connect to the remote node */
static void uvTcpAsyncConnect(struct uvTcpConnect *connect);
/* The TCP connect failed, we closed the handle and want to try with next IP */
static void uvTcpTryNextConnectCb(struct uv_handle_s *handle)
{
struct uvTcpConnect *connect = handle->data;
struct UvTcp *t = connect->t;
int rv;
connect->retry = false;
if (t->closing) {
connect->status = RAFT_CANCELED;
/* We are already in close cb for the tcp handle, simply invoke
* final cb
*/
uvTcpConnectUvCloseCb(handle);
return;
}
rv = uv_tcp_init(t->loop, connect->tcp);
assert(rv == 0);
uvTcpAsyncConnect(connect);
}
/* The TCP connection is established. Write the handshake data. */
static void uvTcpConnectUvConnectCb(struct uv_connect_s *req, int status)
{
struct uvTcpConnect *connect = req->data;
struct UvTcp *t = connect->t;
int rv;
if (t->closing) {
connect->status = RAFT_CANCELED;
return;
}
if (status != 0) {
assert(status !=
UV_ECANCELED); /* t->closing would have been true */
connect->ai_current = connect->ai_current->ai_next;
if (connect->ai_current) {
/* For the next connect attempt we need to close the tcp
* handle. */
/* To avoid interference with aborting we set a flag to
* indicate the connect attempt */
connect->retry = true;
uv_close((struct uv_handle_s *)connect->tcp,
uvTcpTryNextConnectCb);
return;
}
connect->status = RAFT_NOCONNECTION;
ErrMsgPrintf(t->transport->errmsg, "uv_tcp_connect(): %s",
uv_strerror(status));
goto err;
}
rv = uv_write(&connect->write, (struct uv_stream_s *)connect->tcp,
&connect->handshake, 1, uvTcpConnectUvWriteCb);
if (rv != 0) {
/* UNTESTED: what are the error conditions? perhaps ENOMEM */
connect->status = RAFT_NOCONNECTION;
goto err;
}
return;
err:
uvTcpConnectAbort(connect);
}
/* Helper function to connect to the remote node */
static void uvTcpAsyncConnect(struct uvTcpConnect *connect)
{
int rv;
rv = uv_tcp_connect(&connect->connect, connect->tcp,
connect->ai_current->ai_addr,
uvTcpConnectUvConnectCb);
if (rv != 0) {
/* UNTESTED: since parsing succeed, this should fail only
* because of lack of system resources */
ErrMsgPrintf(connect->t->transport->errmsg,
"uv_tcp_connect(): %s", uv_strerror(rv));
connect->status = RAFT_NOCONNECTION;
uvTcpConnectAbort(connect);
}
}
/* The hostname resolve is finished */
static void uvGetAddrInfoCb(uv_getaddrinfo_t *req,
int status,
struct addrinfo *res)
{
struct uvTcpConnect *connect = req->data;
struct UvTcp *t = connect->t;
connect->resolving =
false; /* Indicate we are in the name resolving phase */
if (t->closing) {
connect->status = RAFT_CANCELED;
/* We need to close the tcp handle to abort connection attempt
*/
uv_close((struct uv_handle_s *)connect->tcp,
uvTcpConnectUvCloseCb);
return;
}
if (status < 0) {
ErrMsgPrintf(t->transport->errmsg, "uv_getaddrinfo(): %s",
uv_err_name(status));
connect->status = RAFT_NOCONNECTION;
uvTcpConnectAbort(connect);
return;
}
connect->ai_current = res;
uvTcpAsyncConnect(connect);
}
/* Create a new TCP handle and submit a connection request to the event loop. */
static int uvTcpConnectStart(struct uvTcpConnect *r, const char *address)
{
static struct addrinfo hints = {.ai_flags = 0,
.ai_family = AF_INET,
.ai_socktype = SOCK_STREAM,
.ai_protocol = 0};
struct UvTcp *t = r->t;
char hostname[NI_MAXHOST];
char service[NI_MAXSERV];
int rv;
r->handshake.base = NULL;
/* Initialize the handshake buffer. */
rv = uvTcpEncodeHandshake(t->id, t->address, &r->handshake);
if (rv != 0) {
assert(rv == RAFT_NOMEM);
ErrMsgOom(t->transport->errmsg);
goto err;
}
r->tcp = RaftHeapMalloc(sizeof *r->tcp);
if (r->tcp == NULL) {
ErrMsgOom(t->transport->errmsg);
rv = RAFT_NOMEM;
goto err;
}
rv = uv_tcp_init(r->t->loop, r->tcp);
assert(rv == 0);
r->tcp->data = r;
rv = uvIpAddrSplit(address, hostname, sizeof(hostname), service,
sizeof(service));
if (rv) {
ErrMsgPrintf(
t->transport->errmsg,
"uv_tcp_connect(): Cannot split %s into host and service",
address);
rv = RAFT_NOCONNECTION;
goto err_after_tcp_init;
}
rv = uv_getaddrinfo(r->t->loop, &r->getaddrinfo, &uvGetAddrInfoCb,
hostname, service, &hints);
if (rv) {
ErrMsgPrintf(t->transport->errmsg,
"uv_tcp_connect(): Cannot initiate getaddrinfo %s",
uv_strerror(rv));
rv = RAFT_NOCONNECTION;
goto err_after_tcp_init;
}
r->resolving = true; /* Indicate we are in the name resolving phase */
return 0;
err_after_tcp_init:
uv_close((uv_handle_t *)r->tcp, (uv_close_cb)RaftHeapFree);
err:
RaftHeapFree(r->handshake.base);
return rv;
}
int UvTcpConnect(struct raft_uv_transport *transport,
struct raft_uv_connect *req,
raft_id id,
const char *address,
raft_uv_connect_cb cb)
{
struct UvTcp *t = transport->impl;
struct uvTcpConnect *r;
int rv;
(void)id;
assert(!t->closing);
/* Create and initialize a new TCP connection request object */
r = RaftHeapMalloc(sizeof *r);
if (r == NULL) {
rv = RAFT_NOMEM;
ErrMsgOom(transport->errmsg);
goto err;
}
r->t = t;
r->req = req;
r->status = 0;
r->write.data = r;
r->getaddrinfo.data = r;
r->resolving = false;
r->retry = false;
r->connect.data = r;
req->cb = cb;
/* Keep track of the pending request */
queue_insert_tail(&t->connecting, &r->queue);
/* Start connecting */
rv = uvTcpConnectStart(r, address);
if (rv != 0) {
goto err_after_alloc;
}
return 0;
err_after_alloc:
queue_remove(&r->queue);
RaftHeapFree(r);
err:
return rv;
}
void UvTcpConnectClose(struct UvTcp *t)
{
while (!queue_empty(&t->connecting)) {
struct uvTcpConnect *connect;
queue *head;
head = queue_head(&t->connecting);
connect = QUEUE_DATA(head, struct uvTcpConnect, queue);
uvTcpConnectAbort(connect);
}
}
dqlite-1.16.7/src/raft/uv_tcp_listen.c 0000664 0000000 0000000 00000026351 14652527134 0017655 0 ustar 00root root 0000000 0000000 #include
#include "assert.h"
#include "byte.h"
#include "heap.h"
#include "uv_ip.h"
#include "uv_tcp.h"
/* The happy path of an incoming connection is:
*
* - The connection callback is fired on the listener TCP handle, and the
* incoming connection is uv_accept()'ed. We call uv_read_start() to get
* notified about received handshake data.
*
* - Once the preamble is received, we start waiting for the server address.
*
* - Once the server address is received, we fire the receive callback.
*
* Possible failure modes are:
*
* - The accept process gets canceled in the transport->close() implementation,
* by calling tcp_accept_stop(): the incoming TCP connection handle gets
* closed, preventing any further handshake data notification, and all
* allocated memory gets released in the handle close callback.
*/
/* Hold state for a connection being accepted. */
struct uvTcpHandshake
{
uint64_t preamble[3]; /* Preamble buffer */
uv_buf_t address; /* Address buffer */
size_t nread; /* Number of bytes read */
};
/* Hold handshake data for a new connection being established. */
struct uvTcpIncoming
{
struct UvTcp *t; /* Transport implementation */
struct uv_tcp_s
*listener; /* The tcp handle, which accepted this socket */
struct uv_tcp_s *tcp; /* TCP connection socket handle */
struct uvTcpHandshake handshake; /* Handshake data */
queue queue; /* Pending accept queue */
};
/* Decode the handshake preamble, containing the protocol version, the ID of the
* connecting server and the length of its address. Also, allocate the buffer to
* start reading the server address. */
static int uvTcpDecodePreamble(struct uvTcpHandshake *h)
{
uint64_t protocol;
protocol = byteFlip64(h->preamble[0]);
if (protocol != UV__TCP_HANDSHAKE_PROTOCOL) {
return RAFT_MALFORMED;
}
h->address.len = (size_t)byteFlip64(h->preamble[2]);
h->address.base = RaftHeapMalloc(h->address.len);
if (h->address.base == NULL) {
return RAFT_NOMEM;
}
h->nread = 0;
return 0;
}
/* The accepted TCP client connection has been closed, release all memory
* associated with accept object. We can get here only if an error occurrent
* during the handshake or if raft_uv_transport->close() has been invoked. */
static void uvTcpIncomingCloseCb(struct uv_handle_s *handle)
{
struct uvTcpIncoming *incoming = handle->data;
struct UvTcp *t = incoming->t;
queue_remove(&incoming->queue);
if (incoming->handshake.address.base != NULL) {
RaftHeapFree(incoming->handshake.address.base);
}
RaftHeapFree(incoming->tcp);
RaftHeapFree(incoming);
UvTcpMaybeFireCloseCb(t);
}
/* Close an incoming TCP connection which hasn't complete the handshake yet. */
static void uvTcpIncomingAbort(struct uvTcpIncoming *incoming)
{
struct UvTcp *t = incoming->t;
/* After uv_close() returns we are guaranteed that no more alloc_cb or
* read_cb will be called. */
queue_remove(&incoming->queue);
queue_insert_tail(&t->aborting, &incoming->queue);
uv_close((struct uv_handle_s *)incoming->tcp, uvTcpIncomingCloseCb);
}
/* Read the address part of the handshake. */
static void uvTcpIncomingAllocCbAddress(struct uv_handle_s *handle,
size_t suggested_size,
uv_buf_t *buf)
{
struct uvTcpIncoming *incoming = handle->data;
(void)suggested_size;
assert(!incoming->t->closing);
buf->base =
incoming->handshake.address.base + incoming->handshake.nread;
buf->len = incoming->handshake.address.len - incoming->handshake.nread;
}
static void uvTcpIncomingReadCbAddress(uv_stream_t *stream,
ssize_t nread,
const uv_buf_t *buf)
{
struct uvTcpIncoming *incoming = stream->data;
char *address;
raft_id id;
size_t n;
int rv;
(void)buf;
assert(!incoming->t->closing);
if (nread == 0) {
/* Empty read just ignore it. */
return;
}
if (nread < 0) {
uvTcpIncomingAbort(incoming);
return;
}
/* We shouldn't have read more data than the pending amount. */
n = (size_t)nread;
assert(n <=
incoming->handshake.address.len - incoming->handshake.nread);
/* Advance the read window */
incoming->handshake.nread += n;
/* If there's more data to read in order to fill the current
* read buffer, just return, we'll be invoked again. */
if (incoming->handshake.nread < incoming->handshake.address.len) {
return;
}
/* If we have completed reading the address, let's fire the callback. */
rv = uv_read_stop(stream);
assert(rv == 0);
id = byteFlip64(incoming->handshake.preamble[1]);
address = incoming->handshake.address.base;
queue_remove(&incoming->queue);
incoming->t->accept_cb(incoming->t->transport, id, address,
(struct uv_stream_s *)incoming->tcp);
RaftHeapFree(incoming->handshake.address.base);
RaftHeapFree(incoming);
}
/* Read the preamble of the handshake. */
static void uvTcpIncomingAllocCbPreamble(struct uv_handle_s *handle,
size_t suggested_size,
uv_buf_t *buf)
{
struct uvTcpIncoming *incoming = handle->data;
(void)suggested_size;
buf->base =
(char *)incoming->handshake.preamble + incoming->handshake.nread;
buf->len =
sizeof incoming->handshake.preamble - incoming->handshake.nread;
}
static void uvTcpIncomingReadCbPreamble(uv_stream_t *stream,
ssize_t nread,
const uv_buf_t *buf)
{
struct uvTcpIncoming *incoming = stream->data;
size_t n;
int rv;
(void)buf;
if (nread == 0) {
/* Empty read just ignore it. */
return;
}
if (nread < 0) {
uvTcpIncomingAbort(incoming);
return;
}
/* We shouldn't have read more data than the pending amount. */
n = (size_t)nread;
assert(n <=
sizeof incoming->handshake.preamble - incoming->handshake.nread);
/* Advance the read window */
incoming->handshake.nread += n;
/* If there's more data to read in order to fill the current
* read buffer, just return, we'll be invoked again. */
if (incoming->handshake.nread < sizeof incoming->handshake.preamble) {
return;
}
/* If we have completed reading the preamble, let's parse it. */
rv = uvTcpDecodePreamble(&incoming->handshake);
if (rv != 0) {
uvTcpIncomingAbort(incoming);
return;
}
rv = uv_read_stop(stream);
assert(rv == 0);
rv = uv_read_start((uv_stream_t *)incoming->tcp,
uvTcpIncomingAllocCbAddress,
uvTcpIncomingReadCbAddress);
assert(rv == 0);
}
/* Start reading handshake data for a new incoming connection. */
static int uvTcpIncomingStart(struct uvTcpIncoming *incoming)
{
int rv;
memset(&incoming->handshake, 0, sizeof incoming->handshake);
incoming->tcp = RaftHeapMalloc(sizeof *incoming->tcp);
if (incoming->tcp == NULL) {
return RAFT_NOMEM;
}
incoming->tcp->data = incoming;
rv = uv_tcp_init(incoming->t->loop, incoming->tcp);
assert(rv == 0);
rv = uv_accept((struct uv_stream_s *)incoming->listener,
(struct uv_stream_s *)incoming->tcp);
if (rv != 0) {
rv = RAFT_IOERR;
goto err_after_tcp_init;
}
rv = uv_read_start((uv_stream_t *)incoming->tcp,
uvTcpIncomingAllocCbPreamble,
uvTcpIncomingReadCbPreamble);
assert(rv == 0);
return 0;
err_after_tcp_init:
uv_close((uv_handle_t *)incoming->tcp, (uv_close_cb)RaftHeapFree);
return rv;
}
#define IS_IN_ARRAY(elem, array, array_size) \
(const char *)(elem) >= (const char *)(array) && \
(const char *)(elem) < \
(const char *)(array) + array_size * sizeof(*array)
/* Called when there's a new incoming connection: create a new tcp_accept object
* and start receiving handshake data. */
static void uvTcpListenCb(struct uv_stream_s *stream, int status)
{
struct UvTcp *t = stream->data;
struct uvTcpIncoming *incoming;
int rv;
assert(IS_IN_ARRAY(stream, t->listeners, t->n_listeners));
if (status != 0) {
rv = RAFT_IOERR;
goto err;
}
incoming = RaftHeapMalloc(sizeof *incoming);
if (incoming == NULL) {
rv = RAFT_NOMEM;
goto err;
}
incoming->t = t;
incoming->listener = (struct uv_tcp_s *)stream;
incoming->tcp = NULL;
queue_insert_tail(&t->accepting, &incoming->queue);
rv = uvTcpIncomingStart(incoming);
if (rv != 0) {
goto err_after_accept_alloc;
}
return;
err_after_accept_alloc:
queue_remove(&incoming->queue);
RaftHeapFree(incoming);
err:
assert(rv != 0);
}
/* Do bind/listen call on the tcp handle */
static int uvTcpBindListen(struct uv_tcp_s *listener, struct sockaddr *addr)
{
if (uv_tcp_bind(listener, addr, 0) ||
uv_listen((uv_stream_t *)listener, 1, uvTcpListenCb)) {
return RAFT_IOERR;
}
return 0;
}
/* Create a tcp handle and do bind/listen for each IP */
static int uvTcpListenOnMultipleIP(struct raft_uv_transport *transport,
struct addrinfo *addr_infos)
{
struct UvTcp *t;
struct addrinfo *current;
unsigned n_listeners;
int rv;
t = transport->impl;
n_listeners = 0;
for (current = addr_infos; current; current = current->ai_next) {
++n_listeners;
}
current = addr_infos;
t->listeners = raft_malloc(n_listeners * sizeof(*t->listeners));
if (!t->listeners) {
rv = RAFT_NOMEM;
goto err;
}
t->n_listeners = n_listeners;
for (n_listeners = 0; n_listeners < t->n_listeners; ++n_listeners) {
struct uv_tcp_s *listener = &t->listeners[n_listeners];
listener->data = t;
if (uv_tcp_init(t->loop, listener) ||
uvTcpBindListen(listener, current->ai_addr)) {
rv = RAFT_IOERR;
goto err;
}
current = addr_infos->ai_next;
}
return 0;
err:
if (t->listeners) {
for (unsigned i = 0; i <= n_listeners; ++i) {
uv_close((struct uv_handle_s *)&t->listeners[i], NULL);
}
raft_free(t->listeners);
t->listeners = NULL;
t->n_listeners = 0;
}
return rv;
}
/* Ignore duplicate entries from glibc getaddrinfo due to
* https://bugzilla.redhat.com/show_bug.cgi?id=496300
* in case of resolving localhost */
static bool uvIsAddressDuplication(struct addrinfo *addr_info)
{
struct addrinfo *next = addr_info->ai_next;
/* Check, if we have a list of length 2 */
if (!next || next->ai_next) {
return false;
}
if (addr_info->ai_addrlen != next->ai_addrlen ||
bcmp(addr_info->ai_addr, next->ai_addr, addr_info->ai_addrlen)) {
return false;
}
return true;
}
int UvTcpListen(struct raft_uv_transport *transport, raft_uv_accept_cb cb)
{
struct UvTcp *t;
struct addrinfo *addr_infos;
int rv;
t = transport->impl;
t->accept_cb = cb;
if (t->bind_address == NULL) {
rv = uvIpResolveBindAddresses(t->address, &addr_infos);
} else {
rv = uvIpResolveBindAddresses(t->bind_address, &addr_infos);
}
if (rv != 0 || !addr_infos) {
return rv;
}
if (addr_infos->ai_next && uvIsAddressDuplication(addr_infos)) {
rv = uvTcpListenOnMultipleIP(transport, addr_infos->ai_next);
} else {
rv = uvTcpListenOnMultipleIP(transport, addr_infos);
}
freeaddrinfo(addr_infos);
return rv;
}
/* Close callback for uvTcp->listener. */
static void uvTcpListenCloseCbListener(struct uv_handle_s *handle)
{
struct UvTcp *t = handle->data;
assert(t->closing);
assert(t->n_listeners);
assert(t->listeners);
if (--t->n_listeners == 0) {
raft_free(t->listeners);
t->listeners = NULL;
UvTcpMaybeFireCloseCb(t);
}
}
void UvTcpListenClose(struct UvTcp *t)
{
queue *head;
assert(t->closing);
while (!queue_empty(&t->accepting)) {
struct uvTcpIncoming *incoming;
head = queue_head(&t->accepting);
incoming = QUEUE_DATA(head, struct uvTcpIncoming, queue);
uvTcpIncomingAbort(incoming);
}
if (t->n_listeners) {
for (unsigned i = 0; i < t->n_listeners; ++i) {
uv_close((struct uv_handle_s *)&t->listeners[i],
uvTcpListenCloseCbListener);
}
}
}
dqlite-1.16.7/src/raft/uv_truncate.c 0000664 0000000 0000000 00000010752 14652527134 0017334 0 ustar 00root root 0000000 0000000 #include
#include
#include "assert.h"
#include "byte.h"
#include "heap.h"
#include "uv.h"
#include "uv_encoding.h"
/* Track a truncate request. */
struct uvTruncate
{
struct uv *uv;
struct UvBarrierReq barrier;
raft_index index;
int status;
};
/* Execute a truncate request in a thread. */
static void uvTruncateWorkCb(uv_work_t *work)
{
struct uvTruncate *truncate = work->data;
struct uv *uv = truncate->uv;
tracef("uv truncate work cb");
struct uvSnapshotInfo *snapshots;
struct uvSegmentInfo *segments;
struct uvSegmentInfo *segment;
size_t n_snapshots;
size_t n_segments;
size_t i;
size_t j;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int rv;
/* Load all segments on disk. */
rv = UvList(uv, &snapshots, &n_snapshots, &segments, &n_segments,
errmsg);
if (rv != 0) {
goto err;
}
if (snapshots != NULL) {
RaftHeapFree(snapshots);
}
assert(segments != NULL);
/* Find the segment that contains the truncate point. */
segment = NULL; /* Suppress warnings. */
for (i = 0; i < n_segments; i++) {
segment = &segments[i];
if (segment->is_open) {
continue;
}
if (truncate->index >= segment->first_index &&
truncate->index <= segment->end_index) {
break;
}
}
assert(i < n_segments);
/* If the truncate index is not the first of the segment, we need to
* truncate it. */
if (truncate->index > segment->first_index) {
rv = uvSegmentTruncate(uv, segment, truncate->index);
if (rv != 0) {
goto err_after_list;
}
}
/* Remove all closed segments past the one containing the truncate
* index. */
for (j = i; j < n_segments; j++) {
segment = &segments[j];
if (segment->is_open) {
continue;
}
rv = UvFsRemoveFile(uv->dir, segment->filename, errmsg);
if (rv != 0) {
tracef("unlink segment %s: %s", segment->filename,
errmsg);
rv = RAFT_IOERR;
goto err_after_list;
}
}
rv = UvFsSyncDir(uv->dir, errmsg);
if (rv != 0) {
tracef("sync data directory: %s", errmsg);
rv = RAFT_IOERR;
goto err_after_list;
}
RaftHeapFree(segments);
truncate->status = 0;
tracef("uv truncate work cb ok");
return;
err_after_list:
RaftHeapFree(segments);
err:
assert(rv != 0);
truncate->status = rv;
}
static void uvTruncateAfterWorkCb(uv_work_t *work, int status)
{
assert(work != NULL);
struct uvTruncate *truncate = work->data;
assert(truncate != NULL);
struct uv *uv = truncate->uv;
assert(uv != NULL);
tracef("uv truncate after work cb status:%d", status);
assert(status == 0);
if (truncate->status != 0) {
uv->errored = true;
}
tracef("clear truncate work");
uv->truncate_work.data = NULL;
RaftHeapFree(truncate);
UvUnblock(uv);
}
static void uvTruncateBarrierCb(struct UvBarrierReq *barrier)
{
struct uvTruncate *truncate = barrier->data;
struct uv *uv = truncate->uv;
tracef("uv truncate barrier cb");
int rv;
/* Ensure that we don't invoke this callback more than once. */
barrier->cb = NULL;
/* If we're closing, don't perform truncation at all and abort here. */
if (uv->closing) {
tracef("closing => don't truncate");
RaftHeapFree(truncate);
uvMaybeFireCloseCb(uv);
return;
}
assert(queue_empty(&uv->append_writing_reqs));
assert(queue_empty(&uv->finalize_reqs));
assert(uv->finalize_work.data == NULL);
assert(uv->truncate_work.data == NULL);
tracef("set truncate work");
uv->truncate_work.data = truncate;
rv = uv_queue_work(uv->loop, &uv->truncate_work, uvTruncateWorkCb,
uvTruncateAfterWorkCb);
if (rv != 0) {
tracef("truncate index %lld: %s", truncate->index,
uv_strerror(rv));
tracef("clear truncate work");
uv->truncate_work.data = NULL;
uv->errored = true;
}
}
int UvTruncate(struct raft_io *io, raft_index index)
{
struct uv *uv;
struct uvTruncate *truncate;
int rv;
uv = io->impl;
tracef("uv truncate %llu", index);
assert(!uv->closing);
/* We should truncate only entries that we were requested to append in
* the first place. */
assert(index > 0);
assert(index < uv->append_next_index);
truncate = RaftHeapMalloc(sizeof *truncate);
if (truncate == NULL) {
rv = RAFT_NOMEM;
goto err;
}
truncate->uv = uv;
truncate->index = index;
truncate->barrier.data = truncate;
truncate->barrier.blocking = true;
truncate->barrier.cb = uvTruncateBarrierCb;
/* Make sure that we wait for any inflight writes to finish and then
* close the current segment. */
rv = UvBarrier(uv, index, &truncate->barrier);
if (rv != 0) {
goto err_after_req_alloc;
}
return 0;
err_after_req_alloc:
RaftHeapFree(truncate);
err:
assert(rv != 0);
return rv;
}
#undef tracef
dqlite-1.16.7/src/raft/uv_work.c 0000664 0000000 0000000 00000002706 14652527134 0016471 0 ustar 00root root 0000000 0000000 #include "assert.h"
#include "heap.h"
#include "uv.h"
struct uvAsyncWork
{
struct uv *uv;
struct raft_io_async_work *req;
struct uv_work_s work;
int status;
queue queue;
};
static void uvAsyncWorkCb(uv_work_t *work)
{
struct uvAsyncWork *w = work->data;
assert(w != NULL);
int rv;
rv = w->req->work(w->req);
w->status = rv;
}
static void uvAsyncAfterWorkCb(uv_work_t *work, int status)
{
struct uvAsyncWork *w = work->data;
struct raft_io_async_work *req = w->req;
int req_status = w->status;
struct uv *uv = w->uv;
assert(status == 0);
queue_remove(&w->queue);
RaftHeapFree(w);
req->cb(req, req_status);
uvMaybeFireCloseCb(uv);
}
int UvAsyncWork(struct raft_io *io,
struct raft_io_async_work *req,
raft_io_async_work_cb cb)
{
struct uv *uv;
struct uvAsyncWork *async_work;
int rv;
uv = io->impl;
assert(!uv->closing);
async_work = RaftHeapMalloc(sizeof *async_work);
if (async_work == NULL) {
rv = RAFT_NOMEM;
goto err;
}
async_work->uv = uv;
async_work->req = req;
async_work->work.data = async_work;
req->cb = cb;
queue_insert_tail(&uv->async_work_reqs, &async_work->queue);
rv = uv_queue_work(uv->loop, &async_work->work, uvAsyncWorkCb,
uvAsyncAfterWorkCb);
if (rv != 0) {
queue_remove(&async_work->queue);
tracef("async work: %s", uv_strerror(rv));
rv = RAFT_IOERR;
goto err_after_req_alloc;
}
return 0;
err_after_req_alloc:
RaftHeapFree(async_work);
err:
assert(rv != 0);
return rv;
}
#undef tracef
dqlite-1.16.7/src/raft/uv_writer.c 0000664 0000000 0000000 00000032661 14652527134 0017026 0 ustar 00root root 0000000 0000000 #include "uv_writer.h"
#include
#include