pax_global_header 0000666 0000000 0000000 00000000064 14750134674 0014525 g ustar 00root root 0000000 0000000 52 comment=08468cc3830185c75f9e7edefd88aa01e5c2f8ab
liburing-2.9/ 0000775 0000000 0000000 00000000000 14750134674 0013212 5 ustar 00root root 0000000 0000000 liburing-2.9/.github/ 0000775 0000000 0000000 00000000000 14750134674 0014552 5 ustar 00root root 0000000 0000000 liburing-2.9/.github/actions/ 0000775 0000000 0000000 00000000000 14750134674 0016212 5 ustar 00root root 0000000 0000000 liburing-2.9/.github/actions/codespell/ 0000775 0000000 0000000 00000000000 14750134674 0020164 5 ustar 00root root 0000000 0000000 liburing-2.9/.github/actions/codespell/stopwords 0000664 0000000 0000000 00000000062 14750134674 0022151 0 ustar 00root root 0000000 0000000 bu
cancelation
cancelations
cant
pring
sring
wont
liburing-2.9/.github/pull_request_template.md 0000664 0000000 0000000 00000005666 14750134674 0021530 0 ustar 00root root 0000000 0000000
----
## git request-pull output:
```
Generate your PR shortlog and diffstat with these commands:
git remote add axboe-tree https://github.com/axboe/liburing
git fetch axboe-tree
git request-pull axboe-tree/master your_fork_URL your_branch_name
Then replace this with the output of `git request-pull` command.
```
----
Click to show/hide pull request guidelines
## Pull Request Guidelines
1. To make everyone easily filter pull request from the email
notification, use `[GIT PULL]` as a prefix in your PR title.
```
[GIT PULL] Your Pull Request Title
```
2. Follow the commit message format rules below.
3. Follow the Linux kernel coding style (see: https://github.com/torvalds/linux/blob/master/Documentation/process/coding-style.rst).
### Commit message format rules:
1. The first line is title (don't be more than 72 chars if possible).
2. Then an empty line.
3. Then a description (may be omitted for truly trivial changes).
4. Then an empty line again (if it has a description).
5. Then a `Signed-off-by` tag with your real name and email. For example:
```
Signed-off-by: Foo Bar
```
The description should be word-wrapped at 72 chars. Some things should
not be word-wrapped. They may be some kind of quoted text - long
compiler error messages, oops reports, Link, etc. (things that have a
certain specific format).
Note that all of this goes in the commit message, not in the pull
request text. The pull request text should introduce what this pull
request does, and each commit message should explain the rationale for
why that particular change was made. The git tree is canonical source
of truth, not github.
Each patch should do one thing, and one thing only. If you find yourself
writing an explanation for why a patch is fixing multiple issues, that's
a good indication that the change should be split into separate patches.
If the commit is a fix for an issue, add a `Fixes` tag with the issue
URL.
Don't use GitHub anonymous email like this as the commit author:
```
123456789+username@users.noreply.github.com
```
Use a real email address!
### Commit message example:
```
src/queue: don't flush SQ ring for new wait interface
If we have IORING_FEAT_EXT_ARG, then timeouts are done through the
syscall instead of by posting an internal timeout. This was done
to be both more efficient, but also to enable multi-threaded use
the wait side. If we touch the SQ state by flushing it, that isn't
safe without synchronization.
Fixes: https://github.com/axboe/liburing/issues/402
Signed-off-by: Jens Axboe
```
----
## By submitting this pull request, I acknowledge that:
1. I have followed the above pull request guidelines.
2. I have the rights to submit this work under the same license.
3. I agree to a Developer Certificate of Origin (see https://developercertificate.org for more information).
liburing-2.9/.github/workflows/ 0000775 0000000 0000000 00000000000 14750134674 0016607 5 ustar 00root root 0000000 0000000 liburing-2.9/.github/workflows/build.yml 0000664 0000000 0000000 00000011543 14750134674 0020435 0 ustar 00root root 0000000 0000000 name: Build test
on:
# Trigger the workflow on push or pull requests.
push:
pull_request:
jobs:
build:
runs-on: ubuntu-24.04
strategy:
fail-fast: false
matrix:
include:
# x86-64 gcc
- arch: x86_64
cc_pkg: gcc-x86-64-linux-gnu
cxx_pkg: g++-x86-64-linux-gnu
cc: x86_64-linux-gnu-gcc
cxx: x86_64-linux-gnu-g++
sanitize: 0
# x86-64 gcc asan
- arch: x86_64
cc_pkg: gcc-x86-64-linux-gnu
cxx_pkg: g++-x86-64-linux-gnu
cc: x86_64-linux-gnu-gcc
cxx: x86_64-linux-gnu-g++
sanitize: 1
# x86-64 clang
- arch: x86_64
cc_pkg: clang
cxx_pkg: clang
cc: clang
cxx: clang++
liburing_extra_flags: -Wshorten-64-to-32
extra_flags: -Wmissing-prototypes -Wstrict-prototypes -Wunreachable-code-loop-increment -Wunreachable-code -Wmissing-variable-declarations -Wextra-semi-stmt
sanitize: 0
# x86 (32-bit) gcc
- arch: i686
cc_pkg: gcc-i686-linux-gnu
cxx_pkg: g++-i686-linux-gnu
cc: i686-linux-gnu-gcc
cxx: i686-linux-gnu-g++
sanitize: 0
# aarch64 gcc
- arch: aarch64
cc_pkg: gcc-aarch64-linux-gnu
cxx_pkg: g++-aarch64-linux-gnu
cc: aarch64-linux-gnu-gcc
cxx: aarch64-linux-gnu-g++
sanitize: 0
# arm (32-bit) gcc
- arch: arm
cc_pkg: gcc-arm-linux-gnueabi
cxx_pkg: g++-arm-linux-gnueabi
cc: arm-linux-gnueabi-gcc
cxx: arm-linux-gnueabi-g++
sanitize: 0
# riscv64
- arch: riscv64
cc_pkg: gcc-riscv64-linux-gnu
cxx_pkg: g++-riscv64-linux-gnu
cc: riscv64-linux-gnu-gcc
cxx: riscv64-linux-gnu-g++
sanitize: 0
# powerpc64
- arch: powerpc64
cc_pkg: gcc-powerpc64-linux-gnu
cxx_pkg: g++-powerpc64-linux-gnu
cc: powerpc64-linux-gnu-gcc
cxx: powerpc64-linux-gnu-g++
sanitize: 0
# powerpc
- arch: powerpc
cc_pkg: gcc-powerpc-linux-gnu
cxx_pkg: g++-powerpc-linux-gnu
cc: powerpc-linux-gnu-gcc
cxx: powerpc-linux-gnu-g++
sanitize: 0
# alpha
- arch: alpha
cc_pkg: gcc-alpha-linux-gnu
cxx_pkg: g++-alpha-linux-gnu
cc: alpha-linux-gnu-gcc
cxx: alpha-linux-gnu-g++
sanitize: 0
# mips64
- arch: mips64
cc_pkg: gcc-mips64-linux-gnuabi64
cxx_pkg: g++-mips64-linux-gnuabi64
cc: mips64-linux-gnuabi64-gcc
cxx: mips64-linux-gnuabi64-g++
sanitize: 0
# mips
- arch: mips
cc_pkg: gcc-mips-linux-gnu
cxx_pkg: g++-mips-linux-gnu
cc: mips-linux-gnu-gcc
cxx: mips-linux-gnu-g++
sanitize: 0
# hppa
- arch: hppa
cc_pkg: gcc-hppa-linux-gnu
cxx_pkg: g++-hppa-linux-gnu
cc: hppa-linux-gnu-gcc
cxx: hppa-linux-gnu-g++
sanitize: 0
env:
FLAGS: -g -O3 -Wall -Wextra -Werror -Wno-sign-compare ${{matrix.extra_flags}}
SANITIZE: ${{matrix.sanitize}}
# Flags for building sources in src/ dir only.
LIBURING_CFLAGS: ${{matrix.liburing_extra_flags}}
steps:
- name: Checkout source
uses: actions/checkout@v4
- name: Install Compilers
run: |
if [[ "${{matrix.cc_pkg}}" == "clang" ]]; then \
wget https://apt.llvm.org/llvm.sh -O /tmp/llvm.sh; \
sudo apt-get purge --auto-remove llvm python3-lldb-14 llvm-14 -y; \
sudo bash /tmp/llvm.sh 17; \
sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-17 400; \
sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-17 400; \
else \
sudo apt-get update -y; \
sudo apt-get install -y ${{matrix.cc_pkg}} ${{matrix.cxx_pkg}}; \
fi;
- name: Display compiler versions
run: |
${{matrix.cc}} --version;
${{matrix.cxx}} --version;
- name: Build
if: ${{matrix.sanitizer == '0'}}
run: |
./configure --cc=${{matrix.cc}} --cxx=${{matrix.cxx}};
make -j$(nproc) V=1 CPPFLAGS="-Werror" CFLAGS="$FLAGS" CXXFLAGS="$FLAGS";
- name: Build
if: ${{matrix.sanitizer == '1'}}
run: |
./configure --cc=${{matrix.cc}} --cxx=${{matrix.cxx}} --enable-sanitizer;
make -j$(nproc) V=1 CPPFLAGS="-Werror" CFLAGS="$FLAGS" CXXFLAGS="$FLAGS";
- name: Test install command
run: |
sudo make install;
liburing-2.9/.github/workflows/codespell.yml 0000664 0000000 0000000 00000000765 14750134674 0021314 0 ustar 00root root 0000000 0000000 name: Codespell
on:
# Trigger the workflow on push or pull requests.
push:
pull_request:
jobs:
test:
runs-on: ubuntu-24.04
steps:
- name: Checkout source
uses: actions/checkout@v4
- name: Install codespell
run: |
sudo apt-get update -y
sudo apt-get install -y codespell
- name: Display codespell version
run: codespell --version
- name: Execute codespell
run: codespell --ignore-words=.github/actions/codespell/stopwords .
liburing-2.9/.github/workflows/shellcheck.yml 0000664 0000000 0000000 00000000543 14750134674 0021441 0 ustar 00root root 0000000 0000000 name: Shellcheck
on:
# Trigger the workflow on push or pull requests.
push:
pull_request:
jobs:
test:
runs-on: ubuntu-22.04
steps:
- name: Checkout source
uses: actions/checkout@v4
- name: Display shellcheck version
run: shellcheck --version
- name: Shellcheck execution
run: shellcheck test/runtest*.sh
liburing-2.9/.gitignore 0000664 0000000 0000000 00000001326 14750134674 0015204 0 ustar 00root root 0000000 0000000 *.rej
*.orig
*~
/*.patch
*.d
*.o
*.o[ls]
/src/liburing.a
/src/liburing.so*
/src/liburing-ffi.a
/src/liburing-ffi.so*
/src/include/liburing/compat.h
/src/include/liburing/io_uring_version.h
/examples/io_uring-close-test
/examples/io_uring-cp
/examples/io_uring-test
/examples/io_uring-udp
/examples/link-cp
/examples/napi-busy-poll-client
/examples/napi-busy-poll-server
/examples/ucontext-cp
/examples/poll-bench
/examples/proxy
/examples/send-zerocopy
/examples/rsrc-update-bench
/examples/kdigest
/examples/reg-wait
/test/*.t
/test/*.dmesg
/test/output/
# Clang's compilation database file and directory.
/.cache
/compile_commands.json
config-host.h
config-host.mak
config.log
liburing.pc
liburing-ffi.pc
cscope.out
liburing-2.9/CHANGELOG 0000664 0000000 0000000 00000012773 14750134674 0014436 0 ustar 00root root 0000000 0000000 liburing-2.9 release
- Add support for ring resizing
- Add support for registered waits
- Test additions and improvements
- Fix bug with certain ring setups with SQE128 set not fully closing
the ring after io_uring_queue_exit(3) had been called.
- Various man page fixes and updates
liburing-2.8 release
- Add support for incrementally/partially consumed provided buffers,
usable with the provided buffer ring support.
- Add support for foo_and_wait_min_timeout(), where it's possible to
define a minimum timeout for waiting to get batches of completions,
but if that fails, extend for a longer timeout without having any
extra context switches.
- Add support for using different clock sources for completion waiting.
- Great increase coverage of test cases, test case improvements and
fixes.
- Man page updates
- Don't leak _GNU_SOURCE via pkb-config --cflags
- Support for address sanitizer
- Add examples/kdigest sample program
- Add discard helper, test, and man page
liburing-2.7 release
- Man page updates
- Sync with kernel 6.10
- send/recv bundle support
- accept nowait and CQE_F_MORE
- Add and update test cases
- Fix io_uring_queue_init_mem() returning a value that was too small,
potentially causing memory corruption in userspace by overwriting
64 bytes beyond the returned value. Also add test case for that.
- Add 64-bit length variants of io_uring_prep_{m,f}advise()
- Add BIND/LISTEN support and helpers / man pages
- Add io_uring_enable_rings.3 man page
- Fix bug in io_uring_prep_read_multishot()
- Fixup bundle test cases
- Add fixed-hugepage test case
- Fix io_uring_prep_fixed_fd_install.3 man page
- Note 'len' == 0 requirement in io_uring_prep_send.3 man page
- Fix some test cases for skipping on older kernels
liburing-2.6 release
- Add getsockopt and setsockopt socket commands
- Add test cases to test/hardlink
- Man page fixes
- Add futex support, and test cases
- Add waitid support, and test cases
- Add read multishot, and test cases
- Add support for IORING_SETUP_NO_SQARRAY
- Use IORING_SETUP_NO_SQARRAY as the default
- Add support for IORING_OP_FIXED_FD_INSTALL
- Add io_uring_prep_fixed_fd_install() helper
- Support for napi busy polling
- Improve/add test cases
- Man page fixes
- Add sample 'proxy' example
liburing-2.5 release
- Add support for io_uring_prep_cmd_sock()
- Add support for application allocated ring memory, for placing rings
in huge mem. Available through io_uring_queue_init_mem().
- Add support for registered ring fds
- Various documentation updates
- Various fixes
liburing-2.4 release
- Add io_uring_{major,minor,check}_version() functions.
- Add IO_URING_{MAJOR,MINOR,CHECK}_VERSION() macros.
- FFI support (for non-C/C++ languages integration).
- Add io_uring_prep_msg_ring_cqe_flags() function.
- Deprecate --nolibc configure option.
- CONFIG_NOLIBC is always enabled on x86-64, x86, and aarch64.
- Add support for IORING_REGISTER_USE_REGISTERED_RING and use if available.
- Add io_uring_close_ring_fd() function.
- Add io_uring_prep_msg_ring_fd_alloc function.
- Add io_uring_free_buf_ring() and io_uring_setup_buf_ring() functions.
- Ensure that io_uring_prep_accept_direct(), io_uring_prep_openat_direct(),
io_uring_prep_openat2_direct(), io_uring_prep_msg_ring_fd(), and
io_uring_prep_socket_direct() factor in being called with
IORING_FILE_INDEX_ALLOC for allocating a direct descriptor.
- Add io_uring_prep_sendto() function.
- Add io_uring_prep_cmd_sock() function.
liburing-2.3 release
- Support non-libc build for aarch64.
- Add io_uring_{enter,enter2,register,setup} syscall functions.
- Add sync cancel interface, io_uring_register_sync_cancel().
- Fix return value of io_uring_submit_and_wait_timeout() to match the
man page.
- Improvements to the regression tests
- Add support and test case for passthrough IO
- Add recv and recvmsg multishot helpers and support
- Add documentation and support for IORING_SETUP_DEFER_TASKRUN
- Fix potential missing kernel entry with IORING_SETUP_IOPOLL
- Add support and documentation for zero-copy network transmit
- Various optimizations
- Many cleanups
- Many man page additions and updates
liburing-2.2 release
- Support non-libc builds.
- Optimized syscall handling for x86-64/x86/aarch64.
- Enable non-lib function calls for fast path functions.
- Add support for multishot accept.
- io_uring_register_files() will set RLIMIT_NOFILE if necessary.
- Add support for registered ring fds, io_uring_register_ring_fd(),
reducing the overhead of an io_uring_enter() system call.
- Add support for the message ring opcode.
- Add support for newer request cancelation features.
- Add support for IORING_SETUP_COOP_TASKRUN, which can help reduce the
overhead of io_uring in general. Most applications should set this flag,
see the io_uring_setup.2 man page for details.
- Add support for registering a sparse buffer and file set.
- Add support for a new buffer provide scheme, see
io_uring_register_buf_ring.3 for details.
- Add io_uring_submit_and_wait_timeout() for submitting IO and waiting
for completions with a timeout.
- Add io_uring_prep_{read,write}v2 prep helpers.
- Add io_uring_prep_close_direct() helper.
- Add support for SQE128 and CQE32, which are doubly sized SQE and CQE
rings. This is needed for some cases of the new IORING_OP_URING_CMD,
notably for NVMe passthrough.
- ~5500 lines of man page additions, including adding ~90 new man pages.
- Synced with the 5.19 kernel release, supporting all the features of
5.19 and earlier.
- 24 new regression test cases, and ~7000 lines of new tests in general.
- General optimizations and fixes.
liburing-2.9/CITATION.cff 0000664 0000000 0000000 00000000364 14750134674 0015107 0 ustar 00root root 0000000 0000000 cff-version: 1.2.0
preferred-citation:
type: software
authors:
- family-names: "Axboe"
given-names: "Jens"
email: axboe@kernel.dk
title: "liburing library for io_uring"
year: 2022
url: "https://github.com/axboe/liburing"
licence: MIT
liburing-2.9/CONTRIBUTING.md 0000664 0000000 0000000 00000015563 14750134674 0015455 0 ustar 00root root 0000000 0000000 Introduction
============
liburing welcomes contributions, whether they be bug fixes, features, or
documentation additions/updates. However, we do have some rules in place
to govern the sanity of the project, and all contributions should follow
the guidelines in this document. The main reasons for the rules are:
1) Keep the code consistent
2) Keep the git repository consistent
3) Maintain bisectability
Coding style
============
Generally, all the code in liburing should follow the same style. A few
known exceptions exist, like syzbot test cases that got committed rather
than re-writing them in a saner format. Any change you make, please
follow the style of the code around you.
Commit format
=============
Each commit should do one thing, and one thing only. If you find yourself,
in the commit message, adding phrases like "Also do [...]" or "While in
here [...]", then that's a sign that the change should have been split
into multiple commits. If your change includes some refactoring of code to
make your change possible, then that refactoring should be a separate
commit, done first. That means this preparatory commit won't have any
functional changes, and hence should be a no-op. It also means that your
main commit, with the change that you actually care about, will be smaller
and easier to review.
Each commit must stand on its own in terms of what it provides, and how it
works. Lots of changes are just a single commit, but for something a bit
more involved, it's not uncommon to have a pull request contain multiple
commits. Make each commit as simple as possible, and not any simpler. We'd
much rather see 10 simple commits than 2 more complicated ones. If you
stumble across something that needs fixing while making an unrelated
change, then please make that change as a separate commit, explaining why
it's being made.
Each commit in a series must be buildable, it's not enough that the end
result is buildable. See reason 3 in the introduction for why that's the
case.
No fixup commits! Sometimes people post a change and errors are pointed
out in the commit, and the author then does a followup fix for that
error. This isn't acceptable, please squash fixup commits into the
commit that introduced the problem in the first place. This is done by
amending the fix into the original commit that caused the issue. You can
do that with git rebase -i and arrange the commit order such that
the fixup is right after the original commit, and then use 's' (for
squash) to squash the fixup into the original commit. Don't forget to
edit the commit message while doing that, as git will combine the two
commit messages into one. Or you can do it manually. Once done, force
push your rewritten git history. See reasons 1-3 in the introduction
series for why that is.
Commit message
==============
A good commit explains the WHY of a commit - explain the reason for this
commit to exist. Don't explain what the code in commit does, that should
be readily apparent from just reading the code. If that isn't the case,
then a comment in the code is going to be more useful than a lengthy
explanation in the commit message. liburing commits use the following
format:
Title
Body of commit
Signed-off-by: ```My Identity ```
That is, a descriptive title on the first line, then an empty line, then
the body of the commit message, then an empty line, and finally an SOB
tag. The signed-off-by exists to provide proof of origin, see the
[DCO](https://developercertificate.org/).
Example commit:
```
commit 0fe5c09195c0918f89582dd6ff098a58a0bdf62a
Author: Jens Axboe
Date: Fri Sep 6 15:54:04 2024 -0600
configure: fix ublk_cmd header check
The previous commit is mixing private structures and defines with public
uapi ones. Testing for UBLK_U_CMD_START_DEV is fine, CTRL_CMD_HAS_DATA
is not. And struct ublk_ctrl_cmd_data is not a public struct.
Fixes: 83bc535a3118 ("configure: don't enable ublk if modern commands not available")
Signed-off-by: Jens Axboe
```
Since this change is pretty trivial, a huge explanation need not be given
as to the reasonings for the change. However, for more complicated
changes, better reasonings should be given.
A Fixes line can be added if this commit fixes an issue in a previous
commit. That kind of meta data can be useful down the line for finding
dependencies between commits. Adding the following to your .gitconfig:
```
[pretty]
fixes = Fixes: %h (\"%s\")
```
and running ```git fixes ``` will then generate the correctly
formatted Fixes line for the commit. Likewise, other meta data can be:
Link: https://somesite/somewhere
can be useful to link to a discussion around the issue that led to this
commit, perhaps a bug report. This can be a GitHub issue as well. If a
commit closes/solves a GitHub issue, than:
Closes: https://github.com/axboe/liburing/issues/XXXX
can also be used.
Each commit message should be formatted so each full line is 72-74 chars
wide. For many of us, GitHub is not the primary location, and git log is
often used in a terminal to browse the repo. Breaking lines at 72-74
characters retains readability in an xterm/terminal.
Pull Requests
=============
The git repository itself is the canonical location for information. It's
quite fine to provide a lengthy explanation for a pull request on GitHub,
however please ensure that this doesn't come at the expense of the commit
messages themselves being lacking. The commit messages should stand on
their own and contain everything that you'd otherwise put in the PR
message. If you've worked on projects that send patches before, consider
the PR message similar to the cover letter for a series of patches.
Most contributors seem to use GH for sending patches, which is fine. If
you prefer using email, then patches can also be sent to the io_uring
mailing list: io-uring@vger.kernel.org.
liburing doesn't squash/rebase-on-merge, or other heinous practices
sometimes seen elsewhere. Whatever sha your commit has in your tree is
what it'll have in the upstream tree. Patches are applied directly, and
pull requests are merged with a merge commit. If meta data needs to go
into the merge commit, then it will go into the merge commit message.
This means that you don't need to continually rebase your changes on top
of the master branch.
Testing changes
===============
You should ALWAYS test your changes, no matter how trivial or obviously
correct they may seem. Nobody is infallible, and making mistakes is only
human.
liburing contains a wide variety of functional tests. If you make changes
to liburing, then you should run the test cases. This is done by building
the repo and running ```make runtests```. Note that some of the liburing
tests test for defects in older kernels, and hence it's possible that they
will crash on an outdated kernel that doesn't contain fixes from the
stable kernel tree. If in doubt, building and running the tests in a vm is
encouraged.
liburing-2.9/COPYING 0000664 0000000 0000000 00000063631 14750134674 0014256 0 ustar 00root root 0000000 0000000 GNU LESSER GENERAL PUBLIC LICENSE
Version 2.1, February 1999
Copyright (C) 1991, 1999 Free Software Foundation, Inc.
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
[This is the first released version of the Lesser GPL. It also counts
as the successor of the GNU Library Public License, version 2, hence
the version number 2.1.]
Preamble
The licenses for most software are designed to take away your
freedom to share and change it. By contrast, the GNU General Public
Licenses are intended to guarantee your freedom to share and change
free software--to make sure the software is free for all its users.
This license, the Lesser General Public License, applies to some
specially designated software packages--typically libraries--of the
Free Software Foundation and other authors who decide to use it. You
can use it too, but we suggest you first think carefully about whether
this license or the ordinary General Public License is the better
strategy to use in any particular case, based on the explanations below.
When we speak of free software, we are referring to freedom of use,
not price. Our General Public Licenses are designed to make sure that
you have the freedom to distribute copies of free software (and charge
for this service if you wish); that you receive source code or can get
it if you want it; that you can change the software and use pieces of
it in new free programs; and that you are informed that you can do
these things.
To protect your rights, we need to make restrictions that forbid
distributors to deny you these rights or to ask you to surrender these
rights. These restrictions translate to certain responsibilities for
you if you distribute copies of the library or if you modify it.
For example, if you distribute copies of the library, whether gratis
or for a fee, you must give the recipients all the rights that we gave
you. You must make sure that they, too, receive or can get the source
code. If you link other code with the library, you must provide
complete object files to the recipients, so that they can relink them
with the library after making changes to the library and recompiling
it. And you must show them these terms so they know their rights.
We protect your rights with a two-step method: (1) we copyright the
library, and (2) we offer you this license, which gives you legal
permission to copy, distribute and/or modify the library.
To protect each distributor, we want to make it very clear that
there is no warranty for the free library. Also, if the library is
modified by someone else and passed on, the recipients should know
that what they have is not the original version, so that the original
author's reputation will not be affected by problems that might be
introduced by others.
Finally, software patents pose a constant threat to the existence of
any free program. We wish to make sure that a company cannot
effectively restrict the users of a free program by obtaining a
restrictive license from a patent holder. Therefore, we insist that
any patent license obtained for a version of the library must be
consistent with the full freedom of use specified in this license.
Most GNU software, including some libraries, is covered by the
ordinary GNU General Public License. This license, the GNU Lesser
General Public License, applies to certain designated libraries, and
is quite different from the ordinary General Public License. We use
this license for certain libraries in order to permit linking those
libraries into non-free programs.
When a program is linked with a library, whether statically or using
a shared library, the combination of the two is legally speaking a
combined work, a derivative of the original library. The ordinary
General Public License therefore permits such linking only if the
entire combination fits its criteria of freedom. The Lesser General
Public License permits more lax criteria for linking other code with
the library.
We call this license the "Lesser" General Public License because it
does Less to protect the user's freedom than the ordinary General
Public License. It also provides other free software developers Less
of an advantage over competing non-free programs. These disadvantages
are the reason we use the ordinary General Public License for many
libraries. However, the Lesser license provides advantages in certain
special circumstances.
For example, on rare occasions, there may be a special need to
encourage the widest possible use of a certain library, so that it becomes
a de-facto standard. To achieve this, non-free programs must be
allowed to use the library. A more frequent case is that a free
library does the same job as widely used non-free libraries. In this
case, there is little to gain by limiting the free library to free
software only, so we use the Lesser General Public License.
In other cases, permission to use a particular library in non-free
programs enables a greater number of people to use a large body of
free software. For example, permission to use the GNU C Library in
non-free programs enables many more people to use the whole GNU
operating system, as well as its variant, the GNU/Linux operating
system.
Although the Lesser General Public License is Less protective of the
users' freedom, it does ensure that the user of a program that is
linked with the Library has the freedom and the wherewithal to run
that program using a modified version of the Library.
The precise terms and conditions for copying, distribution and
modification follow. Pay close attention to the difference between a
"work based on the library" and a "work that uses the library". The
former contains code derived from the library, whereas the latter must
be combined with the library in order to run.
GNU LESSER GENERAL PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. This License Agreement applies to any software library or other
program which contains a notice placed by the copyright holder or
other authorized party saying it may be distributed under the terms of
this Lesser General Public License (also called "this License").
Each licensee is addressed as "you".
A "library" means a collection of software functions and/or data
prepared so as to be conveniently linked with application programs
(which use some of those functions and data) to form executables.
The "Library", below, refers to any such software library or work
which has been distributed under these terms. A "work based on the
Library" means either the Library or any derivative work under
copyright law: that is to say, a work containing the Library or a
portion of it, either verbatim or with modifications and/or translated
straightforwardly into another language. (Hereinafter, translation is
included without limitation in the term "modification".)
"Source code" for a work means the preferred form of the work for
making modifications to it. For a library, complete source code means
all the source code for all modules it contains, plus any associated
interface definition files, plus the scripts used to control compilation
and installation of the library.
Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope. The act of
running a program using the Library is not restricted, and output from
such a program is covered only if its contents constitute a work based
on the Library (independent of the use of the Library in a tool for
writing it). Whether that is true depends on what the Library does
and what the program that uses the Library does.
1. You may copy and distribute verbatim copies of the Library's
complete source code as you receive it, in any medium, provided that
you conspicuously and appropriately publish on each copy an
appropriate copyright notice and disclaimer of warranty; keep intact
all the notices that refer to this License and to the absence of any
warranty; and distribute a copy of this License along with the
Library.
You may charge a fee for the physical act of transferring a copy,
and you may at your option offer warranty protection in exchange for a
fee.
2. You may modify your copy or copies of the Library or any portion
of it, thus forming a work based on the Library, and copy and
distribute such modifications or work under the terms of Section 1
above, provided that you also meet all of these conditions:
a) The modified work must itself be a software library.
b) You must cause the files modified to carry prominent notices
stating that you changed the files and the date of any change.
c) You must cause the whole of the work to be licensed at no
charge to all third parties under the terms of this License.
d) If a facility in the modified Library refers to a function or a
table of data to be supplied by an application program that uses
the facility, other than as an argument passed when the facility
is invoked, then you must make a good faith effort to ensure that,
in the event an application does not supply such function or
table, the facility still operates, and performs whatever part of
its purpose remains meaningful.
(For example, a function in a library to compute square roots has
a purpose that is entirely well-defined independent of the
application. Therefore, Subsection 2d requires that any
application-supplied function or table used by this function must
be optional: if the application does not supply it, the square
root function must still compute square roots.)
These requirements apply to the modified work as a whole. If
identifiable sections of that work are not derived from the Library,
and can be reasonably considered independent and separate works in
themselves, then this License, and its terms, do not apply to those
sections when you distribute them as separate works. But when you
distribute the same sections as part of a whole which is a work based
on the Library, the distribution of the whole must be on the terms of
this License, whose permissions for other licensees extend to the
entire whole, and thus to each and every part regardless of who wrote
it.
Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Library.
In addition, mere aggregation of another work not based on the Library
with the Library (or with a work based on the Library) on a volume of
a storage or distribution medium does not bring the other work under
the scope of this License.
3. You may opt to apply the terms of the ordinary GNU General Public
License instead of this License to a given copy of the Library. To do
this, you must alter all the notices that refer to this License, so
that they refer to the ordinary GNU General Public License, version 2,
instead of to this License. (If a newer version than version 2 of the
ordinary GNU General Public License has appeared, then you can specify
that version instead if you wish.) Do not make any other change in
these notices.
Once this change is made in a given copy, it is irreversible for
that copy, so the ordinary GNU General Public License applies to all
subsequent copies and derivative works made from that copy.
This option is useful when you wish to copy part of the code of
the Library into a program that is not a library.
4. You may copy and distribute the Library (or a portion or
derivative of it, under Section 2) in object code or executable form
under the terms of Sections 1 and 2 above provided that you accompany
it with the complete corresponding machine-readable source code, which
must be distributed under the terms of Sections 1 and 2 above on a
medium customarily used for software interchange.
If distribution of object code is made by offering access to copy
from a designated place, then offering equivalent access to copy the
source code from the same place satisfies the requirement to
distribute the source code, even though third parties are not
compelled to copy the source along with the object code.
5. A program that contains no derivative of any portion of the
Library, but is designed to work with the Library by being compiled or
linked with it, is called a "work that uses the Library". Such a
work, in isolation, is not a derivative work of the Library, and
therefore falls outside the scope of this License.
However, linking a "work that uses the Library" with the Library
creates an executable that is a derivative of the Library (because it
contains portions of the Library), rather than a "work that uses the
library". The executable is therefore covered by this License.
Section 6 states terms for distribution of such executables.
When a "work that uses the Library" uses material from a header file
that is part of the Library, the object code for the work may be a
derivative work of the Library even though the source code is not.
Whether this is true is especially significant if the work can be
linked without the Library, or if the work is itself a library. The
threshold for this to be true is not precisely defined by law.
If such an object file uses only numerical parameters, data
structure layouts and accessors, and small macros and small inline
functions (ten lines or less in length), then the use of the object
file is unrestricted, regardless of whether it is legally a derivative
work. (Executables containing this object code plus portions of the
Library will still fall under Section 6.)
Otherwise, if the work is a derivative of the Library, you may
distribute the object code for the work under the terms of Section 6.
Any executables containing that work also fall under Section 6,
whether or not they are linked directly with the Library itself.
6. As an exception to the Sections above, you may also combine or
link a "work that uses the Library" with the Library to produce a
work containing portions of the Library, and distribute that work
under terms of your choice, provided that the terms permit
modification of the work for the customer's own use and reverse
engineering for debugging such modifications.
You must give prominent notice with each copy of the work that the
Library is used in it and that the Library and its use are covered by
this License. You must supply a copy of this License. If the work
during execution displays copyright notices, you must include the
copyright notice for the Library among them, as well as a reference
directing the user to the copy of this License. Also, you must do one
of these things:
a) Accompany the work with the complete corresponding
machine-readable source code for the Library including whatever
changes were used in the work (which must be distributed under
Sections 1 and 2 above); and, if the work is an executable linked
with the Library, with the complete machine-readable "work that
uses the Library", as object code and/or source code, so that the
user can modify the Library and then relink to produce a modified
executable containing the modified Library. (It is understood
that the user who changes the contents of definitions files in the
Library will not necessarily be able to recompile the application
to use the modified definitions.)
b) Use a suitable shared library mechanism for linking with the
Library. A suitable mechanism is one that (1) uses at run time a
copy of the library already present on the user's computer system,
rather than copying library functions into the executable, and (2)
will operate properly with a modified version of the library, if
the user installs one, as long as the modified version is
interface-compatible with the version that the work was made with.
c) Accompany the work with a written offer, valid for at
least three years, to give the same user the materials
specified in Subsection 6a, above, for a charge no more
than the cost of performing this distribution.
d) If distribution of the work is made by offering access to copy
from a designated place, offer equivalent access to copy the above
specified materials from the same place.
e) Verify that the user has already received a copy of these
materials or that you have already sent this user a copy.
For an executable, the required form of the "work that uses the
Library" must include any data and utility programs needed for
reproducing the executable from it. However, as a special exception,
the materials to be distributed need not include anything that is
normally distributed (in either source or binary form) with the major
components (compiler, kernel, and so on) of the operating system on
which the executable runs, unless that component itself accompanies
the executable.
It may happen that this requirement contradicts the license
restrictions of other proprietary libraries that do not normally
accompany the operating system. Such a contradiction means you cannot
use both them and the Library together in an executable that you
distribute.
7. You may place library facilities that are a work based on the
Library side-by-side in a single library together with other library
facilities not covered by this License, and distribute such a combined
library, provided that the separate distribution of the work based on
the Library and of the other library facilities is otherwise
permitted, and provided that you do these two things:
a) Accompany the combined library with a copy of the same work
based on the Library, uncombined with any other library
facilities. This must be distributed under the terms of the
Sections above.
b) Give prominent notice with the combined library of the fact
that part of it is a work based on the Library, and explaining
where to find the accompanying uncombined form of the same work.
8. You may not copy, modify, sublicense, link with, or distribute
the Library except as expressly provided under this License. Any
attempt otherwise to copy, modify, sublicense, link with, or
distribute the Library is void, and will automatically terminate your
rights under this License. However, parties who have received copies,
or rights, from you under this License will not have their licenses
terminated so long as such parties remain in full compliance.
9. You are not required to accept this License, since you have not
signed it. However, nothing else grants you permission to modify or
distribute the Library or its derivative works. These actions are
prohibited by law if you do not accept this License. Therefore, by
modifying or distributing the Library (or any work based on the
Library), you indicate your acceptance of this License to do so, and
all its terms and conditions for copying, distributing or modifying
the Library or works based on it.
10. Each time you redistribute the Library (or any work based on the
Library), the recipient automatically receives a license from the
original licensor to copy, distribute, link with or modify the Library
subject to these terms and conditions. You may not impose any further
restrictions on the recipients' exercise of the rights granted herein.
You are not responsible for enforcing compliance by third parties with
this License.
11. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot
distribute so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you
may not distribute the Library at all. For example, if a patent
license would not permit royalty-free redistribution of the Library by
all those who receive copies directly or indirectly through you, then
the only way you could satisfy both it and this License would be to
refrain entirely from distribution of the Library.
If any portion of this section is held invalid or unenforceable under any
particular circumstance, the balance of the section is intended to apply,
and the section as a whole is intended to apply in other circumstances.
It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system which is
implemented by public license practices. Many people have made
generous contributions to the wide range of software distributed
through that system in reliance on consistent application of that
system; it is up to the author/donor to decide if he or she is willing
to distribute software through any other system and a licensee cannot
impose that choice.
This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.
12. If the distribution and/or use of the Library is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Library under this License may add
an explicit geographical distribution limitation excluding those countries,
so that distribution is permitted only in or among countries not thus
excluded. In such case, this License incorporates the limitation as if
written in the body of this License.
13. The Free Software Foundation may publish revised and/or new
versions of the Lesser General Public License from time to time.
Such new versions will be similar in spirit to the present version,
but may differ in detail to address new problems or concerns.
Each version is given a distinguishing version number. If the Library
specifies a version number of this License which applies to it and
"any later version", you have the option of following the terms and
conditions either of that version or of any later version published by
the Free Software Foundation. If the Library does not specify a
license version number, you may choose any version ever published by
the Free Software Foundation.
14. If you wish to incorporate parts of the Library into other free
programs whose distribution conditions are incompatible with these,
write to the author to ask for permission. For software which is
copyrighted by the Free Software Foundation, write to the Free
Software Foundation; we sometimes make exceptions for this. Our
decision will be guided by the two goals of preserving the free status
of all derivatives of our free software and of promoting the sharing
and reuse of software generally.
NO WARRANTY
15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
DAMAGES.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Libraries
If you develop a new library, and you want it to be of the greatest
possible use to the public, we recommend making it free software that
everyone can redistribute and change. You can do so by permitting
redistribution under these terms (or, alternatively, under the terms of the
ordinary General Public License).
To apply these terms, attach the following notices to the library. It is
safest to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least the
"copyright" line and a pointer to where the full notice is found.
Copyright (C)
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Also add information on how to contact you by electronic and paper mail.
You should also get your employer (if you work as a programmer) or your
school, if any, to sign a "copyright disclaimer" for the library, if
necessary. Here is a sample; alter the names:
Yoyodyne, Inc., hereby disclaims all copyright interest in the
library `Frob' (a library for tweaking knobs) written by James Random Hacker.
, 1 April 1990
Ty Coon, President of Vice
That's all there is to it!
liburing-2.9/COPYING.GPL 0000664 0000000 0000000 00000043254 14750134674 0014676 0 ustar 00root root 0000000 0000000 GNU GENERAL PUBLIC LICENSE
Version 2, June 1991
Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The licenses for most software are designed to take away your
freedom to share and change it. By contrast, the GNU General Public
License is intended to guarantee your freedom to share and change free
software--to make sure the software is free for all its users. This
General Public License applies to most of the Free Software
Foundation's software and to any other program whose authors commit to
using it. (Some other Free Software Foundation software is covered by
the GNU Lesser General Public License instead.) You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
this service if you wish), that you receive source code or can get it
if you want it, that you can change the software or use pieces of it
in new free programs; and that you know you can do these things.
To protect your rights, we need to make restrictions that forbid
anyone to deny you these rights or to ask you to surrender the rights.
These restrictions translate to certain responsibilities for you if you
distribute copies of the software, or if you modify it.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must give the recipients all the rights that
you have. You must make sure that they, too, receive or can get the
source code. And you must show them these terms so they know their
rights.
We protect your rights with two steps: (1) copyright the software, and
(2) offer you this license which gives you legal permission to copy,
distribute and/or modify the software.
Also, for each author's protection and ours, we want to make certain
that everyone understands that there is no warranty for this free
software. If the software is modified by someone else and passed on, we
want its recipients to know that what they have is not the original, so
that any problems introduced by others will not reflect on the original
authors' reputations.
Finally, any free program is threatened constantly by software
patents. We wish to avoid the danger that redistributors of a free
program will individually obtain patent licenses, in effect making the
program proprietary. To prevent this, we have made it clear that any
patent must be licensed for everyone's free use or not licensed at all.
The precise terms and conditions for copying, distribution and
modification follow.
GNU GENERAL PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. This License applies to any program or other work which contains
a notice placed by the copyright holder saying it may be distributed
under the terms of this General Public License. The "Program", below,
refers to any such program or work, and a "work based on the Program"
means either the Program or any derivative work under copyright law:
that is to say, a work containing the Program or a portion of it,
either verbatim or with modifications and/or translated into another
language. (Hereinafter, translation is included without limitation in
the term "modification".) Each licensee is addressed as "you".
Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope. The act of
running the Program is not restricted, and the output from the Program
is covered only if its contents constitute a work based on the
Program (independent of having been made by running the Program).
Whether that is true depends on what the Program does.
1. You may copy and distribute verbatim copies of the Program's
source code as you receive it, in any medium, provided that you
conspicuously and appropriately publish on each copy an appropriate
copyright notice and disclaimer of warranty; keep intact all the
notices that refer to this License and to the absence of any warranty;
and give any other recipients of the Program a copy of this License
along with the Program.
You may charge a fee for the physical act of transferring a copy, and
you may at your option offer warranty protection in exchange for a fee.
2. You may modify your copy or copies of the Program or any portion
of it, thus forming a work based on the Program, and copy and
distribute such modifications or work under the terms of Section 1
above, provided that you also meet all of these conditions:
a) You must cause the modified files to carry prominent notices
stating that you changed the files and the date of any change.
b) You must cause any work that you distribute or publish, that in
whole or in part contains or is derived from the Program or any
part thereof, to be licensed as a whole at no charge to all third
parties under the terms of this License.
c) If the modified program normally reads commands interactively
when run, you must cause it, when started running for such
interactive use in the most ordinary way, to print or display an
announcement including an appropriate copyright notice and a
notice that there is no warranty (or else, saying that you provide
a warranty) and that users may redistribute the program under
these conditions, and telling the user how to view a copy of this
License. (Exception: if the Program itself is interactive but
does not normally print such an announcement, your work based on
the Program is not required to print an announcement.)
These requirements apply to the modified work as a whole. If
identifiable sections of that work are not derived from the Program,
and can be reasonably considered independent and separate works in
themselves, then this License, and its terms, do not apply to those
sections when you distribute them as separate works. But when you
distribute the same sections as part of a whole which is a work based
on the Program, the distribution of the whole must be on the terms of
this License, whose permissions for other licensees extend to the
entire whole, and thus to each and every part regardless of who wrote it.
Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Program.
In addition, mere aggregation of another work not based on the Program
with the Program (or with a work based on the Program) on a volume of
a storage or distribution medium does not bring the other work under
the scope of this License.
3. You may copy and distribute the Program (or a work based on it,
under Section 2) in object code or executable form under the terms of
Sections 1 and 2 above provided that you also do one of the following:
a) Accompany it with the complete corresponding machine-readable
source code, which must be distributed under the terms of Sections
1 and 2 above on a medium customarily used for software interchange; or,
b) Accompany it with a written offer, valid for at least three
years, to give any third party, for a charge no more than your
cost of physically performing source distribution, a complete
machine-readable copy of the corresponding source code, to be
distributed under the terms of Sections 1 and 2 above on a medium
customarily used for software interchange; or,
c) Accompany it with the information you received as to the offer
to distribute corresponding source code. (This alternative is
allowed only for noncommercial distribution and only if you
received the program in object code or executable form with such
an offer, in accord with Subsection b above.)
The source code for a work means the preferred form of the work for
making modifications to it. For an executable work, complete source
code means all the source code for all modules it contains, plus any
associated interface definition files, plus the scripts used to
control compilation and installation of the executable. However, as a
special exception, the source code distributed need not include
anything that is normally distributed (in either source or binary
form) with the major components (compiler, kernel, and so on) of the
operating system on which the executable runs, unless that component
itself accompanies the executable.
If distribution of executable or object code is made by offering
access to copy from a designated place, then offering equivalent
access to copy the source code from the same place counts as
distribution of the source code, even though third parties are not
compelled to copy the source along with the object code.
4. You may not copy, modify, sublicense, or distribute the Program
except as expressly provided under this License. Any attempt
otherwise to copy, modify, sublicense or distribute the Program is
void, and will automatically terminate your rights under this License.
However, parties who have received copies, or rights, from you under
this License will not have their licenses terminated so long as such
parties remain in full compliance.
5. You are not required to accept this License, since you have not
signed it. However, nothing else grants you permission to modify or
distribute the Program or its derivative works. These actions are
prohibited by law if you do not accept this License. Therefore, by
modifying or distributing the Program (or any work based on the
Program), you indicate your acceptance of this License to do so, and
all its terms and conditions for copying, distributing or modifying
the Program or works based on it.
6. Each time you redistribute the Program (or any work based on the
Program), the recipient automatically receives a license from the
original licensor to copy, distribute or modify the Program subject to
these terms and conditions. You may not impose any further
restrictions on the recipients' exercise of the rights granted herein.
You are not responsible for enforcing compliance by third parties to
this License.
7. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot
distribute so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you
may not distribute the Program at all. For example, if a patent
license would not permit royalty-free redistribution of the Program by
all those who receive copies directly or indirectly through you, then
the only way you could satisfy both it and this License would be to
refrain entirely from distribution of the Program.
If any portion of this section is held invalid or unenforceable under
any particular circumstance, the balance of the section is intended to
apply and the section as a whole is intended to apply in other
circumstances.
It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system, which is
implemented by public license practices. Many people have made
generous contributions to the wide range of software distributed
through that system in reliance on consistent application of that
system; it is up to the author/donor to decide if he or she is willing
to distribute software through any other system and a licensee cannot
impose that choice.
This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.
8. If the distribution and/or use of the Program is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Program under this License
may add an explicit geographical distribution limitation excluding
those countries, so that distribution is permitted only in or among
countries not thus excluded. In such case, this License incorporates
the limitation as if written in the body of this License.
9. The Free Software Foundation may publish revised and/or new versions
of the General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the Program
specifies a version number of this License which applies to it and "any
later version", you have the option of following the terms and conditions
either of that version or of any later version published by the Free
Software Foundation. If the Program does not specify a version number of
this License, you may choose any version ever published by the Free Software
Foundation.
10. If you wish to incorporate parts of the Program into other free
programs whose distribution conditions are different, write to the author
to ask for permission. For software which is copyrighted by the Free
Software Foundation, write to the Free Software Foundation; we sometimes
make exceptions for this. Our decision will be guided by the two goals
of preserving the free status of all derivatives of our free software and
of promoting the sharing and reuse of software generally.
NO WARRANTY
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
REPAIR OR CORRECTION.
12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
POSSIBILITY OF SUCH DAMAGES.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
Copyright (C)
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
Also add information on how to contact you by electronic and paper mail.
If the program is interactive, make it output a short notice like this
when it starts in an interactive mode:
Gnomovision version 69, Copyright (C) year name of author
Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, the commands you use may
be called something other than `show w' and `show c'; they could even be
mouse-clicks or menu items--whatever suits your program.
You should also get your employer (if you work as a programmer) or your
school, if any, to sign a "copyright disclaimer" for the program, if
necessary. Here is a sample; alter the names:
Yoyodyne, Inc., hereby disclaims all copyright interest in the program
`Gnomovision' (which makes passes at compilers) written by James Hacker.
, 1 April 1989
Ty Coon, President of Vice
This General Public License does not permit incorporating your program into
proprietary programs. If your program is a subroutine library, you may
consider it more useful to permit linking proprietary applications with the
library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License.
liburing-2.9/LICENSE 0000664 0000000 0000000 00000002032 14750134674 0014214 0 ustar 00root root 0000000 0000000 Copyright 2020 Jens Axboe
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
liburing-2.9/Makefile 0000664 0000000 0000000 00000005312 14750134674 0014653 0 ustar 00root root 0000000 0000000 include Makefile.common
RPMBUILD=$(shell `which rpmbuild >&/dev/null` && echo "rpmbuild" || echo "rpm")
INSTALL=install
default: all
all:
@$(MAKE) -C src
@$(MAKE) -C test
@$(MAKE) -C examples
library:
@$(MAKE) -C src
.PHONY: all install default clean test library
.PHONY: FORCE cscope
runtests: all
@$(MAKE) -C test runtests
runtests-loop: all
@$(MAKE) -C test runtests-loop
runtests-parallel: all
@$(MAKE) -C test runtests-parallel
config-host.mak: configure
+@if [ ! -e "$@" ]; then \
echo "Running configure ..."; \
./configure; \
else \
echo "$@ is out-of-date, running configure"; \
sed -n "/.*Configured with/s/[^:]*: //p" "$@" | sh; \
fi
ifneq ($(MAKECMDGOALS),clean)
include config-host.mak
endif
%.pc: %.pc.in config-host.mak $(SPECFILE)
sed -e "s%@prefix@%$(prefix)%g" \
-e "s%@libdir@%$(libdir)%g" \
-e "s%@includedir@%$(includedir)%g" \
-e "s%@NAME@%$(NAME)%g" \
-e "s%@VERSION@%$(VERSION)%g" \
$< >$@
install: $(NAME).pc $(NAME)-ffi.pc
@$(MAKE) -C src install prefix=$(DESTDIR)$(prefix) \
includedir=$(DESTDIR)$(includedir) \
libdir=$(DESTDIR)$(libdir) \
libdevdir=$(DESTDIR)$(libdevdir) \
relativelibdir=$(relativelibdir)
$(INSTALL) -D -m 644 $(NAME).pc $(DESTDIR)$(libdevdir)/pkgconfig/$(NAME).pc
$(INSTALL) -D -m 644 $(NAME)-ffi.pc $(DESTDIR)$(libdevdir)/pkgconfig/$(NAME)-ffi.pc
$(INSTALL) -m 755 -d $(DESTDIR)$(mandir)/man2
$(INSTALL) -m 644 man/*.2 $(DESTDIR)$(mandir)/man2
$(INSTALL) -m 755 -d $(DESTDIR)$(mandir)/man3
$(INSTALL) -m 644 man/*.3 $(DESTDIR)$(mandir)/man3
$(INSTALL) -m 755 -d $(DESTDIR)$(mandir)/man7
$(INSTALL) -m 644 man/*.7 $(DESTDIR)$(mandir)/man7
uninstall:
@$(MAKE) -C src uninstall prefix=$(DESTDIR)$(prefix) datadir=$(DESTDIR)$(datadir)
@rm -f $(DESTDIR)$(libdevdir)/pkgconfig/$(NAME).pc
@rm -f $(DESTDIR)$(libdevdir)/pkgconfig/$(NAME)-ffi.pc
@rm -rf $(DESTDIR)$(mandir)/man2/io_uring*.2
@rm -rf $(DESTDIR)$(mandir)/man3/io_uring*.3
@rm -rf $(DESTDIR)$(mandir)/man7/io_uring*.7
install-tests:
@$(MAKE) -C test install prefix=$(DESTDIR)$(prefix) datadir=$(DESTDIR)$(datadir)
uninstall-tests:
@$(MAKE) -C test uninstall prefix=$(DESTDIR)$(prefix) datadir=$(DESTDIR)$(datadir)
clean:
@rm -f config-host.mak config-host.h cscope.out $(NAME).pc $(NAME)-ffi.pc test/*.dmesg
@$(MAKE) -C src clean
@$(MAKE) -C test clean
@$(MAKE) -C examples clean
cscope:
@cscope -b -R
tag-archive:
@git tag $(TAG)
create-archive:
@git archive --prefix=$(NAME)-$(VERSION)/ -o $(NAME)-$(VERSION).tar.gz $(TAG)
@echo "The final archive is ./$(NAME)-$(VERSION).tar.gz."
archive: clean tag-archive create-archive
srpm: create-archive
$(RPMBUILD) --define "_sourcedir `pwd`" --define "_srcrpmdir `pwd`" --nodeps -bs $(SPECFILE)
liburing-2.9/Makefile.common 0000664 0000000 0000000 00000000465 14750134674 0016146 0 ustar 00root root 0000000 0000000 TOP := $(dir $(CURDIR)/$(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)))
NAME=liburing
SPECFILE=$(TOP)/$(NAME).spec
VERSION=$(shell awk '/Version:/ { print $$2 }' $(SPECFILE))
VERSION_MAJOR=$(shell echo $(VERSION) | cut -d. -f1)
VERSION_MINOR=$(shell echo $(VERSION) | cut -d. -f2)
TAG = $(NAME)-$(VERSION)
liburing-2.9/Makefile.quiet 0000664 0000000 0000000 00000000355 14750134674 0016003 0 ustar 00root root 0000000 0000000 ifneq ($(findstring $(MAKEFLAGS),s),s)
ifndef V
QUIET_CC = @echo ' ' CC $@;
QUIET_CXX = @echo ' ' CXX $@;
QUIET_LINK = @echo ' ' LINK $@;
QUIET_AR = @echo ' ' AR $@;
QUIET_RANLIB = @echo '' RANLIB $@;
endif
endif
liburing-2.9/README 0000664 0000000 0000000 00000006072 14750134674 0014077 0 ustar 00root root 0000000 0000000 liburing
--------
This is the io_uring library, liburing. liburing provides helpers to setup and
teardown io_uring instances, and also a simplified interface for
applications that don't need (or want) to deal with the full kernel
side implementation.
For more info on io_uring, please see:
https://kernel.dk/io_uring.pdf
Subscribe to io-uring@vger.kernel.org for io_uring related discussions
and development for both kernel and userspace. The list is archived here:
https://lore.kernel.org/io-uring/
kernel version dependency
--------------------------
liburing itself is not tied to any specific kernel release, and hence it's
possible to use the newest liburing release even on older kernels (and vice
versa). Newer features may only be available on more recent kernels,
obviously.
ulimit settings
---------------
io_uring accounts memory it needs under the rlimit memlocked option, which
can be quite low on some setups (64K). The default is usually enough for
most use cases, but bigger rings or things like registered buffers deplete
it quickly. root isn't under this restriction, but regular users are. Going
into detail on how to bump the limit on various systems is beyond the scope
of this little blurb, but check /etc/security/limits.conf for user specific
settings, or /etc/systemd/user.conf and /etc/systemd/system.conf for systemd
setups. This affects 5.11 and earlier, new kernels are less dependent
on RLIMIT_MEMLOCK as it is only used for registering buffers.
Regressions tests
-----------------
The bulk of liburing is actually regression/unit tests for both liburing and
the kernel io_uring support. Please note that this suite isn't expected to
pass on older kernels, and may even crash or hang older kernels!
Building liburing
-----------------
#
# Prepare build config (optional).
#
# --cc specifies the C compiler.
# --cxx specifies the C++ compiler.
#
./configure --cc=gcc --cxx=g++;
#
# Build liburing.
#
make -j$(nproc);
#
# Build liburing.pc
#
make liburing.pc
#
# Install liburing (headers, shared/static libs, and manpage).
#
sudo make install;
See './configure --help' for more information about build config options.
FFI support
-----------
By default, the build results in 4 lib files:
2 shared libs:
liburing.so
liburing-ffi.so
2 static libs:
liburing.a
liburing-ffi.a
Languages and applications that can't use 'static inline' functions in
liburing.h should use the FFI variants.
liburing's main public interface lives in liburing.h as 'static inline'
functions. Users wishing to consume liburing purely as a binary dependency
should link against liburing-ffi. It contains definitions for every 'static
inline' function.
License
-------
All software contained within this repo is dual licensed LGPL and MIT, see
COPYING and LICENSE, except for a header coming from the kernel which is
dual licensed GPL with a Linux-syscall-note exception and MIT, see
COPYING.GPL and .
Jens Axboe 2022-05-19
liburing-2.9/SECURITY.md 0000664 0000000 0000000 00000000316 14750134674 0015003 0 ustar 00root root 0000000 0000000 # Security Policy
## Reporting a Vulnerability
Please report any security issue to axboe@kernel.dk where the issue will be triaged appropriately.
Thank you in advance for helping to keep liburing secure.
liburing-2.9/configure 0000775 0000000 0000000 00000036613 14750134674 0015132 0 ustar 00root root 0000000 0000000 #!/bin/sh
set -e
cc=${CC:-gcc}
cxx=${CXX:-g++}
for opt do
optarg=$(expr "x$opt" : 'x[^=]*=\(.*\)' || true)
case "$opt" in
--help|-h) show_help=yes
;;
--prefix=*) prefix="$(realpath -s $optarg)"
;;
--includedir=*) includedir="$optarg"
;;
--libdir=*) libdir="$optarg"
;;
--libdevdir=*) libdevdir="$optarg"
;;
--mandir=*) mandir="$optarg"
;;
--datadir=*) datadir="$optarg"
;;
--cc=*) cc="$optarg"
;;
--cxx=*) cxx="$optarg"
;;
--use-libc) use_libc=yes
;;
--enable-sanitizer) use_sanitizer=yes
;;
*)
echo "ERROR: unknown option $opt"
echo "Try '$0 --help' for more information"
exit 1
;;
esac
done
if test -z "$prefix"; then
prefix=/usr
fi
if test -z "$includedir"; then
includedir="$prefix/include"
fi
if test -z "$libdir"; then
libdir="$prefix/lib"
fi
if test -z "$libdevdir"; then
libdevdir="$prefix/lib"
fi
if test -z "$mandir"; then
mandir="$prefix/man"
fi
if test -z "$datadir"; then
datadir="$prefix/share"
fi
if test x"$libdir" = x"$libdevdir"; then
relativelibdir=""
else
relativelibdir="$libdir/"
fi
if test "$show_help" = "yes"; then
cat <
trap "rm -rf $TMP_DIRECTORY" EXIT INT QUIT TERM
rm -rf config.log
config_host_mak="config-host.mak"
config_host_h="config-host.h"
rm -rf $config_host_mak
rm -rf $config_host_h
fatal() {
echo $@
echo "Configure failed, check config.log and/or the above output"
rm -rf $config_host_mak
rm -rf $config_host_h
exit 1
}
# Print result for each configuration test
print_config() {
printf "%-35s%s\n" "$1" "$2"
}
# Default CFLAGS
CFLAGS="-D_GNU_SOURCE -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -include config-host.h"
BUILD_CFLAGS=""
# Print configure header at the top of $config_host_h
echo "/*" > $config_host_h
echo " * Automatically generated by configure - do not modify" >> $config_host_h
printf " * Configured with:" >> $config_host_h
printf " * '%s'" "$0" "$@" >> $config_host_h
echo "" >> $config_host_h
echo " */" >> $config_host_h
echo "# Automatically generated by configure - do not modify" > $config_host_mak
printf "# Configured with:" >> $config_host_mak
printf " '%s'" "$0" "$@" >> $config_host_mak
echo >> $config_host_mak
do_cxx() {
# Run the compiler, capturing its output to the log.
echo $cxx "$@" >> config.log
$cxx "$@" >> config.log 2>&1 || return $?
return 0
}
do_cc() {
# Run the compiler, capturing its output to the log.
echo $cc "$@" >> config.log
$cc "$@" >> config.log 2>&1 || return $?
# Test passed. If this is an --enable-werror build, rerun
# the test with -Werror and bail out if it fails. This
# makes warning-generating-errors in configure test code
# obvious to developers.
if test "$werror" != "yes"; then
return 0
fi
# Don't bother rerunning the compile if we were already using -Werror
case "$*" in
*-Werror*)
return 0
;;
esac
echo $cc -Werror "$@" >> config.log
$cc -Werror "$@" >> config.log 2>&1 && return $?
echo "ERROR: configure test passed without -Werror but failed with -Werror."
echo "This is probably a bug in the configure script. The failing command"
echo "will be at the bottom of config.log."
fatal "You can run configure with --disable-werror to bypass this check."
}
compile_prog() {
local_cflags="$1"
local_ldflags="$2 $LIBS"
echo "Compiling test case $3" >> config.log
do_cc $CFLAGS $local_cflags -o $TMPE $TMPC $LDFLAGS $local_ldflags
}
compile_prog_cxx() {
local_cflags="$1"
local_ldflags="$2 $LIBS"
echo "Compiling test case $3" >> config.log
do_cxx $CFLAGS $local_cflags -o $TMPE $TMPCXX $LDFLAGS $local_ldflags
}
has() {
type "$1" >/dev/null 2>&1
}
output_mak() {
echo "$1=$2" >> $config_host_mak
}
output_sym() {
output_mak "$1" "y"
echo "#define $1" >> $config_host_h
}
print_and_output_mak() {
print_config "$1" "$2"
output_mak "$1" "$2"
}
print_and_output_mak "prefix" "$prefix"
print_and_output_mak "includedir" "$includedir"
print_and_output_mak "libdir" "$libdir"
print_and_output_mak "libdevdir" "$libdevdir"
print_and_output_mak "relativelibdir" "$relativelibdir"
print_and_output_mak "mandir" "$mandir"
print_and_output_mak "datadir" "$datadir"
####################################################
# Check for correct compiler runtime library to link with
libgcc_link_flag="-lgcc"
if $cc -print-libgcc-file-name >/dev/null 2>&1; then
libgcc_link_flag="$($cc $CFLAGS $LDFLAGS -print-libgcc-file-name)"
fi
print_and_output_mak "libgcc_link_flag" "$libgcc_link_flag"
####################################################
##########################################
# check for compiler -Wstringop-overflow
stringop_overflow="no"
cat > $TMPC << EOF
#include
int main(int argc, char **argv)
{
return 0;
}
EOF
if compile_prog "-Werror -Wstringop-overflow=0" "" "stringop_overflow"; then
stringop_overflow="yes"
fi
print_config "stringop_overflow" "$stringop_overflow"
##########################################
# check for compiler -Warryr-bounds
array_bounds="no"
cat > $TMPC << EOF
#include
int main(int argc, char **argv)
{
return 0;
}
EOF
if compile_prog "-Werror -Warray-bounds=0" "" "array_bounds"; then
array_bounds="yes"
fi
print_config "array_bounds" "$array_bounds"
##########################################
# check for __kernel_rwf_t
__kernel_rwf_t="no"
cat > $TMPC << EOF
#include
int main(int argc, char **argv)
{
__kernel_rwf_t x;
x = 0;
return x;
}
EOF
if compile_prog "" "" "__kernel_rwf_t"; then
__kernel_rwf_t="yes"
fi
print_config "__kernel_rwf_t" "$__kernel_rwf_t"
##########################################
# check for __kernel_timespec
__kernel_timespec="no"
cat > $TMPC << EOF
#include
#include
int main(int argc, char **argv)
{
struct __kernel_timespec ts;
ts.tv_sec = 0;
ts.tv_nsec = 1;
return 0;
}
EOF
if compile_prog "" "" "__kernel_timespec"; then
__kernel_timespec="yes"
fi
print_config "__kernel_timespec" "$__kernel_timespec"
##########################################
# check for open_how
open_how="no"
cat > $TMPC << EOF
#include
#include
#include
#include
int main(int argc, char **argv)
{
struct open_how how;
how.flags = 0;
how.mode = 0;
how.resolve = 0;
return 0;
}
EOF
if compile_prog "" "" "open_how"; then
open_how="yes"
fi
print_config "open_how" "$open_how"
##########################################
# check for statx
statx="no"
cat > $TMPC << EOF
#include
#include
#include
#include
#include
int main(int argc, char **argv)
{
struct statx x;
return memset(&x, 0, sizeof(x)) != NULL;
}
EOF
if compile_prog "" "" "statx"; then
statx="yes"
fi
print_config "statx" "$statx"
##########################################
# check for glibc statx
glibc_statx="no"
cat > $TMPC << EOF
#include
#include
#include
#include
#include
int main(int argc, char **argv)
{
struct statx x;
return memset(&x, 0, sizeof(x)) != NULL;
}
EOF
if compile_prog "" "" "glibc_statx"; then
glibc_statx="yes"
fi
print_config "glibc_statx" "$glibc_statx"
##########################################
# check for C++
has_cxx="no"
cat > $TMPCXX << EOF
#include
int main(int argc, char **argv)
{
std::cout << "Test";
return 0;
}
EOF
if compile_prog_cxx "" "" "C++"; then
has_cxx="yes"
fi
print_config "C++" "$has_cxx"
##########################################
# check for ucontext support
has_ucontext="no"
cat > $TMPC << EOF
#include
int main(int argc, char **argv)
{
ucontext_t ctx;
getcontext(&ctx);
makecontext(&ctx, 0, 0);
return 0;
}
EOF
if compile_prog "" "" "has_ucontext"; then
has_ucontext="yes"
fi
print_config "has_ucontext" "$has_ucontext"
##########################################
# Check NVME_URING_CMD support
nvme_uring_cmd="no"
cat > $TMPC << EOF
#include
int main(void)
{
struct nvme_uring_cmd *cmd;
return sizeof(struct nvme_uring_cmd);
}
EOF
if compile_prog "" "" "nvme uring cmd"; then
nvme_uring_cmd="yes"
fi
print_config "NVMe uring command support" "$nvme_uring_cmd"
##########################################
# Check futexv support
futexv="no"
cat > $TMPC << EOF
#include
#include
#include
int main(void)
{
struct futex_waitv fw;
memset(&fw, FUTEX_32, sizeof(fw));
return sizeof(struct futex_waitv);
}
EOF
if compile_prog "" "" "futexv"; then
futexv="yes"
fi
print_config "futex waitv support" "$futexv"
##########################################
# Check block discard cmd support
discard_cmd="no"
cat > $TMPC << EOF
#include
int main(void)
{
return BLOCK_URING_CMD_DISCARD;
}
EOF
if compile_prog "" "" "discard command"; then
discard_cmd="yes"
fi
print_config "io_uring discard command support" "$discard_cmd"
##########################################
# Check idtype_t support
has_idtype_t="no"
cat > $TMPC << EOF
#include
int main(void)
{
idtype_t v;
return 0;
}
EOF
if compile_prog "" "" "idtype_t"; then
has_idtype_t="yes"
fi
print_config "has_idtype_t" "$has_idtype_t"
#############################################################################
liburing_nolibc="no"
if test "$use_libc" != "yes"; then
#
# Currently, CONFIG_NOLIBC only supports x86-64, x86 (32-bit), aarch64 and riscv64.
#
cat > $TMPC << EOF
int main(void){
#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64)
return 0;
#else
#error libc is needed
#endif
}
EOF
if compile_prog "" "" "nolibc"; then
liburing_nolibc="yes"
fi
fi
print_config "nolibc" "$liburing_nolibc";
#############################################################################
####################################################
# Most Android devices don't have sys/fanotify.h
has_fanotify="no"
cat > $TMPC << EOF
#include
int main(void)
{
return 0;
}
EOF
if compile_prog "" "" "fanotify"; then
has_fanotify="yes"
fi
print_config "has_fanotify" "$has_fanotify"
####################################################
##########################################
# check for ublk headers
ublk_header="no"
cat > $TMPC << EOF
#include
#include
#include
int main(int argc, char **argv)
{
struct ublksrv_ctrl_cmd cmd = { };
cmd.addr = UBLK_U_CMD_START_DEV;
return cmd.queue_id;
}
EOF
if compile_prog "" "" "ublk_header"; then
ublk_header="yes"
fi
print_config "ublk_header" "$ublk_header"
if test "$liburing_nolibc" = "yes"; then
output_sym "CONFIG_NOLIBC"
fi
if test "$__kernel_rwf_t" = "yes"; then
output_sym "CONFIG_HAVE_KERNEL_RWF_T"
fi
if test "$__kernel_timespec" = "yes"; then
output_sym "CONFIG_HAVE_KERNEL_TIMESPEC"
fi
if test "$open_how" = "yes"; then
output_sym "CONFIG_HAVE_OPEN_HOW"
fi
if test "$statx" = "yes"; then
output_sym "CONFIG_HAVE_STATX"
fi
if test "$glibc_statx" = "yes"; then
output_sym "CONFIG_HAVE_GLIBC_STATX"
fi
if test "$has_cxx" = "yes"; then
output_sym "CONFIG_HAVE_CXX"
fi
if test "$has_ucontext" = "yes"; then
output_sym "CONFIG_HAVE_UCONTEXT"
fi
if test "$stringop_overflow" = "yes"; then
output_sym "CONFIG_HAVE_STRINGOP_OVERFLOW"
fi
if test "$array_bounds" = "yes"; then
output_sym "CONFIG_HAVE_ARRAY_BOUNDS"
fi
if test "$nvme_uring_cmd" = "yes"; then
output_sym "CONFIG_HAVE_NVME_URING"
fi
if test "$has_fanotify" = "yes"; then
output_sym "CONFIG_HAVE_FANOTIFY"
fi
if test "$futexv" = "yes"; then
output_sym "CONFIG_HAVE_FUTEXV"
fi
if test "$ublk_header" = "yes"; then
output_sym "CONFIG_HAVE_UBLK_HEADER"
fi
if test "$use_sanitizer" = "yes"; then
output_sym "CONFIG_USE_SANITIZER"
print_config "use sanitizer" "yes"
else
print_config "use sanitizer" "no"
fi
echo "CC=$cc" >> $config_host_mak
print_config "CC" "$cc"
echo "CXX=$cxx" >> $config_host_mak
print_config "CXX" "$cxx"
# generate io_uring_version.h
# Reset MAKEFLAGS
MAKEFLAGS=
MAKE_PRINT_VARS="include Makefile.common\nprint-%%: ; @echo \$(\$*)\n"
VERSION_MAJOR=$(printf "$MAKE_PRINT_VARS" | make -s --no-print-directory -f - print-VERSION_MAJOR)
VERSION_MINOR=$(printf "$MAKE_PRINT_VARS" | make -s --no-print-directory -f - print-VERSION_MINOR)
io_uring_version_h="src/include/liburing/io_uring_version.h"
cat > $io_uring_version_h << EOF
/* SPDX-License-Identifier: MIT */
#ifndef LIBURING_VERSION_H
#define LIBURING_VERSION_H
#define IO_URING_VERSION_MAJOR $VERSION_MAJOR
#define IO_URING_VERSION_MINOR $VERSION_MINOR
#endif
EOF
# generate compat.h
compat_h="src/include/liburing/compat.h"
cat > $compat_h << EOF
/* SPDX-License-Identifier: MIT */
#ifndef LIBURING_COMPAT_H
#define LIBURING_COMPAT_H
EOF
if test "$__kernel_rwf_t" != "yes"; then
cat >> $compat_h << EOF
typedef int __kernel_rwf_t;
EOF
fi
if test "$__kernel_timespec" != "yes"; then
cat >> $compat_h << EOF
#include
struct __kernel_timespec {
int64_t tv_sec;
long long tv_nsec;
};
/* is not available, so it can't be included */
#define UAPI_LINUX_IO_URING_H_SKIP_LINUX_TIME_TYPES_H 1
EOF
else
cat >> $compat_h << EOF
#include
/* is included above and not needed again */
#define UAPI_LINUX_IO_URING_H_SKIP_LINUX_TIME_TYPES_H 1
EOF
fi
if test "$open_how" != "yes"; then
cat >> $compat_h << EOF
#include
struct open_how {
uint64_t flags;
uint64_t mode;
uint64_t resolve;
};
EOF
else cat >> $compat_h << EOF
#include
EOF
fi
if [ "$glibc_statx" = "no" ] && [ "$statx" = "yes" ]; then
cat >> $compat_h << EOF
#include
EOF
fi
if test "$futexv" != "yes"; then
cat >> $compat_h << EOF
#include
#define FUTEX_32 2
#define FUTEX_WAITV_MAX 128
struct futex_waitv {
uint64_t val;
uint64_t uaddr;
uint32_t flags;
uint32_t __reserved;
};
EOF
fi
if test "$has_idtype_t" != "yes"; then
cat >> $compat_h << EOF
typedef enum
{
P_ALL, /* Wait for any child. */
P_PID, /* Wait for specified process. */
P_PGID /* Wait for members of process group. */
} idtype_t;
EOF
fi
if test "$discard_cmd" != "yes"; then
cat >> $compat_h << EOF
#include
#ifndef BLOCK_URING_CMD_DISCARD
#define BLOCK_URING_CMD_DISCARD _IO(0x12, 0)
#endif
EOF
else cat >> $compat_h << EOF
#include
EOF
fi
cat >> $compat_h << EOF
#endif
EOF
liburing-2.9/debian/ 0000775 0000000 0000000 00000000000 14750134674 0014434 5 ustar 00root root 0000000 0000000 liburing-2.9/debian/README.Debian 0000664 0000000 0000000 00000000354 14750134674 0016477 0 ustar 00root root 0000000 0000000 liburing for Debian
The newest Linux IO interface i.e. io_uring, need
userspace library to support it. This package
liburing is the library for io_uring.
-- Liu Changcheng Thu, 14 Nov 2019 21:35:39 +0800
liburing-2.9/debian/changelog 0000664 0000000 0000000 00000002153 14750134674 0016307 0 ustar 00root root 0000000 0000000 liburing (2.2-1) stable; urgency=low
* Update to 2.2
* Bump up so version to 2
* Drop liburing1-udeb
* Package using dh instead of using dh_* helpers manually
* Add linux header dependency to liburing-dev
* Bump up debhelper-compact level to 13
-- Kefu Chai Sun, 16 Oct 2022 16:30:48 +0800
liburing (0.7-1) stable; urgency=low
* Update to 0.7
* Fix library symlinks
-- Stefan Metzmacher Thu, 23 Jul 2020 00:23:00 +0200
liburing (0.4-2) stable; urgency=low
* Fix /usr/lib/*/liburing.so symlink to /lib/*/liburing.so.1.0.4
-- Stefan Metzmacher Fri, 07 Feb 2020 15:30:00 +0100
liburing (0.4-1) stable; urgency=low
* Package liburing-0.4 using a packaging layout similar to libaio1
-- Stefan Metzmacher Thu, 06 Feb 2020 11:30:00 +0100
liburing (0.2-1ubuntu1) stable; urgency=low
* Initial release.
* commit 4bce856d43ab1f9a64477aa5a8f9f02f53e64b74
* Author: Jens Axboe
* Date: Mon Nov 11 16:00:58 2019 -0700
-- Liu Changcheng Fri, 15 Nov 2019 00:06:46 +0800
liburing-2.9/debian/control 0000664 0000000 0000000 00000002312 14750134674 0016035 0 ustar 00root root 0000000 0000000 Source: liburing
Section: libs
Priority: optional
Maintainer: Liu Changcheng
Build-Depends:
debhelper-compat (= 13)
Standards-Version: 4.1.4
Homepage: https://git.kernel.dk/cgit/liburing/tree/README
Vcs-Git: https://git.kernel.dk/liburing
Vcs-Browser: https://git.kernel.dk/cgit/liburing/
Package: liburing2
Architecture: linux-any
Multi-Arch: same
Pre-Depends: ${misc:Pre-Depends}
Depends: ${misc:Depends}, ${shlibs:Depends}
Description: userspace library for using io_uring
io_uring is kernel feature to improve development
The newese Linux IO interface, io_uring could improve
system performance a lot. liburing is the userspace
library to use io_uring feature.
.
This package contains the shared library.
Package: liburing-dev
Section: libdevel
Architecture: linux-any
Multi-Arch: same
Depends:
${misc:Depends},
liburing2 (= ${binary:Version}),
linux-libc-dev (>= 5.1)
Description: userspace library for using io_uring
io_uring is kernel feature to improve development
The newese Linux IO interface, io_uring could improve
system performance a lot. liburing is the userspace
library to use io_uring feature.
.
This package contains the static library and the header files.
liburing-2.9/debian/copyright 0000664 0000000 0000000 00000004175 14750134674 0016376 0 ustar 00root root 0000000 0000000 Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: liburing
Source: https://git.kernel.dk/cgit/liburing/
Files: *
Copyright: 2019 Jens Axboe
License: GPL-2+ / MIT
Files: debian/*
Copyright: 2019 Changcheng Liu
License: GPL-2+
License: GPL-2+
This package is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
.
This package is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
.
You should have received a copy of the GNU General Public License
along with this program. If not, see
.
On Debian systems, the complete text of the GNU General
Public License version 2 can be found in "/usr/share/common-licenses/GPL-2".
License: MIT
Copyright 2020 Jens Axboe
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
liburing-2.9/debian/liburing-dev.install 0000664 0000000 0000000 00000000103 14750134674 0020405 0 ustar 00root root 0000000 0000000 usr/include
usr/lib/*/lib*.so
usr/lib/*/lib*.a
usr/lib/*/pkgconfig
liburing-2.9/debian/liburing-dev.manpages 0000664 0000000 0000000 00000000240 14750134674 0020534 0 ustar 00root root 0000000 0000000 usr/share/man/man2/io_uring_*.2
usr/share/man/man3/io_uring_*.3
usr/share/man/man7/io_uring.7
usr/share/man/man3/IO_URING_*.3
usr/share/man/man3/__io_uring_*.3
liburing-2.9/debian/liburing2.install 0000664 0000000 0000000 00000000024 14750134674 0017715 0 ustar 00root root 0000000 0000000 usr/lib/*/lib*.so.*
liburing-2.9/debian/liburing2.symbols 0000664 0000000 0000000 00000005250 14750134674 0017745 0 ustar 00root root 0000000 0000000 liburing.so.2 liburing2 #MINVER# [47/1887]
LIBURING_2.0@LIBURING_2.0 0.7-1
LIBURING_2.1@LIBURING_2.1 0.7-1
LIBURING_2.2@LIBURING_2.2 0.7-1
LIBURING_2.3@LIBURING_2.3 0.7-1
__io_uring_get_cqe@LIBURING_2.0 0.7-1
__io_uring_sqring_wait@LIBURING_2.0 0.7-1
io_uring_enter2@LIBURING_2.3 0.7-1
io_uring_enter@LIBURING_2.3 0.7-1
io_uring_free_probe@LIBURING_2.0 0.7-1
io_uring_get_events@LIBURING_2.3 0.7-1
io_uring_get_probe@LIBURING_2.0 0.7-1
io_uring_get_probe_ring@LIBURING_2.0 0.7-1
io_uring_get_sqe@LIBURING_2.0 0.7-1
io_uring_mlock_size@LIBURING_2.1 0.7-1
io_uring_mlock_size_params@LIBURING_2.1 0.7-1
io_uring_peek_batch_cqe@LIBURING_2.0 0.7-1
io_uring_queue_exit@LIBURING_2.0 0.7-1
io_uring_queue_init@LIBURING_2.0 0.7-1
io_uring_queue_init_params@LIBURING_2.0 0.7-1
io_uring_queue_mmap@LIBURING_2.0 0.7-1
io_uring_register@LIBURING_2.3 0.7-1
io_uring_register_buf_ring@LIBURING_2.2 0.7-1
io_uring_register_buffers@LIBURING_2.0 0.7-1
io_uring_register_buffers_sparse@LIBURING_2.2 0.7-1
io_uring_register_buffers_tags@LIBURING_2.1 0.7-1
io_uring_register_buffers_update_tag@LIBURING_2.1 0.7-1
io_uring_register_eventfd@LIBURING_2.0 0.7-1
io_uring_register_eventfd_async@LIBURING_2.0 0.7-1
io_uring_register_file_alloc_range@LIBURING_2.3 0.7-1
io_uring_register_files@LIBURING_2.0 0.7-1
io_uring_register_files_sparse@LIBURING_2.2 0.7-1
io_uring_register_files_tags@LIBURING_2.1 0.7-1
io_uring_register_files_update@LIBURING_2.0 0.7-1
io_uring_register_files_update_tag@LIBURING_2.1 0.7-1
io_uring_register_iowq_aff@LIBURING_2.1 0.7-1
io_uring_register_iowq_max_workers@LIBURING_2.1 0.7-1
io_uring_register_personality@LIBURING_2.0 0.7-1
io_uring_register_probe@LIBURING_2.0 0.7-1
io_uring_register_ring_fd@LIBURING_2.2 0.7-1
io_uring_register_sync_cancel@LIBURING_2.3 0.7-1
io_uring_ring_dontfork@LIBURING_2.0 0.7-1
io_uring_setup@LIBURING_2.3 0.7-1
io_uring_submit@LIBURING_2.0 0.7-1
io_uring_submit_and_get_events@LIBURING_2.3 0.7-1
io_uring_submit_and_wait@LIBURING_2.0 0.7-1
io_uring_submit_and_wait_timeout@LIBURING_2.2 0.7-1
io_uring_unregister_buf_ring@LIBURING_2.2 0.7-1
io_uring_unregister_buffers@LIBURING_2.0 0.7-1
io_uring_unregister_eventfd@LIBURING_2.0 0.7-1
io_uring_unregister_files@LIBURING_2.0 0.7-1
io_uring_unregister_iowq_aff@LIBURING_2.1 0.7-1
io_uring_unregister_personality@LIBURING_2.0 0.7-1
io_uring_unregister_ring_fd@LIBURING_2.2 0.7-1
io_uring_wait_cqe_timeout@LIBURING_2.0 0.7-1
io_uring_wait_cqes@LIBURING_2.0 0.7-1
liburing-2.9/debian/patches/ 0000775 0000000 0000000 00000000000 14750134674 0016063 5 ustar 00root root 0000000 0000000 liburing-2.9/debian/patches/series 0000664 0000000 0000000 00000000101 14750134674 0017270 0 ustar 00root root 0000000 0000000 # You must remove unused comment lines for the released package.
liburing-2.9/debian/rules 0000775 0000000 0000000 00000001176 14750134674 0015521 0 ustar 00root root 0000000 0000000 #!/usr/bin/make -f
# Uncomment this to turn on verbose mode.
#export DH_VERBOSE=1
DEB_BUILD_MAINT_OPTIONS = hardening=+bindnow
DEB_CFLAGS_MAINT_PREPEND = -Wall
DEB_BUILD_OPTIONS += nocheck
include /usr/share/dpkg/default.mk
include /usr/share/dpkg/buildtools.mk
%:
dh $@ --parallel
override_dh_auto_configure:
./configure \
--prefix=/usr \
--includedir=/usr/include \
--datadir=/usr/share \
--mandir=/usr/share/man \
--libdir=/usr/lib/$(DEB_HOST_MULTIARCH) \
--libdevdir=/usr/lib/$(DEB_HOST_MULTIARCH) \
--cc=$(CC)
override_dh_auto_test:
ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS)))
$(MAKE) runtests
endif
liburing-2.9/debian/source/ 0000775 0000000 0000000 00000000000 14750134674 0015734 5 ustar 00root root 0000000 0000000 liburing-2.9/debian/source/format 0000664 0000000 0000000 00000000014 14750134674 0017142 0 ustar 00root root 0000000 0000000 3.0 (quilt)
liburing-2.9/debian/source/local-options 0000664 0000000 0000000 00000000054 14750134674 0020441 0 ustar 00root root 0000000 0000000 #abort-on-upstream-changes
#unapply-patches
liburing-2.9/debian/source/options 0000664 0000000 0000000 00000000130 14750134674 0017344 0 ustar 00root root 0000000 0000000 extend-diff-ignore = "(^|/)(config\.log|config-host\.h|config-host\.mak|liburing\.pc)$"
liburing-2.9/debian/watch 0000664 0000000 0000000 00000000272 14750134674 0015466 0 ustar 00root root 0000000 0000000 # Site Directory Pattern Version Script
version=4
https://git.kernel.dk/cgit/liburing/ snapshot\/liburing-([\d\.]+)\.tar\.(?:gz|xz) debian uupdate
liburing-2.9/examples/ 0000775 0000000 0000000 00000000000 14750134674 0015030 5 ustar 00root root 0000000 0000000 liburing-2.9/examples/Makefile 0000664 0000000 0000000 00000002501 14750134674 0016466 0 ustar 00root root 0000000 0000000 CPPFLAGS ?=
override CPPFLAGS += -D_GNU_SOURCE -I../src/include/
CFLAGS ?= -g -O2 -Wall
LDFLAGS ?=
override LDFLAGS += -L../src/ -luring
include ../Makefile.quiet
ifneq ($(MAKECMDGOALS),clean)
include ../config-host.mak
endif
LDFLAGS ?=
override LDFLAGS += -L../src/ -luring -lpthread
ifeq ($(CONFIG_USE_SANITIZER),y)
override CFLAGS += -fsanitize=address,undefined -fno-omit-frame-pointer -fno-optimize-sibling-calls
override CPPFLAGS += -fsanitize=address,undefined -fno-omit-frame-pointer -fno-optimize-sibling-calls
override LDFLAGS += -fsanitize=address,undefined
endif
example_srcs := \
io_uring-close-test.c \
io_uring-cp.c \
io_uring-test.c \
io_uring-udp.c \
link-cp.c \
napi-busy-poll-client.c \
napi-busy-poll-server.c \
poll-bench.c \
reg-wait.c \
send-zerocopy.c \
rsrc-update-bench.c \
proxy.c \
kdigest.c
all_targets :=
ifdef CONFIG_HAVE_UCONTEXT
example_srcs += ucontext-cp.c
endif
all_targets += ucontext-cp helpers.o
example_targets := $(patsubst %.c,%,$(patsubst %.cc,%,$(example_srcs)))
all_targets += $(example_targets)
helpers = helpers.o
all: $(example_targets)
helpers.o: helpers.c
$(QUIET_CC)$(CC) $(CPPFLAGS) $(CFLAGS) -o $@ -c $<
%: %.c $(helpers) ../src/liburing.a
$(QUIET_CC)$(CC) $(CPPFLAGS) $(CFLAGS) -o $@ $< $(helpers) $(LDFLAGS)
clean:
@rm -f $(all_targets)
.PHONY: all clean
liburing-2.9/examples/helpers.c 0000664 0000000 0000000 00000002361 14750134674 0016640 0 ustar 00root root 0000000 0000000 /* SPDX-License-Identifier: MIT */
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "helpers.h"
int setup_listening_socket(int port, int ipv6)
{
struct sockaddr_in srv_addr = { };
struct sockaddr_in6 srv_addr6 = { };
int fd, enable, ret, domain;
if (ipv6)
domain = AF_INET6;
else
domain = AF_INET;
fd = socket(domain, SOCK_STREAM, 0);
if (fd == -1) {
perror("socket()");
return -1;
}
enable = 1;
ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
if (ret < 0) {
perror("setsockopt(SO_REUSEADDR)");
return -1;
}
if (ipv6) {
srv_addr6.sin6_family = AF_INET6;
srv_addr6.sin6_port = htons(port);
srv_addr6.sin6_addr = in6addr_any;
ret = bind(fd, (const struct sockaddr *)&srv_addr6, sizeof(srv_addr6));
} else {
srv_addr.sin_family = AF_INET;
srv_addr.sin_port = htons(port);
srv_addr.sin_addr.s_addr = htonl(INADDR_ANY);
ret = bind(fd, (const struct sockaddr *)&srv_addr, sizeof(srv_addr));
}
if (ret < 0) {
perror("bind()");
return -1;
}
if (listen(fd, 1024) < 0) {
perror("listen()");
return -1;
}
return fd;
}
liburing-2.9/examples/helpers.h 0000664 0000000 0000000 00000000230 14750134674 0016636 0 ustar 00root root 0000000 0000000 /* SPDX-License-Identifier: MIT */
#ifndef LIBURING_EX_HELPERS_H
#define LIBURING_EX_HELPERS_H
int setup_listening_socket(int port, int ipv6);
#endif
liburing-2.9/examples/io_uring-close-test.c 0000664 0000000 0000000 00000004747 14750134674 0021103 0 ustar 00root root 0000000 0000000 /* SPDX-License-Identifier: MIT */
/*
* Simple app that demonstrates how to setup an io_uring interface, and use it
* via a registered ring fd, without leaving the original fd open.
*
* gcc -Wall -O2 -D_GNU_SOURCE -o io_uring-close-test io_uring-close-test.c -luring
*/
#include
#include
#include
#include
#include
#include
#include
#include "liburing.h"
#define QD 4
int main(int argc, char *argv[])
{
struct io_uring ring;
int i, fd, ret, pending, done;
struct io_uring_sqe *sqe;
struct io_uring_cqe *cqe;
struct iovec *iovecs;
struct stat sb;
ssize_t fsize;
off_t offset;
void *buf;
if (argc < 2) {
printf("%s: file\n", argv[0]);
return 1;
}
ret = io_uring_queue_init(QD, &ring, 0);
if (ret < 0) {
fprintf(stderr, "queue_init: %s\n", strerror(-ret));
return 1;
}
ret = io_uring_register_ring_fd(&ring);
if (ret < 0) {
fprintf(stderr, "register_ring_fd: %s\n", strerror(-ret));
return 1;
}
ret = io_uring_close_ring_fd(&ring);
if (ret < 0) {
fprintf(stderr, "close_ring_fd: %s\n", strerror(-ret));
return 1;
}
fd = open(argv[1], O_RDONLY);
if (fd < 0) {
perror("open");
return 1;
}
if (fstat(fd, &sb) < 0) {
perror("fstat");
return 1;
}
fsize = 0;
iovecs = calloc(QD, sizeof(struct iovec));
for (i = 0; i < QD; i++) {
if (posix_memalign(&buf, 4096, 4096))
return 1;
iovecs[i].iov_base = buf;
iovecs[i].iov_len = 4096;
fsize += 4096;
}
offset = 0;
i = 0;
do {
sqe = io_uring_get_sqe(&ring);
if (!sqe)
break;
io_uring_prep_readv(sqe, fd, &iovecs[i], 1, offset);
offset += iovecs[i].iov_len;
i++;
if (offset > sb.st_size)
break;
} while (1);
ret = io_uring_submit(&ring);
if (ret < 0) {
fprintf(stderr, "io_uring_submit: %s\n", strerror(-ret));
return 1;
} else if (ret != i) {
fprintf(stderr, "io_uring_submit submitted less %d\n", ret);
return 1;
}
done = 0;
pending = ret;
fsize = 0;
for (i = 0; i < pending; i++) {
ret = io_uring_wait_cqe(&ring, &cqe);
if (ret < 0) {
fprintf(stderr, "io_uring_wait_cqe: %s\n", strerror(-ret));
return 1;
}
done++;
ret = 0;
if (cqe->res != 4096 && cqe->res + fsize != sb.st_size) {
fprintf(stderr, "ret=%d, wanted 4096\n", cqe->res);
ret = 1;
}
fsize += cqe->res;
io_uring_cqe_seen(&ring, cqe);
if (ret)
break;
}
printf("Submitted=%d, completed=%d, bytes=%lu\n", pending, done,
(unsigned long) fsize);
close(fd);
io_uring_queue_exit(&ring);
return 0;
}
liburing-2.9/examples/io_uring-cp.c 0000664 0000000 0000000 00000012316 14750134674 0017412 0 ustar 00root root 0000000 0000000 /* SPDX-License-Identifier: MIT */
/*
* gcc -Wall -O2 -D_GNU_SOURCE -o io_uring-cp io_uring-cp.c -luring
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "liburing.h"
#define QD 64
#define BS (32*1024)
static int infd, outfd;
struct io_data {
int read;
off_t first_offset, offset;
size_t first_len;
struct iovec iov;
};
static int setup_context(unsigned entries, struct io_uring *ring)
{
int ret;
ret = io_uring_queue_init(entries, ring, 0);
if (ret < 0) {
fprintf(stderr, "queue_init: %s\n", strerror(-ret));
return -1;
}
return 0;
}
static int get_file_size(int fd, off_t *size)
{
struct stat st;
if (fstat(fd, &st) < 0)
return -1;
if (S_ISREG(st.st_mode)) {
*size = st.st_size;
return 0;
} else if (S_ISBLK(st.st_mode)) {
unsigned long long bytes;
if (ioctl(fd, BLKGETSIZE64, &bytes) != 0)
return -1;
*size = bytes;
return 0;
}
return -1;
}
static void queue_prepped(struct io_uring *ring, struct io_data *data)
{
struct io_uring_sqe *sqe;
sqe = io_uring_get_sqe(ring);
assert(sqe);
if (data->read)
io_uring_prep_readv(sqe, infd, &data->iov, 1, data->offset);
else
io_uring_prep_writev(sqe, outfd, &data->iov, 1, data->offset);
io_uring_sqe_set_data(sqe, data);
}
static int queue_read(struct io_uring *ring, off_t size, off_t offset)
{
struct io_uring_sqe *sqe;
struct io_data *data;
data = malloc(size + sizeof(*data));
if (!data)
return 1;
sqe = io_uring_get_sqe(ring);
if (!sqe) {
free(data);
return 1;
}
data->read = 1;
data->offset = data->first_offset = offset;
data->iov.iov_base = data + 1;
data->iov.iov_len = size;
data->first_len = size;
io_uring_prep_readv(sqe, infd, &data->iov, 1, offset);
io_uring_sqe_set_data(sqe, data);
return 0;
}
static void queue_write(struct io_uring *ring, struct io_data *data)
{
data->read = 0;
data->offset = data->first_offset;
data->iov.iov_base = data + 1;
data->iov.iov_len = data->first_len;
queue_prepped(ring, data);
io_uring_submit(ring);
}
static int copy_file(struct io_uring *ring, off_t insize)
{
unsigned long reads, writes;
struct io_uring_cqe *cqe;
off_t write_left, offset;
int ret;
write_left = insize;
writes = reads = offset = 0;
while (insize || write_left) {
unsigned long had_reads;
int got_comp;
/*
* Queue up as many reads as we can
*/
had_reads = reads;
while (insize) {
off_t this_size = insize;
if (reads + writes >= QD)
break;
if (this_size > BS)
this_size = BS;
else if (!this_size)
break;
if (queue_read(ring, this_size, offset))
break;
insize -= this_size;
offset += this_size;
reads++;
}
if (had_reads != reads) {
ret = io_uring_submit(ring);
if (ret < 0) {
fprintf(stderr, "io_uring_submit: %s\n", strerror(-ret));
break;
}
}
/*
* Queue is full at this point. Find at least one completion.
*/
got_comp = 0;
while (write_left) {
struct io_data *data;
if (!got_comp) {
ret = io_uring_wait_cqe(ring, &cqe);
got_comp = 1;
} else {
ret = io_uring_peek_cqe(ring, &cqe);
if (ret == -EAGAIN) {
cqe = NULL;
ret = 0;
}
}
if (ret < 0) {
fprintf(stderr, "io_uring_peek_cqe: %s\n",
strerror(-ret));
return 1;
}
if (!cqe)
break;
data = io_uring_cqe_get_data(cqe);
if (cqe->res < 0) {
if (cqe->res == -EAGAIN) {
queue_prepped(ring, data);
io_uring_submit(ring);
io_uring_cqe_seen(ring, cqe);
continue;
}
fprintf(stderr, "cqe failed: %s\n",
strerror(-cqe->res));
return 1;
} else if ((size_t)cqe->res != data->iov.iov_len) {
/* Short read/write, adjust and requeue */
data->iov.iov_base += cqe->res;
data->iov.iov_len -= cqe->res;
data->offset += cqe->res;
queue_prepped(ring, data);
io_uring_submit(ring);
io_uring_cqe_seen(ring, cqe);
continue;
}
/*
* All done. if write, nothing else to do. if read,
* queue up corresponding write.
*/
if (data->read) {
queue_write(ring, data);
write_left -= data->first_len;
reads--;
writes++;
} else {
free(data);
writes--;
}
io_uring_cqe_seen(ring, cqe);
}
}
/* wait out pending writes */
while (writes) {
struct io_data *data;
ret = io_uring_wait_cqe(ring, &cqe);
if (ret) {
fprintf(stderr, "wait_cqe=%d\n", ret);
return 1;
}
if (cqe->res < 0) {
fprintf(stderr, "write res=%d\n", cqe->res);
return 1;
}
data = io_uring_cqe_get_data(cqe);
free(data);
writes--;
io_uring_cqe_seen(ring, cqe);
}
return 0;
}
int main(int argc, char *argv[])
{
struct io_uring ring;
off_t insize;
int ret;
if (argc < 3) {
printf("%s: infile outfile\n", argv[0]);
return 1;
}
infd = open(argv[1], O_RDONLY);
if (infd < 0) {
perror("open infile");
return 1;
}
outfd = open(argv[2], O_WRONLY | O_CREAT | O_TRUNC, 0644);
if (outfd < 0) {
perror("open outfile");
return 1;
}
if (setup_context(QD, &ring))
return 1;
if (get_file_size(infd, &insize))
return 1;
ret = copy_file(&ring, insize);
close(infd);
close(outfd);
io_uring_queue_exit(&ring);
return ret;
}
liburing-2.9/examples/io_uring-test.c 0000664 0000000 0000000 00000004426 14750134674 0017772 0 ustar 00root root 0000000 0000000 /* SPDX-License-Identifier: MIT */
/*
* Simple app that demonstrates how to setup an io_uring interface,
* submit and complete IO against it, and then tear it down.
*
* gcc -Wall -O2 -D_GNU_SOURCE -o io_uring-test io_uring-test.c -luring
*/
#include
#include
#include
#include
#include
#include
#include
#include "liburing.h"
#define QD 4
int main(int argc, char *argv[])
{
struct io_uring ring;
int i, fd, ret, pending, done;
struct io_uring_sqe *sqe;
struct io_uring_cqe *cqe;
struct iovec *iovecs;
struct stat sb;
ssize_t fsize;
off_t offset;
void *buf;
if (argc < 2) {
printf("%s: file\n", argv[0]);
return 1;
}
ret = io_uring_queue_init(QD, &ring, 0);
if (ret < 0) {
fprintf(stderr, "queue_init: %s\n", strerror(-ret));
return 1;
}
fd = open(argv[1], O_RDONLY | O_DIRECT);
if (fd < 0) {
perror("open");
return 1;
}
if (fstat(fd, &sb) < 0) {
perror("fstat");
return 1;
}
fsize = 0;
iovecs = calloc(QD, sizeof(struct iovec));
for (i = 0; i < QD; i++) {
if (posix_memalign(&buf, 4096, 4096))
return 1;
iovecs[i].iov_base = buf;
iovecs[i].iov_len = 4096;
fsize += 4096;
}
offset = 0;
i = 0;
do {
sqe = io_uring_get_sqe(&ring);
if (!sqe)
break;
io_uring_prep_readv(sqe, fd, &iovecs[i], 1, offset);
offset += iovecs[i].iov_len;
i++;
if (offset >= sb.st_size)
break;
} while (1);
ret = io_uring_submit(&ring);
if (ret < 0) {
fprintf(stderr, "io_uring_submit: %s\n", strerror(-ret));
return 1;
} else if (ret != i) {
fprintf(stderr, "io_uring_submit submitted less %d\n", ret);
return 1;
}
done = 0;
pending = ret;
fsize = 0;
for (i = 0; i < pending; i++) {
ret = io_uring_wait_cqe(&ring, &cqe);
if (ret < 0) {
fprintf(stderr, "io_uring_wait_cqe: %s\n", strerror(-ret));
return 1;
}
done++;
ret = 0;
if (cqe->res != 4096 && cqe->res + fsize != sb.st_size) {
fprintf(stderr, "ret=%d, wanted 4096\n", cqe->res);
ret = 1;
}
fsize += cqe->res;
io_uring_cqe_seen(&ring, cqe);
if (ret)
break;
}
printf("Submitted=%d, completed=%d, bytes=%lu\n", pending, done,
(unsigned long) fsize);
close(fd);
io_uring_queue_exit(&ring);
for (i = 0; i < QD; i++)
free(iovecs[i].iov_base);
free(iovecs);
return 0;
}
liburing-2.9/examples/io_uring-udp.c 0000664 0000000 0000000 00000021167 14750134674 0017604 0 ustar 00root root 0000000 0000000 /* SPDX-License-Identifier: MIT */
#include
#include
#include
#include
#include
#include
#include
#include
#include "liburing.h"
#define QD 64
#define BUF_SHIFT 12 /* 4k */
#define CQES (QD * 16)
#define BUFFERS CQES
#define CONTROLLEN 0
struct sendmsg_ctx {
struct msghdr msg;
struct iovec iov;
};
struct ctx {
struct io_uring ring;
struct io_uring_buf_ring *buf_ring;
unsigned char *buffer_base;
struct msghdr msg;
int buf_shift;
int af;
bool verbose;
struct sendmsg_ctx send[BUFFERS];
size_t buf_ring_size;
};
static size_t buffer_size(struct ctx *ctx)
{
return 1U << ctx->buf_shift;
}
static unsigned char *get_buffer(struct ctx *ctx, int idx)
{
return ctx->buffer_base + (idx << ctx->buf_shift);
}
static int setup_buffer_pool(struct ctx *ctx)
{
int ret, i;
void *mapped;
struct io_uring_buf_reg reg = { .ring_addr = 0,
.ring_entries = BUFFERS,
.bgid = 0 };
ctx->buf_ring_size = (sizeof(struct io_uring_buf) + buffer_size(ctx)) * BUFFERS;
mapped = mmap(NULL, ctx->buf_ring_size, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
if (mapped == MAP_FAILED) {
fprintf(stderr, "buf_ring mmap: %s\n", strerror(errno));
return -1;
}
ctx->buf_ring = (struct io_uring_buf_ring *)mapped;
io_uring_buf_ring_init(ctx->buf_ring);
reg = (struct io_uring_buf_reg) {
.ring_addr = (unsigned long)ctx->buf_ring,
.ring_entries = BUFFERS,
.bgid = 0
};
ctx->buffer_base = (unsigned char *)ctx->buf_ring +
sizeof(struct io_uring_buf) * BUFFERS;
ret = io_uring_register_buf_ring(&ctx->ring, ®, 0);
if (ret) {
fprintf(stderr, "buf_ring init failed: %s\n"
"NB This requires a kernel version >= 6.0\n",
strerror(-ret));
return ret;
}
for (i = 0; i < BUFFERS; i++) {
io_uring_buf_ring_add(ctx->buf_ring, get_buffer(ctx, i), buffer_size(ctx), i,
io_uring_buf_ring_mask(BUFFERS), i);
}
io_uring_buf_ring_advance(ctx->buf_ring, BUFFERS);
return 0;
}
static int setup_context(struct ctx *ctx)
{
struct io_uring_params params;
int ret;
memset(¶ms, 0, sizeof(params));
params.cq_entries = QD * 8;
params.flags = IORING_SETUP_SUBMIT_ALL | IORING_SETUP_COOP_TASKRUN |
IORING_SETUP_CQSIZE;
ret = io_uring_queue_init_params(QD, &ctx->ring, ¶ms);
if (ret < 0) {
fprintf(stderr, "queue_init failed: %s\n"
"NB: This requires a kernel version >= 6.0\n",
strerror(-ret));
return ret;
}
ret = setup_buffer_pool(ctx);
if (ret)
io_uring_queue_exit(&ctx->ring);
memset(&ctx->msg, 0, sizeof(ctx->msg));
ctx->msg.msg_namelen = sizeof(struct sockaddr_storage);
ctx->msg.msg_controllen = CONTROLLEN;
return ret;
}
static int setup_sock(int af, int port)
{
int ret;
int fd;
uint16_t nport = port <= 0 ? 0 : htons(port);
fd = socket(af, SOCK_DGRAM, 0);
if (fd < 0) {
fprintf(stderr, "sock_init: %s\n", strerror(errno));
return -1;
}
if (af == AF_INET6) {
struct sockaddr_in6 addr6 = {
.sin6_family = af,
.sin6_port = nport,
.sin6_addr = IN6ADDR_ANY_INIT
};
ret = bind(fd, (struct sockaddr *) &addr6, sizeof(addr6));
} else {
struct sockaddr_in addr = {
.sin_family = af,
.sin_port = nport,
.sin_addr = { INADDR_ANY }
};
ret = bind(fd, (struct sockaddr *) &addr, sizeof(addr));
}
if (ret) {
fprintf(stderr, "sock_bind: %s\n", strerror(errno));
close(fd);
return -1;
}
if (port <= 0) {
int port;
struct sockaddr_storage s;
socklen_t sz = sizeof(s);
if (getsockname(fd, (struct sockaddr *)&s, &sz)) {
fprintf(stderr, "getsockname failed\n");
close(fd);
return -1;
}
port = ntohs(((struct sockaddr_in *)&s)->sin_port);
fprintf(stderr, "port bound to %d\n", port);
}
return fd;
}
static void cleanup_context(struct ctx *ctx)
{
munmap(ctx->buf_ring, ctx->buf_ring_size);
io_uring_queue_exit(&ctx->ring);
}
static bool get_sqe(struct ctx *ctx, struct io_uring_sqe **sqe)
{
*sqe = io_uring_get_sqe(&ctx->ring);
if (!*sqe) {
io_uring_submit(&ctx->ring);
*sqe = io_uring_get_sqe(&ctx->ring);
}
if (!*sqe) {
fprintf(stderr, "cannot get sqe\n");
return true;
}
return false;
}
static int add_recv(struct ctx *ctx, int idx)
{
struct io_uring_sqe *sqe;
if (get_sqe(ctx, &sqe))
return -1;
io_uring_prep_recvmsg_multishot(sqe, idx, &ctx->msg, MSG_TRUNC);
sqe->flags |= IOSQE_FIXED_FILE;
sqe->flags |= IOSQE_BUFFER_SELECT;
sqe->buf_group = 0;
io_uring_sqe_set_data64(sqe, BUFFERS + 1);
return 0;
}
static void recycle_buffer(struct ctx *ctx, int idx)
{
io_uring_buf_ring_add(ctx->buf_ring, get_buffer(ctx, idx), buffer_size(ctx), idx,
io_uring_buf_ring_mask(BUFFERS), 0);
io_uring_buf_ring_advance(ctx->buf_ring, 1);
}
static int process_cqe_send(struct ctx *ctx, struct io_uring_cqe *cqe)
{
int idx = cqe->user_data;
if (cqe->res < 0)
fprintf(stderr, "bad send %s\n", strerror(-cqe->res));
recycle_buffer(ctx, idx);
return 0;
}
static int process_cqe_recv(struct ctx *ctx, struct io_uring_cqe *cqe,
int fdidx)
{
int ret, idx;
struct io_uring_recvmsg_out *o;
struct io_uring_sqe *sqe;
if (!(cqe->flags & IORING_CQE_F_MORE)) {
ret = add_recv(ctx, fdidx);
if (ret)
return ret;
}
if (cqe->res == -ENOBUFS)
return 0;
if (!(cqe->flags & IORING_CQE_F_BUFFER) || cqe->res < 0) {
fprintf(stderr, "recv cqe bad res %d\n", cqe->res);
if (cqe->res == -EFAULT || cqe->res == -EINVAL)
fprintf(stderr,
"NB: This requires a kernel version >= 6.0\n");
return -1;
}
idx = cqe->flags >> 16;
o = io_uring_recvmsg_validate(get_buffer(ctx, cqe->flags >> 16),
cqe->res, &ctx->msg);
if (!o) {
fprintf(stderr, "bad recvmsg\n");
return -1;
}
if (o->namelen > ctx->msg.msg_namelen) {
fprintf(stderr, "truncated name\n");
recycle_buffer(ctx, idx);
return 0;
}
if (o->flags & MSG_TRUNC) {
unsigned int r;
r = io_uring_recvmsg_payload_length(o, cqe->res, &ctx->msg);
fprintf(stderr, "truncated msg need %u received %u\n",
o->payloadlen, r);
recycle_buffer(ctx, idx);
return 0;
}
if (ctx->verbose) {
struct sockaddr_in *addr = io_uring_recvmsg_name(o);
struct sockaddr_in6 *addr6 = (void *)addr;
char buff[INET6_ADDRSTRLEN + 1];
const char *name;
void *paddr;
if (ctx->af == AF_INET6)
paddr = &addr6->sin6_addr;
else
paddr = &addr->sin_addr;
name = inet_ntop(ctx->af, paddr, buff, sizeof(buff));
if (!name)
name = "";
fprintf(stderr, "received %u bytes %d from [%s]:%d\n",
io_uring_recvmsg_payload_length(o, cqe->res, &ctx->msg),
o->namelen, name, (int)ntohs(addr->sin_port));
}
if (get_sqe(ctx, &sqe))
return -1;
ctx->send[idx].iov = (struct iovec) {
.iov_base = io_uring_recvmsg_payload(o, &ctx->msg),
.iov_len =
io_uring_recvmsg_payload_length(o, cqe->res, &ctx->msg)
};
ctx->send[idx].msg = (struct msghdr) {
.msg_namelen = o->namelen,
.msg_name = io_uring_recvmsg_name(o),
.msg_control = NULL,
.msg_controllen = 0,
.msg_iov = &ctx->send[idx].iov,
.msg_iovlen = 1
};
io_uring_prep_sendmsg(sqe, fdidx, &ctx->send[idx].msg, 0);
io_uring_sqe_set_data64(sqe, idx);
sqe->flags |= IOSQE_FIXED_FILE;
return 0;
}
static int process_cqe(struct ctx *ctx, struct io_uring_cqe *cqe, int fdidx)
{
if (cqe->user_data < BUFFERS)
return process_cqe_send(ctx, cqe);
else
return process_cqe_recv(ctx, cqe, fdidx);
}
int main(int argc, char *argv[])
{
struct ctx ctx;
int ret;
int port = -1;
int sockfd;
int opt;
struct io_uring_cqe *cqes[CQES];
unsigned int count, i;
memset(&ctx, 0, sizeof(ctx));
ctx.verbose = false;
ctx.af = AF_INET;
ctx.buf_shift = BUF_SHIFT;
while ((opt = getopt(argc, argv, "6vp:b:")) != -1) {
switch (opt) {
case '6':
ctx.af = AF_INET6;
break;
case 'p':
port = atoi(optarg);
break;
case 'b':
ctx.buf_shift = atoi(optarg);
break;
case 'v':
ctx.verbose = true;
break;
default:
fprintf(stderr, "Usage: %s [-p port] "
"[-b log2(BufferSize)] [-6] [-v]\n",
argv[0]);
exit(-1);
}
}
sockfd = setup_sock(ctx.af, port);
if (sockfd < 0)
return 1;
if (setup_context(&ctx)) {
close(sockfd);
return 1;
}
ret = io_uring_register_files(&ctx.ring, &sockfd, 1);
if (ret) {
fprintf(stderr, "register files: %s\n", strerror(-ret));
return -1;
}
ret = add_recv(&ctx, 0);
if (ret)
return 1;
while (true) {
ret = io_uring_submit_and_wait(&ctx.ring, 1);
if (ret == -EINTR)
continue;
if (ret < 0) {
fprintf(stderr, "submit and wait failed %d\n", ret);
break;
}
count = io_uring_peek_batch_cqe(&ctx.ring, &cqes[0], CQES);
for (i = 0; i < count; i++) {
ret = process_cqe(&ctx, cqes[i], 0);
if (ret)
goto cleanup;
}
io_uring_cq_advance(&ctx.ring, count);
}
cleanup:
cleanup_context(&ctx);
close(sockfd);
return ret;
}
liburing-2.9/examples/kdigest.c 0000664 0000000 0000000 00000022160 14750134674 0016627 0 ustar 00root root 0000000 0000000 /* SPDX-License-Identifier: MIT */
/*
* Proof-of-concept for doing file digests using the kernel's AF_ALG API.
* Needs a bit of error handling.
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "liburing.h"
#define QD 64
#define WAIT_BATCH (QD / 8)
#define BS (64*1024)
#define BGID 1
#define BID_MASK (QD - 1)
enum req_state {
IO_INIT = 0,
IO_READ,
IO_READ_COMPLETE,
IO_WRITE,
IO_WRITE_COMPLETE,
};
struct req {
off_t offset;
enum req_state state;
struct iovec iov;
};
struct kdigest {
struct io_uring ring;
struct io_uring_buf_ring *br;
struct req reqs[QD];
/* heap allocated, aligned QD*BS buffer */
uint8_t *bufs;
};
static int infd, outfd;
static int get_file_size(int fd, size_t *size)
{
struct stat st;
if (fstat(fd, &st) < 0)
return -1;
if (S_ISREG(st.st_mode)) {
*size = st.st_size;
} else if (S_ISBLK(st.st_mode)) {
unsigned long long bytes;
if (ioctl(fd, BLKGETSIZE64, &bytes) != 0)
return -1;
*size = bytes;
} else {
return -1;
}
return 0;
}
static int reap_completions(struct io_uring *ring, int *inflight,
size_t *outsize)
{
struct io_uring_cqe *cqe;
unsigned head;
int ret = 0, nr;
nr = 0;
io_uring_for_each_cqe(ring, head, cqe) {
struct req *req;
req = io_uring_cqe_get_data(cqe);
assert(req->state == IO_READ || req->state == IO_WRITE);
if (cqe->res < 0) {
fprintf(stderr, "%s: cqe error %d\n",
req->state == IO_WRITE ? "send" : "read",
cqe->res);
*outsize = 0;
ret = 1;
break;
}
(*inflight)--;
req->state++;
if (req->state == IO_WRITE_COMPLETE)
*outsize -= cqe->res;
nr++;
}
io_uring_cq_advance(ring, nr);
return ret;
}
/*
* Add buffers to the outgoing ring, and submit a single bundle send that
* will finish when all of them have completed.
*/
static void submit_sends_br(struct kdigest *kdigest, int *write_idx,
int *inflight)
{
struct io_uring_buf_ring *br = kdigest->br;
struct req *req, *first_req = NULL;
struct io_uring_sqe *sqe;
int nr = 0;
/*
* Find any completed reads, and add the buffers to the outgoing
* send ring. That will serialize the data sent.
*/
while (kdigest->reqs[*write_idx].state == IO_READ_COMPLETE) {
req = &kdigest->reqs[*write_idx];
io_uring_buf_ring_add(br, req->iov.iov_base, req->iov.iov_len,
*write_idx, BID_MASK, nr++);
/*
* Mark as a write/send if it's the first one, that serve
* as the "barrier" in the array. The rest can be marked
* complete upfront, if there's more in this bundle, as
* the first will serve a the stopping point.
*/
if (!first_req) {
req->state = IO_WRITE;
first_req = req;
} else {
req->state = IO_WRITE_COMPLETE;
}
*write_idx = (*write_idx + 1) % QD;
}
/*
* If any completed reads were found and we added buffers, advance
* the buffer ring and prepare a single bundle send for all of them.
*/
if (first_req) {
io_uring_buf_ring_advance(br, nr);
sqe = io_uring_get_sqe(&kdigest->ring);
io_uring_prep_send_bundle(sqe, outfd, 0, MSG_MORE);
sqe->flags |= IOSQE_BUFFER_SELECT;
sqe->buf_group = BGID;
io_uring_sqe_set_data(sqe, first_req);
(*inflight)++;
}
}
/*
* Serialize multiple writes with IOSQE_IO_LINK. Not the most efficient
* way, as it's both more expensive on the kernel side to handle link, and
* if there's bundle support, all of the below can be done with a single
* send rather than multiple ones.
*/
static void submit_sends_linked(struct kdigest *kdigest, int *write_idx,
int *inflight)
{
struct io_uring_sqe *sqe;
struct req *req;
/* Queue up any possible writes. Link flag ensures ordering. */
sqe = NULL;
while (kdigest->reqs[*write_idx].state == IO_READ_COMPLETE) {
if (sqe)
sqe->flags |= IOSQE_IO_LINK;
req = &kdigest->reqs[*write_idx];
req->state = IO_WRITE;
sqe = io_uring_get_sqe(&kdigest->ring);
io_uring_prep_send(sqe, outfd, req->iov.iov_base,
req->iov.iov_len, MSG_MORE);
io_uring_sqe_set_data(sqe, req);
(*inflight)++;
*write_idx = (*write_idx + 1) % QD;
}
}
static void submit_sends(struct kdigest *kdigest, int *write_idx, int *inflight)
{
if (kdigest->br)
submit_sends_br(kdigest, write_idx, inflight);
else
submit_sends_linked(kdigest, write_idx, inflight);
}
static int digest_file(struct kdigest *kdigest, size_t insize)
{
struct io_uring *ring = &kdigest->ring;
off_t read_off = 0;
size_t outsize = insize;
int read_idx = 0, write_idx = 0, inflight = 0;
while (outsize) {
struct io_uring_sqe *sqe;
struct req *req;
int to_wait;
submit_sends(kdigest, &write_idx, &inflight);
/* Queue up any reads. Completions may arrive out of order. */
while (insize && (kdigest->reqs[read_idx].state == IO_INIT
|| kdigest->reqs[read_idx].state == IO_WRITE_COMPLETE)) {
size_t this_size = (insize < BS ? insize : BS);
req = &kdigest->reqs[read_idx];
req->state = IO_READ;
req->offset = read_off;
req->iov.iov_base = &kdigest->bufs[read_idx * BS];
req->iov.iov_len = this_size;
sqe = io_uring_get_sqe(ring);
io_uring_prep_read(sqe, infd, req->iov.iov_base,
req->iov.iov_len, read_off);
io_uring_sqe_set_data(sqe, req);
read_off += this_size;
insize -= this_size;
inflight++;
read_idx = (read_idx + 1) % QD;
}
/* wait for about half queue completion before resubmit */
for (to_wait = (inflight >> 1) | 1; to_wait; to_wait--) {
int ret, wait_nr;
wait_nr = inflight;
if (wait_nr > WAIT_BATCH)
wait_nr = WAIT_BATCH;
ret = io_uring_submit_and_wait(ring, wait_nr);
if (ret < 0) {
fprintf(stderr, "wait cqe: %s\n",
strerror(-ret));
return 1;
}
if (reap_completions(ring, &inflight, &outsize))
return 1;
}
}
assert(!inflight);
return 0;
}
static int get_result(struct kdigest *kdigest, const char *alg, const char *file)
{
struct io_uring *ring = &kdigest->ring;
struct io_uring_sqe *sqe;
struct io_uring_cqe *cqe;
int i, ret;
/* reuse I/O buf block to stash hash result */
sqe = io_uring_get_sqe(ring);
io_uring_prep_recv(sqe, outfd, kdigest->bufs, BS, 0);
if (io_uring_submit_and_wait(ring, 1) < 0)
return 1;
ret = io_uring_peek_cqe(ring, &cqe);
if (ret < 0) {
fprintf(stderr, "peek cqe: %s\n", strerror(-ret));
return 1;
}
if (cqe->res < 0) {
fprintf(stderr, "cqe error: %s\n", strerror(-cqe->res));
goto err;
}
fprintf(stdout, "uring %s%s(%s) returned(len=%u): ",
kdigest->br ? "bundled " : "", alg, file, cqe->res);
for (i = 0; i < cqe->res; i++)
fprintf(stdout, "%02x", kdigest->bufs[i]);
putc('\n', stdout);
ret = 0;
err:
io_uring_cqe_seen(ring, cqe);
return ret;
}
int main(int argc, char *argv[])
{
const char *alg;
const char *infile;
size_t alg_len, insize;
struct sockaddr_alg sa = {
.salg_family = AF_ALG,
.salg_type = "hash",
};
struct kdigest kdigest = { };
struct io_uring_params p = { };
int sfd, ret;
if (argc < 3) {
fprintf(stderr, "%s: algorithm infile\n", argv[0]);
return 1;
}
alg = argv[1];
infile = argv[2];
alg_len = strlen(alg);
if (alg_len >= sizeof(sa.salg_name)) {
fprintf(stderr, "algorithm name too long\n");
return 1;
}
/* +1 for null terminator */
memcpy(sa.salg_name, alg, alg_len + 1);
infd = open(infile, O_RDONLY);
if (infd < 0) {
perror("open infile");
return 1;
}
sfd = socket(AF_ALG, SOCK_SEQPACKET, 0);
if (sfd < 0) {
if (errno == EAFNOSUPPORT)
fprintf(stderr, "kernel AF_ALG support not available. "
"CONFIG_CRYPTO_USER_API_HASH required.\n");
else
perror("AF_ALG socket");
return 1;
}
if (bind(sfd, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
if (errno == ENOENT)
fprintf(stderr, "AF_ALG bind(%s): hash not available. "
"See /proc/crypto hash algorithm list.\n",
alg);
else
fprintf(stderr, "AF_ALG bind(%s): %s\n",
alg, strerror(errno));
return 1;
}
outfd = accept(sfd, NULL, 0);
if (outfd < 0) {
perror("AF_ALG accept");
return 1;
}
if (posix_memalign((void **)&kdigest.bufs, 4096, QD * BS)) {
fprintf(stderr, "failed to alloc I/O bufs\n");
return 1;
}
p.flags = IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN;
do {
ret = io_uring_queue_init_params(QD, &kdigest.ring, &p);
if (!ret)
break;
if (!p.flags) {
fprintf(stderr, "queue_init: %s\n", strerror(-ret));
return 1;
}
p.flags = 0;
} while (1);
/* use send bundles, if available */
if (p.features & IORING_FEAT_RECVSEND_BUNDLE) {
kdigest.br = io_uring_setup_buf_ring(&kdigest.ring, QD, BGID, 0, &ret);
if (!kdigest.br) {
fprintf(stderr, "Failed setting up bundle buffer ring: %d\n", ret);
return 1;
}
}
if (get_file_size(infd, &insize))
return 1;
ret = digest_file(&kdigest, insize);
if (ret) {
fprintf(stderr, "%s digest failed\n", alg);
return 1;
}
ret = get_result(&kdigest, alg, infile);
if (ret) {
fprintf(stderr, "failed to retrieve %s digest result\n", alg);
return 1;
}
if (kdigest.br)
io_uring_free_buf_ring(&kdigest.ring, kdigest.br, QD, BGID);
io_uring_queue_exit(&kdigest.ring);
free(kdigest.bufs);
if (close(infd) < 0)
ret |= 1;
if (close(sfd) < 0)
ret |= 1;
if (close(outfd) < 0)
ret |= 1;
return ret;
}
liburing-2.9/examples/link-cp.c 0000664 0000000 0000000 00000006732 14750134674 0016541 0 ustar 00root root 0000000 0000000 /* SPDX-License-Identifier: MIT */
/*
* Very basic proof-of-concept for doing a copy with linked SQEs. Needs a
* bit of error handling and short read love.
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "liburing.h"
#define QD 64
#define BS (32*1024)
struct io_data {
size_t offset;
int index;
struct iovec iov;
};
static int infd, outfd;
static int inflight;
static int setup_context(unsigned entries, struct io_uring *ring)
{
int ret;
ret = io_uring_queue_init(entries, ring, 0);
if (ret < 0) {
fprintf(stderr, "queue_init: %s\n", strerror(-ret));
return -1;
}
return 0;
}
static int get_file_size(int fd, off_t *size)
{
struct stat st;
if (fstat(fd, &st) < 0)
return -1;
if (S_ISREG(st.st_mode)) {
*size = st.st_size;
return 0;
} else if (S_ISBLK(st.st_mode)) {
unsigned long long bytes;
if (ioctl(fd, BLKGETSIZE64, &bytes) != 0)
return -1;
*size = bytes;
return 0;
}
return -1;
}
static void queue_rw_pair(struct io_uring *ring, off_t size, off_t offset)
{
struct io_uring_sqe *sqe;
struct io_data *data;
void *ptr;
ptr = malloc(size + sizeof(*data));
data = ptr + size;
data->index = 0;
data->offset = offset;
data->iov.iov_base = ptr;
data->iov.iov_len = size;
sqe = io_uring_get_sqe(ring);
io_uring_prep_readv(sqe, infd, &data->iov, 1, offset);
sqe->flags |= IOSQE_IO_LINK;
io_uring_sqe_set_data(sqe, data);
sqe = io_uring_get_sqe(ring);
io_uring_prep_writev(sqe, outfd, &data->iov, 1, offset);
io_uring_sqe_set_data(sqe, data);
}
static int handle_cqe(struct io_uring *ring, struct io_uring_cqe *cqe)
{
struct io_data *data = io_uring_cqe_get_data(cqe);
int ret = 0;
data->index++;
if (cqe->res < 0) {
if (cqe->res == -ECANCELED) {
queue_rw_pair(ring, data->iov.iov_len, data->offset);
inflight += 2;
} else {
printf("cqe error: %s\n", strerror(-cqe->res));
ret = 1;
}
}
if (data->index == 2) {
void *ptr = (void *) data - data->iov.iov_len;
free(ptr);
}
io_uring_cqe_seen(ring, cqe);
return ret;
}
static int copy_file(struct io_uring *ring, off_t insize)
{
struct io_uring_cqe *cqe;
off_t this_size;
off_t offset;
offset = 0;
while (insize) {
int has_inflight = inflight;
int depth;
while (insize && inflight < QD) {
this_size = BS;
if (this_size > insize)
this_size = insize;
queue_rw_pair(ring, this_size, offset);
offset += this_size;
insize -= this_size;
inflight += 2;
}
if (has_inflight != inflight)
io_uring_submit(ring);
if (insize)
depth = QD;
else
depth = 1;
while (inflight >= depth) {
int ret;
ret = io_uring_wait_cqe(ring, &cqe);
if (ret < 0) {
printf("wait cqe: %s\n", strerror(-ret));
return 1;
}
if (handle_cqe(ring, cqe))
return 1;
inflight--;
}
}
return 0;
}
int main(int argc, char *argv[])
{
struct io_uring ring;
off_t insize;
int ret;
if (argc < 3) {
printf("%s: infile outfile\n", argv[0]);
return 1;
}
infd = open(argv[1], O_RDONLY);
if (infd < 0) {
perror("open infile");
return 1;
}
outfd = open(argv[2], O_WRONLY | O_CREAT | O_TRUNC, 0644);
if (outfd < 0) {
perror("open outfile");
return 1;
}
if (setup_context(QD, &ring))
return 1;
if (get_file_size(infd, &insize))
return 1;
ret = copy_file(&ring, insize);
close(infd);
close(outfd);
io_uring_queue_exit(&ring);
return ret;
}
liburing-2.9/examples/napi-busy-poll-client.c 0000664 0000000 0000000 00000025576 14750134674 0021342 0 ustar 00root root 0000000 0000000 /* SPDX-License-Identifier: MIT */
/*
* Simple ping/pong client which can use the io_uring NAPI support.
*
* Needs to be run as root because it sets SCHED_FIFO scheduling class,
* but will work without that.
*
* Example:
*
* sudo examples/napi-busy-poll-client -a 192.168.2.2 -n100000 -p4444 \
* -b -t10 -u
*
* send and receive 100k packets, using NAPI.
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#define MAXBUFLEN 100
#define PORTNOLEN 10
#define ADDRLEN 80
#define RINGSIZE 1024
#define printable(ch) (isprint((unsigned char)ch) ? ch : '#')
enum {
IOURING_RECV,
IOURING_SEND,
IOURING_RECVMSG,
IOURING_SENDMSG
};
struct ctx
{
struct io_uring ring;
union {
struct sockaddr_in6 saddr6;
struct sockaddr_in saddr;
};
int sockfd;
int buffer_len;
int num_pings;
bool napi_check;
union {
char buffer[MAXBUFLEN];
struct timespec ts;
};
int rtt_index;
double *rtt;
};
struct options
{
int num_pings;
__u32 timeout;
bool sq_poll;
bool defer_tw;
bool busy_loop;
bool prefer_busy_poll;
bool ipv6;
char port[PORTNOLEN];
char addr[ADDRLEN];
};
static struct option longopts[] =
{
{"address" , 1, NULL, 'a'},
{"busy" , 0, NULL, 'b'},
{"help" , 0, NULL, 'h'},
{"num_pings", 1, NULL, 'n'},
{"port" , 1, NULL, 'p'},
{"prefer" , 1, NULL, 'u'},
{"sqpoll" , 0, NULL, 's'},
{"timeout" , 1, NULL, 't'},
{NULL , 0, NULL, 0 }
};
static void printUsage(const char *name)
{
fprintf(stderr,
"Usage: %s [-l|--listen] [-a|--address ip_address] [-p|--port port-no] [-s|--sqpoll]"
" [-b|--busy] [-n|--num pings] [-t|--timeout busy-poll-timeout] [-u||--prefer] [-6] [-h|--help]\n"
"--address\n"
"-a : remote or local ipv6 address\n"
"--busy\n"
"-b : busy poll io_uring instead of blocking.\n"
"--num_pings\n"
"-n : number of pings\n"
"--port\n"
"-p : port\n"
"--sqpoll\n"
"-s : Configure io_uring to use SQPOLL thread\n"
"--timeout\n"
"-t : Configure NAPI busy poll timeout"
"--prefer\n"
"-u : prefer NAPI busy poll\n"
"-6 : use IPV6\n"
"--help\n"
"-h : Display this usage message\n\n",
name);
}
static void printError(const char *msg, int opt)
{
if (msg && opt)
fprintf(stderr, "%s (-%c)\n", msg, printable(opt));
}
static void setProcessScheduler(void)
{
struct sched_param param;
param.sched_priority = sched_get_priority_max(SCHED_FIFO);
if (sched_setscheduler(0, SCHED_FIFO, ¶m) < 0)
fprintf(stderr, "sched_setscheduler() failed: (%d) %s\n",
errno, strerror(errno));
}
static double diffTimespec(const struct timespec *time1, const struct timespec *time0)
{
return (time1->tv_sec - time0->tv_sec)
+ (time1->tv_nsec - time0->tv_nsec) / 1000000000.0;
}
static uint64_t encodeUserData(char type, int fd)
{
return (uint32_t)fd | ((uint64_t)type << 56);
}
static void decodeUserData(uint64_t data, char *type, int *fd)
{
*type = data >> 56;
*fd = data & 0xffffffffU;
}
static const char *opTypeToStr(char type)
{
const char *res;
switch (type) {
case IOURING_RECV:
res = "IOURING_RECV";
break;
case IOURING_SEND:
res = "IOURING_SEND";
break;
case IOURING_RECVMSG:
res = "IOURING_RECVMSG";
break;
case IOURING_SENDMSG:
res = "IOURING_SENDMSG";
break;
default:
res = "Unknown";
}
return res;
}
static void reportNapi(struct ctx *ctx)
{
unsigned int napi_id = 0;
socklen_t len = sizeof(napi_id);
getsockopt(ctx->sockfd, SOL_SOCKET, SO_INCOMING_NAPI_ID, &napi_id, &len);
if (napi_id)
printf(" napi id: %d\n", napi_id);
else
printf(" unassigned napi id\n");
ctx->napi_check = true;
}
static void sendPing(struct ctx *ctx)
{
struct io_uring_sqe *sqe = io_uring_get_sqe(&ctx->ring);
clock_gettime(CLOCK_REALTIME, (struct timespec *)ctx->buffer);
io_uring_prep_send(sqe, ctx->sockfd, ctx->buffer, sizeof(struct timespec), 0);
sqe->user_data = encodeUserData(IOURING_SEND, ctx->sockfd);
}
static void receivePing(struct ctx *ctx)
{
struct io_uring_sqe *sqe = io_uring_get_sqe(&ctx->ring);
io_uring_prep_recv(sqe, ctx->sockfd, ctx->buffer, MAXBUFLEN, 0);
sqe->user_data = encodeUserData(IOURING_RECV, ctx->sockfd);
}
static void recordRTT(struct ctx *ctx)
{
struct timespec startTs = ctx->ts;
// Send next ping.
sendPing(ctx);
// Store round-trip time.
ctx->rtt[ctx->rtt_index] = diffTimespec(&ctx->ts, &startTs);
ctx->rtt_index++;
}
static void printStats(struct ctx *ctx)
{
double minRTT = DBL_MAX;
double maxRTT = 0.0;
double avgRTT = 0.0;
double stddevRTT = 0.0;
// Calculate min, max, avg.
for (int i = 0; i < ctx->rtt_index; i++) {
if (ctx->rtt[i] < minRTT)
minRTT = ctx->rtt[i];
if (ctx->rtt[i] > maxRTT)
maxRTT = ctx->rtt[i];
avgRTT += ctx->rtt[i];
}
avgRTT /= ctx->rtt_index;
// Calculate stddev.
for (int i = 0; i < ctx->rtt_index; i++)
stddevRTT += fabs(ctx->rtt[i] - avgRTT);
stddevRTT /= ctx->rtt_index;
fprintf(stdout, " rtt(us) min/avg/max/mdev = %.3f/%.3f/%.3f/%.3f\n",
minRTT * 1000000, avgRTT * 1000000, maxRTT * 1000000, stddevRTT * 1000000);
}
static int completion(struct ctx *ctx, struct io_uring_cqe *cqe)
{
char type;
int fd;
int res = cqe->res;
decodeUserData(cqe->user_data, &type, &fd);
if (res < 0) {
fprintf(stderr, "unexpected %s failure: (%d) %s\n",
opTypeToStr(type), -res, strerror(-res));
return -1;
}
switch (type) {
case IOURING_SEND:
receivePing(ctx);
break;
case IOURING_RECV:
if (res != sizeof(struct timespec)) {
fprintf(stderr, "unexpected ping reply len: %d\n", res);
abort();
}
if (!ctx->napi_check) {
reportNapi(ctx);
sendPing(ctx);
} else {
recordRTT(ctx);
}
--ctx->num_pings;
break;
default:
fprintf(stderr, "unexpected %s completion\n",
opTypeToStr(type));
return -1;
break;
}
return 0;
}
int main(int argc, char *argv[])
{
struct ctx ctx;
struct options opt;
struct __kernel_timespec *tsPtr;
struct __kernel_timespec ts;
struct io_uring_params params;
struct io_uring_napi napi;
int flag, ret, af;
memset(&opt, 0, sizeof(struct options));
// Process flags.
while ((flag = getopt_long(argc, argv, ":hs:bua:n:p:t:6d:", longopts, NULL)) != -1) {
switch (flag) {
case 'a':
strcpy(opt.addr, optarg);
break;
case 'b':
opt.busy_loop = true;
break;
case 'h':
printUsage(argv[0]);
exit(0);
break;
case 'n':
opt.num_pings = atoi(optarg) + 1;
break;
case 'p':
strcpy(opt.port, optarg);
break;
case 's':
opt.sq_poll = !!atoi(optarg);
break;
case 't':
opt.timeout = atoi(optarg);
break;
case 'u':
opt.prefer_busy_poll = true;
break;
case '6':
opt.ipv6 = true;
break;
case 'd':
opt.defer_tw = !!atoi(optarg);
break;
case ':':
printError("Missing argument", optopt);
printUsage(argv[0]);
exit(-1);
break;
case '?':
printError("Unrecognized option", optopt);
printUsage(argv[0]);
exit(-1);
break;
default:
fprintf(stderr, "Fatal: Unexpected case in CmdLineProcessor switch()\n");
exit(-1);
break;
}
}
if (strlen(opt.addr) == 0) {
fprintf(stderr, "address option is mandatory\n");
printUsage(argv[0]);
exit(1);
}
if (opt.ipv6) {
af = AF_INET6;
ctx.saddr6.sin6_port = htons(atoi(opt.port));
ctx.saddr6.sin6_family = AF_INET6;
} else {
af = AF_INET;
ctx.saddr.sin_port = htons(atoi(opt.port));
ctx.saddr.sin_family = AF_INET;
}
if (opt.ipv6)
ret = inet_pton(af, opt.addr, &ctx.saddr6.sin6_addr);
else
ret = inet_pton(af, opt.addr, &ctx.saddr.sin_addr);
if (ret <= 0) {
fprintf(stderr, "inet_pton error for %s\n", optarg);
printUsage(argv[0]);
exit(1);
}
// Connect to server.
fprintf(stdout, "Connecting to %s... (port=%s) to send %d pings\n", opt.addr, opt.port, opt.num_pings - 1);
if ((ctx.sockfd = socket(af, SOCK_DGRAM, 0)) < 0) {
fprintf(stderr, "socket() failed: (%d) %s\n", errno, strerror(errno));
exit(1);
}
if (opt.ipv6)
ret = connect(ctx.sockfd, (struct sockaddr *)&ctx.saddr6, sizeof(struct sockaddr_in6));
else
ret = connect(ctx.sockfd, (struct sockaddr *)&ctx.saddr, sizeof(struct sockaddr_in));
if (ret < 0) {
fprintf(stderr, "connect() failed: (%d) %s\n", errno, strerror(errno));
exit(1);
}
// Setup ring.
memset(¶ms, 0, sizeof(params));
memset(&ts, 0, sizeof(ts));
memset(&napi, 0, sizeof(napi));
params.flags = IORING_SETUP_SINGLE_ISSUER;
if (opt.defer_tw) {
params.flags |= IORING_SETUP_DEFER_TASKRUN;
} else if (opt.sq_poll) {
params.flags = IORING_SETUP_SQPOLL;
params.sq_thread_idle = 50;
} else {
params.flags |= IORING_SETUP_COOP_TASKRUN;
}
ret = io_uring_queue_init_params(RINGSIZE, &ctx.ring, ¶ms);
if (ret) {
fprintf(stderr, "io_uring_queue_init_params() failed: (%d) %s\n",
ret, strerror(-ret));
exit(1);
}
if (opt.timeout || opt.prefer_busy_poll) {
napi.prefer_busy_poll = opt.prefer_busy_poll;
napi.busy_poll_to = opt.timeout;
ret = io_uring_register_napi(&ctx.ring, &napi);
if (ret) {
fprintf(stderr, "io_uring_register_napi: %d\n", ret);
exit(1);
}
}
if (opt.busy_loop)
tsPtr = &ts;
else
tsPtr = NULL;
// Use realtime scheduler.
setProcessScheduler();
// Copy payload.
clock_gettime(CLOCK_REALTIME, &ctx.ts);
// Setup context.
ctx.napi_check = false;
ctx.buffer_len = sizeof(struct timespec);
ctx.num_pings = opt.num_pings;
ctx.rtt_index = 0;
ctx.rtt = (double *)malloc(sizeof(double) * opt.num_pings);
if (!ctx.rtt) {
fprintf(stderr, "Cannot allocate results array\n");
exit(1);
}
// Send initial message to get napi id.
sendPing(&ctx);
while (ctx.num_pings != 0) {
int res;
unsigned num_completed = 0;
unsigned head;
struct io_uring_cqe *cqe;
do {
res = io_uring_submit_and_wait_timeout(&ctx.ring, &cqe, 1, tsPtr, NULL);
if (res >= 0)
break;
else if (res == -ETIME)
continue;
fprintf(stderr, "submit_and_wait: %d\n", res);
exit(1);
} while (1);
io_uring_for_each_cqe(&ctx.ring, head, cqe) {
++num_completed;
if (completion(&ctx, cqe))
goto out;
}
if (num_completed)
io_uring_cq_advance(&ctx.ring, num_completed);
}
printStats(&ctx);
out:
// Clean up.
if (opt.timeout || opt.prefer_busy_poll) {
ret = io_uring_unregister_napi(&ctx.ring, &napi);
if (ret)
fprintf(stderr, "io_uring_unregister_napi: %d\n", ret);
if (opt.timeout != napi.busy_poll_to ||
opt.prefer_busy_poll != napi.prefer_busy_poll) {
fprintf(stderr, "Expected busy poll to = %d, got %d\n",
opt.timeout, napi.busy_poll_to);
fprintf(stderr, "Expected prefer busy poll = %d, got %d\n",
opt.prefer_busy_poll, napi.prefer_busy_poll);
}
} else {
ret = io_uring_unregister_napi(&ctx.ring, NULL);
if (ret)
fprintf(stderr, "io_uring_unregister_napi: %d\n", ret);
}
io_uring_queue_exit(&ctx.ring);
free(ctx.rtt);
close(ctx.sockfd);
return 0;
}
liburing-2.9/examples/napi-busy-poll-server.c 0000664 0000000 0000000 00000022673 14750134674 0021365 0 ustar 00root root 0000000 0000000 /* SPDX-License-Identifier: MIT */
/*
* Simple ping/pong backend which can use the io_uring NAPI support.
*
* Needs to be run as root because it sets SCHED_FIFO scheduling class,
* but will work without that.
*
* Example:
*
* sudo examples/napi-busy-poll-server -l -a 192.168.2.2 -n100000 \
* -p4444 -t10 -b -u
*
* will respond to 100k packages, using NAPI.
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#define MAXBUFLEN 100
#define PORTNOLEN 10
#define ADDRLEN 80
#define RINGSIZE 1024
#define printable(ch) (isprint((unsigned char)ch) ? ch : '#')
enum {
IOURING_RECV,
IOURING_SEND,
IOURING_RECVMSG,
IOURING_SENDMSG
};
struct ctx
{
struct io_uring ring;
union {
struct sockaddr_in6 saddr6;
struct sockaddr_in saddr;
};
struct iovec iov;
struct msghdr msg;
int sockfd;
int buffer_len;
int num_pings;
bool napi_check;
union {
char buffer[MAXBUFLEN];
struct timespec ts;
};
};
struct options
{
int num_pings;
__u32 timeout;
bool listen;
bool defer_tw;
bool sq_poll;
bool busy_loop;
bool prefer_busy_poll;
bool ipv6;
char port[PORTNOLEN];
char addr[ADDRLEN];
};
static struct options opt;
static struct option longopts[] =
{
{"address" , 1, NULL, 'a'},
{"busy" , 0, NULL, 'b'},
{"help" , 0, NULL, 'h'},
{"listen" , 0, NULL, 'l'},
{"num_pings", 1, NULL, 'n'},
{"port" , 1, NULL, 'p'},
{"prefer" , 1, NULL, 'u'},
{"sqpoll" , 0, NULL, 's'},
{"timeout" , 1, NULL, 't'},
{NULL , 0, NULL, 0 }
};
static void printUsage(const char *name)
{
fprintf(stderr,
"Usage: %s [-l|--listen] [-a|--address ip_address] [-p|--port port-no] [-s|--sqpoll]"
" [-b|--busy] [-n|--num pings] [-t|--timeout busy-poll-timeout] [-u|--prefer] [-6] [-h|--help]\n"
" --listen\n"
"-l : Server mode\n"
"--address\n"
"-a : remote or local ipv6 address\n"
"--busy\n"
"-b : busy poll io_uring instead of blocking.\n"
"--num_pings\n"
"-n : number of pings\n"
"--port\n"
"-p : port\n"
"--sqpoll\n"
"-s : Configure io_uring to use SQPOLL thread\n"
"--timeout\n"
"-t : Configure NAPI busy poll timeout"
"--prefer\n"
"-u : prefer NAPI busy poll\n"
"-6 : use IPV6\n"
"--help\n"
"-h : Display this usage message\n\n",
name);
}
static void printError(const char *msg, int opt)
{
if (msg && opt)
fprintf(stderr, "%s (-%c)\n", msg, printable(opt));
}
static void setProcessScheduler(void)
{
struct sched_param param;
param.sched_priority = sched_get_priority_max(SCHED_FIFO);
if (sched_setscheduler(0, SCHED_FIFO, ¶m) < 0)
fprintf(stderr, "sched_setscheduler() failed: (%d) %s\n",
errno, strerror(errno));
}
static uint64_t encodeUserData(char type, int fd)
{
return (uint32_t)fd | ((__u64)type << 56);
}
static void decodeUserData(uint64_t data, char *type, int *fd)
{
*type = data >> 56;
*fd = data & 0xffffffffU;
}
static const char *opTypeToStr(char type)
{
const char *res;
switch (type) {
case IOURING_RECV:
res = "IOURING_RECV";
break;
case IOURING_SEND:
res = "IOURING_SEND";
break;
case IOURING_RECVMSG:
res = "IOURING_RECVMSG";
break;
case IOURING_SENDMSG:
res = "IOURING_SENDMSG";
break;
default:
res = "Unknown";
}
return res;
}
static void reportNapi(struct ctx *ctx)
{
unsigned int napi_id = 0;
socklen_t len = sizeof(napi_id);
getsockopt(ctx->sockfd, SOL_SOCKET, SO_INCOMING_NAPI_ID, &napi_id, &len);
if (napi_id)
printf(" napi id: %d\n", napi_id);
else
printf(" unassigned napi id\n");
ctx->napi_check = true;
}
static void sendPing(struct ctx *ctx)
{
struct io_uring_sqe *sqe = io_uring_get_sqe(&ctx->ring);
io_uring_prep_sendmsg(sqe, ctx->sockfd, &ctx->msg, 0);
sqe->user_data = encodeUserData(IOURING_SENDMSG, ctx->sockfd);
}
static void receivePing(struct ctx *ctx)
{
struct io_uring_sqe *sqe;
bzero(&ctx->msg, sizeof(struct msghdr));
if (opt.ipv6) {
ctx->msg.msg_name = &ctx->saddr6;
ctx->msg.msg_namelen = sizeof(struct sockaddr_in6);
} else {
ctx->msg.msg_name = &ctx->saddr;
ctx->msg.msg_namelen = sizeof(struct sockaddr_in);
}
ctx->iov.iov_base = ctx->buffer;
ctx->iov.iov_len = MAXBUFLEN;
ctx->msg.msg_iov = &ctx->iov;
ctx->msg.msg_iovlen = 1;
sqe = io_uring_get_sqe(&ctx->ring);
io_uring_prep_recvmsg(sqe, ctx->sockfd, &ctx->msg, 0);
sqe->user_data = encodeUserData(IOURING_RECVMSG, ctx->sockfd);
}
static void completion(struct ctx *ctx, struct io_uring_cqe *cqe)
{
char type;
int fd;
int res = cqe->res;
decodeUserData(cqe->user_data, &type, &fd);
if (res < 0) {
fprintf(stderr, "unexpected %s failure: (%d) %s\n",
opTypeToStr(type), -res, strerror(-res));
abort();
}
switch (type) {
case IOURING_SENDMSG:
receivePing(ctx);
--ctx->num_pings;
break;
case IOURING_RECVMSG:
ctx->iov.iov_len = res;
sendPing(ctx);
if (!ctx->napi_check)
reportNapi(ctx);
break;
default:
fprintf(stderr, "unexpected %s completion\n",
opTypeToStr(type));
abort();
break;
}
}
int main(int argc, char *argv[])
{
int flag;
struct ctx ctx;
struct __kernel_timespec *tsPtr;
struct __kernel_timespec ts;
struct io_uring_params params;
struct io_uring_napi napi;
int ret, af;
memset(&opt, 0, sizeof(struct options));
// Process flags.
while ((flag = getopt_long(argc, argv, ":lhs:bua:n:p:t:6d:", longopts, NULL)) != -1) {
switch (flag) {
case 'a':
strcpy(opt.addr, optarg);
break;
case 'b':
opt.busy_loop = true;
break;
case 'h':
printUsage(argv[0]);
exit(0);
break;
case 'l':
opt.listen = true;
break;
case 'n':
opt.num_pings = atoi(optarg) + 1;
break;
case 'p':
strcpy(opt.port, optarg);
break;
case 's':
opt.sq_poll = !!atoi(optarg);
break;
case 't':
opt.timeout = atoi(optarg);
break;
case 'u':
opt.prefer_busy_poll = true;
break;
case '6':
opt.ipv6 = true;
break;
case 'd':
opt.defer_tw = !!atoi(optarg);
break;
case ':':
printError("Missing argument", optopt);
printUsage(argv[0]);
exit(-1);
break;
case '?':
printError("Unrecognized option", optopt);
printUsage(argv[0]);
exit(-1);
break;
default:
fprintf(stderr, "Fatal: Unexpected case in CmdLineProcessor switch()\n");
exit(-1);
break;
}
}
if (strlen(opt.addr) == 0) {
fprintf(stderr, "address option is mandatory\n");
printUsage(argv[0]);
exit(1);
}
if (opt.ipv6) {
af = AF_INET6;
ctx.saddr6.sin6_port = htons(atoi(opt.port));
ctx.saddr6.sin6_family = AF_INET6;
} else {
af = AF_INET;
ctx.saddr.sin_port = htons(atoi(opt.port));
ctx.saddr.sin_family = AF_INET;
}
if (opt.ipv6)
ret = inet_pton(AF_INET6, opt.addr, &ctx.saddr6.sin6_addr);
else
ret = inet_pton(AF_INET, opt.addr, &ctx.saddr.sin_addr);
if (ret <= 0) {
fprintf(stderr, "inet_pton error for %s\n", optarg);
printUsage(argv[0]);
exit(1);
}
// Connect to server.
fprintf(stdout, "Listening %s : %s...\n", opt.addr, opt.port);
if ((ctx.sockfd = socket(af, SOCK_DGRAM, 0)) < 0) {
fprintf(stderr, "socket() failed: (%d) %s\n", errno, strerror(errno));
exit(1);
}
if (opt.ipv6)
ret = bind(ctx.sockfd, (struct sockaddr *)&ctx.saddr6, sizeof(struct sockaddr_in6));
else
ret = bind(ctx.sockfd, (struct sockaddr *)&ctx.saddr, sizeof(struct sockaddr_in));
if (ret < 0) {
fprintf(stderr, "bind() failed: (%d) %s\n", errno, strerror(errno));
exit(1);
}
// Setup ring.
memset(¶ms, 0, sizeof(params));
memset(&ts, 0, sizeof(ts));
memset(&napi, 0, sizeof(napi));
params.flags = IORING_SETUP_SINGLE_ISSUER;
if (opt.defer_tw) {
params.flags |= IORING_SETUP_DEFER_TASKRUN;
} else if (opt.sq_poll) {
params.flags = IORING_SETUP_SQPOLL;
params.sq_thread_idle = 50;
} else {
params.flags |= IORING_SETUP_COOP_TASKRUN;
}
ret = io_uring_queue_init_params(RINGSIZE, &ctx.ring, ¶ms);
if (ret) {
fprintf(stderr, "io_uring_queue_init_params() failed: (%d) %s\n",
ret, strerror(-ret));
exit(1);
}
if (opt.timeout || opt.prefer_busy_poll) {
napi.prefer_busy_poll = opt.prefer_busy_poll;
napi.busy_poll_to = opt.timeout;
ret = io_uring_register_napi(&ctx.ring, &napi);
if (ret) {
fprintf(stderr, "io_uring_register_napi: %d\n", ret);
exit(1);
}
}
if (opt.busy_loop)
tsPtr = &ts;
else
tsPtr = NULL;
// Use realtime scheduler.
setProcessScheduler();
// Copy payload.
clock_gettime(CLOCK_REALTIME, &ctx.ts);
// Setup context.
ctx.napi_check = false;
ctx.buffer_len = sizeof(struct timespec);
ctx.num_pings = opt.num_pings;
// Receive initial message to get napi id.
receivePing(&ctx);
while (ctx.num_pings != 0) {
int res;
unsigned int num_completed = 0;
unsigned int head;
struct io_uring_cqe *cqe;
do {
res = io_uring_submit_and_wait_timeout(&ctx.ring, &cqe, 1, tsPtr, NULL);
if (res >= 0)
break;
else if (res == -ETIME)
continue;
fprintf(stderr, "submit_and_wait: %d\n", res);
exit(1);
} while (1);
io_uring_for_each_cqe(&ctx.ring, head, cqe) {
++num_completed;
completion(&ctx, cqe);
}
if (num_completed)
io_uring_cq_advance(&ctx.ring, num_completed);
}
// Clean up.
if (opt.timeout || opt.prefer_busy_poll) {
ret = io_uring_unregister_napi(&ctx.ring, &napi);
if (ret)
fprintf(stderr, "io_uring_unregister_napi: %d\n", ret);
}
io_uring_queue_exit(&ctx.ring);
close(ctx.sockfd);
return 0;
}
liburing-2.9/examples/poll-bench.c 0000664 0000000 0000000 00000004121 14750134674 0017215 0 ustar 00root root 0000000 0000000 /* SPDX-License-Identifier: MIT */
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "liburing.h"
static char buf[4096];
static unsigned long runtime_ms = 10000;
static unsigned long gettimeofday_ms(void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
}
int main(void)
{
unsigned long tstop;
unsigned long nr_reqs = 0;
struct io_uring_cqe *cqe;
struct io_uring_sqe *sqe;
struct io_uring ring;
int pipe1[2];
int ret, i, qd = 32;
if (pipe(pipe1) != 0) {
perror("pipe");
return 1;
}
ret = io_uring_queue_init(1024, &ring, IORING_SETUP_SINGLE_ISSUER);
if (ret == -EINVAL) {
fprintf(stderr, "can't single\n");
ret = io_uring_queue_init(1024, &ring, 0);
}
if (ret) {
fprintf(stderr, "child: ring setup failed: %d\n", ret);
return 1;
}
ret = io_uring_register_files(&ring, pipe1, 2);
if (ret < 0) {
fprintf(stderr, "io_uring_register_files failed\n");
return 1;
}
ret = io_uring_register_ring_fd(&ring);
if (ret < 0) {
fprintf(stderr, "io_uring_register_ring_fd failed\n");
return 1;
}
tstop = gettimeofday_ms() + runtime_ms;
do {
for (i = 0; i < qd; i++) {
sqe = io_uring_get_sqe(&ring);
io_uring_prep_poll_add(sqe, 0, POLLIN);
sqe->flags |= IOSQE_FIXED_FILE;
sqe->user_data = 1;
}
ret = io_uring_submit(&ring);
if (ret != qd) {
fprintf(stderr, "child: sqe submit failed: %d\n", ret);
return 1;
}
ret = write(pipe1[1], buf, 1);
if (ret != 1) {
fprintf(stderr, "write failed %i\n", errno);
return 1;
}
ret = read(pipe1[0], buf, 1);
if (ret != 1) {
fprintf(stderr, "read failed %i\n", errno);
return 1;
}
for (i = 0; i < qd; i++) {
ret = io_uring_wait_cqe(&ring, &cqe);
if (ret < 0) {
fprintf(stderr, "child: wait completion %d\n", ret);
break;
}
io_uring_cqe_seen(&ring, cqe);
nr_reqs++;
}
} while (gettimeofday_ms() < tstop);
fprintf(stderr, "requests/s: %lu\n", nr_reqs * 1000UL / runtime_ms);
return 0;
}
liburing-2.9/examples/proxy.c 0000664 0000000 0000000 00000201160 14750134674 0016355 0 ustar 00root root 0000000 0000000 /* SPDX-License-Identifier: MIT */
/*
* Sample program that can act either as a packet sink, where it just receives
* packets and doesn't do anything with them, or it can act as a proxy where it
* receives packets and then sends them to a new destination. The proxy can
* be unidirectional (-B0), or bi-direction (-B1).
*
* Examples:
*
* Act as a proxy, listening on port 4444, and send data to 192.168.2.6 on port
* 4445. Use multishot receive, DEFER_TASKRUN, and fixed files
*
* ./proxy -m1 -r4444 -H 192.168.2.6 -p4445
*
* Same as above, but utilize send bundles (-C1, requires -u1 send_ring) as well
* with ring provided send buffers, and recv bundles (-c1).
*
* ./proxy -m1 -c1 -u1 -C1 -r4444 -H 192.168.2.6 -p4445
*
* Act as a bi-directional proxy, listening on port 8888, and send data back
* and forth between host and 192.168.2.6 on port 22. Use multishot receive,
* DEFER_TASKRUN, fixed files, and buffers of size 1500.
*
* ./proxy -m1 -B1 -b1500 -r8888 -H 192.168.2.6 -p22
*
* Act a sink, listening on port 4445, using multishot receive, DEFER_TASKRUN,
* and fixed files:
*
* ./proxy -m1 -s1 -r4445
*
* Run with -h to see a list of options, and their defaults.
*
* (C) 2024 Jens Axboe
*
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "proxy.h"
#include "helpers.h"
/*
* Will go away once/if bundles are upstreamed and we put the generic
* definitions in the kernel header.
*/
#ifndef IORING_RECVSEND_BUNDLE
#define IORING_RECVSEND_BUNDLE (1U << 4)
#endif
#ifndef IORING_FEAT_SEND_BUF_SELECT
#define IORING_FEAT_SEND_BUF_SELECT (1U << 14)
#endif
static int cur_bgid = 1;
static int nr_conns;
static int open_conns;
static long page_size;
static unsigned long event_loops;
static unsigned long events;
static int recv_mshot = 1;
static int sqpoll;
static int defer_tw = 1;
static int is_sink;
static int fixed_files = 1;
static char *host = "192.168.3.2";
static int send_port = 4445;
static int receive_port = 4444;
static int buf_size = 32;
static int buf_ring_inc;
static int bidi;
static int ipv6;
static int napi;
static int napi_timeout;
static int wait_batch = 1;
static int wait_usec = 1000000;
static int rcv_msg;
static int snd_msg;
static int snd_zc;
static int send_ring = -1;
static int snd_bundle;
static int rcv_bundle;
static int use_huge;
static int ext_stat;
static int verbose;
static int nr_bufs = 256;
static int br_mask;
static int ring_size = 128;
static pthread_mutex_t thread_lock;
static struct timeval last_housekeeping;
/*
* For sendmsg/recvmsg. recvmsg just has a single vec, sendmsg will have
* two vecs - one that is currently submitted and being sent, and one that
* is being prepared. When a new sendmsg is issued, we'll swap which one we
* use. For send, even though we don't pass in the iovec itself, we use the
* vec to serialize the sends to avoid reordering.
*/
struct msg_vec {
struct iovec *iov;
/* length of allocated vec */
int vec_size;
/* length currently being used */
int iov_len;
/* only for send, current index we're processing */
int cur_iov;
};
struct io_msg {
struct msghdr msg;
struct msg_vec vecs[2];
/* current msg_vec being prepared */
int vec_index;
};
/*
* Per socket stats per connection. For bi-directional, we'll have both
* sends and receives on each socket, this helps track them separately.
* For sink or one directional, each of the two stats will be only sends
* or receives, not both.
*/
struct conn_dir {
int index;
int pending_shutdown;
int pending_send;
int pending_recv;
int snd_notif;
int out_buffers;
int rcv, rcv_shrt, rcv_enobufs, rcv_mshot;
int snd, snd_shrt, snd_enobufs, snd_busy, snd_mshot;
int snd_next_bid;
int rcv_next_bid;
int *rcv_bucket;
int *snd_bucket;
unsigned long in_bytes, out_bytes;
/* only ever have a single recv pending */
struct io_msg io_rcv_msg;
/* one send that is inflight, and one being prepared for the next one */
struct io_msg io_snd_msg;
};
enum {
CONN_F_STARTED = 1,
CONN_F_DISCONNECTING = 2,
CONN_F_DISCONNECTED = 4,
CONN_F_PENDING_SHUTDOWN = 8,
CONN_F_STATS_SHOWN = 16,
CONN_F_END_TIME = 32,
CONN_F_REAPED = 64,
};
/*
* buffer ring belonging to a connection
*/
struct conn_buf_ring {
struct io_uring_buf_ring *br;
void *buf;
int bgid;
};
struct conn {
struct io_uring ring;
/* receive side buffer ring, new data arrives here */
struct conn_buf_ring in_br;
/* if send_ring is used, outgoing data to send */
struct conn_buf_ring out_br;
int tid;
int in_fd, out_fd;
int pending_cancels;
int flags;
struct conn_dir cd[2];
struct timeval start_time, end_time;
union {
struct sockaddr_in addr;
struct sockaddr_in6 addr6;
};
pthread_t thread;
pthread_barrier_t startup_barrier;
};
#define MAX_CONNS 1024
static struct conn conns[MAX_CONNS];
#define vlog(str, ...) do { \
if (verbose) \
printf(str, ##__VA_ARGS__); \
} while (0)
static int prep_next_send(struct io_uring *ring, struct conn *c,
struct conn_dir *cd, int fd);
static void *thread_main(void *data);
static struct conn *cqe_to_conn(struct io_uring_cqe *cqe)
{
struct userdata ud = { .val = cqe->user_data };
return &conns[ud.op_tid & TID_MASK];
}
static struct conn_dir *cqe_to_conn_dir(struct conn *c,
struct io_uring_cqe *cqe)
{
int fd = cqe_to_fd(cqe);
return &c->cd[fd != c->in_fd];
}
static int other_dir_fd(struct conn *c, int fd)
{
if (c->in_fd == fd)
return c->out_fd;
return c->in_fd;
}
/* currently active msg_vec */
static struct msg_vec *msg_vec(struct io_msg *imsg)
{
return &imsg->vecs[imsg->vec_index];
}
static struct msg_vec *snd_msg_vec(struct conn_dir *cd)
{
return msg_vec(&cd->io_snd_msg);
}
/*
* Goes from accept new connection -> create socket, connect to end
* point, prepare recv, on receive do send (unless sink). If either ends
* disconnects, we transition to shutdown and then close.
*/
enum {
__ACCEPT = 1,
__SOCK = 2,
__CONNECT = 3,
__RECV = 4,
__RECVMSG = 5,
__SEND = 6,
__SENDMSG = 7,
__SHUTDOWN = 8,
__CANCEL = 9,
__CLOSE = 10,
__FD_PASS = 11,
__NOP = 12,
__STOP = 13,
};
struct error_handler {
const char *name;
int (*error_fn)(struct error_handler *, struct io_uring *, struct io_uring_cqe *);
};
static int recv_error(struct error_handler *err, struct io_uring *ring,
struct io_uring_cqe *cqe);
static int send_error(struct error_handler *err, struct io_uring *ring,
struct io_uring_cqe *cqe);
static int default_error(struct error_handler *err,
struct io_uring __attribute__((__unused__)) *ring,
struct io_uring_cqe *cqe)
{
struct conn *c = cqe_to_conn(cqe);
fprintf(stderr, "%d: %s error %s\n", c->tid, err->name, strerror(-cqe->res));
fprintf(stderr, "fd=%d, bid=%d\n", cqe_to_fd(cqe), cqe_to_bid(cqe));
return 1;
}
/*
* Move error handling out of the normal handling path, cleanly separating
* them. If an opcode doesn't need any error handling, set it to NULL. If
* it wants to stop the connection at that point and not do anything else,
* then the default handler can be used. Only receive has proper error
* handling, as we can get -ENOBUFS which is not a fatal condition. It just
* means we need to wait on buffer replenishing before re-arming the receive.
*/
static struct error_handler error_handlers[] = {
{ .name = "NULL", .error_fn = NULL, },
{ .name = "ACCEPT", .error_fn = default_error, },
{ .name = "SOCK", .error_fn = default_error, },
{ .name = "CONNECT", .error_fn = default_error, },
{ .name = "RECV", .error_fn = recv_error, },
{ .name = "RECVMSG", .error_fn = recv_error, },
{ .name = "SEND", .error_fn = send_error, },
{ .name = "SENDMSG", .error_fn = send_error, },
{ .name = "SHUTDOWN", .error_fn = NULL, },
{ .name = "CANCEL", .error_fn = NULL, },
{ .name = "CLOSE", .error_fn = NULL, },
{ .name = "FD_PASS", .error_fn = default_error, },
{ .name = "NOP", .error_fn = NULL, },
{ .name = "STOP", .error_fn = default_error, },
};
static void free_buffer_ring(struct io_uring *ring, struct conn_buf_ring *cbr)
{
if (!cbr->br)
return;
io_uring_free_buf_ring(ring, cbr->br, nr_bufs, cbr->bgid);
cbr->br = NULL;
if (use_huge)
munmap(cbr->buf, buf_size * nr_bufs);
else
free(cbr->buf);
}
static void free_buffer_rings(struct io_uring *ring, struct conn *c)
{
free_buffer_ring(ring, &c->in_br);
free_buffer_ring(ring, &c->out_br);
}
/*
* Setup a ring provided buffer ring for each connection. If we get -ENOBUFS
* on receive, for multishot receive we'll wait for half the provided buffers
* to be returned by pending sends, then re-arm the multishot receive. If
* this happens too frequently (see enobufs= stat), then the ring size is
* likely too small. Use -nXX to make it bigger. See recv_enobufs().
*
* The alternative here would be to use the older style provided buffers,
* where you simply setup a buffer group and use SQEs with
* io_urign_prep_provide_buffers() to add to the pool. But that approach is
* slower and has been deprecated by using the faster ring provided buffers.
*/
static int setup_recv_ring(struct io_uring *ring, struct conn *c)
{
struct conn_buf_ring *cbr = &c->in_br;
int br_flags = 0;
int ret, i;
size_t len;
void *ptr;
len = buf_size * nr_bufs;
if (use_huge) {
cbr->buf = mmap(NULL, len, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_HUGETLB|MAP_HUGE_2MB|MAP_ANONYMOUS,
-1, 0);
if (cbr->buf == MAP_FAILED) {
perror("mmap");
return 1;
}
} else {
if (posix_memalign(&cbr->buf, page_size, len)) {
perror("posix memalign");
return 1;
}
}
if (buf_ring_inc)
br_flags = IOU_PBUF_RING_INC;
cbr->br = io_uring_setup_buf_ring(ring, nr_bufs, cbr->bgid, br_flags, &ret);
if (!cbr->br) {
fprintf(stderr, "Buffer ring register failed %d\n", ret);
return 1;
}
ptr = cbr->buf;
for (i = 0; i < nr_bufs; i++) {
vlog("%d: add bid %d, data %p\n", c->tid, i, ptr);
io_uring_buf_ring_add(cbr->br, ptr, buf_size, i, br_mask, i);
ptr += buf_size;
}
io_uring_buf_ring_advance(cbr->br, nr_bufs);
printf("%d: recv buffer ring bgid %d, bufs %d\n", c->tid, cbr->bgid, nr_bufs);
return 0;
}
/*
* If 'send_ring' is used and the kernel supports it, we can skip serializing
* sends as the data will be ordered regardless. This reduces the send handling
* complexity, as buffers can always be added to the outgoing ring and will be
* processed in the order in which they were added.
*/
static int setup_send_ring(struct io_uring *ring, struct conn *c)
{
struct conn_buf_ring *cbr = &c->out_br;
int br_flags = 0;
int ret;
if (buf_ring_inc)
br_flags = IOU_PBUF_RING_INC;
cbr->br = io_uring_setup_buf_ring(ring, nr_bufs, cbr->bgid, br_flags, &ret);
if (!cbr->br) {
fprintf(stderr, "Buffer ring register failed %d\n", ret);
return 1;
}
printf("%d: send buffer ring bgid %d, bufs %d\n", c->tid, cbr->bgid, nr_bufs);
return 0;
}
static int setup_send_zc(struct io_uring *ring, struct conn *c)
{
struct iovec *iovs;
void *buf;
int i, ret;
if (snd_msg)
return 0;
buf = c->in_br.buf;
iovs = calloc(nr_bufs, sizeof(struct iovec));
for (i = 0; i < nr_bufs; i++) {
iovs[i].iov_base = buf;
iovs[i].iov_len = buf_size;
buf += buf_size;
}
ret = io_uring_register_buffers(ring, iovs, nr_bufs);
if (ret) {
fprintf(stderr, "failed registering buffers: %d\n", ret);
free(iovs);
return ret;
}
free(iovs);
return 0;
}
/*
* Setup an input and output buffer ring.
*/
static int setup_buffer_rings(struct io_uring *ring, struct conn *c)
{
int ret;
/* no locking needed on cur_bgid, parent serializes setup */
c->in_br.bgid = cur_bgid++;
c->out_br.bgid = cur_bgid++;
c->out_br.br = NULL;
ret = setup_recv_ring(ring, c);
if (ret)
return ret;
if (is_sink)
return 0;
if (snd_zc) {
ret = setup_send_zc(ring, c);
if (ret)
return ret;
}
if (send_ring) {
ret = setup_send_ring(ring, c);
if (ret) {
free_buffer_ring(ring, &c->in_br);
return ret;
}
}
return 0;
}
struct bucket_stat {
int nr_packets;
int count;
};
static int stat_cmp(const void *p1, const void *p2)
{
const struct bucket_stat *b1 = p1;
const struct bucket_stat *b2 = p2;
if (b1->count < b2->count)
return 1;
else if (b1->count > b2->count)
return -1;
return 0;
}
static void show_buckets(struct conn_dir *cd)
{
unsigned long snd_total, rcv_total;
struct bucket_stat *rstat, *sstat;
int i;
if (!cd->rcv_bucket || !cd->snd_bucket)
return;
rstat = calloc(nr_bufs + 1, sizeof(struct bucket_stat));
sstat = calloc(nr_bufs + 1, sizeof(struct bucket_stat));
snd_total = rcv_total = 0;
for (i = 0; i <= nr_bufs; i++) {
snd_total += cd->snd_bucket[i];
sstat[i].nr_packets = i;
sstat[i].count = cd->snd_bucket[i];
rcv_total += cd->rcv_bucket[i];
rstat[i].nr_packets = i;
rstat[i].count = cd->rcv_bucket[i];
}
if (!snd_total && !rcv_total) {
free(sstat);
free(rstat);
}
if (snd_total)
qsort(sstat, nr_bufs, sizeof(struct bucket_stat), stat_cmp);
if (rcv_total)
qsort(rstat, nr_bufs, sizeof(struct bucket_stat), stat_cmp);
printf("\t Packets per recv/send:\n");
for (i = 0; i <= nr_bufs; i++) {
double snd_prc = 0.0, rcv_prc = 0.0;
if (!rstat[i].count && !sstat[i].count)
continue;
if (rstat[i].count)
rcv_prc = 100.0 * (rstat[i].count / (double) rcv_total);
if (sstat[i].count)
snd_prc = 100.0 * (sstat[i].count / (double) snd_total);
printf("\t bucket(%3d/%3d): rcv=%u (%.2f%%) snd=%u (%.2f%%)\n",
rstat[i].nr_packets, sstat[i].nr_packets,
rstat[i].count, rcv_prc,
sstat[i].count, snd_prc);
}
free(sstat);
free(rstat);
}
static void __show_stats(struct conn *c)
{
unsigned long msec, qps;
unsigned long bytes, bw;
struct conn_dir *cd;
int i;
if (c->flags & (CONN_F_STATS_SHOWN | CONN_F_REAPED))
return;
if (!(c->flags & CONN_F_STARTED))
return;
if (!(c->flags & CONN_F_END_TIME))
gettimeofday(&c->end_time, NULL);
msec = (c->end_time.tv_sec - c->start_time.tv_sec) * 1000;
msec += (c->end_time.tv_usec - c->start_time.tv_usec) / 1000;
qps = 0;
for (i = 0; i < 2; i++)
qps += c->cd[i].rcv + c->cd[i].snd;
if (!qps)
return;
if (msec)
qps = (qps * 1000) / msec;
printf("Conn %d/(in_fd=%d, out_fd=%d): qps=%lu, msec=%lu\n", c->tid,
c->in_fd, c->out_fd, qps, msec);
bytes = 0;
for (i = 0; i < 2; i++) {
cd = &c->cd[i];
if (!cd->in_bytes && !cd->out_bytes && !cd->snd && !cd->rcv)
continue;
bytes += cd->in_bytes;
bytes += cd->out_bytes;
printf("\t%3d: rcv=%u (short=%u, enobufs=%d), snd=%u (short=%u,"
" busy=%u, enobufs=%d)\n", i, cd->rcv, cd->rcv_shrt,
cd->rcv_enobufs, cd->snd, cd->snd_shrt, cd->snd_busy,
cd->snd_enobufs);
printf("\t : in_bytes=%lu (Kb %lu), out_bytes=%lu (Kb %lu)\n",
cd->in_bytes, cd->in_bytes >> 10,
cd->out_bytes, cd->out_bytes >> 10);
printf("\t : mshot_rcv=%d, mshot_snd=%d\n", cd->rcv_mshot,
cd->snd_mshot);
show_buckets(cd);
}
if (msec) {
bytes *= 8UL;
bw = bytes / 1000;
bw /= msec;
printf("\tBW=%'luMbit\n", bw);
}
c->flags |= CONN_F_STATS_SHOWN;
}
static void show_stats(void)
{
float events_per_loop = 0.0;
static int stats_shown;
int i;
if (stats_shown)
return;
if (events)
events_per_loop = (float) events / (float) event_loops;
printf("Event loops: %lu, events %lu, events per loop %.2f\n", event_loops,
events, events_per_loop);
for (i = 0; i < MAX_CONNS; i++) {
struct conn *c = &conns[i];
__show_stats(c);
}
stats_shown = 1;
}
static void sig_int(int __attribute__((__unused__)) sig)
{
printf("\n");
show_stats();
exit(1);
}
/*
* Special cased for SQPOLL only, as we don't control when SQEs are consumed if
* that is used. Hence we may need to wait for the SQPOLL thread to keep up
* until we can get a new SQE. All other cases will break immediately, with a
* fresh SQE.
*
* If we grossly undersized our SQ ring, getting a NULL sqe can happen even
* for the !SQPOLL case if we're handling a lot of CQEs in our event loop
* and multishot isn't used. We can do io_uring_submit() to flush what we
* have here. Only caveat here is that if linked requests are used, SQEs
* would need to be allocated upfront as a link chain is only valid within
* a single submission cycle.
*/
static struct io_uring_sqe *get_sqe(struct io_uring *ring)
{
struct io_uring_sqe *sqe;
do {
sqe = io_uring_get_sqe(ring);
if (sqe)
break;
if (!sqpoll)
io_uring_submit(ring);
else
io_uring_sqring_wait(ring);
} while (1);
return sqe;
}
/*
* See __encode_userdata() for how we encode sqe->user_data, which is passed
* back as cqe->user_data at completion time.
*/
static void encode_userdata(struct io_uring_sqe *sqe, struct conn *c, int op,
int bid, int fd)
{
__encode_userdata(sqe, c->tid, op, bid, fd);
}
static void __submit_receive(struct io_uring *ring, struct conn *c,
struct conn_dir *cd, int fd)
{
struct conn_buf_ring *cbr = &c->in_br;
struct io_uring_sqe *sqe;
vlog("%d: submit receive fd=%d\n", c->tid, fd);
assert(!cd->pending_recv);
cd->pending_recv = 1;
/*
* For both recv and multishot receive, we use the ring provided
* buffers. These are handed to the application ahead of time, and
* are consumed when a receive triggers. Note that the address and
* length of the receive are set to NULL/0, and we assign the
* sqe->buf_group to tell the kernel which buffer group ID to pick
* a buffer from. Finally, IOSQE_BUFFER_SELECT is set to tell the
* kernel that we want a buffer picked for this request, we are not
* passing one in with the request.
*/
sqe = get_sqe(ring);
if (rcv_msg) {
struct io_msg *imsg = &cd->io_rcv_msg;
struct msghdr *msg = &imsg->msg;
memset(msg, 0, sizeof(*msg));
msg->msg_iov = msg_vec(imsg)->iov;
msg->msg_iovlen = msg_vec(imsg)->iov_len;
if (recv_mshot) {
cd->rcv_mshot++;
io_uring_prep_recvmsg_multishot(sqe, fd, &imsg->msg, 0);
} else {
io_uring_prep_recvmsg(sqe, fd, &imsg->msg, 0);
}
} else {
if (recv_mshot) {
cd->rcv_mshot++;
io_uring_prep_recv_multishot(sqe, fd, NULL, 0, 0);
} else {
io_uring_prep_recv(sqe, fd, NULL, 0, 0);
}
}
encode_userdata(sqe, c, __RECV, 0, fd);
sqe->buf_group = cbr->bgid;
sqe->flags |= IOSQE_BUFFER_SELECT;
if (fixed_files)
sqe->flags |= IOSQE_FIXED_FILE;
if (rcv_bundle)
sqe->ioprio |= IORING_RECVSEND_BUNDLE;
}
/*
* One directional just arms receive on our in_fd
*/
static void submit_receive(struct io_uring *ring, struct conn *c)
{
__submit_receive(ring, c, &c->cd[0], c->in_fd);
}
/*
* Bi-directional arms receive on both in and out fd
*/
static void submit_bidi_receive(struct io_uring *ring, struct conn *c)
{
__submit_receive(ring, c, &c->cd[0], c->in_fd);
__submit_receive(ring, c, &c->cd[1], c->out_fd);
}
/*
* We hit -ENOBUFS, which means that we ran out of buffers in our current
* provided buffer group. This can happen if there's an imbalance between the
* receives coming in and the sends being processed, particularly with multishot
* receive as they can trigger very quickly. If this happens, defer arming a
* new receive until we've replenished half of the buffer pool by processing
* pending sends.
*/
static void recv_enobufs(struct io_uring *ring, struct conn *c,
struct conn_dir *cd, int fd)
{
vlog("%d: enobufs hit\n", c->tid);
cd->rcv_enobufs++;
/*
* If we're a sink, mark rcv as rearm. If we're not, then mark us as
* needing a rearm for receive and send. The completing send will
* kick the recv rearm.
*/
if (!is_sink) {
int do_recv_arm = 1;
if (!cd->pending_send)
do_recv_arm = !prep_next_send(ring, c, cd, fd);
if (do_recv_arm)
__submit_receive(ring, c, &c->cd[0], c->in_fd);
} else {
__submit_receive(ring, c, &c->cd[0], c->in_fd);
}
}
/*
* Kill this socket - submit a shutdown and link a close to it. We don't
* care about shutdown status, so mark it as not needing to post a CQE unless
* it fails.
*/
static void queue_shutdown_close(struct io_uring *ring, struct conn *c, int fd)
{
struct io_uring_sqe *sqe1, *sqe2;
/*
* On the off chance that we run out of SQEs after the first one,
* grab two upfront. This it to prevent our link not working if
* get_sqe() ends up doing submissions to free up an SQE, as links
* are not valid across separate submissions.
*/
sqe1 = get_sqe(ring);
sqe2 = get_sqe(ring);
io_uring_prep_shutdown(sqe1, fd, SHUT_RDWR);
if (fixed_files)
sqe1->flags |= IOSQE_FIXED_FILE;
sqe1->flags |= IOSQE_IO_LINK | IOSQE_CQE_SKIP_SUCCESS;
encode_userdata(sqe1, c, __SHUTDOWN, 0, fd);
if (fixed_files)
io_uring_prep_close_direct(sqe2, fd);
else
io_uring_prep_close(sqe2, fd);
encode_userdata(sqe2, c, __CLOSE, 0, fd);
}
/*
* This connection is going away, queue a cancel for any pending recv, for
* example, we have pending for this ring. For completeness, we issue a cancel
* for any request we have pending for both in_fd and out_fd.
*/
static void queue_cancel(struct io_uring *ring, struct conn *c)
{
struct io_uring_sqe *sqe;
int flags = 0;
if (fixed_files)
flags |= IORING_ASYNC_CANCEL_FD_FIXED;
sqe = get_sqe(ring);
io_uring_prep_cancel_fd(sqe, c->in_fd, flags);
encode_userdata(sqe, c, __CANCEL, 0, c->in_fd);
c->pending_cancels++;
if (c->out_fd != -1) {
sqe = get_sqe(ring);
io_uring_prep_cancel_fd(sqe, c->out_fd, flags);
encode_userdata(sqe, c, __CANCEL, 0, c->out_fd);
c->pending_cancels++;
}
io_uring_submit(ring);
}
static int pending_shutdown(struct conn *c)
{
return c->cd[0].pending_shutdown + c->cd[1].pending_shutdown;
}
static bool should_shutdown(struct conn *c)
{
int i;
if (!pending_shutdown(c))
return false;
if (is_sink)
return true;
if (!bidi)
return c->cd[0].in_bytes == c->cd[1].out_bytes;
for (i = 0; i < 2; i++) {
if (c->cd[0].rcv != c->cd[1].snd)
return false;
if (c->cd[1].rcv != c->cd[0].snd)
return false;
}
return true;
}
/*
* Close this connection - send a ring message to the connection with intent
* to stop. When the client gets the message, it will initiate the stop.
*/
static void __close_conn(struct io_uring *ring, struct conn *c)
{
struct io_uring_sqe *sqe;
uint64_t user_data;
printf("Client %d: queueing stop\n", c->tid);
user_data = __raw_encode(c->tid, __STOP, 0, 0);
sqe = io_uring_get_sqe(ring);
io_uring_prep_msg_ring(sqe, c->ring.ring_fd, 0, user_data, 0);
encode_userdata(sqe, c, __NOP, 0, 0);
io_uring_submit(ring);
}
static void close_cd(struct conn *c, struct conn_dir *cd)
{
cd->pending_shutdown = 1;
if (cd->pending_send)
return;
if (!(c->flags & CONN_F_PENDING_SHUTDOWN)) {
gettimeofday(&c->end_time, NULL);
c->flags |= CONN_F_PENDING_SHUTDOWN | CONN_F_END_TIME;
}
}
/*
* We're done with this buffer, add it back to our pool so the kernel is
* free to use it again.
*/
static int replenish_buffer(struct conn_buf_ring *cbr, int bid, int offset)
{
void *this_buf = cbr->buf + bid * buf_size;
assert(bid < nr_bufs);
io_uring_buf_ring_add(cbr->br, this_buf, buf_size, bid, br_mask, offset);
return buf_size;
}
/*
* Iterate buffers from '*bid' and with a total size of 'bytes' and add them
* back to our receive ring so they can be reused for new receives.
*/
static int replenish_buffers(struct conn *c, int *bid, int bytes)
{
struct conn_buf_ring *cbr = &c->in_br;
int nr_packets = 0;
while (bytes) {
int this_len = replenish_buffer(cbr, *bid, nr_packets);
if (this_len > bytes)
this_len = bytes;
bytes -= this_len;
*bid = (*bid + 1) & (nr_bufs - 1);
nr_packets++;
}
io_uring_buf_ring_advance(cbr->br, nr_packets);
return nr_packets;
}
static void free_mvec(struct msg_vec *mvec)
{
free(mvec->iov);
mvec->iov = NULL;
}
static void init_mvec(struct msg_vec *mvec)
{
memset(mvec, 0, sizeof(*mvec));
mvec->iov = malloc(sizeof(struct iovec));
mvec->vec_size = 1;
}
static void init_msgs(struct conn_dir *cd)
{
memset(&cd->io_snd_msg, 0, sizeof(cd->io_snd_msg));
memset(&cd->io_rcv_msg, 0, sizeof(cd->io_rcv_msg));
init_mvec(&cd->io_snd_msg.vecs[0]);
init_mvec(&cd->io_snd_msg.vecs[1]);
init_mvec(&cd->io_rcv_msg.vecs[0]);
}
static void free_msgs(struct conn_dir *cd)
{
free_mvec(&cd->io_snd_msg.vecs[0]);
free_mvec(&cd->io_snd_msg.vecs[1]);
free_mvec(&cd->io_rcv_msg.vecs[0]);
}
/*
* Multishot accept completion triggered. If we're acting as a sink, we're
* good to go. Just issue a receive for that case. If we're acting as a proxy,
* then start opening a socket that we can use to connect to the other end.
*/
static int handle_accept(struct io_uring *ring, struct io_uring_cqe *cqe)
{
struct conn *c;
int i;
if (nr_conns == MAX_CONNS) {
fprintf(stderr, "max clients reached %d\n", nr_conns);
return 1;
}
/* main thread handles this, which is obviously serialized */
c = &conns[nr_conns];
c->tid = nr_conns++;
c->in_fd = -1;
c->out_fd = -1;
for (i = 0; i < 2; i++) {
struct conn_dir *cd = &c->cd[i];
cd->index = i;
cd->snd_next_bid = -1;
cd->rcv_next_bid = -1;
if (ext_stat) {
cd->rcv_bucket = calloc(nr_bufs + 1, sizeof(int));
cd->snd_bucket = calloc(nr_bufs + 1, sizeof(int));
}
init_msgs(cd);
}
printf("New client: id=%d, in=%d\n", c->tid, c->in_fd);
gettimeofday(&c->start_time, NULL);
pthread_barrier_init(&c->startup_barrier, NULL, 2);
pthread_create(&c->thread, NULL, thread_main, c);
/*
* Wait for thread to have its ring setup, then either assign the fd
* if it's non-fixed, or pass the fixed one
*/
pthread_barrier_wait(&c->startup_barrier);
if (!fixed_files) {
c->in_fd = cqe->res;
} else {
struct io_uring_sqe *sqe;
uint64_t user_data;
/*
* Ring has just been setup, we'll use index 0 as the descriptor
* value.
*/
user_data = __raw_encode(c->tid, __FD_PASS, 0, 0);
sqe = io_uring_get_sqe(ring);
io_uring_prep_msg_ring_fd(sqe, c->ring.ring_fd, cqe->res, 0,
user_data, 0);
encode_userdata(sqe, c, __NOP, 0, cqe->res);
}
return 0;
}
/*
* Our socket request completed, issue a connect request to the other end.
*/
static int handle_sock(struct io_uring *ring, struct io_uring_cqe *cqe)
{
struct conn *c = cqe_to_conn(cqe);
struct io_uring_sqe *sqe;
int ret;
vlog("%d: sock: res=%d\n", c->tid, cqe->res);
c->out_fd = cqe->res;
if (ipv6) {
memset(&c->addr6, 0, sizeof(c->addr6));
c->addr6.sin6_family = AF_INET6;
c->addr6.sin6_port = htons(send_port);
ret = inet_pton(AF_INET6, host, &c->addr6.sin6_addr);
} else {
memset(&c->addr, 0, sizeof(c->addr));
c->addr.sin_family = AF_INET;
c->addr.sin_port = htons(send_port);
ret = inet_pton(AF_INET, host, &c->addr.sin_addr);
}
if (ret <= 0) {
if (!ret)
fprintf(stderr, "host not in right format\n");
else
perror("inet_pton");
return 1;
}
sqe = get_sqe(ring);
if (ipv6) {
io_uring_prep_connect(sqe, c->out_fd,
(struct sockaddr *) &c->addr6,
sizeof(c->addr6));
} else {
io_uring_prep_connect(sqe, c->out_fd,
(struct sockaddr *) &c->addr,
sizeof(c->addr));
}
encode_userdata(sqe, c, __CONNECT, 0, c->out_fd);
if (fixed_files)
sqe->flags |= IOSQE_FIXED_FILE;
return 0;
}
/*
* Connection to the other end is done, submit a receive to start receiving
* data. If we're a bidirectional proxy, issue a receive on both ends. If not,
* then just a single recv will do.
*/
static int handle_connect(struct io_uring *ring, struct io_uring_cqe *cqe)
{
struct conn *c = cqe_to_conn(cqe);
pthread_mutex_lock(&thread_lock);
open_conns++;
pthread_mutex_unlock(&thread_lock);
if (bidi)
submit_bidi_receive(ring, c);
else
submit_receive(ring, c);
return 0;
}
/*
* Append new segment to our currently active msg_vec. This will be submitted
* as a sendmsg (with all of it), or as separate sends, later. If we're using
* send_ring, then we won't hit this path. Instead, outgoing buffers are
* added directly to our outgoing send buffer ring.
*/
static void send_append_vec(struct conn_dir *cd, void *data, int len)
{
struct msg_vec *mvec = snd_msg_vec(cd);
if (mvec->iov_len == mvec->vec_size) {
mvec->vec_size <<= 1;
mvec->iov = realloc(mvec->iov, mvec->vec_size * sizeof(struct iovec));
}
mvec->iov[mvec->iov_len].iov_base = data;
mvec->iov[mvec->iov_len].iov_len = len;
mvec->iov_len++;
}
/*
* Queue a send based on the data received in this cqe, which came from
* a completed receive operation.
*/
static void send_append(struct conn *c, struct conn_dir *cd, void *data,
int bid, int len)
{
vlog("%d: send %d (%p, bid %d)\n", c->tid, len, data, bid);
assert(bid < nr_bufs);
/* if using provided buffers for send, add it upfront */
if (send_ring) {
struct conn_buf_ring *cbr = &c->out_br;
io_uring_buf_ring_add(cbr->br, data, len, bid, br_mask, 0);
io_uring_buf_ring_advance(cbr->br, 1);
} else {
send_append_vec(cd, data, len);
}
}
/*
* For non recvmsg && multishot, a zero receive marks the end. For recvmsg
* with multishot, we always get the header regardless. Hence a "zero receive"
* is the size of the header.
*/
static int recv_done_res(int res)
{
if (!res)
return 1;
if (rcv_msg && recv_mshot && res == sizeof(struct io_uring_recvmsg_out))
return 1;
return 0;
}
static int recv_inc(struct conn *c, struct conn_dir *cd, int *bid,
struct io_uring_cqe *cqe)
{
struct conn_buf_ring *cbr = &c->out_br;
struct conn_buf_ring *in_cbr = &c->in_br;
void *data;
if (!cqe->res)
return 0;
if (cqe->flags & IORING_CQE_F_BUF_MORE)
return 0;
data = in_cbr->buf + *bid * buf_size;
if (is_sink) {
io_uring_buf_ring_add(in_cbr->br, data, buf_size, *bid, br_mask, 0);
io_uring_buf_ring_advance(in_cbr->br, 1);
} else if (send_ring) {
io_uring_buf_ring_add(cbr->br, data, buf_size, *bid, br_mask, 0);
io_uring_buf_ring_advance(cbr->br, 1);
} else {
send_append(c, cd, data, *bid, buf_size);
}
*bid = (*bid + 1) & (nr_bufs - 1);
return 1;
}
/*
* Any receive that isn't recvmsg with multishot can be handled the same way.
* Iterate from '*bid' and 'in_bytes' in total, and append the data to the
* outgoing queue.
*/
static int recv_bids(struct conn *c, struct conn_dir *cd, int *bid, int in_bytes)
{
struct conn_buf_ring *cbr = &c->out_br;
struct conn_buf_ring *in_cbr = &c->in_br;
struct io_uring_buf *buf;
int nr_packets = 0;
while (in_bytes) {
int this_bytes;
void *data;
buf = &in_cbr->br->bufs[*bid];
data = (void *) (unsigned long) buf->addr;
this_bytes = buf->len;
if (this_bytes > in_bytes)
this_bytes = in_bytes;
in_bytes -= this_bytes;
if (send_ring)
io_uring_buf_ring_add(cbr->br, data, this_bytes, *bid,
br_mask, nr_packets);
else
send_append(c, cd, data, *bid, this_bytes);
*bid = (*bid + 1) & (nr_bufs - 1);
nr_packets++;
}
if (send_ring)
io_uring_buf_ring_advance(cbr->br, nr_packets);
return nr_packets;
}
/*
* Special handling of recvmsg with multishot
*/
static int recv_mshot_msg(struct conn *c, struct conn_dir *cd, int *bid,
int in_bytes)
{
struct conn_buf_ring *cbr = &c->out_br;
struct conn_buf_ring *in_cbr = &c->in_br;
struct io_uring_buf *buf;
int nr_packets = 0;
while (in_bytes) {
struct io_uring_recvmsg_out *pdu;
int this_bytes;
void *data;
buf = &in_cbr->br->bufs[*bid];
/*
* multishot recvmsg puts a header in front of the data - we
* have to take that into account for the send setup, and
* adjust the actual data read to not take this metadata into
* account. For this use case, namelen and controllen will not
* be set. If they were, they would need to be factored in too.
*/
buf->len -= sizeof(struct io_uring_recvmsg_out);
in_bytes -= sizeof(struct io_uring_recvmsg_out);
pdu = (void *) (unsigned long) buf->addr;
vlog("pdu namelen %d, controllen %d, payload %d flags %x\n",
pdu->namelen, pdu->controllen, pdu->payloadlen,
pdu->flags);
data = (void *) (pdu + 1);
this_bytes = pdu->payloadlen;
if (this_bytes > in_bytes)
this_bytes = in_bytes;
in_bytes -= this_bytes;
if (send_ring)
io_uring_buf_ring_add(cbr->br, data, this_bytes, *bid,
br_mask, nr_packets);
else
send_append(c, cd, data, *bid, this_bytes);
*bid = (*bid + 1) & (nr_bufs - 1);
nr_packets++;
}
if (send_ring)
io_uring_buf_ring_advance(cbr->br, nr_packets);
return nr_packets;
}
static int __handle_recv(struct io_uring *ring, struct conn *c,
struct conn_dir *cd, struct io_uring_cqe *cqe)
{
struct conn_dir *ocd = &c->cd[!cd->index];
int bid, nr_packets;
/*
* Not having a buffer attached should only happen if we get a zero
* sized receive, because the other end closed the connection. It
* cannot happen otherwise, as all our receives are using provided
* buffers and hence it's not possible to return a CQE with a non-zero
* result and not have a buffer attached.
*/
if (!(cqe->flags & IORING_CQE_F_BUFFER)) {
cd->pending_recv = 0;
if (!recv_done_res(cqe->res)) {
fprintf(stderr, "no buffer assigned, res=%d\n", cqe->res);
return 1;
}
start_close:
prep_next_send(ring, c, ocd, other_dir_fd(c, cqe_to_fd(cqe)));
close_cd(c, cd);
return 0;
}
if (cqe->res && cqe->res < buf_size)
cd->rcv_shrt++;
bid = cqe->flags >> IORING_CQE_BUFFER_SHIFT;
/*
* BIDI will use the same buffer pool and do receive on both CDs,
* so can't reliably check. TODO.
*/
if (!bidi && cd->rcv_next_bid != -1 && bid != cd->rcv_next_bid) {
fprintf(stderr, "recv bid %d, wanted %d\n", bid, cd->rcv_next_bid);
goto start_close;
}
vlog("%d: recv: bid=%d, res=%d, cflags=%x\n", c->tid, bid, cqe->res, cqe->flags);
/*
* If we're a sink, we're done here. Just replenish the buffer back
* to the pool. For proxy mode, we will send the data to the other
* end and the buffer will be replenished once the send is done with
* it.
*/
if (buf_ring_inc)
nr_packets = recv_inc(c, ocd, &bid, cqe);
else if (is_sink)
nr_packets = replenish_buffers(c, &bid, cqe->res);
else if (rcv_msg && recv_mshot)
nr_packets = recv_mshot_msg(c, ocd, &bid, cqe->res);
else
nr_packets = recv_bids(c, ocd, &bid, cqe->res);
if (cd->rcv_bucket)
cd->rcv_bucket[nr_packets]++;
if (!is_sink) {
ocd->out_buffers += nr_packets;
assert(ocd->out_buffers <= nr_bufs);
}
cd->rcv++;
cd->rcv_next_bid = bid;
/*
* If IORING_CQE_F_MORE isn't set, then this is either a normal recv
* that needs rearming, or it's a multishot that won't post any further
* completions. Setup a new one for these cases.
*/
if (!(cqe->flags & IORING_CQE_F_MORE)) {
cd->pending_recv = 0;
if (recv_done_res(cqe->res))
goto start_close;
if (is_sink || !ocd->pending_send)
__submit_receive(ring, c, &c->cd[0], c->in_fd);
}
/*
* Submit a send if we won't get anymore notifications from this
* recv, or if we have nr_bufs / 2 queued up. If BIDI mode, send
* every buffer. We assume this is interactive mode, and hence don't
* delay anything.
*/
if (((!ocd->pending_send && (bidi || (ocd->out_buffers >= nr_bufs / 2))) ||
!(cqe->flags & IORING_CQE_F_MORE)) && !is_sink)
prep_next_send(ring, c, ocd, other_dir_fd(c, cqe_to_fd(cqe)));
if (!recv_done_res(cqe->res))
cd->in_bytes += cqe->res;
return 0;
}
static int handle_recv(struct io_uring *ring, struct io_uring_cqe *cqe)
{
struct conn *c = cqe_to_conn(cqe);
struct conn_dir *cd = cqe_to_conn_dir(c, cqe);
return __handle_recv(ring, c, cd, cqe);
}
static int recv_error(struct error_handler *err, struct io_uring *ring,
struct io_uring_cqe *cqe)
{
struct conn *c = cqe_to_conn(cqe);
struct conn_dir *cd = cqe_to_conn_dir(c, cqe);
cd->pending_recv = 0;
if (cqe->res != -ENOBUFS)
return default_error(err, ring, cqe);
recv_enobufs(ring, c, cd, other_dir_fd(c, cqe_to_fd(cqe)));
return 0;
}
static void submit_send(struct io_uring *ring, struct conn *c,
struct conn_dir *cd, int fd, void *data, int len,
int bid, int flags)
{
struct io_uring_sqe *sqe;
int bgid = c->out_br.bgid;
if (cd->pending_send)
return;
cd->pending_send = 1;
flags |= MSG_WAITALL | MSG_NOSIGNAL;
sqe = get_sqe(ring);
if (snd_msg) {
struct io_msg *imsg = &cd->io_snd_msg;
if (snd_zc) {
io_uring_prep_sendmsg_zc(sqe, fd, &imsg->msg, flags);
cd->snd_notif++;
} else {
io_uring_prep_sendmsg(sqe, fd, &imsg->msg, flags);
}
} else if (send_ring) {
io_uring_prep_send(sqe, fd, NULL, 0, flags);
} else if (!snd_zc) {
io_uring_prep_send(sqe, fd, data, len, flags);
} else {
io_uring_prep_send_zc(sqe, fd, data, len, flags, 0);
sqe->ioprio |= IORING_RECVSEND_FIXED_BUF;
sqe->buf_index = bid;
cd->snd_notif++;
}
encode_userdata(sqe, c, __SEND, bid, fd);
if (fixed_files)
sqe->flags |= IOSQE_FIXED_FILE;
if (send_ring) {
sqe->flags |= IOSQE_BUFFER_SELECT;
sqe->buf_group = bgid;
}
if (snd_bundle) {
sqe->ioprio |= IORING_RECVSEND_BUNDLE;
cd->snd_mshot++;
} else if (send_ring)
cd->snd_mshot++;
}
/*
* Prepare the next send request, if we need to. If one is already pending,
* or if we're a sink and we don't need to do sends, then there's nothing
* to do.
*
* Return 1 if another send completion is expected, 0 if not.
*/
static int prep_next_send(struct io_uring *ring, struct conn *c,
struct conn_dir *cd, int fd)
{
int bid;
if (cd->pending_send || is_sink)
return 0;
if (!cd->out_buffers)
return 0;
bid = cd->snd_next_bid;
if (bid == -1)
bid = 0;
if (send_ring) {
/*
* send_ring mode is easy, there's nothing to do but submit
* our next send request. That will empty the entire outgoing
* queue.
*/
submit_send(ring, c, cd, fd, NULL, 0, bid, 0);
return 1;
} else if (snd_msg) {
/*
* For sendmsg mode, submit our currently prepared iovec, if
* we have one, and swap our iovecs so that any further
* receives will start preparing that one.
*/
struct io_msg *imsg = &cd->io_snd_msg;
if (!msg_vec(imsg)->iov_len)
return 0;
imsg->msg.msg_iov = msg_vec(imsg)->iov;
imsg->msg.msg_iovlen = msg_vec(imsg)->iov_len;
msg_vec(imsg)->iov_len = 0;
imsg->vec_index = !imsg->vec_index;
submit_send(ring, c, cd, fd, NULL, 0, bid, 0);
return 1;
} else {
/*
* send without send_ring - submit the next available vec,
* if any. If this vec is the last one in the current series,
* then swap to the next vec. We flag each send with MSG_MORE,
* unless this is the last part of the current vec.
*/
struct io_msg *imsg = &cd->io_snd_msg;
struct msg_vec *mvec = msg_vec(imsg);
int flags = !snd_zc ? MSG_MORE : 0;
struct iovec *iov;
if (mvec->iov_len == mvec->cur_iov)
return 0;
imsg->msg.msg_iov = msg_vec(imsg)->iov;
iov = &mvec->iov[mvec->cur_iov];
mvec->cur_iov++;
if (mvec->cur_iov == mvec->iov_len) {
mvec->iov_len = 0;
mvec->cur_iov = 0;
imsg->vec_index = !imsg->vec_index;
flags = 0;
}
submit_send(ring, c, cd, fd, iov->iov_base, iov->iov_len, bid, flags);
return 1;
}
}
static int handle_send_inc(struct conn *c, struct conn_dir *cd, int bid,
struct io_uring_cqe *cqe)
{
struct conn_buf_ring *in_cbr = &c->in_br;
int ret = 0;
void *data;
if (!cqe->res)
goto out;
if (cqe->flags & IORING_CQE_F_BUF_MORE)
return 0;
assert(cqe->res <= buf_size);
cd->out_bytes += cqe->res;
data = in_cbr->buf + bid * buf_size;
io_uring_buf_ring_add(in_cbr->br, data, buf_size, bid, br_mask, 0);
io_uring_buf_ring_advance(in_cbr->br, 1);
bid = (bid + 1) & (nr_bufs - 1);
ret = 1;
out:
if (pending_shutdown(c))
close_cd(c, cd);
return ret;
}
/*
* Handling a send with an outgoing send ring. Get the buffers from the
* receive side, and add them to the ingoing buffer ring again.
*/
static int handle_send_ring(struct conn *c, struct conn_dir *cd, int bid,
int bytes)
{
struct conn_buf_ring *in_cbr = &c->in_br;
struct conn_buf_ring *out_cbr = &c->out_br;
int i = 0;
while (bytes) {
struct io_uring_buf *buf = &out_cbr->br->bufs[bid];
int this_bytes;
void *this_buf;
this_bytes = buf->len;
if (this_bytes > bytes)
this_bytes = bytes;
cd->out_bytes += this_bytes;
vlog("%d: send: bid=%d, len=%d\n", c->tid, bid, this_bytes);
this_buf = in_cbr->buf + bid * buf_size;
io_uring_buf_ring_add(in_cbr->br, this_buf, buf_size, bid, br_mask, i);
/*
* Find the provided buffer that the receive consumed, and
* which we then used for the send, and add it back to the
* pool so it can get picked by another receive. Once the send
* is done, we're done with it.
*/
bid = (bid + 1) & (nr_bufs - 1);
bytes -= this_bytes;
i++;
}
cd->snd_next_bid = bid;
io_uring_buf_ring_advance(in_cbr->br, i);
if (pending_shutdown(c))
close_cd(c, cd);
return i;
}
/*
* sendmsg, or send without a ring. Just add buffers back to the ingoing
* ring for receives.
*/
static int handle_send_buf(struct conn *c, struct conn_dir *cd, int bid,
int bytes)
{
struct conn_buf_ring *in_cbr = &c->in_br;
int i = 0;
while (bytes) {
struct io_uring_buf *buf = &in_cbr->br->bufs[bid];
int this_bytes;
this_bytes = bytes;
if (this_bytes > buf->len)
this_bytes = buf->len;
vlog("%d: send: bid=%d, len=%d\n", c->tid, bid, this_bytes);
cd->out_bytes += this_bytes;
/* each recvmsg mshot package has this overhead */
if (rcv_msg && recv_mshot)
cd->out_bytes += sizeof(struct io_uring_recvmsg_out);
replenish_buffer(in_cbr, bid, i);
bid = (bid + 1) & (nr_bufs - 1);
bytes -= this_bytes;
i++;
}
io_uring_buf_ring_advance(in_cbr->br, i);
cd->snd_next_bid = bid;
return i;
}
static int __handle_send(struct io_uring *ring, struct conn *c,
struct conn_dir *cd, struct io_uring_cqe *cqe)
{
struct conn_dir *ocd;
int bid, nr_packets;
if (send_ring) {
if (!(cqe->flags & IORING_CQE_F_BUFFER)) {
fprintf(stderr, "no buffer in send?! %d\n", cqe->res);
return 1;
}
bid = cqe->flags >> IORING_CQE_BUFFER_SHIFT;
} else {
bid = cqe_to_bid(cqe);
}
/*
* CQE notifications only happen with send/sendmsg zerocopy. They
* tell us that the data has been acked, and that hence the buffer
* is now free to reuse. Waiting on an ACK for each packet will slow
* us down tremendously, so do all of our sends and then wait for
* the ACKs to come in. They tend to come in bundles anyway. Once
* all acks are done (cd->snd_notif == 0), then fire off the next
* receive.
*/
if (cqe->flags & IORING_CQE_F_NOTIF) {
cd->snd_notif--;
} else {
if (cqe->res && cqe->res < buf_size)
cd->snd_shrt++;
/*
* BIDI will use the same buffer pool and do sends on both CDs,
* so can't reliably check. TODO.
*/
if (!bidi && send_ring && cd->snd_next_bid != -1 &&
bid != cd->snd_next_bid) {
fprintf(stderr, "send bid %d, wanted %d at %lu\n", bid,
cd->snd_next_bid, cd->out_bytes);
goto out_close;
}
assert(bid <= nr_bufs);
vlog("send: got %d, %lu\n", cqe->res, cd->out_bytes);
if (buf_ring_inc)
nr_packets = handle_send_inc(c, cd, bid, cqe);
else if (send_ring)
nr_packets = handle_send_ring(c, cd, bid, cqe->res);
else
nr_packets = handle_send_buf(c, cd, bid, cqe->res);
if (cd->snd_bucket)
cd->snd_bucket[nr_packets]++;
cd->out_buffers -= nr_packets;
assert(cd->out_buffers >= 0);
cd->snd++;
}
if (!(cqe->flags & IORING_CQE_F_MORE)) {
int do_recv_arm;
cd->pending_send = 0;
/*
* send done - see if the current vec has data to submit, and
* do so if it does. if it doesn't have data yet, nothing to
* do.
*/
do_recv_arm = !prep_next_send(ring, c, cd, cqe_to_fd(cqe));
ocd = &c->cd[!cd->index];
if (!cd->snd_notif && do_recv_arm && !ocd->pending_recv) {
int fd = other_dir_fd(c, cqe_to_fd(cqe));
__submit_receive(ring, c, ocd, fd);
}
out_close:
if (pending_shutdown(c))
close_cd(c, cd);
}
vlog("%d: pending sends %d\n", c->tid, cd->pending_send);
return 0;
}
static int handle_send(struct io_uring *ring, struct io_uring_cqe *cqe)
{
struct conn *c = cqe_to_conn(cqe);
struct conn_dir *cd = cqe_to_conn_dir(c, cqe);
return __handle_send(ring, c, cd, cqe);
}
static int send_error(struct error_handler *err, struct io_uring *ring,
struct io_uring_cqe *cqe)
{
struct conn *c = cqe_to_conn(cqe);
struct conn_dir *cd = cqe_to_conn_dir(c, cqe);
cd->pending_send = 0;
/* res can have high bit set */
if (cqe->flags & IORING_CQE_F_NOTIF)
return handle_send(ring, cqe);
if (cqe->res != -ENOBUFS)
return default_error(err, ring, cqe);
cd->snd_enobufs++;
return 0;
}
/*
* We don't expect to get here, as we marked it with skipping posting a
* CQE if it was successful. If it does trigger, than means it fails and
* that our close has not been done. Log the shutdown error and issue a new
* separate close.
*/
static int handle_shutdown(struct io_uring *ring, struct io_uring_cqe *cqe)
{
struct conn *c = cqe_to_conn(cqe);
struct io_uring_sqe *sqe;
int fd = cqe_to_fd(cqe);
fprintf(stderr, "Got shutdown notification on fd %d\n", fd);
if (!cqe->res)
fprintf(stderr, "Unexpected success shutdown CQE\n");
else if (cqe->res < 0)
fprintf(stderr, "Shutdown got %s\n", strerror(-cqe->res));
sqe = get_sqe(ring);
if (fixed_files)
io_uring_prep_close_direct(sqe, fd);
else
io_uring_prep_close(sqe, fd);
encode_userdata(sqe, c, __CLOSE, 0, fd);
return 0;
}
/*
* Final stage of a connection, the shutdown and close has finished. Mark
* it as disconnected and let the main loop reap it.
*/
static int handle_close(struct io_uring *ring, struct io_uring_cqe *cqe)
{
struct conn *c = cqe_to_conn(cqe);
int fd = cqe_to_fd(cqe);
printf("Closed client: id=%d, in_fd=%d, out_fd=%d\n", c->tid, c->in_fd, c->out_fd);
if (fd == c->in_fd)
c->in_fd = -1;
else if (fd == c->out_fd)
c->out_fd = -1;
if (c->in_fd == -1 && c->out_fd == -1) {
c->flags |= CONN_F_DISCONNECTED;
pthread_mutex_lock(&thread_lock);
__show_stats(c);
open_conns--;
pthread_mutex_unlock(&thread_lock);
free_buffer_rings(ring, c);
free_msgs(&c->cd[0]);
free_msgs(&c->cd[1]);
free(c->cd[0].rcv_bucket);
free(c->cd[0].snd_bucket);
}
return 0;
}
static int handle_cancel(struct io_uring *ring, struct io_uring_cqe *cqe)
{
struct conn *c = cqe_to_conn(cqe);
int fd = cqe_to_fd(cqe);
c->pending_cancels--;
vlog("%d: got cancel fd %d, refs %d\n", c->tid, fd, c->pending_cancels);
if (!c->pending_cancels) {
queue_shutdown_close(ring, c, c->in_fd);
if (c->out_fd != -1)
queue_shutdown_close(ring, c, c->out_fd);
io_uring_submit(ring);
}
return 0;
}
static void open_socket(struct conn *c)
{
if (is_sink) {
pthread_mutex_lock(&thread_lock);
open_conns++;
pthread_mutex_unlock(&thread_lock);
submit_receive(&c->ring, c);
} else {
struct io_uring_sqe *sqe;
int domain;
if (ipv6)
domain = AF_INET6;
else
domain = AF_INET;
/*
* If fixed_files is set, proxy will use fixed files for any new
* file descriptors it instantiates. Fixd files, or fixed
* descriptors, are io_uring private file descriptors. They
* cannot be accessed outside of io_uring. io_uring holds a
* fixed reference to them, which means that we do not need to
* grab per-request references to them. Particularly for
* threaded applications, grabbing and dropping file references
* for each operation can be costly as the file table is shared.
* This generally shows up as fget/fput related overhead in any
* workload profiles.
*
* Fixed descriptors are passed in via the 'fd' field just like
* regular descriptors, and then marked as such by setting the
* IOSQE_FIXED_FILE flag in the sqe->flags field. Some helpers
* do that automatically, like the below, others will need it
* set manually if they don't have a *direct*() helper.
*
* For operations that instantiate them, like the opening of a
* direct socket, the application may either ask the kernel to
* find a free one (as is done below), or the application may
* manage the space itself and pass in an index for a currently
* free slot in the table. If the kernel is asked to allocate a
* free direct descriptor, note that io_uring does not abide by
* the POSIX mandated "lowest free must be returned". It may
* return any free descriptor of its choosing.
*/
sqe = get_sqe(&c->ring);
if (fixed_files)
io_uring_prep_socket_direct_alloc(sqe, domain, SOCK_STREAM, 0, 0);
else
io_uring_prep_socket(sqe, domain, SOCK_STREAM, 0, 0);
encode_userdata(sqe, c, __SOCK, 0, 0);
}
}
/*
* Start of connection, we got our in descriptor.
*/
static int handle_fd_pass(struct io_uring_cqe *cqe)
{
struct conn *c = cqe_to_conn(cqe);
int fd = cqe_to_fd(cqe);
vlog("%d: got fd pass %d\n", c->tid, fd);
c->in_fd = fd;
open_socket(c);
return 0;
}
static int handle_stop(struct io_uring_cqe *cqe)
{
struct conn *c = cqe_to_conn(cqe);
printf("Client %d: queueing shutdown\n", c->tid);
queue_cancel(&c->ring, c);
return 0;
}
/*
* Called for each CQE that we receive. Decode the request type that it
* came from, and call the appropriate handler.
*/
static int handle_cqe(struct io_uring *ring, struct io_uring_cqe *cqe)
{
int ret;
/*
* Unlikely, but there's an error in this CQE. If an error handler
* is defined, call it, and that will deal with it. If no error
* handler is defined, the opcode handler either doesn't care or will
* handle it on its own.
*/
if (cqe->res < 0) {
struct error_handler *err = &error_handlers[cqe_to_op(cqe)];
if (err->error_fn)
return err->error_fn(err, ring, cqe);
}
switch (cqe_to_op(cqe)) {
case __ACCEPT:
ret = handle_accept(ring, cqe);
break;
case __SOCK:
ret = handle_sock(ring, cqe);
break;
case __CONNECT:
ret = handle_connect(ring, cqe);
break;
case __RECV:
case __RECVMSG:
ret = handle_recv(ring, cqe);
break;
case __SEND:
case __SENDMSG:
ret = handle_send(ring, cqe);
break;
case __CANCEL:
ret = handle_cancel(ring, cqe);
break;
case __SHUTDOWN:
ret = handle_shutdown(ring, cqe);
break;
case __CLOSE:
ret = handle_close(ring, cqe);
break;
case __FD_PASS:
ret = handle_fd_pass(cqe);
break;
case __STOP:
ret = handle_stop(cqe);
break;
case __NOP:
ret = 0;
break;
default:
fprintf(stderr, "bad user data %lx\n", (long) cqe->user_data);
return 1;
}
return ret;
}
static void house_keeping(struct io_uring *ring)
{
static unsigned long last_bytes;
unsigned long bytes, elapsed;
struct conn *c;
int i, j;
vlog("House keeping entered\n");
bytes = 0;
for (i = 0; i < nr_conns; i++) {
c = &conns[i];
for (j = 0; j < 2; j++) {
struct conn_dir *cd = &c->cd[j];
bytes += cd->in_bytes + cd->out_bytes;
}
if (c->flags & CONN_F_DISCONNECTED) {
vlog("%d: disconnected\n", i);
if (!(c->flags & CONN_F_REAPED)) {
void *ret;
pthread_join(c->thread, &ret);
c->flags |= CONN_F_REAPED;
}
continue;
}
if (c->flags & CONN_F_DISCONNECTING)
continue;
if (should_shutdown(c)) {
__close_conn(ring, c);
c->flags |= CONN_F_DISCONNECTING;
}
}
elapsed = mtime_since_now(&last_housekeeping);
if (bytes && elapsed >= 900) {
unsigned long bw;
bw = (8 * (bytes - last_bytes) / 1000UL) / elapsed;
if (bw) {
if (open_conns)
printf("Bandwidth (threads=%d): %'luMbit\n", open_conns, bw);
gettimeofday(&last_housekeeping, NULL);
last_bytes = bytes;
}
}
}
/*
* Event loop shared between the parent, and the connections. Could be
* split in two, as they don't handle the same types of events. For the per
* connection loop, 'c' is valid. For the main loop, it's NULL.
*/
static int __event_loop(struct io_uring *ring, struct conn *c)
{
struct __kernel_timespec active_ts, idle_ts;
int flags;
idle_ts.tv_sec = 0;
idle_ts.tv_nsec = 100000000LL;
active_ts = idle_ts;
if (wait_usec > 1000000) {
active_ts.tv_sec = wait_usec / 1000000;
wait_usec -= active_ts.tv_sec * 1000000;
}
active_ts.tv_nsec = wait_usec * 1000;
gettimeofday(&last_housekeeping, NULL);
flags = 0;
while (1) {
struct __kernel_timespec *ts = &idle_ts;
struct io_uring_cqe *cqe;
unsigned int head;
int ret, i, to_wait;
/*
* If wait_batch is set higher than 1, then we'll wait on
* that amount of CQEs to be posted each loop. If used with
* DEFER_TASKRUN, this can provide a substantial reduction
* in context switch rate as the task isn't woken until the
* requested number of events can be returned.
*
* Can be used with -t to set a wait_usec timeout as well.
* For example, if an application can deal with 250 usec
* of wait latencies, it can set -w8 -t250 which will cause
* io_uring to return when either 8 events have been received,
* or if 250 usec of waiting has passed.
*
* If we don't have any open connections, wait on just 1
* always.
*/
to_wait = 1;
if (open_conns && !flags) {
ts = &active_ts;
to_wait = wait_batch;
}
vlog("Submit and wait for %d\n", to_wait);
ret = io_uring_submit_and_wait_timeout(ring, &cqe, to_wait, ts, NULL);
if (*ring->cq.koverflow)
printf("overflow %u\n", *ring->cq.koverflow);
if (*ring->sq.kflags & IORING_SQ_CQ_OVERFLOW)
printf("saw overflow\n");
vlog("Submit and wait: %d\n", ret);
i = flags = 0;
io_uring_for_each_cqe(ring, head, cqe) {
if (handle_cqe(ring, cqe))
return 1;
flags |= cqe_to_conn(cqe)->flags;
++i;
}
vlog("Handled %d events\n", i);
/*
* Advance the CQ ring for seen events when we've processed
* all of them in this loop. This can also be done with
* io_uring_cqe_seen() in each handler above, which just marks
* that single CQE as seen. However, it's more efficient to
* mark a batch as seen when we're done with that batch.
*/
if (i) {
io_uring_cq_advance(ring, i);
events += i;
}
event_loops++;
if (c) {
if (c->flags & CONN_F_DISCONNECTED)
break;
} else {
house_keeping(ring);
}
}
return 0;
}
/*
* Main event loop, Submit our multishot accept request, and then just loop
* around handling incoming connections.
*/
static int parent_loop(struct io_uring *ring, int fd)
{
struct io_uring_sqe *sqe;
/*
* proxy provides a way to use either multishot receive or not, but
* for accept, we always use multishot. A multishot accept request
* needs only be armed once, and then it'll trigger a completion and
* post a CQE whenever a new connection is accepted. No need to do
* anything else, unless the multishot accept terminates. This happens
* if it encounters an error. Applications should check for
* IORING_CQE_F_MORE in cqe->flags - this tells you if more completions
* are expected from this request or not. Non-multishot never have
* this set, where multishot will always have this set unless an error
* occurs.
*/
sqe = get_sqe(ring);
if (fixed_files)
io_uring_prep_multishot_accept_direct(sqe, fd, NULL, NULL, 0);
else
io_uring_prep_multishot_accept(sqe, fd, NULL, NULL, 0);
__encode_userdata(sqe, 0, __ACCEPT, 0, fd);
return __event_loop(ring, NULL);
}
static int init_ring(struct io_uring *ring, int nr_files)
{
struct io_uring_params params;
int ret;
/*
* By default, set us up with a big CQ ring. Not strictly needed
* here, but it's very important to never overflow the CQ ring.
* Events will not be dropped if this happens, but it does slow
* the application down in dealing with overflown events.
*
* Set SINGLE_ISSUER, which tells the kernel that only one thread
* is doing IO submissions. This enables certain optimizations in
* the kernel.
*/
memset(¶ms, 0, sizeof(params));
params.flags |= IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_CLAMP;
params.flags |= IORING_SETUP_CQSIZE;
params.cq_entries = 1024;
/*
* If use_huge is set, setup the ring with IORING_SETUP_NO_MMAP. This
* means that the application allocates the memory for the ring, and
* the kernel maps it. The alternative is having the kernel allocate
* the memory, and then liburing will mmap it. But we can't really
* support huge pages that way. If this fails, then ensure that the
* system has huge pages set aside upfront.
*/
if (use_huge)
params.flags |= IORING_SETUP_NO_MMAP;
/*
* DEFER_TASKRUN decouples async event reaping and retrying from
* regular system calls. If this isn't set, then io_uring uses
* normal task_work for this. task_work is always being run on any
* exit to userspace. Real applications do more than just call IO
* related system calls, and hence we can be running this work way
* too often. Using DEFER_TASKRUN defers any task_work running to
* when the application enters the kernel anyway to wait on new
* events. It's generally the preferred and recommended way to setup
* a ring.
*/
if (defer_tw) {
params.flags |= IORING_SETUP_DEFER_TASKRUN;
sqpoll = 0;
}
/*
* SQPOLL offloads any request submission and retry operations to a
* dedicated thread. This enables an application to do IO without
* ever having to enter the kernel itself. The SQPOLL thread will
* stay busy as long as there's work to do, and go to sleep if
* sq_thread_idle msecs have passed. If it's running, submitting new
* IO just needs to make them visible to the SQPOLL thread, it needs
* not enter the kernel. For submission, the application will only
* enter the kernel if the SQPOLL has been idle long enough that it
* has gone to sleep.
*
* Waiting on events still need to enter the kernel, if none are
* available. The application may also use io_uring_peek_cqe() to
* check for new events without entering the kernel, as completions
* will be continually produced to the CQ ring by the SQPOLL thread
* as they occur.
*/
if (sqpoll) {
params.flags |= IORING_SETUP_SQPOLL;
params.sq_thread_idle = 1000;
defer_tw = 0;
}
/*
* If neither DEFER_TASKRUN or SQPOLL is used, set COOP_TASKRUN. This
* avoids heavy signal based notifications, which can force an
* application to enter the kernel and process it as soon as they
* occur.
*/
if (!sqpoll && !defer_tw)
params.flags |= IORING_SETUP_COOP_TASKRUN;
/*
* The SQ ring size need not be larger than any batch of requests
* that need to be prepared before submit. Normally in a loop we'd
* only need a few, if any, particularly if multishot is used.
*/
ret = io_uring_queue_init_params(ring_size, ring, ¶ms);
if (ret) {
fprintf(stderr, "%s\n", strerror(-ret));
return 1;
}
/*
* If send serialization is available and no option was given to use
* it or not, default it to on. If it was turned on and the kernel
* doesn't support it, turn it off.
*/
if (params.features & IORING_FEAT_SEND_BUF_SELECT) {
if (send_ring == -1)
send_ring = 1;
} else {
if (send_ring == 1) {
fprintf(stderr, "Kernel doesn't support ring provided "
"buffers for sends, disabled\n");
}
send_ring = 0;
}
if (!send_ring && snd_bundle) {
fprintf(stderr, "Can't use send bundle without send_ring\n");
snd_bundle = 0;
}
if (fixed_files) {
/*
* If fixed files are used, we need to allocate a fixed file
* table upfront where new direct descriptors can be managed.
*/
ret = io_uring_register_files_sparse(ring, nr_files);
if (ret) {
fprintf(stderr, "file register: %d\n", ret);
return 1;
}
/*
* If fixed files are used, we also register the ring fd. See
* comment near io_uring_prep_socket_direct_alloc() further
* down. This avoids the fget/fput overhead associated with
* the io_uring_enter(2) system call itself, which is used to
* submit and wait on events.
*/
ret = io_uring_register_ring_fd(ring);
if (ret != 1) {
fprintf(stderr, "ring register: %d\n", ret);
return 1;
}
}
if (napi) {
struct io_uring_napi n = {
.prefer_busy_poll = napi > 1 ? 1 : 0,
.busy_poll_to = napi_timeout,
};
ret = io_uring_register_napi(ring, &n);
if (ret) {
fprintf(stderr, "io_uring_register_napi: %d\n", ret);
if (ret != -EINVAL)
return 1;
fprintf(stderr, "NAPI not available, turned off\n");
}
}
return 0;
}
static void *thread_main(void *data)
{
struct conn *c = data;
int ret;
c->flags |= CONN_F_STARTED;
/* we need a max of 4 descriptors for each client */
ret = init_ring(&c->ring, 4);
if (ret)
goto done;
if (setup_buffer_rings(&c->ring, c))
goto done;
/*
* If we're using fixed files, then we need to wait for the parent
* to install the c->in_fd into our direct descriptor table. When
* that happens, we'll set things up. If we're not using fixed files,
* we can set up the receive or connect now.
*/
if (!fixed_files)
open_socket(c);
/* we're ready */
pthread_barrier_wait(&c->startup_barrier);
__event_loop(&c->ring, c);
done:
return NULL;
}
static void usage(const char *name)
{
printf("%s:\n", name);
printf("\t-m:\t\tUse multishot receive (%d)\n", recv_mshot);
printf("\t-d:\t\tUse DEFER_TASKRUN (%d)\n", defer_tw);
printf("\t-S:\t\tUse SQPOLL (%d)\n", sqpoll);
printf("\t-f:\t\tUse only fixed files (%d)\n", fixed_files);
printf("\t-a:\t\tUse huge pages for the ring (%d)\n", use_huge);
printf("\t-t:\t\tTimeout for waiting on CQEs (usec) (%d)\n", wait_usec);
printf("\t-w:\t\tNumber of CQEs to wait for each loop (%d)\n", wait_batch);
printf("\t-B:\t\tUse bi-directional mode (%d)\n", bidi);
printf("\t-s:\t\tAct only as a sink (%d)\n", is_sink);
printf("\t-q:\t\tRing size to use (%d)\n", ring_size);
printf("\t-H:\t\tHost to connect to (%s)\n", host);
printf("\t-r:\t\tPort to receive on (%d)\n", receive_port);
printf("\t-p:\t\tPort to connect to (%d)\n", send_port);
printf("\t-6:\t\tUse IPv6 (%d)\n", ipv6);
printf("\t-N:\t\tUse NAPI polling (%d)\n", napi);
printf("\t-T:\t\tNAPI timeout (usec) (%d)\n", napi_timeout);
printf("\t-b:\t\tSend/receive buf size (%d)\n", buf_size);
printf("\t-n:\t\tNumber of provided buffers (pow2) (%d)\n", nr_bufs);
printf("\t-u:\t\tUse provided buffers for send (%d)\n", send_ring);
printf("\t-C:\t\tUse bundles for send (%d)\n", snd_bundle);
printf("\t-z:\t\tUse zerocopy send (%d)\n", snd_zc);
printf("\t-c:\t\tUse bundles for recv (%d)\n", snd_bundle);
printf("\t-M:\t\tUse sendmsg (%d)\n", snd_msg);
printf("\t-M:\t\tUse recvmsg (%d)\n", rcv_msg);
printf("\t-x:\t\tShow extended stats (%d)\n", ext_stat);
printf("\t-V:\t\tIncrease verbosity (%d)\n", verbose);
}
/*
* Options parsing the ring / net setup
*/
int main(int argc, char *argv[])
{
struct io_uring ring;
struct sigaction sa = { };
const char *optstring;
int opt, ret, fd;
setlocale(LC_NUMERIC, "en_US");
page_size = sysconf(_SC_PAGESIZE);
if (page_size < 0) {
perror("sysconf(_SC_PAGESIZE)");
return 1;
}
pthread_mutex_init(&thread_lock, NULL);
optstring = "m:d:S:s:b:f:H:r:p:n:B:N:T:w:t:M:R:u:c:C:q:a:x:z:i:6Vh?";
while ((opt = getopt(argc, argv, optstring)) != -1) {
switch (opt) {
case 'm':
recv_mshot = !!atoi(optarg);
break;
case 'S':
sqpoll = !!atoi(optarg);
break;
case 'd':
defer_tw = !!atoi(optarg);
break;
case 'b':
buf_size = atoi(optarg);
break;
case 'n':
nr_bufs = atoi(optarg);
break;
case 'u':
send_ring = !!atoi(optarg);
break;
case 'c':
rcv_bundle = !!atoi(optarg);
break;
case 'C':
snd_bundle = !!atoi(optarg);
break;
case 'w':
wait_batch = atoi(optarg);
break;
case 't':
wait_usec = atoi(optarg);
break;
case 's':
is_sink = !!atoi(optarg);
break;
case 'f':
fixed_files = !!atoi(optarg);
break;
case 'H':
host = strdup(optarg);
break;
case 'r':
receive_port = atoi(optarg);
break;
case 'p':
send_port = atoi(optarg);
break;
case 'B':
bidi = !!atoi(optarg);
break;
case 'N':
napi = !!atoi(optarg);
break;
case 'T':
napi_timeout = atoi(optarg);
break;
case '6':
ipv6 = true;
break;
case 'M':
snd_msg = !!atoi(optarg);
break;
case 'z':
snd_zc = !!atoi(optarg);
break;
case 'R':
rcv_msg = !!atoi(optarg);
break;
case 'q':
ring_size = atoi(optarg);
break;
case 'i':
buf_ring_inc = !!atoi(optarg);
break;
case 'a':
use_huge = !!atoi(optarg);
break;
case 'x':
ext_stat = !!atoi(optarg);
break;
case 'V':
verbose++;
break;
case 'h':
default:
usage(argv[0]);
return 1;
}
}
if (bidi && is_sink) {
fprintf(stderr, "Can't be both bidi proxy and sink\n");
return 1;
}
if (snd_msg && sqpoll) {
fprintf(stderr, "SQPOLL with msg variants disabled\n");
snd_msg = 0;
}
if (rcv_msg && rcv_bundle) {
fprintf(stderr, "Can't use bundles with recvmsg\n");
rcv_msg = 0;
}
if (snd_msg && snd_bundle) {
fprintf(stderr, "Can't use bundles with sendmsg\n");
snd_msg = 0;
}
if (snd_msg && send_ring) {
fprintf(stderr, "Can't use send ring sendmsg\n");
snd_msg = 0;
}
if (snd_zc && (send_ring || snd_bundle)) {
fprintf(stderr, "Can't use send zc with bundles or ring\n");
send_ring = snd_bundle = 0;
}
/*
* For recvmsg w/multishot, we waste some data at the head of the
* packet every time. Adjust the buffer size to account for that,
* so we're still handing 'buf_size' actual payload of data.
*/
if (rcv_msg && recv_mshot) {
fprintf(stderr, "Adjusted buf size for recvmsg w/multishot\n");
buf_size += sizeof(struct io_uring_recvmsg_out);
}
br_mask = nr_bufs - 1;
fd = setup_listening_socket(receive_port, ipv6);
if (is_sink)
send_port = -1;
if (fd == -1)
return 1;
atexit(show_stats);
sa.sa_handler = sig_int;
sa.sa_flags = SA_RESTART;
sigaction(SIGINT, &sa, NULL);
ret = init_ring(&ring, MAX_CONNS * 3);
if (ret)
return ret;
printf("Backend: sqpoll=%d, defer_tw=%d, fixed_files=%d, "
"is_sink=%d, buf_size=%d, nr_bufs=%d, host=%s, send_port=%d, "
"receive_port=%d, napi=%d, napi_timeout=%d, huge_page=%d\n",
sqpoll, defer_tw, fixed_files, is_sink,
buf_size, nr_bufs, host, send_port, receive_port,
napi, napi_timeout, use_huge);
printf(" recv options: recvmsg=%d, recv_mshot=%d, recv_bundle=%d\n",
rcv_msg, recv_mshot, rcv_bundle);
printf(" send options: sendmsg=%d, send_ring=%d, send_bundle=%d, "
"send_zerocopy=%d\n", snd_msg, send_ring, snd_bundle,
snd_zc);
return parent_loop(&ring, fd);
}
liburing-2.9/examples/proxy.h 0000664 0000000 0000000 00000004105 14750134674 0016362 0 ustar 00root root 0000000 0000000 /* SPDX-License-Identifier: MIT */
#ifndef LIBURING_PROXY_H
#define LIBURING_PROXY_H
#include
/*
* Generic opcode agnostic encoding to sqe/cqe->user_data
*/
struct userdata {
union {
struct {
uint16_t op_tid; /* 4 bits op, 12 bits tid */
uint16_t bid;
uint16_t fd;
};
uint64_t val;
};
};
#define OP_SHIFT (12)
#define TID_MASK ((1U << 12) - 1)
/*
* Packs the information that we will need at completion time into the
* sqe->user_data field, which is passed back in the completion in
* cqe->user_data. Some apps would need more space than this, and in fact
* I'd love to pack the requested IO size in here, and it's not uncommon to
* see apps use this field as just a cookie to either index a data structure
* at completion time, or even just put the pointer to the associated
* structure into this field.
*/
static inline void __encode_userdata(struct io_uring_sqe *sqe, int tid, int op,
int bid, int fd)
{
struct userdata ud = {
.op_tid = (op << OP_SHIFT) | tid,
.bid = bid,
.fd = fd
};
io_uring_sqe_set_data64(sqe, ud.val);
}
static inline uint64_t __raw_encode(int tid, int op, int bid, int fd)
{
struct userdata ud = {
.op_tid = (op << OP_SHIFT) | tid,
.bid = bid,
.fd = fd
};
return ud.val;
}
static inline int cqe_to_op(struct io_uring_cqe *cqe)
{
struct userdata ud = { .val = cqe->user_data };
return ud.op_tid >> OP_SHIFT;
}
static inline int cqe_to_bid(struct io_uring_cqe *cqe)
{
struct userdata ud = { .val = cqe->user_data };
return ud.bid;
}
static inline int cqe_to_fd(struct io_uring_cqe *cqe)
{
struct userdata ud = { .val = cqe->user_data };
return ud.fd;
}
static unsigned long long mtime_since(const struct timeval *s,
const struct timeval *e)
{
long long sec, usec;
sec = e->tv_sec - s->tv_sec;
usec = (e->tv_usec - s->tv_usec);
if (sec > 0 && usec < 0) {
sec--;
usec += 1000000;
}
sec *= 1000;
usec /= 1000;
return sec + usec;
}
static unsigned long long mtime_since_now(struct timeval *tv)
{
struct timeval end;
gettimeofday(&end, NULL);
return mtime_since(tv, &end);
}
#endif
liburing-2.9/examples/reg-wait.c 0000664 0000000 0000000 00000011132 14750134674 0016711 0 ustar 00root root 0000000 0000000 /* SPDX-License-Identifier: MIT */
/*
* Sample program that shows how to use registered waits.
*
* (C) 2024 Jens Axboe
*/
#include
#include
#include
#include
#include
#include
#include
#include
static unsigned long long mtime_since(const struct timeval *s,
const struct timeval *e)
{
long long sec, usec;
sec = e->tv_sec - s->tv_sec;
usec = (e->tv_usec - s->tv_usec);
if (sec > 0 && usec < 0) {
sec--;
usec += 1000000;
}
sec *= 1000;
usec /= 1000;
return sec + usec;
}
static unsigned long long mtime_since_now(struct timeval *tv)
{
struct timeval end;
gettimeofday(&end, NULL);
return mtime_since(tv, &end);
}
static int register_memory(struct io_uring *ring, void *ptr, size_t size)
{
struct io_uring_region_desc rd = {};
struct io_uring_mem_region_reg mr = {};
rd.user_addr = (__u64)(unsigned long)ptr;
rd.size = size;
rd.flags = IORING_MEM_REGION_TYPE_USER;
mr.region_uptr = (__u64)(unsigned long)&rd;
mr.flags = IORING_MEM_REGION_REG_WAIT_ARG;
return io_uring_register_region(ring, &mr);
}
int main(int argc, char *argv[])
{
struct io_uring_reg_wait *reg;
struct io_uring_sqe *sqe;
struct io_uring_cqe *cqe[2];
struct io_uring ring;
char b1[8], b2[8];
unsigned long msec;
struct timeval tv;
int ret, fds[2];
int page_size;
if (argc > 1) {
fprintf(stdout, "%s: takes no arguments\n", argv[0]);
return 0;
}
page_size = sysconf(_SC_PAGESIZE);
if (page_size < 0) {
fprintf(stderr, "sysconf(_SC_PAGESIZE) failed\n");
return 1;
}
if (pipe(fds) < 0) {
perror("pipe");
return 1;
}
ret = io_uring_queue_init(8, &ring, IORING_SETUP_R_DISABLED);
if (ret) {
fprintf(stderr, "Queue init: %d\n", ret);
return 1;
}
/*
* Setup a region we'll use to pass wait arguments. It should be
* page aligned, we're using only first two wait entries here and
* the rest of the memory can be reused for other purposes.
*/
reg = aligned_alloc(page_size, page_size);
if (!reg) {
fprintf(stderr, "allocation failed\n");
return 1;
}
ret = register_memory(&ring, reg, page_size);
if (ret) {
if (ret == -EINVAL) {
fprintf(stderr, "Kernel doesn't support registered waits\n");
return 1;
}
fprintf(stderr, "Registered wait: %d\n", ret);
return 1;
}
ret = io_uring_enable_rings(&ring);
if (ret) {
fprintf(stderr, "io_uring_enable_rings failure %i\n", ret);
return 1;
}
/*
* Setup two distinct wait regions. Index 0 will be a 1 second wait,
* and region 2 is a short wait using min_wait_usec as well. Neither
* of these use a signal mask, but sigmask/sigmask_sz can be set as
* well for that.
*/
reg[0].ts.tv_sec = 1;
reg[0].ts.tv_nsec = 0;
reg[0].flags = IORING_REG_WAIT_TS;
reg[1].ts.tv_sec = 0;
reg[1].ts.tv_nsec = 100000000LL;
reg[1].min_wait_usec = 10000;
reg[1].flags = IORING_REG_WAIT_TS;
/*
* No pending completions. Wait with region 0, which should time
* out after 1 second.
*/
gettimeofday(&tv, NULL);
ret = io_uring_submit_and_wait_reg(&ring, cqe, 1, 0);
if (ret == -EINVAL) {
fprintf(stderr, "Kernel doesn't support registered waits\n");
return 1;
} else if (ret != -ETIME) {
fprintf(stderr, "Wait should've timed out... %d\n", ret);
return 1;
}
msec = mtime_since_now(&tv);
if (msec < 900 || msec > 1100) {
fprintf(stderr, "Wait took an unexpected amount of time: %lu\n",
msec);
return 1;
}
/*
* Now prepare two pipe reads. We'll trigger one completion quickly,
* but the other one will never happen. Use min_wait_usec timeout
* to abort after 10 msec of time, where the overall timeout is
* otherwise 100 msec. Since we're waiting on two events, the min
* timeout ends up aborting us.
*/
sqe = io_uring_get_sqe(&ring);
io_uring_prep_read(sqe, fds[0], b1, sizeof(b1), 0);
sqe = io_uring_get_sqe(&ring);
io_uring_prep_read(sqe, fds[0], b2, sizeof(b2), 0);
/* trigger one read */
ret = write(fds[1], "Hello", 5);
if (ret < 0) {
perror("write");
return 1;
}
/*
* This should will wait for 2 entries, where 1 is already available.
* Since we're using min_wait_usec == 10 msec here with an overall
* wait of 100 msec, we expect the wait to abort after 10 msec since
* one or more events are available.
*/
gettimeofday(&tv, NULL);
ret = io_uring_submit_and_wait_reg(&ring, cqe, 2, 1);
msec = mtime_since_now(&tv);
if (ret != 2) {
fprintf(stderr, "Should have submitted 2: %d\n", ret);
return 1;
}
if (msec < 8 || msec > 12)
fprintf(stderr, "min_wait_usec should take ~10 msec: %lu\n", msec);
/*
* Cleanup after ourselves
*/
io_uring_queue_exit(&ring);
free(reg);
return 0;
}
liburing-2.9/examples/rsrc-update-bench.c 0000664 0000000 0000000 00000004140 14750134674 0020501 0 ustar 00root root 0000000 0000000 /* SPDX-License-Identifier: MIT */
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "liburing.h"
static unsigned long runtime_ms = 10000;
static unsigned long gettimeofday_ms(void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
}
int main(void)
{
unsigned long tstop;
unsigned long nr_reqs = 0;
struct io_uring_cqe *cqe;
struct io_uring_sqe *sqe;
struct io_uring ring;
int pipe1[2];
int ret, i, qd = 32;
int table_size = 128;
if (pipe(pipe1) != 0) {
perror("pipe");
return 1;
}
ret = io_uring_queue_init(1024, &ring, IORING_SETUP_SINGLE_ISSUER |
IORING_SETUP_DEFER_TASKRUN);
if (ret) {
fprintf(stderr, "io_uring_queue_init failed: %d\n", ret);
return 1;
}
ret = io_uring_register_ring_fd(&ring);
if (ret < 0) {
fprintf(stderr, "io_uring_register_ring_fd failed\n");
return 1;
}
ret = io_uring_register_files_sparse(&ring, table_size);
if (ret < 0) {
fprintf(stderr, "io_uring_register_files_sparse failed\n");
return 1;
}
for (i = 0; i < table_size; i++) {
ret = io_uring_register_files_update(&ring, i, pipe1, 1);
if (ret < 0) {
fprintf(stderr, "io_uring_register_files_update failed\n");
return 1;
}
}
srand(time(NULL));
tstop = gettimeofday_ms() + runtime_ms;
do {
int off = rand();
for (i = 0; i < qd; i++) {
sqe = io_uring_get_sqe(&ring);
int roff = (off + i) % table_size;
io_uring_prep_files_update(sqe, pipe1, 1, roff);
}
ret = io_uring_submit(&ring);
if (ret != qd) {
fprintf(stderr, "child: sqe submit failed: %d\n", ret);
return 1;
}
for (i = 0; i < qd; i++) {
ret = io_uring_wait_cqe(&ring, &cqe);
if (ret < 0) {
fprintf(stderr, "child: wait completion %d\n", ret);
break;
}
io_uring_cqe_seen(&ring, cqe);
nr_reqs++;
}
} while (gettimeofday_ms() < tstop);
fprintf(stderr, "max updates/s: %lu\n", nr_reqs * 1000UL / runtime_ms);
io_uring_queue_exit(&ring);
close(pipe1[0]);
close(pipe1[1]);
return 0;
}
liburing-2.9/examples/send-zerocopy.c 0000664 0000000 0000000 00000035327 14750134674 0020007 0 ustar 00root root 0000000 0000000 /* SPDX-License-Identifier: MIT */
/* based on linux-kernel/tools/testing/selftests/net/msg_zerocopy.c */
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "liburing.h"
#define ZC_TAG 0xfffffffULL
#define MAX_SUBMIT_NR 512
#define MAX_THREADS 100
struct thread_data {
pthread_t thread;
void *ret;
int idx;
unsigned long long packets;
unsigned long long bytes;
unsigned long long dt_ms;
struct sockaddr_storage dst_addr;
int fd;
};
static bool cfg_reg_ringfd = true;
static bool cfg_fixed_files = 1;
static bool cfg_zc = 1;
static int cfg_nr_reqs = 8;
static bool cfg_fixed_buf = 1;
static bool cfg_hugetlb = 0;
static bool cfg_defer_taskrun = 0;
static int cfg_cpu = -1;
static bool cfg_rx = 0;
static unsigned cfg_nr_threads = 1;
static int cfg_family = PF_UNSPEC;
static int cfg_type = 0;
static int cfg_payload_len;
static int cfg_port = 8000;
static int cfg_runtime_ms = 4200;
static bool cfg_rx_poll = false;
static socklen_t cfg_alen;
static char *str_addr = NULL;
static char payload_buf[IP_MAXPACKET] __attribute__((aligned(4096)));
static char *payload;
static struct thread_data threads[MAX_THREADS];
static pthread_barrier_t barrier;
static bool should_stop = false;
static void sigint_handler(__attribute__((__unused__)) int sig)
{
/* kill if should_stop can't unblock threads fast enough */
if (should_stop)
_exit(-1);
should_stop = true;
}
/*
* Implementation of error(3), prints an error message and exits.
*/
static void t_error(int status, int errnum, const char *format, ...)
{
va_list args;
va_start(args, format);
vfprintf(stderr, format, args);
if (errnum)
fprintf(stderr, ": %s", strerror(errnum));
fprintf(stderr, "\n");
va_end(args);
exit(status);
}
static void set_cpu_affinity(void)
{
cpu_set_t mask;
if (cfg_cpu == -1)
return;
CPU_ZERO(&mask);
CPU_SET(cfg_cpu, &mask);
if (sched_setaffinity(0, sizeof(mask), &mask))
t_error(1, errno, "unable to pin cpu\n");
}
static void set_iowq_affinity(struct io_uring *ring)
{
cpu_set_t mask;
int ret;
if (cfg_cpu == -1)
return;
CPU_ZERO(&mask);
CPU_SET(cfg_cpu, &mask);
ret = io_uring_register_iowq_aff(ring, 1, &mask);
if (ret)
t_error(1, ret, "unabled to set io-wq affinity\n");
}
static unsigned long gettimeofday_ms(void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
}
static void do_setsockopt(int fd, int level, int optname, int val)
{
if (setsockopt(fd, level, optname, &val, sizeof(val)))
t_error(1, errno, "setsockopt %d.%d: %d", level, optname, val);
}
static void setup_sockaddr(int domain, const char *str_addr,
struct sockaddr_storage *sockaddr)
{
struct sockaddr_in6 *addr6 = (void *) sockaddr;
struct sockaddr_in *addr4 = (void *) sockaddr;
int port = cfg_port;
switch (domain) {
case PF_INET:
memset(addr4, 0, sizeof(*addr4));
addr4->sin_family = AF_INET;
addr4->sin_port = htons(port);
if (str_addr &&
inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
t_error(1, 0, "ipv4 parse error: %s", str_addr);
break;
case PF_INET6:
memset(addr6, 0, sizeof(*addr6));
addr6->sin6_family = AF_INET6;
addr6->sin6_port = htons(port);
if (str_addr &&
inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
t_error(1, 0, "ipv6 parse error: %s", str_addr);
break;
default:
t_error(1, 0, "illegal domain");
}
}
static int do_poll(int fd, int events)
{
struct pollfd pfd;
int ret;
pfd.events = events;
pfd.revents = 0;
pfd.fd = fd;
ret = poll(&pfd, 1, -1);
if (ret == -1)
t_error(1, errno, "poll");
return ret && (pfd.revents & events);
}
/* Flush all outstanding bytes for the tcp receive queue */
static int do_flush_tcp(struct thread_data *td, int fd)
{
int ret;
/* MSG_TRUNC flushes up to len bytes */
ret = recv(fd, NULL, 1 << 21, MSG_TRUNC | MSG_DONTWAIT);
if (ret == -1 && errno == EAGAIN)
return 0;
if (ret == -1)
t_error(1, errno, "flush");
if (!ret)
return 1;
td->packets++;
td->bytes += ret;
return 0;
}
/* Flush all outstanding datagrams. Verify first few bytes of each. */
static int do_flush_datagram(struct thread_data *td, int fd)
{
long ret, off = 0;
char buf[64];
/* MSG_TRUNC will return full datagram length */
ret = recv(fd, buf, sizeof(buf), MSG_DONTWAIT | MSG_TRUNC);
if (ret == -1 && errno == EAGAIN)
return 0;
if (ret == -1)
t_error(1, errno, "recv");
if (ret != cfg_payload_len)
t_error(1, 0, "recv: ret=%u != %u", ret, cfg_payload_len);
if ((unsigned long) ret > sizeof(buf) - off)
ret = sizeof(buf) - off;
if (memcmp(buf + off, payload, ret))
t_error(1, 0, "recv: data mismatch");
td->packets++;
td->bytes += cfg_payload_len;
return 0;
}
static void do_setup_rx(int domain, int type, int protocol)
{
struct sockaddr_storage addr = {};
struct thread_data *td;
int listen_fd, fd;
unsigned int i;
fd = socket(domain, type, protocol);
if (fd == -1)
t_error(1, errno, "socket r");
do_setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, 1);
setup_sockaddr(cfg_family, str_addr, &addr);
if (bind(fd, (void *)&addr, cfg_alen))
t_error(1, errno, "bind");
if (type != SOCK_STREAM) {
if (cfg_nr_threads != 1)
t_error(1, 0, "udp rx cant multithread");
threads[0].fd = fd;
return;
}
listen_fd = fd;
if (listen(listen_fd, cfg_nr_threads))
t_error(1, errno, "listen");
for (i = 0; i < cfg_nr_threads; i++) {
td = &threads[i];
fd = accept(listen_fd, NULL, NULL);
if (fd == -1)
t_error(1, errno, "accept");
td->fd = fd;
}
if (close(listen_fd))
t_error(1, errno, "close listen sock");
}
static void *do_rx(void *arg)
{
struct thread_data *td = arg;
const int cfg_receiver_wait_ms = 400;
uint64_t tstop;
int ret, fd = td->fd;
tstop = gettimeofday_ms() + cfg_runtime_ms + cfg_receiver_wait_ms;
do {
if (cfg_type == SOCK_STREAM)
ret = do_flush_tcp(td, fd);
else
ret = do_flush_datagram(td, fd);
if (ret)
break;
do_poll(fd, POLLIN);
} while (gettimeofday_ms() < tstop);
if (close(fd))
t_error(1, errno, "close");
pthread_exit(&td->ret);
return NULL;
}
static inline struct io_uring_cqe *wait_cqe_fast(struct io_uring *ring)
{
struct io_uring_cqe *cqe;
unsigned head;
int ret;
io_uring_for_each_cqe(ring, head, cqe)
return cqe;
ret = io_uring_wait_cqe(ring, &cqe);
if (ret)
t_error(1, ret, "wait cqe");
return cqe;
}
static void do_tx(struct thread_data *td, int domain, int type, int protocol)
{
const int notif_slack = 128;
struct io_uring ring;
struct iovec iov;
uint64_t tstart;
int i, fd, ret;
int compl_cqes = 0;
int ring_flags = IORING_SETUP_COOP_TASKRUN | IORING_SETUP_SINGLE_ISSUER;
unsigned loop = 0;
if (cfg_defer_taskrun)
ring_flags |= IORING_SETUP_DEFER_TASKRUN;
fd = socket(domain, type, protocol);
if (fd == -1)
t_error(1, errno, "socket t");
if (connect(fd, (void *)&td->dst_addr, cfg_alen))
t_error(1, errno, "connect, idx %i", td->idx);
ret = io_uring_queue_init(512, &ring, ring_flags);
if (ret)
t_error(1, ret, "io_uring: queue init");
set_cpu_affinity();
set_iowq_affinity(&ring);
if (cfg_fixed_files) {
ret = io_uring_register_files(&ring, &fd, 1);
if (ret < 0)
t_error(1, ret, "io_uring: files registration");
}
if (cfg_reg_ringfd) {
ret = io_uring_register_ring_fd(&ring);
if (ret < 0)
t_error(1, ret, "io_uring: io_uring_register_ring_fd");
}
iov.iov_base = payload;
iov.iov_len = cfg_payload_len;
ret = io_uring_register_buffers(&ring, &iov, 1);
if (ret)
t_error(1, ret, "io_uring: buffer registration");
if (cfg_rx_poll) {
struct io_uring_sqe *sqe;
sqe = io_uring_get_sqe(&ring);
io_uring_prep_poll_add(sqe, fd, POLLIN);
ret = io_uring_submit(&ring);
if (ret != 1)
t_error(1, ret, "submit poll");
}
pthread_barrier_wait(&barrier);
tstart = gettimeofday_ms();
do {
struct io_uring_sqe *sqe;
struct io_uring_cqe *cqe;
unsigned buf_idx = 0;
unsigned msg_flags = MSG_WAITALL;
for (i = 0; i < cfg_nr_reqs; i++) {
sqe = io_uring_get_sqe(&ring);
if (!cfg_zc)
io_uring_prep_send(sqe, fd, payload,
cfg_payload_len, 0);
else {
io_uring_prep_send_zc(sqe, fd, payload,
cfg_payload_len, msg_flags, 0);
if (cfg_fixed_buf) {
sqe->ioprio |= IORING_RECVSEND_FIXED_BUF;
sqe->buf_index = buf_idx;
}
}
sqe->user_data = 1;
if (cfg_fixed_files) {
sqe->fd = 0;
sqe->flags |= IOSQE_FIXED_FILE;
}
}
if (cfg_defer_taskrun && compl_cqes >= notif_slack)
ret = io_uring_submit_and_get_events(&ring);
else
ret = io_uring_submit(&ring);
if (ret != cfg_nr_reqs)
t_error(1, ret, "submit");
for (i = 0; i < cfg_nr_reqs; i++) {
cqe = wait_cqe_fast(&ring);
if (cqe->flags & IORING_CQE_F_NOTIF) {
if (cqe->flags & IORING_CQE_F_MORE)
t_error(1, -EINVAL, "F_MORE notif");
compl_cqes--;
i--;
io_uring_cqe_seen(&ring, cqe);
continue;
}
if (cqe->flags & IORING_CQE_F_MORE)
compl_cqes++;
if (cqe->res >= 0) {
td->packets++;
td->bytes += cqe->res;
} else if (cqe->res == -ECONNREFUSED || cqe->res == -EPIPE ||
cqe->res == -ECONNRESET) {
fprintf(stderr, "Connection failure\n");
goto out_fail;
} else if (cqe->res != -EAGAIN) {
t_error(1, cqe->res, "send failed");
}
io_uring_cqe_seen(&ring, cqe);
}
if (should_stop)
break;
} while ((++loop % 16 != 0) || gettimeofday_ms() < tstart + cfg_runtime_ms);
td->dt_ms = gettimeofday_ms() - tstart;
out_fail:
shutdown(fd, SHUT_RDWR);
if (close(fd))
t_error(1, errno, "close");
while (compl_cqes) {
struct io_uring_cqe *cqe = wait_cqe_fast(&ring);
io_uring_cqe_seen(&ring, cqe);
compl_cqes--;
}
io_uring_queue_exit(&ring);
}
static void *do_test(void *arg)
{
struct thread_data *td = arg;
int protocol = 0;
setup_sockaddr(cfg_family, str_addr, &td->dst_addr);
do_tx(td, cfg_family, cfg_type, protocol);
pthread_exit(&td->ret);
return NULL;
}
static void usage(const char *filepath)
{
printf("Usage:\t%s -D [options]\n", filepath);
printf("\t%s