pax_global_header00006660000000000000000000000064145361036270014521gustar00rootroot0000000000000052 comment=c1247bffa8fc36de7380a5cd42673a3b32f74c97 xsimd-12.1.1/000077500000000000000000000000001453610362700127275ustar00rootroot00000000000000xsimd-12.1.1/.clang-format000066400000000000000000000002301453610362700152750ustar00rootroot00000000000000--- BasedOnStyle: WebKit AlignAfterOpenBracket: Align AlignConsecutiveDeclarations: 'false' BreakBeforeBraces: Allman NamespaceIndentation: All ... xsimd-12.1.1/.github/000077500000000000000000000000001453610362700142675ustar00rootroot00000000000000xsimd-12.1.1/.github/toolchains/000077500000000000000000000000001453610362700164325ustar00rootroot00000000000000xsimd-12.1.1/.github/toolchains/clang-aarch64-linux-gnu.cmake000066400000000000000000000001621453610362700236710ustar00rootroot00000000000000set(CMAKE_SYSTEM_PROCESSOR aarch64) set(triple aarch64-linux-gnu) include(${CMAKE_CURRENT_LIST_DIR}/clang.cmake) xsimd-12.1.1/.github/toolchains/clang-arm-linux-gnueabihf.cmake000066400000000000000000000001641453610362700243610ustar00rootroot00000000000000set(CMAKE_SYSTEM_PROCESSOR armv7-a) set(triple arm-linux-gnueabihf) include(${CMAKE_CURRENT_LIST_DIR}/clang.cmake) xsimd-12.1.1/.github/toolchains/clang-riscv64-linux-gnu.cmake000066400000000000000000000001621453610362700237410ustar00rootroot00000000000000set(CMAKE_SYSTEM_PROCESSOR riscv64) set(triple riscv64-linux-gnu) include(${CMAKE_CURRENT_LIST_DIR}/clang.cmake) xsimd-12.1.1/.github/toolchains/clang.cmake000066400000000000000000000005341453610362700205220ustar00rootroot00000000000000set(CMAKE_SYSTEM_NAME Linux) set(CMAKE_C_COMPILER clang) set(CMAKE_C_COMPILER_TARGET ${triple}) set(CMAKE_CXX_COMPILER clang++) set(CMAKE_CXX_COMPILER_TARGET ${triple}) set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) xsimd-12.1.1/.github/toolchains/gcc-aarch64-linux-gnu.cmake000066400000000000000000000001601453610362700233370ustar00rootroot00000000000000set(CMAKE_SYSTEM_PROCESSOR aarch64) set(triple aarch64-linux-gnu) include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake) xsimd-12.1.1/.github/toolchains/gcc-arm-linux-gnueabihf.cmake000066400000000000000000000001621453610362700240270ustar00rootroot00000000000000set(CMAKE_SYSTEM_PROCESSOR armv7-a) set(triple arm-linux-gnueabihf) include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake) xsimd-12.1.1/.github/toolchains/gcc-riscv64-linux-gnu.cmake000066400000000000000000000001601453610362700234070ustar00rootroot00000000000000set(CMAKE_SYSTEM_PROCESSOR riscv64) set(triple riscv64-linux-gnu) include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake) xsimd-12.1.1/.github/toolchains/gcc.cmake000066400000000000000000000004321453610362700201670ustar00rootroot00000000000000set(CMAKE_SYSTEM_NAME Linux) set(CMAKE_C_COMPILER ${triple}-gcc) set(CMAKE_CXX_COMPILER ${triple}-g++) set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) xsimd-12.1.1/.github/workflows/000077500000000000000000000000001453610362700163245ustar00rootroot00000000000000xsimd-12.1.1/.github/workflows/android.yml000066400000000000000000000021151453610362700204660ustar00rootroot00000000000000name: Android build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: test: runs-on: ubuntu-latest strategy: matrix: target: - armeabi-v7a - arm64-v8a - x86 - x86_64 api: - 16 - 18 steps: - name: Checkout uses: actions/checkout@v3 - name: Build script env: TARGET: ${{ matrix.target }} API: ${{ matrix.api }} run: | mkdir _build NDK="$($ANDROID_HOME/cmdline-tools/latest/bin/sdkmanager --list_installed | sed -E 's/( +[|] +)/|/g;s/ +$//' | grep '^ ndk' | cut -d '|' -f 4 | sort | head -n1)" cd _build && \ cmake .. -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/$NDK/build/cmake/android.toolchain.cmake \ -DANDROID_ABI=$ABI \ -DANDROID_PLATFORM=android-$API \ -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release cmake --build . --verbose xsimd-12.1.1/.github/workflows/benchmark.yml000066400000000000000000000011421453610362700207770ustar00rootroot00000000000000name: benchmark & examples on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - name: Install dependencies run: | sudo apt install g++ cmake - name: Setup run: | mkdir _build cd _build && cmake .. -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release - name: Build run: cmake --build _build - name: Testing sequential run: cmake --build _build --target xbenchmark xsimd-12.1.1/.github/workflows/cross-rvv.yml000066400000000000000000000045061453610362700210200ustar00rootroot00000000000000name: RISC-V RVV cross-compilation build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true env: LLVM_VERSION: "17" GCC_VERSION: "12" jobs: build: runs-on: ubuntu-22.04 name: 'RISC-V RVV${{ matrix.vector_bits }}' strategy: matrix: vector_bits: - 128 - 256 - 512 steps: - name: Setup GCC run: | sudo apt-get -y -qq update sudo apt-get -y -qq --no-install-suggests --no-install-recommends install gcc-${GCC_VERSION}-riscv64-linux-gnu g++-${GCC_VERSION}-riscv64-linux-gnu sudo update-alternatives --install /usr/bin/riscv64-linux-gnu-gcc riscv64-linux-gnu-gcc /usr/bin/riscv64-linux-gnu-gcc-${GCC_VERSION} 20 sudo update-alternatives --install /usr/bin/riscv64-linux-gnu-g++ riscv64-linux-gnu-g++ /usr/bin/riscv64-linux-gnu-g++-${GCC_VERSION} 20 - name: Setup LLVM run: | # Install latest LLVM stable curl -o llvm.sh https://apt.llvm.org/llvm.sh chmod u+x llvm.sh sudo ./llvm.sh ${LLVM_VERSION} sudo ln -srf $(which clang-${LLVM_VERSION}) /usr/bin/clang sudo ln -srf $(which clang++-${LLVM_VERSION}) /usr/bin/clang++ rm llvm.sh - name: Setup QEMU uses: docker/setup-qemu-action@v3.0.0 with: platforms: riscv64 - name: Setup Ninja run: | sudo apt-get -y -qq install ninja-build - name: Checkout xsimd uses: actions/checkout@v3 - name: Setup run: > cmake -S . -B _build -GNinja -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release -DTARGET_ARCH=generic -DCMAKE_C_FLAGS="-march=rv64gcv_zvl${{ matrix.vector_bits }}b_zba_zbb_zbs -mrvv-vector-bits=zvl" -DCMAKE_CXX_FLAGS="-march=rv64gcv_zvl${{ matrix.vector_bits }}b_zba_zbb_zbs -mrvv-vector-bits=zvl" -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/clang-riscv64-linux-gnu.cmake - name: Build run: cmake --build _build - name: Testing xsimd run: > QEMU_CPU="rv64,zba=true,zbb=true,zbs=true,v=true,vlen=${{ matrix.vector_bits }},elen=64,vext_spec=v1.0" QEMU_LD_PREFIX="/usr/riscv64-linux-gnu" ./test/test_xsimd working-directory: ${{ github.workspace }}/_build xsimd-12.1.1/.github/workflows/cross-sve.yml000066400000000000000000000033461453610362700210010ustar00rootroot00000000000000name: Arm-SVE cross-compilation build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: runs-on: ubuntu-20.04 name: 'Arm SVE${{ matrix.vector_bits }}' strategy: matrix: vector_bits: - 128 - 256 - 512 steps: - name: Setup compiler run: | sudo apt-get update || exit 1 sudo apt-get --no-install-suggests --no-install-recommends install g++-10-aarch64-linux-gnu || exit 1 sudo update-alternatives --install /usr/bin/aarch64-linux-gnu-gcc aarch64-linux-gnu-gcc /usr/bin/aarch64-linux-gnu-gcc-10 20 sudo update-alternatives --install /usr/bin/aarch64-linux-gnu-g++ aarch64-linux-gnu-g++ /usr/bin/aarch64-linux-gnu-g++-10 20 - name: Setup QEMU run: | sudo apt-get --no-install-suggests --no-install-recommends install qemu-user - name: Setup Ninja run: | sudo apt-get install ninja-build - name: Checkout xsimd uses: actions/checkout@v3 - name: Setup run: | mkdir _build cd _build && cmake .. -GNinja -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release -DTARGET_ARCH=generic -DCMAKE_C_FLAGS="-march=armv8-a+sve -msve-vector-bits=${{ matrix.vector_bits }}" -DCMAKE_CXX_FLAGS="-march=armv8-a+sve -msve-vector-bits=${{ matrix.vector_bits }}" -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/gcc-aarch64-linux-gnu.cmake - name: Build run: cmake --build _build - name: Testing xsimd run: | qemu-aarch64 --cpu max,sve${{ matrix.vector_bits }}=on -L /usr/aarch64-linux-gnu/ ./test/test_xsimd working-directory: ${{ github.workspace }}/_build xsimd-12.1.1/.github/workflows/cross.yml000066400000000000000000000070651453610362700202100ustar00rootroot00000000000000name: Arm cross-compilation build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: runs-on: ubuntu-20.04 name: '${{ matrix.target.arch }}, ${{ matrix.sys.compiler }} ${{ matrix.sys.version }}' strategy: matrix: target: - { platform: 'arm', arch: 'armv7-a', dir: 'arm-linux-gnueabihf', flags: '-mfpu=neon', full: 'ON'} - { platform: 'arm', arch: 'armv7-a', dir: 'arm-linux-gnueabihf', flags: '-mfpu=vfpv3-d16', full: 'OFF' } # no neon - { platform: 'aarch64', arch: 'armv8-a', dir: 'aarch64-linux-gnu', flags: '', full: 'ON' } sys: - { compiler: 'gcc', version: '8' } - { compiler: 'clang', version: 'latest' } steps: - name: Setup compiler if: ${{ matrix.sys.compiler == 'clang' }} run: | LLVM_VERSION=${{ matrix.sys.version }} wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - || exit 1 if [[ $LLVM_VERSION -eq 'latest' ]]; then sudo add-apt-repository "deb http://apt.llvm.org/focal/ llvm-toolchain-focal main" || exit 1 else sudo add-apt-repository "deb http://apt.llvm.org/focal/ llvm-toolchain-focal-$LLVM_VERSION main" || exit 1 fi sudo apt-get update || exit 1 if [[ $LLVM_VERSION -eq 'latest' ]]; then sudo apt-get --no-install-suggests --no-install-recommends install clang || exit 1 else sudo apt-get --no-install-suggests --no-install-recommends install clang-$LLVM_VERSION || exit 1 fi sudo apt-get --no-install-suggests --no-install-recommends install g++-9-${{ matrix.target.dir }} g++-9-multilib || exit 1 - name: Setup compiler if: ${{ matrix.sys.compiler == 'gcc' }} run: | sudo apt-get update || exit 1 sudo apt-get --no-install-suggests --no-install-recommends install g++-${{ matrix.sys.version }}-${{ matrix.target.dir }} g++-${{ matrix.sys.version }}-multilib || exit 1 sudo update-alternatives --remove-all ${{ matrix.target.dir }}-gcc || true sudo update-alternatives --remove-all ${{ matrix.target.dir }}-g++ || true sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-gcc ${{ matrix.target.dir }}-gcc /usr/bin/${{ matrix.target.dir }}-gcc-${{ matrix.sys.version }} 20 sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-g++ ${{ matrix.target.dir }}-g++ /usr/bin/${{ matrix.target.dir }}-g++-${{ matrix.sys.version }} 20 - name: Setup QEMU run: | sudo apt-get --no-install-suggests --no-install-recommends install qemu-user - name: Setup Ninja run: | sudo apt-get install ninja-build - name: Checkout xsimd uses: actions/checkout@v3 - name: Setup run: | mkdir _build cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=${{ matrix.target.full }} -DBUILD_EXAMPLES=${{ matrix.target.full }} -DCMAKE_BUILD_TYPE=Release -DTARGET_ARCH=generic -DCMAKE_C_FLAGS="-march=${{ matrix.target.arch }} ${{ matrix.target.flags }}" -DCMAKE_CXX_FLAGS="-march=${{ matrix.target.arch }} ${{ matrix.target.flags }}" -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-${{ matrix.target.dir }}.cmake - name: Build run: cmake --build _build - name: Testing xsimd run: | qemu-${{ matrix.target.platform }} -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd working-directory: ${{ github.workspace }}/_build xsimd-12.1.1/.github/workflows/cxx-no-exceptions.yml000066400000000000000000000007161453610362700224460ustar00rootroot00000000000000name: C++ -fno-except compatibility on: [push, pull_request] jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - name: Install dependencies run: | sudo apt install g++ cmake - name: Setup run: | mkdir _build cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS=-fno-exceptions - name: Build run: cmake --build _build xsimd-12.1.1/.github/workflows/cxx-versions.yml000066400000000000000000000012011453610362700215110ustar00rootroot00000000000000name: C++ compatibility build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: runs-on: ubuntu-latest strategy: matrix: cxx-version: [11, 14, 17, 20] steps: - uses: actions/checkout@v3 - name: Install dependencies run: | sudo apt install g++ cmake - name: Setup run: | mkdir _build cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=${{matrix.cxx-version}} - name: Build run: cmake --build _build xsimd-12.1.1/.github/workflows/doxygen.yml000066400000000000000000000005751453610362700205330ustar00rootroot00000000000000name: doc on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - name: Install dependencies run: sudo apt install doxygen python3-breathe python3-sphinx-rtd-theme - name: Render run: make -C docs xsimd-12.1.1/.github/workflows/emscripten.yml000066400000000000000000000012301453610362700212140ustar00rootroot00000000000000name: Emscripten build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: test: runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v3 - uses: mamba-org/setup-micromamba@v1 with: environment-name: xsimd create-args: >- microsoft::playwright python init-shell: bash - name: Build script shell: bash -el {0} run: | echo "Build script for wasm" playwright install ./test/test_wasm/test_wasm.sh xsimd-12.1.1/.github/workflows/linux.yml000066400000000000000000000131071453610362700202100ustar00rootroot00000000000000name: Linux build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true defaults: run: shell: bash -l {0} jobs: build: runs-on: ubuntu-20.04 name: '${{ matrix.sys.compiler }} ${{ matrix.sys.version }} - ${{ matrix.sys.flags }}' strategy: matrix: sys: - { compiler: 'gcc', version: '7', flags: 'force_no_instr_set' } - { compiler: 'gcc', version: '8', flags: 'enable_xtl_complex' } - { compiler: 'gcc', version: '9', flags: 'avx' } #- { compiler: 'gcc', version: '10', flags: 'avx512' } buggy - { compiler: 'gcc', version: '11', flags: 'avx512' } - { compiler: 'gcc', version: '11', flags: 'i386' } - { compiler: 'gcc', version: '11', flags: 'avx512pf' } - { compiler: 'gcc', version: '11', flags: 'avx512vbmi' } - { compiler: 'gcc', version: '11', flags: 'avx512vnni' } - { compiler: 'clang', version: '8', flags: 'force_no_instr_set' } - { compiler: 'clang', version: '10', flags: 'enable_xtl_complex' } - { compiler: 'clang', version: '12', flags: 'avx' } - { compiler: 'clang', version: '13', flags: 'sse3' } - { compiler: 'clang', version: '14', flags: 'avx512' } steps: - name: Setup compiler if: ${{ matrix.sys.compiler == 'gcc' }} run: | GCC_VERSION=${{ matrix.sys.version }} if [[ $GCC_VERSION == '6' || $GCC_VERSION == '7' || $GCC_VERSION == '8' ]]; then #sudo add-apt-repository ppa:ubuntu-toolchain-r/test sudo apt-get update sudo apt-get --no-install-suggests --no-install-recommends install g++-$GCC_VERSION fi if [[ '${{ matrix.sys.flags }}' -eq 'i386' ]]; then sudo dpkg --add-architecture i386 sudo add-apt-repository ppa:ubuntu-toolchain-r/test sudo apt-get update sudo apt-get --no-install-suggests --no-install-recommends install gcc-$GCC_VERSION-multilib g++-$GCC_VERSION-multilib linux-libc-dev:i386 fi CC=gcc-$GCC_VERSION echo "CC=$CC" >> $GITHUB_ENV CXX=g++-$GCC_VERSION echo "CXX=$CXX" >> $GITHUB_ENV - name: Setup compiler if: ${{ matrix.sys.compiler == 'clang' }} run: | LLVM_VERSION=${{ matrix.sys.version }} #sudo add-apt-repository ppa:ubuntu-toolchain-r/test || exit 1 wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - || exit 1 if [[ $LLVM_VERSION -ge 13 ]]; then sudo add-apt-repository "deb http://apt.llvm.org/focal/ llvm-toolchain-focal-$LLVM_VERSION main" || exit 1 else sudo add-apt-repository "deb http://apt.llvm.org/focal/ llvm-toolchain-focal main" || exit 1 fi || exit 1 sudo apt-get update || exit 1 sudo apt-get --no-install-suggests --no-install-recommends install clang-$LLVM_VERSION || exit 1 sudo apt-get --no-install-suggests --no-install-recommends install g++-9 g++-9-multilib || exit 1 sudo ln -s /usr/include/asm-generic /usr/include/asm CC=clang-$LLVM_VERSION echo "CC=$CC" >> $GITHUB_ENV CXX=clang++-$LLVM_VERSION echo "CXX=$CXX" >> $GITHUB_ENV - name: Checkout xsimd uses: actions/checkout@v3 - name: Install mamba uses: mamba-org/provision-with-micromamba@main with: environment-file: environment.yml - name: Setup SDE if: startswith(matrix.sys.flags, 'avx512') run: sh install_sde.sh - name: Configure build env: CC: ${{ env.CC }} CXX: ${{ env.CXX }} run: | if [[ '${{ matrix.sys.flags }}' == 'enable_xtl_complex' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DENABLE_XTL_COMPLEX=ON" fi if [[ '${{ matrix.sys.flags }}' == 'avx' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=sandybridge" fi if [[ '${{ matrix.sys.flags }}' == 'sse3' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=nocona" fi if [[ '${{ matrix.sys.flags }}' == 'avx512' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" fi if [[ '${{ matrix.sys.flags }}' == 'avx512pf' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=knl" fi if [[ '${{ matrix.sys.flags }}' == 'avx512vbmi' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=cannonlake" fi if [[ '${{ matrix.sys.flags }}' == 'avx512vnni' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=knm" fi if [[ '${{ matrix.sys.flags }}' == 'i386' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DCMAKE_CXX_FLAGS='-m32'" fi if [[ '${{ matrix.sys.flags }}' == 'force_no_instr_set' ]]; then : else CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DXSIMD_ENABLE_WERROR=ON" fi mkdir _build cd _build cmake .. -DBUILD_TESTS=ON \ -DBUILD_BENCHMARK=ON \ -DBUILD_EXAMPLES=ON \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_C_COMPILER=$CC \ -DCMAKE_CXX_COMPILER=$CXX \ $CMAKE_EXTRA_ARGS \ -G Ninja - name: Build run: ninja -C _build - name: Test run: | cd _build cd test if echo '${{ matrix.sys.flags }}' | grep -q 'avx512' ; then ../../sde-external-8.69.1-2021-07-18-lin/sde64 -skx -- ./test_xsimd else ./test_xsimd fi xsimd-12.1.1/.github/workflows/macos.yml000066400000000000000000000014571453610362700201600ustar00rootroot00000000000000name: macOS build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: strategy: matrix: os: - 11 - 12 runs-on: macos-${{ matrix.os }} name: 'macos-${{ matrix.os }}' steps: - uses: actions/checkout@v3 - name: Setup run: | mkdir _build cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" - name: Build run: cmake --build _build --verbose - name: Testing sequential run: cmake --build _build --target xbenchmark --verbose - name: Testing xsimd run: ${{github.workspace}}/_build/test/test_xsimd xsimd-12.1.1/.github/workflows/style-check.yml000066400000000000000000000012451453610362700212640ustar00rootroot00000000000000name: style check on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: formatting-check: name: Format check runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - name: Run clang-format style check for C/C++ programs. uses: jidicula/clang-format-action@v4.11.0 with: clang-format-version: '17' exclude-regex: 'doctest.h' inlining-check: runs-on: ubuntu-latest name: Check inline keyword usage steps: - uses: actions/checkout@v2 - run: sudo apt install clang-tools - run: sh ./test/check_inline_specifier.sh . xsimd-12.1.1/.github/workflows/windows.yml000066400000000000000000000057101453610362700205440ustar00rootroot00000000000000name: Windows build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: name: 'MSVC ${{ matrix.os }}, ${{ matrix.target }} ${{ matrix.sys.set }}' defaults: run: shell: bash {0} strategy: matrix: os: - 2019 - 2022 target: - x86 - x64 sys: - { set: SSE, flags: "/arch:SSE2" } - { set: AVX, flags: "/arch:AVX" } - { set: AVX2, flags: "/arch:AVX2" } - { set: AVX512, flags: "/arch:AVX512" } exclude: # AVX on both platforms has a codegen error # On 2019 in _mm256_rsqrt_ps, on 2022 in _mm256_blend_p* - { sys: { set: AVX } } # On both platforms x86 + AVX512 triggers a compiler crash - { target: x86, sys: { set: AVX512 } } # /arch:SSE2 is not available on x64 platforms (SSE2 is enabled by default) - { target: x64, sys: { set: SSE} } runs-on: windows-${{ matrix.os }} steps: - name: Setup compiler uses: ilammy/msvc-dev-cmd@v1 with: arch: ${{ matrix.target }} - name: Setup Ninja run: | python3 -m pip install --upgrade pip setuptools wheel python3 -m pip install ninja - name: Checkout xsimd uses: actions/checkout@v3 - name: Setup run: | mkdir _build cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS="${{ matrix.sys.flags }}" -G Ninja - name: Build run: | cd _build && cmake --build . - name: Testing xsimd if: ${{ !startsWith(matrix.sys.set, 'AVX512') }} run: | cd _build && ./test/test_xsimd build-windows-mingw: name: 'MSYS2 ${{ matrix.msystem }}' runs-on: windows-2019 defaults: run: shell: msys2 {0} strategy: matrix: # Temporarily remove MINGW64 and UCRT64 builds because # GCC 12 gives an unexpected overflow warning for __builtin_memmove # see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106199 msystem: [ MINGW32, CLANG32, CLANG64 ] #msystem: [ MINGW32, MINGW64, UCRT64, CLANG32, CLANG64 ] fail-fast: false steps: - name: Use MinGW from MSYS2 uses: msys2/setup-msys2@v2 with: msystem: ${{ matrix.msystem }} update: true path-type: minimal pacboy: >- cc:p cmake:p ninja:p doctest:p - name: Checkout xsimd uses: actions/checkout@v2 - name: Configure run: | mkdir _build cd _build cmake .. -DBUILD_TESTS=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -G Ninja - name: Build run: ninja -C _build - name: Test run: | cd _build && ./test/test_xsimd xsimd-12.1.1/.gitignore000066400000000000000000000007251453610362700147230ustar00rootroot00000000000000# Generated pkg-config files *.pc # Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app # Vim tmp files *.swp # Build folder build/ # Documentation build artefacts docs/CMakeCache.txt docs/xml/ docs/build/ # VSCode / clangd IntelliSense .vscode/ .cache/ # CLion / IDEA .idea/xsimd-12.1.1/CMakeLists.txt000066400000000000000000000156421453610362700154770ustar00rootroot00000000000000############################################################################ # Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and # # Martin Renou # # Copyright (c) QuantStack # # Copyright (c) Serge Guelton # # # # Distributed under the terms of the BSD 3-Clause License. # # # # The full license is in the file LICENSE, distributed with this software. # ############################################################################ cmake_minimum_required(VERSION 3.8) project(xsimd) option(XSIMD_REFACTORING ON) set(XSIMD_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) # Versioning # ========== file(STRINGS "${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_config.hpp" xsimd_version_defines REGEX "#define XSIMD_VERSION_(MAJOR|MINOR|PATCH)") foreach(ver ${xsimd_version_defines}) if(ver MATCHES "#define XSIMD_VERSION_(MAJOR|MINOR|PATCH) +([^ ]+)$") set(XSIMD_VERSION_${CMAKE_MATCH_1} "${CMAKE_MATCH_2}" CACHE INTERNAL "") endif() endforeach() set(${PROJECT_NAME}_VERSION ${XSIMD_VERSION_MAJOR}.${XSIMD_VERSION_MINOR}.${XSIMD_VERSION_PATCH}) message(STATUS "xsimd v${${PROJECT_NAME}_VERSION}") # Build # ===== set(XSIMD_HEADERS ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_constants.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma3_avx.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma3_avx2.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma3_sse.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma4.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_generic.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_isa.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_neon.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_neon64.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_rvv.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_scalar.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse2.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse3.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse4_1.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse4_2.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_ssse3.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sve.hpp ${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_arch.hpp ${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_config.hpp ${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_cpuid.hpp ${XSIMD_INCLUDE_DIR}/xsimd/memory/xsimd_aligned_allocator.hpp ${XSIMD_INCLUDE_DIR}/xsimd/memory/xsimd_alignment.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_all_registers.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_api.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_neon_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_neon64_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx2_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx512f_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_batch.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_batch_constant.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_fma3_avx_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_fma3_avx2_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_fma3_sse_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_fma4_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_generic_arch.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_rvv_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse2_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse3_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse4_1_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse4_2_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_ssse3_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sve_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_traits.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_utils.hpp ${XSIMD_INCLUDE_DIR}/xsimd/xsimd.hpp ) add_library(xsimd INTERFACE) target_include_directories(xsimd INTERFACE $ $) OPTION(ENABLE_XTL_COMPLEX "enables support for xcomplex defined in xtl" OFF) OPTION(BUILD_TESTS "xsimd test suite" OFF) if(ENABLE_XTL_COMPLEX) find_package(xtl 0.7.0 REQUIRED) target_compile_features(xsimd INTERFACE cxx_std_14) target_compile_definitions(xsimd INTERFACE XSIMD_ENABLE_XTL_COMPLEX=1) target_link_libraries(xsimd INTERFACE xtl) else() target_compile_features(xsimd INTERFACE cxx_std_11) endif() if(BUILD_TESTS) enable_testing() add_subdirectory(test) endif() OPTION(BUILD_BENCHMARK "xsimd benchmarks" OFF) if(BUILD_BENCHMARK) add_subdirectory(benchmark) endif() OPTION(BUILD_EXAMPLES "xsimd examples" OFF) if(BUILD_EXAMPLES) add_subdirectory(examples) endif() # Installation # ============ OPTION(XSIMD_SKIP_INSTALL "Skip installation or not. By default it is OFF" OFF) if(${XSIMD_SKIP_INSTALL}) return() # skip installation endif () set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") include(JoinPaths) include(GNUInstallDirs) include(CMakePackageConfigHelpers) install(TARGETS xsimd EXPORT ${PROJECT_NAME}-targets) # Makes the project importable from the build directory export(EXPORT ${PROJECT_NAME}-targets FILE "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Targets.cmake") install(DIRECTORY ${XSIMD_INCLUDE_DIR}/xsimd DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". set(XSIMD_CMAKECONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}" CACHE STRING "install path for xsimdConfig.cmake") configure_package_config_file(${PROJECT_NAME}Config.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" INSTALL_DESTINATION ${XSIMD_CMAKECONFIG_INSTALL_DIR}) # xsimd is header-only and does not depend on the architecture. # Remove CMAKE_SIZEOF_VOID_P from xtensorConfigVersion.cmake so that an xtensorConfig.cmake # generated for a 64 bit target can be used for 32 bit targets and vice versa. set(_XTENSOR_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P}) unset(CMAKE_SIZEOF_VOID_P) write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake VERSION ${${PROJECT_NAME}_VERSION} COMPATIBILITY SameMajorVersion) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake DESTINATION ${XSIMD_CMAKECONFIG_INSTALL_DIR}) install(EXPORT ${PROJECT_NAME}-targets FILE ${PROJECT_NAME}Targets.cmake DESTINATION ${XSIMD_CMAKECONFIG_INSTALL_DIR}) configure_file(${PROJECT_NAME}.pc.in "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc" @ONLY) install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc" DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") xsimd-12.1.1/CONTRIBUTING.md000066400000000000000000000016641453610362700151670ustar00rootroot00000000000000# Contributing to xsimd First, thanks for being there! Welcome on board, we will try to make your contributing journey as good an experience as it can be. # Submitting patches Patches should be submitted through Github PR. We di put some effort to setup a decent Continuous Integration coverage, please try to make it green ;-) We use [clang-format](https://clang.llvm.org/docs/ClangFormat.html) to keep the coding style consistent, a ``.clang-format`` file is shipped within the source, feel free to use it! # Extending the API We are open to extending the API, as long as it has been discussed either in an Issue or a PR. The only constraint is to add testing for new functions, and make sure they work on all supported architectures, not only your favorite one! # Licensing We use a shared copyright model that enables all contributors to maintain the copyright on their contributions. Stated otherwise, there's no copyright assignment. xsimd-12.1.1/Changelog.rst000066400000000000000000000143731453610362700153600ustar00rootroot00000000000000.. Copyright (c) Serge Guelton and Johan Mabille Copyright (c) QuantStack Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Changelog ========= 12.1.1 ------ * Update readme with a section on adoption, and a section on the history of the project * Fix/avx512vnni implementation * Fix regression on XSIMD_NO_SUPPORTED_ARCHITECTURE 12.1.0 ------ * Fix various problems with architecture version handling * Specialize xsimd::compress for riscv * Provide stubs for various avx512xx architectures 12.0.0 ------ * Fix sincos implementation to cope with Emscripten * Upgraded minimal version of cmake to remove deprecation warning * Fixed constants::signmask for GCC when using ffast-math * Add RISC-V Vector support * Generic, simple implementation fox xsimd::compress * Disable batch of bools, and suggest using batch_bool instead * Add an option to skip installation 11.2.0 ------ * Provide shuffle operations of floating point batches * Provide a generic implementation of xsimd::swizzle with dynamic indices * Implement rotl, rotr, rotate_left and rotate_right * Let CMake figure out pkgconfig directories * Add missing boolean operators in xsimd_api.hpp * Initial Implementation for the new WASM based instruction set * Provide a generic version for float to uint32_t conversion 11.1.0 ------ * Introduce XSIMD_DEFAULT_ARCH to force default architecture (if any) * Remove C++ requirement on xsimd::exp10 scalar implementation * Improve and test documentation 11.0.0 ------ * Provide a generic reducer * Fix ``find_package(xsimd)`` for xtl enabled xsimd, reloaded * Cleanup benchmark code * Provide avx512f implementation of FMA and variant * Hexadecimal floating points are not a C++11 feature * back to slow implementation of exp10 on Windows * Changed bitwise_cast API * Provide generic signed /unsigned type conversion * Fixed sde location * Feature/incr decr * Cleanup documentation 10.0.0 ------ * Fix potential ABI issue in SVE support * Disable fast exp10 on OSX * Assert on unaligned memory when calling aligned load/store * Fix warning about uninitialized storage * Always forward arch parameter * Do not specialize the behavior of ``simd_return_type`` for char * Support broadcasting of complex batches * Make xsimd compatible with -fno-exceptions * Provide and test comparison operators overloads that accept scalars 9.0.1 ----- * Fix potential ABI issue in SVE support, making ``xsimd::sve`` a type alias to size-dependent type. 9.0.0 ----- * Support fixed size SVE * Fix a bug in SSSE3 ``xsimd::swizzle`` implementation for ``int8`` and ``int16`` * Rename ``xsimd::hadd`` into ``xsimd::reduce_add``, provide ``xsimd::reduce_min`` and ``xsimd::reduce_max`` * Properly report unsupported double for neon on arm32 * Fill holes in xsimd scalar api * Fix ``find_package(xsimd)`` for xtl enabled xsimd * Replace ``xsimd::bool_cast`` by ``xsimd::batch_bool_cast`` * Native ``xsimd::hadd`` for float on arm64 * Properly static_assert when trying to instantiate an ``xsimd::batch`` of xtl complex * Introduce ``xsimd::batch_bool::mask()`` and ``batch_bool::from_mask(...)`` * Flag some function with ``[[nodiscard]]`` * Accept both relative and absolute libdir and include dir in xsimd.pc * Implement ``xsimd::nearbyint_as_int`` for NEON * Add ``xsimd::polar`` * Speedup double -> F32/I32 gathers * Add ``xsimd::slide_left`` and ``xsimd::slide_right`` * Support integral ``xsimd::swizzles`` on AVX 8.1.0 ----- * Add ``xsimd::gather`` and ``xsimd::scatter`` * Add ``xsimd::nearbyint_as_int`` * Add ``xsimd::none`` * Add ``xsimd::reciprocal`` * Remove batch constructor from memory adress, use ``xsimd::batch<...>::load_(un)aligned`` instead * Leave to msvc users the opportunity to manually disable FMA3 on AVX * Provide ``xsimd::insert`` to modify a single value from a vector * Make ``xsimd::pow`` implementation resilient to ``FE_INVALID`` * Reciprocal square root support through ``xsimd::rsqrt`` * NEON: Improve ``xsimd::any`` and ``xsimd::all`` * Provide type utility to explicitly require a batch of given size and type * Implement ``xsimd::swizzle`` on x86, neon and neon64 * Avx support for ``xsimd::zip_lo`` and ``xsimd::zip_hi`` * Only use ``_mm256_unpacklo_epi`` on AVX2 * Provide neon/neon64 conversion function from ``uint(32|64)_t`` to ``(float|double)`` * Provide SSE/AVX/AVX2 conversion function from ``uint32_t`` to ``float`` * Provide AVX2 conversion function from ``(u)int64_t`` to ``double`` * Provide better SSE conversion function from ``uint64_t`` to ``double`` * Provide better SSE conversion function to ``double`` * Support logical xor for ``xsimd::batch_bool`` * Clarify fma support: - FMA3 + SSE -> ``xsimd::fma3`` - FMA3 + AVX -> ``xsimd::fma3`` - FMA3 + AVX2 -> ``xsimd::fma3`` - FMA4 -> ``xsimd::fma4`` * Allow ``xsimd::transform`` to work with complex types * Add missing scalar version of ``xsimd::norm`` and ``xsimd::conj`` 8.0.5 ----- * Fix neon ``xsimd::hadd`` implementation * Detect unsupported architectures and set ``XSIMD_NO_SUPPORTED_ARCHITECTURE`` if needs be 8.0.4 ----- * Provide some conversion operators for ``float`` -> ``uint32`` * Improve code generated for AVX2 signed integer comparisons * Enable detection of avx512cd and avx512dq, and fix avx512bw detection * Enable detection of AVX2+FMA * Pick the best compatible architecture in ``xsimd::dispatch`` * Enables support for FMA when AVX2 is detected on Windows * Add missing includes / forward declaration * Mark all functions inline and noexcept * Assert when using incomplete ``std::initializer_list`` 8.0.3 ----- * Improve CI & testing, no functional change 8.0.2 ----- * Do not use ``_mm256_srai_epi32`` under AVX, it's an AVX2 instruction 8.0.1 ----- * Fix invalid constexpr ``std::make_tuple`` usage in neon64 xsimd-12.1.1/LICENSE000066400000000000000000000031061453610362700137340ustar00rootroot00000000000000Copyright (c) 2016, Johan Mabille, Sylvain Corlay, Wolf Vollprecht and Martin Renou Copyright (c) 2016, QuantStack Copyright (c) 2018, Serge Guelton All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsimd-12.1.1/README.md000066400000000000000000000162061453610362700142130ustar00rootroot00000000000000# ![xsimd](docs/source/xsimd.svg) [![Appveyor](https://ci.appveyor.com/api/projects/status/wori7my48os31nu0?svg=true)](https://ci.appveyor.com/project/xtensor-stack/xsimd) [![Azure](https://dev.azure.com/xtensor-stack/xtensor-stack/_apis/build/status/xtensor-stack.xsimd?branchName=master)](https://dev.azure.com/xtensor-stack/xtensor-stack/_build/latest?definitionId=3&branchName=master) [![Documentation Status](http://readthedocs.org/projects/xsimd/badge/?version=latest)](https://xsimd.readthedocs.io/en/latest/?badge=latest) [![Join the Gitter Chat](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/QuantStack/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) C++ wrappers for SIMD intrinsics ## Introduction SIMD (Single Instruction, Multiple Data) is a feature of microprocessors that has been available for many years. SIMD instructions perform a single operation on a batch of values at once, and thus provide a way to significantly accelerate code execution. However, these instructions differ between microprocessor vendors and compilers. `xsimd` provides a unified means for using these features for library authors. Namely, it enables manipulation of batches of numbers with the same arithmetic operators as for single values. It also provides accelerated implementation of common mathematical functions operating on batches. ## Adoption Beyond Xtensor, Xsimd has been adopted by major open-source projects, such as Mozilla Firefox, Apache Arrow, Pythran, and Krita. ## History The XSimd project started with a series of blog articles by Johan Mabille on how to implement wrappers for SIMD intrinsicts. The archives of the blog can be found here: [The C++ Scientist](http://johanmabille.github.io/blog/archives/). The design described in the articles remained close to the actual architecture of XSimd up until Version 8.0. The mathematical functions are a lightweight implementation of the algorithms originally implemented in the now deprecated [boost.SIMD](https://github.com/NumScale/boost.simd) project. ## Requirements `xsimd` requires a C++11 compliant compiler. The following C++ compilers are supported: Compiler | Version ------------------------|------------------------------- Microsoft Visual Studio | MSVC 2015 update 2 and above g++ | 4.9 and above clang | 4.0 and above The following SIMD instruction set extensions are supported: Architecture | Instruction set extensions -------------|----------------------------------------------------- x86 | SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, FMA3+SSE, FMA3+AVX, FMA3+AVX2 x86 | AVX512BW, AVX512CD, AVX512DQ, AVX512F (gcc7 and higher) x86 AMD | FMA4 ARM | NEON, NEON64, SVE128/256/512 (fixed vector size) WebAssembly | WASM RISC-V | RISC-V128/256/512 (fixed vector size) ## Installation ### Install from conda-forge A package for xsimd is available on the mamba (or conda) package manager. ```bash mamba install -c conda-forge xsimd ``` ### Install with Spack A package for xsimd is available on the Spack package manager. ```bash spack install xsimd spack load xsimd ``` ### Install from sources You can directly install it from the sources with cmake: ```bash cmake -D CMAKE_INSTALL_PREFIX=your_install_prefix make install ``` ## Documentation To get started with using `xsimd`, check out the full documentation http://xsimd.readthedocs.io/ ## Dependencies `xsimd` has an optional dependency on the [xtl](https://github.com/xtensor-stack/xtl) library: | `xsimd` | `xtl` (optional) | |---------|------------------| | master | ^0.7.0 | | 12.x | ^0.7.0 | | 11.x | ^0.7.0 | | 10.x | ^0.7.0 | | 9.x | ^0.7.0 | | 8.x | ^0.7.0 | The dependency on `xtl` is required if you want to support vectorization for `xtl::xcomplex`. In this case, you must build your project with C++14 support enabled. ## Usage The version 8 of the library is a complete rewrite and there are some slight differences with 7.x versions. A migration guide will be available soon. In the meanwhile, the following examples show how to use both versions 7 and 8 of the library? ### Explicit use of an instruction set extension Here is an example that computes the mean of two sets of 4 double floating point values, assuming AVX extension is supported: ```cpp #include #include "xsimd/xsimd.hpp" namespace xs = xsimd; int main(int argc, char* argv[]) { xs::batch a = {1.5, 2.5, 3.5, 4.5}; xs::batch b = {2.5, 3.5, 4.5, 5.5}; auto mean = (a + b) / 2; std::cout << mean << std::endl; return 0; } ``` Do not forget to enable AVX extension when building the example. With gcc or clang, this is done with the `-mavx` flag, on MSVC you have to pass the `/arch:AVX` option. This example outputs: ```cpp (2.0, 3.0, 4.0, 5.0) ``` ### Auto detection of the instruction set extension to be used The same computation operating on vectors and using the most performant instruction set available: ```cpp #include #include #include "xsimd/xsimd.hpp" namespace xs = xsimd; using vector_type = std::vector>; void mean(const vector_type& a, const vector_type& b, vector_type& res) { std::size_t size = a.size(); constexpr std::size_t simd_size = xsimd::simd_type::size; std::size_t vec_size = size - size % simd_size; for(std::size_t i = 0; i < vec_size; i += simd_size) { auto ba = xs::load_aligned(&a[i]); auto bb = xs::load_aligned(&b[i]); auto bres = (ba + bb) / 2.; bres.store_aligned(&res[i]); } for(std::size_t i = vec_size; i < size; ++i) { res[i] = (a[i] + b[i]) / 2.; } } ``` ## Building and Running the Tests Building the tests requires [cmake](https://cmake.org). `cmake` is available as a package for most linux distributions. Besides, they can also be installed with the `conda` package manager (even on windows): ```bash conda install -c conda-forge cmake ``` Once `cmake` is installed, you can build and run the tests: ```bash mkdir build cd build cmake ../ -DBUILD_TESTS=ON make xtest ``` In the context of continuous integration with Travis CI, tests are run in a `conda` environment, which can be activated with ```bash cd test conda env create -f ./test-environment.yml source activate test-xsimd cd .. cmake . -DBUILD_TESTS=ON make xtest ``` ## Building the HTML Documentation xsimd's documentation is built with three tools - [doxygen](http://www.doxygen.org) - [sphinx](http://www.sphinx-doc.org) - [breathe](https://breathe.readthedocs.io) While doxygen must be installed separately, you can install breathe by typing ```bash pip install breathe ``` Breathe can also be installed with `conda` ```bash conda install -c conda-forge breathe ``` Finally, build the documentation with ```bash make html ``` from the `docs` subdirectory. ## License We use a shared copyright model that enables all contributors to maintain the copyright on their contributions. This software is licensed under the BSD-3-Clause license. See the [LICENSE](LICENSE) file for details. xsimd-12.1.1/benchmark/000077500000000000000000000000001453610362700146615ustar00rootroot00000000000000xsimd-12.1.1/benchmark/CMakeLists.txt000066400000000000000000000062641453610362700174310ustar00rootroot00000000000000############################################################################ # Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and # # Martin Renou # # Copyright (c) QuantStack # # Copyright (c) Serge Guelton # # # # Distributed under the terms of the BSD 3-Clause License. # # # # The full license is in the file LICENSE, distributed with this software. # ############################################################################ cmake_minimum_required(VERSION 3.1) if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) project(xsimd-benchmark) find_package(xsimd REQUIRED CONFIG) set(XSIMD_INCLUDE_DIR ${xsimd_INCLUDE_DIRS}) endif () if(NOT CMAKE_BUILD_TYPE) message(STATUS "Setting tests build type to Release") set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) else() message(STATUS "Tests build type is ${CMAKE_BUILD_TYPE}") endif() include(CheckCXXCompilerFlag) string(TOUPPER "${CMAKE_BUILD_TYPE}" U_CMAKE_BUILD_TYPE) if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Intel") if(NOT CMAKE_CXX_FLAGS MATCHES "-march" AND NOT CMAKE_CXX_FLAGS MATCHES "-arch" AND NOT CMAKE_OSX_ARCHITECTURES) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") endif() if(NOT MSVC) CHECK_CXX_COMPILER_FLAG("-std=c++11" HAS_CPP11_FLAG) if (ENABLE_XTL_COMPLEX) CHECK_CXX_COMPILER_FLAG("-std=c++14" HAS_CPP14_FLAG) if (NOT HAS_CPP14_FLAG) message(FATAL_ERROR "Unsupported compiler -- xsimd requires C++14 support when xtl complex support is enabled") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") else() CHECK_CXX_COMPILER_FLAG("-std=c++11" HAS_CPP11_FLAG) if (NOT HAS_CPP11_FLAG) message(FATAL_ERROR "Unsupported compiler -- xsimd requires C++11 support!") else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") endif() endif() endif() endif() if(MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /MP /bigobj") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") set(CMAKE_EXE_LINKER_FLAGS /MANIFEST:NO) foreach(flag_var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) string(REPLACE "/MD" "-MT" ${flag_var} "${${flag_var}}") endforeach() endif() include_directories(${XSIMD_INCLUDE_DIR}) set(XSIMD_BENCHMARK main.cpp xsimd_benchmark.hpp ) set(XSIMD_BENCHMARK_TARGET benchmark_xsimd) add_executable(${XSIMD_BENCHMARK_TARGET} ${XSIMD_BENCHMARK} ${XSIMD_HEADERS}) if(ENABLE_XTL_COMPLEX) target_link_libraries(benchmark_xsimd PRIVATE xtl) endif() add_custom_target(xbenchmark COMMAND benchmark_xsimd DEPENDS ${XSIMD_BENCHMARK_TARGET}) xsimd-12.1.1/benchmark/main.cpp000066400000000000000000000155201453610362700163140ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "xsimd_benchmark.hpp" #include void benchmark_operation() { // std::size_t size = 9984; std::size_t size = 20000; xsimd::run_benchmark_2op(xsimd::add_fn(), std::cout, size, 1000); xsimd::run_benchmark_2op(xsimd::sub_fn(), std::cout, size, 1000); xsimd::run_benchmark_2op(xsimd::mul_fn(), std::cout, size, 1000); xsimd::run_benchmark_2op(xsimd::div_fn(), std::cout, size, 1000); } void benchmark_exp_log() { std::size_t size = 20000; xsimd::run_benchmark_1op(xsimd::exp_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::exp2_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::expm1_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::log_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::log2_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::log10_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::log1p_fn(), std::cout, size, 1000); } void benchmark_trigo() { std::size_t size = 20000; xsimd::run_benchmark_1op(xsimd::sin_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::cos_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::tan_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::asin_fn(), std::cout, size, 1000, xsimd::init_method::arctrigo); xsimd::run_benchmark_1op(xsimd::acos_fn(), std::cout, size, 1000, xsimd::init_method::arctrigo); xsimd::run_benchmark_1op(xsimd::atan_fn(), std::cout, size, 1000, xsimd::init_method::arctrigo); } void benchmark_hyperbolic() { std::size_t size = 20000; xsimd::run_benchmark_1op(xsimd::sinh_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::cosh_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::tanh_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::asinh_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::acosh_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::atanh_fn(), std::cout, size, 100); } void benchmark_power() { std::size_t size = 20000; xsimd::run_benchmark_2op(xsimd::pow_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::sqrt_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::cbrt_fn(), std::cout, size, 100); xsimd::run_benchmark_2op(xsimd::hypot_fn(), std::cout, size, 1000); } void benchmark_rounding() { std::size_t size = 20000; xsimd::run_benchmark_1op(xsimd::ceil_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::floor_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::trunc_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::round_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::nearbyint_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::rint_fn(), std::cout, size, 100); } #ifdef XSIMD_POLY_BENCHMARKS void benchmark_poly_evaluation() { std::size_t size = 20000; xsimd::run_benchmark_1op(xsimd::horner_5_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::estrin_5_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::horner_10_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::estrin_10_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::horner_12_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::estrin_12_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::horner_14_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::estrin_14_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::horner_16_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::estrin_16_fn(), std::cout, size, 1000); } #endif void benchmark_basic_math() { std::size_t size = 20000; xsimd::run_benchmark_2op(xsimd::fmod_fn(), std::cout, size, 1000); xsimd::run_benchmark_2op(xsimd::remainder_fn(), std::cout, size, 1000); xsimd::run_benchmark_2op(xsimd::fdim_fn(), std::cout, size, 1000); xsimd::run_benchmark_3op(xsimd::clip_fn(), std::cout, size, 1000); #if 0 xsimd::run_benchmark_1op_pred(xsimd::isfinite_fn(), std::cout, size, 100); xsimd::run_benchmark_1op_pred(xsimd::isinf_fn(), std::cout, size, 100); xsimd::run_benchmark_1op_pred(xsimd::is_flint_fn(), std::cout, size, 100); xsimd::run_benchmark_1op_pred(xsimd::is_odd_fn(), std::cout, size, 100); xsimd::run_benchmark_1op_pred(xsimd::is_even_fn(), std::cout, size, 100); #endif } int main(int argc, char* argv[]) { const std::map> fn_map = { { "op", { "arithmetic", benchmark_operation } }, { "exp", { "exponential and logarithm", benchmark_exp_log } }, { "trigo", { "trigonometric", benchmark_trigo } }, { "hyperbolic", { "hyperbolic", benchmark_hyperbolic } }, { "power", { "power", benchmark_power } }, { "basic_math", { "basic math", benchmark_basic_math } }, { "rounding", { "rounding", benchmark_rounding } }, #ifdef XSIMD_POLY_BENCHMARKS { "utils", { "polynomial evaluation", benchmark_poly_evaluation } }, #endif }; if (argc > 1) { if (std::string(argv[1]) == "--help" || std::string(argv[1]) == "-h") { std::cout << "Available options:" << std::endl; for (auto const& kv : fn_map) { std::cout << kv.first << ": run benchmark on " << kv.second.first << " functions" << std::endl; } } else { std::cout << "############################" << std::endl << "# " << xsimd::default_arch::name() << std::endl << "############################" << std::endl; for (int i = 1; i < argc; ++i) { fn_map.at(argv[i]).second(); } } } else { std::cout << "############################" << std::endl << "# " << xsimd::default_arch::name() << std::endl << "############################" << std::endl; for (auto const& kv : fn_map) { kv.second.second(); } } return 0; } xsimd-12.1.1/benchmark/xsimd_benchmark.hpp000066400000000000000000000560321453610362700205360ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_BENCHMARK_HPP #define XSIMD_BENCHMARK_HPP #include "xsimd/arch/xsimd_scalar.hpp" #include "xsimd/xsimd.hpp" #include #include #include #include namespace xsimd { using duration_type = std::chrono::duration; template using bench_vector = std::vector>; template void init_benchmark(bench_vector& lhs, bench_vector& rhs, bench_vector& res, size_t size) { lhs.resize(size); rhs.resize(size); res.resize(size); for (size_t i = 0; i < size; ++i) { lhs[i] = T(0.5) + std::sqrt(T(i)) * T(9.) / T(size); rhs[i] = T(10.2) / T(i + 2) + T(0.25); } } template void init_benchmark(bench_vector& op0, bench_vector& op1, bench_vector& op2, bench_vector& res, size_t size) { op0.resize(size); op1.resize(size); op2.resize(size); res.resize(size); for (size_t i = 0; i < size; ++i) { op0[i] = T(0.5) + std::sqrt(T(i)) * T(9.) / T(size); op1[i] = T(10.2) / T(i + 3) + T(0.25); op2[i] = T(20.1) / T(i + 2) + T(0.65); } } template void init_benchmark_arctrigo(bench_vector& lhs, bench_vector& rhs, bench_vector& res, size_t size) { lhs.resize(size); rhs.resize(size); res.resize(size); for (size_t i = 0; i < size; ++i) { lhs[i] = T(-1.) + T(2.) * T(i) / T(size); rhs[i] = T(i) / T(i + 2) + T(0.25); } } enum class init_method { classic, arctrigo }; template duration_type benchmark_scalar(F f, V& lhs, V& res, std::size_t number) { size_t s = lhs.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (size_t i = 0; i < s; ++i) { res[i] = f(lhs[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_scalar(F f, V& lhs, V& rhs, V& res, std::size_t number) { size_t s = lhs.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (size_t i = 0; i < s; ++i) { res[i] = f(lhs[i], rhs[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_scalar(F f, V& op0, V& op1, V& op2, V& res, std::size_t number) { size_t s = op0.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (size_t i = 0; i < s; ++i) { res[i] = f(op0[i], op1[i], op2[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd(F f, V& lhs, V& res, std::size_t number) { std::size_t s = lhs.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - B::size); i += B::size) { B blhs = B::load_aligned(&lhs[i]); B bres = f(blhs); bres.store_aligned(&res[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd_unrolled(F f, V& lhs, V& res, std::size_t number) { std::size_t s = lhs.size(); std::size_t inc = 4 * B::size; duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - inc); i += inc) { size_t j = i + B::size; size_t k = j + B::size; size_t l = k + B::size; B blhs = B::load_aligned(&lhs[i]), blhs2 = B::load_aligned(&lhs[j]), blhs3 = B::load_aligned(&lhs[k]), blhs4 = B::load_aligned(&lhs[l]); B bres = f(blhs); B bres2 = f(blhs2); B bres3 = f(blhs3); B bres4 = f(blhs4); bres.store_aligned(&res[i]); bres2.store_aligned(&res[j]); bres3.store_aligned(&res[k]); bres4.store_aligned(&res[l]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd(F f, V& lhs, V& rhs, V& res, std::size_t number) { std::size_t s = lhs.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - B::size); i += B::size) { B blhs = B::load_aligned(&lhs[i]), brhs = B::load_aligned(&rhs[i]); B bres = f(blhs, brhs); bres.store_aligned(&res[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd_unrolled(F f, V& lhs, V& rhs, V& res, std::size_t number) { std::size_t s = lhs.size(); std::size_t inc = 4 * B::size; duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - inc); i += inc) { size_t j = i + B::size; size_t k = j + B::size; size_t l = k + B::size; B blhs = B::load_aligned(&lhs[i]), brhs = B::load_aligned(&rhs[i]), blhs2 = B::load_aligned(&lhs[j]), brhs2 = B::load_aligned(&rhs[j]); B blhs3 = B::load_aligned(&lhs[k]), brhs3 = B::load_aligned(&rhs[k]), blhs4 = B::load_aligned(&lhs[l]), brhs4 = B::load_aligned(&rhs[l]); B bres = f(blhs, brhs); B bres2 = f(blhs2, brhs2); B bres3 = f(blhs3, brhs3); B bres4 = f(blhs4, brhs4); bres.store_aligned(&res[i]); bres2.store_aligned(&res[j]); bres3.store_aligned(&res[k]); bres4.store_aligned(&res[l]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd(F f, V& op0, V& op1, V& op2, V& res, std::size_t number) { std::size_t s = op0.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - B::size); i += B::size) { B bop0 = B::load_aligned(&op0[i]), bop1 = B::load_aligned(&op1[i]), bop2 = B::load_aligned(&op2[i]); B bres = f(bop0, bop1, bop2); bres.store_aligned(&res[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd_unrolled(F f, V& op0, V& op1, V& op2, V& res, std::size_t number) { std::size_t s = op0.size(); std::size_t inc = 4 * B::size; duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - inc); i += inc) { size_t j = i + B::size; size_t k = j + B::size; size_t l = k + B::size; B bop0_i = B::load_aligned(&op0[i]), bop1_i = B::load_aligned(&op1[i]), bop2_i = B::load_aligned(&op2[i]); B bop0_j = B::load_aligned(&op0[j]), bop1_j = B::load_aligned(&op1[j]), bop2_j = B::load_aligned(&op2[j]); B bop0_k = B::load_aligned(&op0[k]), bop1_k = B::load_aligned(&op1[k]), bop2_k = B::load_aligned(&op2[k]); B bop0_l = B::load_aligned(&op0[l]), bop1_l = B::load_aligned(&op1[l]), bop2_l = B::load_aligned(&op2[l]); B bres_i = f(bop0_i, bop1_i, bop2_i); B bres_j = f(bop0_j, bop1_j, bop2_j); B bres_k = f(bop0_k, bop1_k, bop2_k); B bres_l = f(bop0_l, bop1_l, bop2_l); bres_i.store_aligned(&res[i]); bres_j.store_aligned(&res[j]); bres_k.store_aligned(&res[k]); bres_l.store_aligned(&res[l]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template void run_benchmark_1op(F f, OS& out, std::size_t size, std::size_t iter, init_method init = init_method::classic) { bench_vector f_lhs, f_rhs, f_res; bench_vector d_lhs, d_rhs, d_res; switch (init) { case init_method::classic: init_benchmark(f_lhs, f_rhs, f_res, size); init_benchmark(d_lhs, d_rhs, d_res, size); break; case init_method::arctrigo: init_benchmark_arctrigo(f_lhs, f_rhs, f_res, size); init_benchmark_arctrigo(d_lhs, d_rhs, d_res, size); break; default: init_benchmark(f_lhs, f_rhs, f_res, size); init_benchmark(d_lhs, d_rhs, d_res, size); break; } #ifndef XSIMD_POLY_BENCHMARKS duration_type t_float_scalar = benchmark_scalar(f, f_lhs, f_res, iter); duration_type t_double_scalar = benchmark_scalar(f, d_lhs, d_res, iter); #endif duration_type t_float_vector = benchmark_simd>(f, f_lhs, f_res, iter); duration_type t_float_vector_u = benchmark_simd_unrolled>(f, f_lhs, f_res, iter); #if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 duration_type t_double_vector = benchmark_simd>(f, d_lhs, d_res, iter); duration_type t_double_vector_u = benchmark_simd_unrolled>(f, d_lhs, d_res, iter); #endif out << "============================" << std::endl; out << f.name() << std::endl; #ifndef XSIMD_POLY_BENCHMARKS out << "scalar float : " << t_float_scalar.count() << "ms" << std::endl; #endif out << "vector float : " << t_float_vector.count() << "ms" << std::endl; out << "vector float unr : " << t_float_vector_u.count() << "ms" << std::endl; #ifndef XSIMD_POLY_BENCHMARKS out << "scalar double : " << t_double_scalar.count() << "ms" << std::endl; #endif #if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 out << "vector double : " << t_double_vector.count() << "ms" << std::endl; out << "vector double unr : " << t_double_vector_u.count() << "ms" << std::endl; #endif out << "============================" << std::endl; } template void run_benchmark_2op(F f, OS& out, std::size_t size, std::size_t iter) { bench_vector f_lhs, f_rhs, f_res; bench_vector d_lhs, d_rhs, d_res; init_benchmark(f_lhs, f_rhs, f_res, size); init_benchmark(d_lhs, d_rhs, d_res, size); duration_type t_float_scalar = benchmark_scalar(f, f_lhs, f_rhs, f_res, iter); duration_type t_float_vector = benchmark_simd>(f, f_lhs, f_rhs, f_res, iter); duration_type t_float_vector_u = benchmark_simd_unrolled>(f, f_lhs, f_rhs, f_res, iter); duration_type t_double_scalar = benchmark_scalar(f, d_lhs, d_rhs, d_res, iter); #if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 duration_type t_double_vector = benchmark_simd>(f, d_lhs, d_rhs, d_res, iter); duration_type t_double_vector_u = benchmark_simd_unrolled>(f, d_lhs, d_rhs, d_res, iter); #endif out << "============================" << std::endl; out << f.name() << std::endl; out << "scalar float : " << t_float_scalar.count() << "ms" << std::endl; out << "vector float : " << t_float_vector.count() << "ms" << std::endl; out << "vector float unr : " << t_float_vector_u.count() << "ms" << std::endl; out << "scalar double : " << t_double_scalar.count() << "ms" << std::endl; #if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 out << "vector double : " << t_double_vector.count() << "ms" << std::endl; out << "vector double unr : " << t_double_vector_u.count() << "ms" << std::endl; #endif out << "============================" << std::endl; } template void run_benchmark_3op(F f, OS& out, std::size_t size, std::size_t iter) { bench_vector f_op0, f_op1, f_op2, f_res; bench_vector d_op0, d_op1, d_op2, d_res; init_benchmark(f_op0, f_op1, f_op2, f_res, size); init_benchmark(d_op0, d_op1, d_op2, d_res, size); duration_type t_float_scalar = benchmark_scalar(f, f_op0, f_op1, f_op2, f_res, iter); duration_type t_float_vector = benchmark_simd>(f, f_op0, f_op1, f_op2, f_res, iter); duration_type t_float_vector_u = benchmark_simd_unrolled>(f, f_op0, f_op1, f_op2, f_res, iter); duration_type t_double_scalar = benchmark_scalar(f, d_op0, d_op1, d_op2, d_res, iter); #if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 duration_type t_double_vector = benchmark_simd>(f, d_op0, d_op1, d_op2, d_res, iter); duration_type t_double_vector_u = benchmark_simd_unrolled>(f, d_op0, d_op1, d_op2, d_res, iter); #endif out << "============================" << std::endl; out << f.name() << std::endl; out << "scalar float : " << t_float_scalar.count() << "ms" << std::endl; out << "vector float : " << t_float_vector.count() << "ms" << std::endl; out << "vector float unr : " << t_float_vector_u.count() << "ms" << std::endl; out << "scalar double : " << t_double_scalar.count() << "ms" << std::endl; #if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 out << "vector double : " << t_double_vector.count() << "ms" << std::endl; out << "vector double unr : " << t_double_vector_u.count() << "ms" << std::endl; #endif out << "============================" << std::endl; } #define DEFINE_OP_FUNCTOR_2OP(OP, NAME) \ struct NAME##_fn \ { \ template \ inline T operator()(const T& lhs, const T& rhs) const \ { \ return lhs OP rhs; \ } \ inline std::string name() const \ { \ return #NAME; \ } \ } #define DEFINE_FUNCTOR_1OP(FN) \ struct FN##_fn \ { \ template \ inline T operator()(const T& x) const \ { \ using xsimd::FN; \ return FN(x); \ } \ inline std::string name() const \ { \ return #FN; \ } \ } #define DEFINE_FUNCTOR_1OP_TEMPLATE(NAME, FN, N, ...) \ struct NAME##_##N##_fn \ { \ template \ inline T operator()(const T& x) const \ { \ using xsimd::FN; \ return FN(x); \ } \ inline std::string name() const \ { \ return #FN " " #N; \ } \ } #define DEFINE_FUNCTOR_2OP(FN) \ struct FN##_fn \ { \ template \ inline T operator()(const T& lhs, const T& rhs) const \ { \ using xsimd::FN; \ return FN(lhs, rhs); \ } \ inline std::string name() const \ { \ return #FN; \ } \ } #define DEFINE_FUNCTOR_3OP(FN) \ struct FN##_fn \ { \ template \ inline T operator()(const T& op0, const T& op1, const T& op2) const \ { \ using xsimd::FN; \ return FN(op0, op1, op2); \ } \ inline std::string name() const \ { \ return #FN; \ } \ } DEFINE_OP_FUNCTOR_2OP(+, add); DEFINE_OP_FUNCTOR_2OP(-, sub); DEFINE_OP_FUNCTOR_2OP(*, mul); DEFINE_OP_FUNCTOR_2OP(/, div); DEFINE_FUNCTOR_1OP(exp); DEFINE_FUNCTOR_1OP(exp2); DEFINE_FUNCTOR_1OP(expm1); DEFINE_FUNCTOR_1OP(log); DEFINE_FUNCTOR_1OP(log10); DEFINE_FUNCTOR_1OP(log2); DEFINE_FUNCTOR_1OP(log1p); DEFINE_FUNCTOR_1OP(sin); DEFINE_FUNCTOR_1OP(cos); DEFINE_FUNCTOR_1OP(tan); DEFINE_FUNCTOR_1OP(asin); DEFINE_FUNCTOR_1OP(acos); DEFINE_FUNCTOR_1OP(atan); DEFINE_FUNCTOR_1OP(sinh); DEFINE_FUNCTOR_1OP(cosh); DEFINE_FUNCTOR_1OP(tanh); DEFINE_FUNCTOR_1OP(asinh); DEFINE_FUNCTOR_1OP(acosh); DEFINE_FUNCTOR_1OP(atanh); DEFINE_FUNCTOR_2OP(pow); DEFINE_FUNCTOR_1OP(sqrt); DEFINE_FUNCTOR_1OP(cbrt); DEFINE_FUNCTOR_2OP(hypot); DEFINE_FUNCTOR_1OP(ceil); DEFINE_FUNCTOR_1OP(floor); DEFINE_FUNCTOR_1OP(trunc); DEFINE_FUNCTOR_1OP(round); DEFINE_FUNCTOR_1OP(nearbyint); DEFINE_FUNCTOR_1OP(rint); DEFINE_FUNCTOR_2OP(fmod); DEFINE_FUNCTOR_2OP(remainder); DEFINE_FUNCTOR_2OP(fdim); DEFINE_FUNCTOR_3OP(clip); #if 0 DEFINE_FUNCTOR_1OP(isfinite); DEFINE_FUNCTOR_1OP(isinf); DEFINE_FUNCTOR_1OP(is_flint); DEFINE_FUNCTOR_1OP(is_odd); DEFINE_FUNCTOR_1OP(is_even); #endif DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 5, 1, 2, 3, 4, 5); DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 5, 1, 2, 3, 4, 5); DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10); DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10); DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12); DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12); DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14); DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14); DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); } #endif xsimd-12.1.1/cmake/000077500000000000000000000000001453610362700140075ustar00rootroot00000000000000xsimd-12.1.1/cmake/JoinPaths.cmake000066400000000000000000000016771453610362700167230ustar00rootroot00000000000000# This module provides function for joining paths # known from from most languages # # Original license: # SPDX-License-Identifier: (MIT OR CC0-1.0) # Explicit permission given to distribute this module under # the terms of the project as described in /LICENSE.rst. # Copyright 2020 Jan Tojnar # https://github.com/jtojnar/cmake-snips # # Modelled after Python’s os.path.join # https://docs.python.org/3.7/library/os.path.html#os.path.join # Windows not supported function(join_paths joined_path first_path_segment) set(temp_path "${first_path_segment}") foreach(current_segment IN LISTS ARGN) if(NOT ("${current_segment}" STREQUAL "")) if(IS_ABSOLUTE "${current_segment}") set(temp_path "${current_segment}") else() set(temp_path "${temp_path}/${current_segment}") endif() endif() endforeach() set(${joined_path} "${temp_path}" PARENT_SCOPE) endfunction() xsimd-12.1.1/docs/000077500000000000000000000000001453610362700136575ustar00rootroot00000000000000xsimd-12.1.1/docs/Doxyfile000066400000000000000000000044661453610362700153770ustar00rootroot00000000000000PROJECT_NAME = "xsimd" XML_OUTPUT = xml INPUT = ../include/xsimd/types/xsimd_api.hpp \ ../include/xsimd/types/xsimd_batch.hpp \ ../include/xsimd/types/xsimd_batch_constant.hpp \ ../include/xsimd/config/xsimd_arch.hpp \ ../include/xsimd/config/xsimd_config.hpp \ ../include/xsimd/memory/xsimd_alignment.hpp \ ../include/xsimd/memory/xsimd_aligned_allocator.hpp \ ../include/xsimd/types/xsimd_generic_arch.hpp \ ../include/xsimd/types/xsimd_traits.hpp \ ../include/xsimd/types/xsimd_avx2_register.hpp \ ../include/xsimd/types/xsimd_avx512bw_register.hpp \ ../include/xsimd/types/xsimd_avx512cd_register.hpp \ ../include/xsimd/types/xsimd_avx512dq_register.hpp \ ../include/xsimd/types/xsimd_avx512f_register.hpp \ ../include/xsimd/types/xsimd_avx_register.hpp \ ../include/xsimd/types/xsimd_fma3_avx_register.hpp \ ../include/xsimd/types/xsimd_fma3_avx2_register.hpp \ ../include/xsimd/types/xsimd_fma3_sse_register.hpp \ ../include/xsimd/types/xsimd_fma4_register.hpp \ ../include/xsimd/types/xsimd_neon64_register.hpp \ ../include/xsimd/types/xsimd_neon_register.hpp \ ../include/xsimd/types/xsimd_rvv_register.hpp \ ../include/xsimd/types/xsimd_sse2_register.hpp \ ../include/xsimd/types/xsimd_sse3_register.hpp \ ../include/xsimd/types/xsimd_sse4_1_register.hpp \ ../include/xsimd/types/xsimd_sse4_2_register.hpp \ ../include/xsimd/types/xsimd_ssse3_register.hpp \ ../include/xsimd/types/xsimd_sve_register.hpp GENERATE_LATEX = NO GENERATE_MAN = NO GENERATE_RTF = NO CASE_SENSE_NAMES = NO GENERATE_HTML = NO GENERATE_XML = YES RECURSIVE = YES QUIET = YES JAVADOC_AUTOBRIEF = YES WARN_IF_UNDOCUMENTED = NO WARN_AS_ERROR = NO ENABLE_PREPROCESSING = YES MACRO_EXPANSION = YES EXPAND_ONLY_PREDEF = YES PREDEFINED = XSIMD_NO_DISCARD= xsimd-12.1.1/docs/Makefile000066400000000000000000000147261453610362700153310ustar00rootroot00000000000000# You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = BUILDDIR = build # User-friendly check for sphinx-build ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) endif # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext api default: html help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " applehelp to make an Apple Help Book" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" @echo " text to make text files" @echo " man to make manual pages" @echo " texinfo to make Texinfo files" @echo " info to make Texinfo files and run them through makeinfo" @echo " gettext to make PO message catalogs" @echo " changes to make an overview of all changed/added/deprecated items" @echo " xml to make Docutils-native XML files" @echo " pseudoxml to make pseudoxml-XML files for display purposes" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" @echo " coverage to run coverage check of the documentation (if enabled)" clean: rm -rf $(BUILDDIR)/* html: doxygen $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." dirhtml: doxygen $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." singlehtml: doxygen $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." pickle: doxygen $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: doxygen $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." htmlhelp: doxygen $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." epub: doxygen $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." latex: doxygen $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." latexpdf: doxygen $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." $(MAKE) -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." latexpdfja: doxygen $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through platex and dvipdfmx..." $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." text: doxygen $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." man: doxygen $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." texinfo: doxygen $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." @echo "Run \`make' in that directory to run these through makeinfo" \ "(use \`make info' here to do that automatically)." info: doxygen $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo "Running Texinfo files through makeinfo..." make -C $(BUILDDIR)/texinfo info @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." gettext: doxygen $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale @echo @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." changes: doxygen $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: doxygen $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: doxygen $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." coverage: doxygen $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage @echo "Testing of coverage in the sources finished, look at the " \ "results in $(BUILDDIR)/coverage/python.txt." xml: doxygen $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml @echo @echo "Build finished. The XML files are in $(BUILDDIR)/xml." pseudoxml: doxygen $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml @echo @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." xsimd-12.1.1/docs/environment.yml000066400000000000000000000001341453610362700167440ustar00rootroot00000000000000name: xsimd-docs channels: - conda-forge dependencies: - breathe - sphinx_rtd_theme xsimd-12.1.1/docs/make.bat000066400000000000000000000161651453610362700152750ustar00rootroot00000000000000@ECHO OFF REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set BUILDDIR=build set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source set I18NSPHINXOPTS=%SPHINXOPTS% source if NOT "%PAPER%" == "" ( set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% ) if "%1" == "" goto help if "%1" == "help" ( :help echo.Please use `make ^` where ^ is one of echo. html to make standalone HTML files echo. dirhtml to make HTML files named index.html in directories echo. singlehtml to make a single large HTML file echo. pickle to make pickle files echo. json to make JSON files echo. htmlhelp to make HTML files and a HTML help project echo. qthelp to make HTML files and a qthelp project echo. devhelp to make HTML files and a Devhelp project echo. epub to make an epub echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter echo. text to make text files echo. man to make manual pages echo. texinfo to make Texinfo files echo. gettext to make PO message catalogs echo. changes to make an overview over all changed/added/deprecated items echo. xml to make Docutils-native XML files echo. pseudoxml to make pseudoxml-XML files for display purposes echo. linkcheck to check all external links for integrity echo. doctest to run all doctests embedded in the documentation if enabled echo. coverage to run coverage check of the documentation if enabled goto end ) if "%1" == "clean" ( for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i del /q /s %BUILDDIR%\* goto end ) REM Check if sphinx-build is available and fallback to Python version if any %SPHINXBUILD% 1>NUL 2>NUL if errorlevel 9009 goto sphinx_python goto sphinx_ok :sphinx_python set SPHINXBUILD=python -m sphinx.__init__ %SPHINXBUILD% 2> nul if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) :sphinx_ok if "%1" == "html" ( doxygen %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/html. goto end ) if "%1" == "dirhtml" ( %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. goto end ) if "%1" == "singlehtml" ( %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. goto end ) if "%1" == "pickle" ( %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the pickle files. goto end ) if "%1" == "json" ( %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the JSON files. goto end ) if "%1" == "htmlhelp" ( %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run HTML Help Workshop with the ^ .hhp project file in %BUILDDIR%/htmlhelp. goto end ) if "%1" == "qthelp" ( %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run "qcollectiongenerator" with the ^ .qhcp project file in %BUILDDIR%/qthelp, like this: echo.^> qcollectiongenerator %BUILDDIR%\qthelp\packagename.qhcp echo.To view the help file: echo.^> assistant -collectionFile %BUILDDIR%\qthelp\packagename.ghc goto end ) if "%1" == "devhelp" ( %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp if errorlevel 1 exit /b 1 echo. echo.Build finished. goto end ) if "%1" == "epub" ( %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub if errorlevel 1 exit /b 1 echo. echo.Build finished. The epub file is in %BUILDDIR%/epub. goto end ) if "%1" == "latex" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex if errorlevel 1 exit /b 1 echo. echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. goto end ) if "%1" == "latexpdf" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf cd %~dp0 echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end ) if "%1" == "latexpdfja" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf-ja cd %~dp0 echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end ) if "%1" == "text" ( %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text if errorlevel 1 exit /b 1 echo. echo.Build finished. The text files are in %BUILDDIR%/text. goto end ) if "%1" == "man" ( %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man if errorlevel 1 exit /b 1 echo. echo.Build finished. The manual pages are in %BUILDDIR%/man. goto end ) if "%1" == "texinfo" ( %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo if errorlevel 1 exit /b 1 echo. echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. goto end ) if "%1" == "gettext" ( %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale if errorlevel 1 exit /b 1 echo. echo.Build finished. The message catalogs are in %BUILDDIR%/locale. goto end ) if "%1" == "changes" ( %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes if errorlevel 1 exit /b 1 echo. echo.The overview file is in %BUILDDIR%/changes. goto end ) if "%1" == "linkcheck" ( %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck if errorlevel 1 exit /b 1 echo. echo.Link check complete; look for any errors in the above output ^ or in %BUILDDIR%/linkcheck/output.txt. goto end ) if "%1" == "doctest" ( %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest if errorlevel 1 exit /b 1 echo. echo.Testing of doctests in the sources finished, look at the ^ results in %BUILDDIR%/doctest/output.txt. goto end ) if "%1" == "coverage" ( %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage if errorlevel 1 exit /b 1 echo. echo.Testing of coverage in the sources finished, look at the ^ results in %BUILDDIR%/coverage/python.txt. goto end ) if "%1" == "xml" ( %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml if errorlevel 1 exit /b 1 echo. echo.Build finished. The XML files are in %BUILDDIR%/xml. goto end ) if "%1" == "pseudoxml" ( %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml if errorlevel 1 exit /b 1 echo. echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. goto end ) :end xsimd-12.1.1/docs/source/000077500000000000000000000000001453610362700151575ustar00rootroot00000000000000xsimd-12.1.1/docs/source/_static/000077500000000000000000000000001453610362700166055ustar00rootroot00000000000000xsimd-12.1.1/docs/source/_static/main_stylesheet.css000066400000000000000000000000741453610362700225150ustar00rootroot00000000000000.wy-nav-content{ max-width: 1000px; margin: auto; } xsimd-12.1.1/docs/source/api/000077500000000000000000000000001453610362700157305ustar00rootroot00000000000000xsimd-12.1.1/docs/source/api/aligned_allocator.rst000066400000000000000000000007201453610362700221240ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Alignment manipulation ====================== Aligned memory allocator ------------------------ .. doxygenclass:: xsimd::aligned_allocator :project: xsimd :members: Alignement checker ------------------ .. doxygenfunction:: xsimd::is_aligned :project: xsimd xsimd-12.1.1/docs/source/api/arch.rst000066400000000000000000000012601453610362700173760ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Architecture manipulation ========================= xsimd provides an high level description of the instruction sets it manipulates. The mentioned types are primarily used as template parameters for :ref:`batch `, and when interacting with :cpp:func:`xsimd::dispatch()`. The best available architecture is available at compile time through ``xsimd::best_arch`` which also happens to be ``xsimd::default_arch``. .. doxygengroup:: architectures :project: xsimd :members: xsimd-12.1.1/docs/source/api/arithmetic_index.rst000066400000000000000000000106541453610362700220100ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Arithmetic operators ==================== Binary operations: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`add` | per slot addition | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`sub` | per slot subtraction | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`mul` | per slot multiply | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`div` | per slot division | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`mod` | per slot modulo | +---------------------------------------+----------------------------------------------------+ Unary operations: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`neg` | per slot negate | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`pos` | per slot positive | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`reciprocal` | per slot reciprocal | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`decr` | per slot decrement | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`decr_if` | per slot decrement, based on a mask | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`incr` | per slot increment | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`incr_if` | per slot increment, based on a mask | +---------------------------------------+----------------------------------------------------+ Saturated arithmetic: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`sadd` | per slot saturated addition | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`ssub` | per slot saturated subtraction | +---------------------------------------+----------------------------------------------------+ Fused operations: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fma` | fused multiply add | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fms` | fused multiply sub | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fnma` | fused negate multiply add | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fnms` | fused negate multiply sub | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_arithmetic :project: xsimd :content-only: xsimd-12.1.1/docs/source/api/batch_index.rst000066400000000000000000000005121453610362700207300ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Batch types =========== .. toctree:: :maxdepth: 1 xsimd_batch xsimd_batch_bool xsimd_batch_complex xsimd_batch_constant xsimd-12.1.1/docs/source/api/batch_manip.rst000066400000000000000000000016671453610362700207410ustar00rootroot00000000000000.. Copyright (c) 2021, Serge Guelton Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Conditional expression ====================== +------------------------------+-------------------------------------------+ | :cpp:func:`select` | conditional selection with mask | +------------------------------+-------------------------------------------+ ---- .. doxygenfunction:: select(batch_bool const &cond, batch const &true_br, batch const &false_br) noexcept :project: xsimd .. doxygenfunction:: select(batch_bool_constant, Values...> const &cond, batch const &true_br, batch const &false_br) noexcept :project: xsimd In the specific case when one needs to conditionnaly increment or decrement a batch based on a mask, :cpp:func:`incr_if` and :cpp:func:`decr_if` provide specialized version. xsimd-12.1.1/docs/source/api/bitwise_operators_index.rst000066400000000000000000000046051453610362700234220ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Bitwise operators ================= +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_not` | per slot bitwise not | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_or` | per slot bitwise or | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_xor` | per slot bitwise xor | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_and` | per slot bitwise and | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_andnot` | per slot bitwise and not | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_lshift` | per slot bitwise and | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_rshift` | per slot bitwise and not | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`rotr` | per slot rotate right | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`rotl` | per slot rotate left | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_bitwise :project: xsimd :content-only: xsimd-12.1.1/docs/source/api/cast_index.rst000077500000000000000000000034001453610362700206030ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Type conversion =============== Cast: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`batch_cast` | ``static_cast`` on batch types | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_cast` | ``reinterpret_cast`` on batch types | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`batch_bool_cast` | ``static_cast`` on batch predicate types | +---------------------------------------+----------------------------------------------------+ Conversion: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`to_float` | per slot conversion to floating point | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`to_int` | per slot conversion to integer | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_conversion :project: xsimd :content-only: xsimd-12.1.1/docs/source/api/comparison_index.rst000066400000000000000000000062741453610362700220340ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Comparison operators ==================== Ordering: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`eq` | per slot equals to comparison | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`neq` | per slot different from comparison | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`gt` | per slot strictly greater than comparison | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`lt` | per slot strictly lower than comparison | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`ge` | per slot greater or equal to comparison | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`le` | per slot lower or equal to comparison | +---------------------------------------+----------------------------------------------------+ Parity check: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`is_even` | per slot check for evenness | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`is_odd` | per slot check for oddness | +---------------------------------------+----------------------------------------------------+ Floating point number check: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`isinf` | per slot check for infinity | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`isnan` | per slot check for NaN | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`isfinite` | per slot check for finite number | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`is_flint` | per slot check for float representing an integer | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_logical :project: xsimd :content-only: xsimd-12.1.1/docs/source/api/data_transfer.rst000066400000000000000000000113371453610362700213040ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Data transfer ============= From memory: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`load` | load values from memory | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`load_aligned` | load values from aligned memory | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`load_unaligned` | load values from unaligned memory | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`load_as` | load values, forcing a type conversion | +---------------------------------------+----------------------------------------------------+ From a scalar: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`broadcast` | broadcasting a value to all slots | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`broadcast_as` | broadcasting a value, forcing a type conversion | +---------------------------------------+----------------------------------------------------+ To memory: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`store` | store values to memory | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`store_aligned` | store values to aligned memory | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`store_unaligned` | store values to unaligned memory | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`store_as` | store values, forcing a type conversion | +---------------------------------------+----------------------------------------------------+ In place: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`swizzle` | rearrange slots within the batch | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`slide_left` | bitwise shift the whole batch to the left | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`slide_right` | bitwise shift the whole batch to the right | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`rotate_left` | bitwise rotate the whole batch to the left | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`rotate_right` | bitwise rotate the whole batch to the right | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`insert` | modify a single batch slot | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`compress` | pack elements according to a mask | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`expand` | select contiguous elements from the batch | +---------------------------------------+----------------------------------------------------+ Between batches: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`zip_lo` | interleave low halves of two batches | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`zip_hi` | interleave high halves of two batches | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_data_transfer :project: xsimd :content-only: The following empty types are used for tag dispatching: .. doxygenstruct:: xsimd::aligned_mode :project: xsimd .. doxygenstruct:: xsimd::unaligned_mode :project: xsimd xsimd-12.1.1/docs/source/api/dispatching.rst000066400000000000000000000036461453610362700207700ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html .. _Arch Dispatching: Arch Dispatching ================ `xsimd` provides a generic way to dispatch a function call based on the architecture the code was compiled for and the architectures available at runtime. The :cpp:func:`xsimd::dispatch` function takes a functor whose call operator takes an architecture parameter as first operand, followed by any number of arguments ``Args...`` and turn it into a dispatching functor that takes ``Args...`` as arguments. .. doxygenfunction:: xsimd::dispatch :project: xsimd Following code showcases a usage of the :cpp:func:`xsimd::dispatch` function: .. code-block:: c++ #include "sum.hpp" // Create the dispatching function, specifying the architecture we want to // target. auto dispatched = xsimd::dispatch>(sum{}); // Call the appropriate implementation based on runtime information. float res = dispatched(data, 17); This code does *not* require any architecture-specific flags. The architecture specific details follow. The ``sum.hpp`` header contains the function being actually called, in an architecture-agnostic description: .. literalinclude:: ../../../test/doc/sum.hpp The SSE2 and AVX2 version needs to be provided in other compilation units, compiled with the appropriate flags, for instance: .. literalinclude:: ../../../test/doc/sum_avx2.cpp .. literalinclude:: ../../../test/doc/sum_sse2.cpp xsimd-12.1.1/docs/source/api/instr_macros.rst000066400000000000000000000021371453610362700211700ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille and Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Instruction set macros ====================== Each of these macros corresponds to an instruction set supported by XSIMD. They can be used to filter arch-specific code. .. doxygengroup:: xsimd_config_macro :project: xsimd :content-only: Changing Default architecture ***************************** You can change the default instruction set used by xsimd (when none is provided explicitely) by setting the ``XSIMD_DEFAULT_ARCH`` macro to, say, ``xsimd::avx2``. A common usage is to set it to ``xsimd::unsupported`` as a way to detect instantiation of batches with the default architecture. xsimd-12.1.1/docs/source/api/math_index.rst000066400000000000000000000245771453610362700206210ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Mathematical functions ====================== Basic functions: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`abs` | absolute value | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fabs` | absolute value of floating point values | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fmod` | remainder of the floating point division operation | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`remainder` | signed remainder of the division operation | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`min` | smaller of two batches | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`max` | larger of two batches | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fmin` | smaller of two batches of floating point values | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fmax` | larger of two batches of floating point values | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fdim` | positive difference | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`clip` | clipping operation | +---------------------------------------+----------------------------------------------------+ Exponential functions: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`exp` | natural exponential function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`exp2` | base 2 exponential function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`exp10` | base 10 exponential function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`expm1` | natural exponential function, minus one | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`log` | natural logarithm function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`log2` | base 2 logarithm function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`log10` | base 10 logarithm function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`log1p` | natural logarithm of one plus function | +---------------------------------------+----------------------------------------------------+ Power functions: +-----------------------------------------+----------------------------------------------------+ | :cpp:func:`pow` | power function | +-----------------------------------------+----------------------------------------------------+ | :cpp:func:`rsqrt` | reciprocal square root function | +-----------------------------------------+----------------------------------------------------+ | :cpp:func:`sqrt` | square root function | +-----------------------------------------+----------------------------------------------------+ | :cpp:func:`cbrt` | cubic root function | +-----------------------------------------+----------------------------------------------------+ | :cpp:func:`hypot` | hypotenuse function | +-----------------------------------------+----------------------------------------------------+ Trigonometric functions: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`sin` | sine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`cos` | cosine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`sincos` | sine and cosine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`tan` | tangent function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`asin` | arc sine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`acos` | arc cosine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`atan` | arc tangent function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`atan2` | arc tangent function, determining quadrants | +---------------------------------------+----------------------------------------------------+ Hyperbolic functions: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`sinh` | hyperbolic sine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`cosh` | hyperbolic cosine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`tanh` | hyperbolic tangent function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`asinh` | inverse hyperbolic sine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`acosh` | inverse hyperbolic cosine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`atanh` | inverse hyperbolic tangent function | +---------------------------------------+----------------------------------------------------+ Error functions: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`erf` | error function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`erfc` | complementary error function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`tgamma` | gamma function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`lgamma` | natural logarithm of the gamma function | +---------------------------------------+----------------------------------------------------+ Nearint operations: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`ceil` | nearest integers not less | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`floor` | nearest integers not greater | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`trunc` | nearest integers not greater in magnitude | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`round` | nearest integers, rounding away from zero | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`nearbyint` | nearest integers using current rounding mode | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`rint` | nearest integers using current rounding mode | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_math :project: xsimd :content-only: .. doxygengroup:: batch_trigo :project: xsimd :content-only: .. doxygengroup:: batch_rounding :project: xsimd :content-only: .. doxygengroup:: batch_math_extra :project: xsimd :content-only: xsimd-12.1.1/docs/source/api/misc_index.rst000066400000000000000000000034221453610362700206050ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Miscellaneous ============= Sign manipulation: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`sign` | per slot sign extraction | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`signnz` | per slot sign extraction on non null elements | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitofsign` | per slot sign bit extraction | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`copysign` | per slot sign copy | +---------------------------------------+----------------------------------------------------+ Stream operation: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`operator<<` | batch pretty-printing | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_miscellaneous :project: xsimd :content-only: xsimd-12.1.1/docs/source/api/reducer_index.rst000066400000000000000000000032211453610362700213000ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Reduction operators =================== +---------------------------------------+----------------------------------------------------+ | :cpp:func:`reduce` | generic batch reduction | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`reduce_add` | sum of each batch element | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`reduce_max` | max of the batch elements | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`reduce_min` | min of the batch elements | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`haddp` | horizontal sum across batches | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_reducers :project: xsimd :content-only: xsimd-12.1.1/docs/source/api/type_traits.rst000066400000000000000000000035641453610362700210410ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html .. _Type Traits: Type Traits =========== `xsimd` provides a few type traits to interact with scalar and batch types in an uniformeous manner. Type check: +---------------------------------------+----------------------------------------------------+ | :cpp:class:`is_batch` | batch type detection | +---------------------------------------+----------------------------------------------------+ | :cpp:class:`is_batch_bool` | mask batch type detection | +---------------------------------------+----------------------------------------------------+ | :cpp:class:`is_batch_complex` | complex batch type detection | +---------------------------------------+----------------------------------------------------+ Type access: +---------------------------------------+----------------------------------------------------+ | :cpp:class:`scalar_type` | batch element type | +---------------------------------------+----------------------------------------------------+ | :cpp:class:`mask_type` | batch mask type | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_traits :project: xsimd :content-only: xsimd-12.1.1/docs/source/api/xsimd_batch.rst000066400000000000000000000006061453610362700207510ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Batch of scalars ================ .. _xsimd-batch-ref: .. doxygenclass:: xsimd::batch :project: xsimd :members: .. doxygenstruct:: xsimd::make_sized_batch :project: xsimd :members: xsimd-12.1.1/docs/source/api/xsimd_batch_bool.rst000066400000000000000000000010311453610362700217550ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Batch of conditions =================== .. _xsimd-batch-bool-ref: .. doxygenclass:: xsimd::batch_bool :project: xsimd :members: Logical operators ----------------- .. doxygengroup:: batch_bool_logical :project: xsimd :content-only: Reducers -------- .. doxygengroup:: batch_bool_reducers :project: xsimd :content-only: xsimd-12.1.1/docs/source/api/xsimd_batch_complex.rst000066400000000000000000000014311453610362700224750ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Batch of complex numbers ======================== .. doxygenclass:: xsimd::batch< std::complex< T >, A > :project: xsimd :members: Operations specific to batches of complex numbers ------------------------------------------------- .. doxygengroup:: batch_complex_op :project: xsimd :content-only: XTL complex support ------------------- If the preprocessor token ``XSIMD_ENABLE_XTL_COMPLEX`` is defined, ``xsimd`` provides constructors of ``xsimd::batch< std::complex< T >, A >`` from ``xtl::xcomplex``, similar to those for ``std::complex``. This requires ``xtl`` to be installed. xsimd-12.1.1/docs/source/api/xsimd_batch_constant.rst000066400000000000000000000006241453610362700226620ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Batch of constants ================== .. _xsimd-batch-constant-ref: .. doxygenstruct:: xsimd::batch_constant :project: xsimd :members: .. doxygenfunction:: xsimd::make_batch_constant :project: xsimd xsimd-12.1.1/docs/source/basic_usage.rst000066400000000000000000000032401453610362700201550ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille and Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Basic usage =========== Manipulating abstract batches ----------------------------- Here is an example that computes the mean of two batches, using the best architecture available, based on compile time informations: .. literalinclude:: ../../test/doc/manipulating_abstract_batches.cpp The batch can be a batch of 4 single precision floating point numbers (e.g. on Neon) or a batch of 8 (e.g. on AVX2). Manipulating parametric batches ------------------------------- The previous example can be made fully parametric, both in the batch type and the underlying architecture. This is achieved as described in the following example: .. literalinclude:: ../../test/doc/manipulating_parametric_batches.cpp At its core, a :cpp:class:`xsimd::batch` is bound to the scalar type it contains, and to the instruction set it can use to operate on its values. Explicit use of an instruction set extension -------------------------------------------- Here is an example that loads two batches of 4 double floating point values, and computes their mean, explicitly using the AVX extension: .. literalinclude:: ../../test/doc/explicit_use_of_an_instruction_set.cpp Note that in that case, the instruction set is explicilty specified in the batch type. This example outputs: .. code:: (2.0, 3.0, 4.0, 5.0) .. warning:: If you allow your compiler to generate AVX2 instructions (e.g. through ``-mavx2``) there is nothing preventing it from optimizing the above code using AVX2 instructions. xsimd-12.1.1/docs/source/cmake.svg000066400000000000000000000425311453610362700167650ustar00rootroot00000000000000 image/svg+xml xsimd-12.1.1/docs/source/conda.svg000066400000000000000000000034151453610362700167670ustar00rootroot00000000000000xsimd-12.1.1/docs/source/conf.py000066400000000000000000000015321453610362700164570ustar00rootroot00000000000000#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os import subprocess on_rtd = os.environ.get('READTHEDOCS', None) == 'True' if on_rtd: subprocess.check_call('cd ..; doxygen', shell=True) import sphinx_rtd_theme html_theme = "sphinx_rtd_theme" html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] def setup(app): app.add_css_file("main_stylesheet.css") extensions = ['breathe', 'sphinx_rtd_theme'] breathe_projects = { 'xsimd': '../xml' } templates_path = ['_templates'] html_static_path = ['_static'] source_suffix = '.rst' master_doc = 'index' project = 'xsimd' copyright = '2016, Johan Mabille and Sylvain Corlay' author = 'Johan Mabille and Sylvain Corlay' html_logo = 'quantstack-white.svg' exclude_patterns = [] highlight_language = 'c++' pygments_style = 'sphinx' todo_include_todos = False htmlhelp_basename = 'xsimddoc' xsimd-12.1.1/docs/source/index.rst000066400000000000000000000075061453610362700170300ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille and Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. image:: xsimd.svg :alt: xsimd C++ wrappers for SIMD intrinsics. Introduction ------------ SIMD (Single Instruction, Multiple Data) is a feature of microprocessors that has been available for many years. SIMD instructions perform a single operation on a batch of values at once, and thus provide a way to significantly accelerate code execution. However, these instructions differ between microprocessor vendors and compilers. `xsimd` provides a unified means for using these features for library authors. Namely, it enables manipulation of batches of scalar and complex numbers with the same arithmetic operators and common mathematical functions as for single values. `xsimd` makes it easy to write a single algorithm, generate one version of the algorithm per micro-architecture and pick the best one at runtime, based on the running processor capability. You can find out more about this implementation of C++ wrappers for SIMD intrinsics at the `The C++ Scientist`_. The mathematical functions are a lightweight implementation of the algorithms also used in `boost.SIMD`_. `xsimd` requires a C++11 compliant compiler. The following C++ compilers are supported: +-------------------------+-------------------------------+ | Compiler | Version | +=========================+===============================+ | Microsoft Visual Studio | MSVC 2015 update 2 and above | +-------------------------+-------------------------------+ | g++ | 4.9 and above | +-------------------------+-------------------------------+ | clang | 3.7 and above | +-------------------------+-------------------------------+ The following SIMD instruction set extensions are supported: +--------------+---------------------------------------------------------+ | Architecture | Instruction set extensions | +==============+=========================================================+ | x86 | SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, FMA3, AVX2 | +--------------+---------------------------------------------------------+ | x86 | AVX512 (gcc7 and higher) | +--------------+---------------------------------------------------------+ | x86 AMD | same as above + FMA4 | +--------------+---------------------------------------------------------+ | ARM | ARMv7, ARMv8 | +--------------+---------------------------------------------------------+ | WebAssembly | WASM | +--------------+---------------------------------------------------------+ Licensing --------- We use a shared copyright model that enables all contributors to maintain the copyright on their contributions. This software is licensed under the BSD-3-Clause license. See the LICENSE file for details. .. toctree:: :caption: INSTALLATION :maxdepth: 2 installation .. toctree:: :caption: USAGE :maxdepth: 2 basic_usage vectorized_code .. toctree:: :caption: API REFERENCE :maxdepth: 1 api/instr_macros api/batch_index api/data_transfer api/arithmetic_index api/comparison_index api/bitwise_operators_index api/math_index api/reducer_index api/cast_index api/type_traits api/batch_manip api/misc_index api/aligned_allocator api/arch api/dispatching .. toctree:: :caption: MIGRATION GUIDE :maxdepth: 1 migration_guide .. _The C++ Scientist: http://johanmabille.github.io/blog/archives/ .. _boost.SIMD: https://github.com/NumScale/boost.simd xsimd-12.1.1/docs/source/installation.rst000066400000000000000000000034341453610362700204160ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille and Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Installation ============ `xsimd` is a header-only library, so installing it is just a matter of copying the ``include/xsimd`` directory. However we provide standardized means to install it, with package managers or with cmake. Besides the `xsimd` headers, all these methods place the ``cmake`` project configuration file in the right location so that third-party projects can use cmake's ``find_package`` to locate `xsimd` headers. .. image:: conda.svg Using the conda-forge package ----------------------------- A package for `xsimd` is available for the `mamba `_ (or `conda `_) package manager. .. code:: mamba install -c conda-forge xsimd .. image:: spack.svg Using the Spack package ----------------------- A package for `xsimd` is available on the `Spack `_ package manager. .. code:: spack install xsimd spack load xsimd .. image:: cmake.svg From source with cmake ---------------------- You can install `xsimd` from source with `cmake `_. On Unix platforms, from the source directory: .. code:: mkdir build cd build cmake -DCMAKE_INSTALL_PREFIX=/path/to/prefix .. make install On Windows platforms, from the source directory: .. code:: mkdir build cd build cmake -G "NMake Makefiles" -DCMAKE_INSTALL_PREFIX=/path/to/prefix .. nmake nmake install xsimd-12.1.1/docs/source/migration_guide.rst000066400000000000000000000056031453610362700210630ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille and Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html From 7.x to 8.x =============== Version 8.x introduces a lot of API difference compared to version 7.x. This section motivates the version bump and details the most notable changes. Why 8.x ------- Version 8.x introduces a new concept in `xsimd`: all batch types are now parametrized by a type, say ``double``, and an optional architecture, say ``avx512``, as in ``batch``. It is still possible to just require a batch of doubles and let the library pick the most appropriate architecture, as in ``batch``. This new design make it possible to target multiple architecture from the same code, as detailed in the :ref:`Arch Dispatching` section. As a side effect of this (almost full) rewrite of the library code, `xsimd` is now twice as fast to compile, and its source code size as been (roughly) divided by two. The `xsimd` developers also took this as an opportnuity to significantly improve test coverage. Most Notable Changes -------------------- Batch Types *********** The second argument of :cpp:class:`xsimd::batch` is now a type that represents an architecture, instead of an integer. The previous behavior can be emulated through the :cpp:class:`xsimd::make_sized_batch` utility. Batch of Complex Types ********************** Loading a batch of complex from an ``xtl::xcomplex`` now yields an ``xsimd::batch>`` instead of an ``xtl::xcomplex``. It is still possible to store an ``xsimd::batch>`` to an ``xtl::xcomplex``. Loading Batches *************** ``xsimd::batch::load*`` are now static functions. It is no longer supported to update an existing batch through its ``load`` method. The regular assign operator can be used instead. Indexing Batches **************** ``xsimd::batch::operator[](size_t)`` has been replaced with ``xsimd::batch::get(size_t)``. Keep in mind that this method implies a register load *for each call*, so it's wise not to use it in performance-critical section. When needed, do an explicit store of the batch into an array and work from there. Architecture Detection ********************** Many macros have been replaced by more elaborated constructs. ``XSIMD_INSTR_SET_AVAILABLE`` has been replaced by the type alias ``xsimd::default_arch``. Likewise architecture-specific macros like ``XSIMD_X86_INSTR_SET_AVAILABLE`` has been replaced by ``xsimd::upported_architectures::contains()``. Macro like ``XSIMD_WITH_SSE3`` are still defined to ``0`` or ``1`` to guard architecture-specific code. xsimd-12.1.1/docs/source/quantstack-white.svg000066400000000000000000000116361453610362700212030ustar00rootroot00000000000000 image/svg+xmlxsimd-12.1.1/docs/source/spack.svg000066400000000000000000000046711453610362700170110ustar00rootroot00000000000000 xsimd-12.1.1/docs/source/vectorized_code.rst000066400000000000000000000074451453610362700210730ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille and Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Writing vectorized code ======================= Assume that we have a simple function that computes the mean of two vectors, something like: .. literalinclude:: ../../test/doc/writing_vectorized_code.cpp How can we use `xsimd` to take advantage of vectorization? Explicit use of an instruction set ---------------------------------- `xsimd` provides the template class :cpp:class:`xsimd::batch` parametrized by ``T`` and ``A`` types where ``T`` is the type of the values involved in SIMD instructions and ``A`` is the target architecture. If you know which instruction set is available on your machine, you can directly use the corresponding specialization of ``batch``. For instance, assuming the AVX instruction set is available, the previous code can be vectorized the following way: .. literalinclude:: ../../test/doc/explicit_use_of_an_instruction_set_mean.cpp However, if you want to write code that is portable, you cannot rely on the use of ``batch``. Indeed this won't compile on a CPU where only SSE2 instruction set is available for instance. Fortunately, if you don't set the second template parameter, `xsimd` picks the best architecture among the one available, based on the compiler flag you use. Aligned vs unaligned memory --------------------------- In the previous example, you may have noticed the :cpp:func:`xsimd::batch::load_unaligned` and :cpp:func:`xsimd::batch::store_unaligned` functions. These are meant for loading values from contiguous dynamically allocated memory into SIMD registers and reciprocally. When dealing with memory transfer operations, some instructions sets required the memory to be aligned by a given amount, others can handle both aligned and unaligned modes. In that latter case, operating on aligned memory is generally faster than operating on unaligned memory. `xsimd` provides an aligned memory allocator, namely :cpp:class:`xsimd::aligned_allocator` which follows the standard requirements, so it can be used with STL containers. Let's change the previous code so it can take advantage of this allocator: .. literalinclude:: ../../test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp Memory alignment and tag dispatching ------------------------------------ You may need to write code that can operate on any type of vectors or arrays, not only the STL ones. In that case, you cannot make assumption on the memory alignment of the container. `xsimd` provides a tag dispatching mechanism that allows you to easily write such a generic code: .. literalinclude:: ../../test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp Here, the ``Tag`` template parameter can be :cpp:class:`xsimd::aligned_mode` or :cpp:class:`xsimd::unaligned_mode`. Assuming the existence of a ``get_alignment_tag`` meta-function in the code, the previous code can be invoked this way: .. code:: mean(a, b, res, get_alignment_tag()); Writing arch-independent code ----------------------------- If your code may target either SSE2, AVX2 or AVX512 instruction set, `xsimd` make it possible to make your code even more generic by using the architecture as a template parameter: .. literalinclude:: ../../test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp This can be useful to implement runtime dispatching, based on the instruction set detected at runtime. `xsimd` provides a generic machinery :cpp:func:`xsimd::dispatch()` to implement this pattern. Based on the above example, instead of calling ``mean{}(arch, a, b, res, tag)``, one can use ``xsimd::dispatch(mean{})(a, b, res, tag)``. More about this can be found in the :ref:`Arch Dispatching` section. xsimd-12.1.1/docs/source/xsimd.svg000066400000000000000000000055741453610362700170370ustar00rootroot00000000000000 xsimd-12.1.1/environment.yml000066400000000000000000000001201453610362700160070ustar00rootroot00000000000000name: xsimd channels: - conda-forge dependencies: - ninja - xtl - doctest xsimd-12.1.1/examples/000077500000000000000000000000001453610362700145455ustar00rootroot00000000000000xsimd-12.1.1/examples/CMakeLists.txt000066400000000000000000000037541453610362700173160ustar00rootroot00000000000000############################################################################ # Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and # # Martin Renou # # Copyright (c) QuantStack # # Copyright (c) Serge Guelton # # # # Distributed under the terms of the BSD 3-Clause License. # # # # The full license is in the file LICENSE, distributed with this software. # ############################################################################ cmake_minimum_required(VERSION 3.1) if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) project(xsimd-examples) find_package(xsimd REQUIRED CONFIG) set(XSIMD_INCLUDE_DIR ${xsimd_INCLUDE_DIR}) endif () include_directories(${XSIMD_INCLUDE_DIR}) if(NOT CMAKE_BUILD_TYPE) message(STATUS "Setting examples build type to Release") set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) else() message(STATUS "Tests build type is ${CMAKE_BUILD_TYPE}") endif() if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Intel") if (NOT CMAKE_CXX_FLAGS MATCHES "-march" AND NOT CMAKE_CXX_FLAGS MATCHES "-arch" AND NOT CMAKE_OSX_ARCHITECTURES) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -mtune=native") endif() if(NOT CMAKE_CXX_COMPILER_ID MATCHES Clang) # We are using clang-cl set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") endif() endif() add_executable(mandelbrot mandelbrot.cpp ${XSIMD_HEADERS}) set_property(TARGET mandelbrot PROPERTY CXX_STANDARD 14) if(ENABLE_XTL_COMPLEX) target_link_libraries(mandelbrot PRIVATE xtl) endif() add_custom_target(xmandelbrot COMMAND mandelbrot DEPENDS mandelbrot) xsimd-12.1.1/examples/mandelbrot.cpp000066400000000000000000000233501453610362700174030ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ // This file is derived from tsimd (MIT License) // https://github.com/ospray/tsimd/blob/master/benchmarks/mandelbrot.cpp // Author Jefferson Amstutz / intel #include #include #include #include #include "pico_bench.hpp" #include // helper function to write the rendered image as PPM file inline void writePPM(const std::string& fileName, const int sizeX, const int sizeY, const int* pixel) { FILE* file = fopen(fileName.c_str(), "wb"); fprintf(file, "P6\n%i %i\n255\n", sizeX, sizeY); unsigned char* out = (unsigned char*)alloca(3 * sizeX); for (int y = 0; y < sizeY; y++) { const unsigned char* in = (const unsigned char*)&pixel[(sizeY - 1 - y) * sizeX]; for (int x = 0; x < sizeX; x++) { out[3 * x + 0] = in[4 * x + 0]; out[3 * x + 1] = in[4 * x + 1]; out[3 * x + 2] = in[4 * x + 2]; } fwrite(out, 3 * sizeX, sizeof(char), file); } fprintf(file, "\n"); fclose(file); } namespace xsimd { template inline batch mandel(const batch_bool& _active, const batch& c_re, const batch& c_im, int maxIters) { using float_batch_type = batch; using int_batch_type = batch; constexpr std::size_t N = float_batch_type::size; float_batch_type z_re = c_re; float_batch_type z_im = c_im; int_batch_type vi(0); for (int i = 0; i < maxIters; ++i) { auto active = _active & ((z_re * z_re + z_im * z_im) <= float_batch_type(4.f)); if (!xsimd::any(active)) { break; } float_batch_type new_re = z_re * z_re - z_im * z_im; float_batch_type new_im = 2.f * z_re * z_im; z_re = c_re + new_re; z_im = c_im + new_im; vi = select(batch_bool_cast(active), vi + 1, vi); } return vi; } template void mandelbrot(float x0, float y0, float x1, float y1, int width, int height, int maxIters, int output[]) { using float_batch_type = batch; using int_batch_type = batch; constexpr std::size_t N = float_batch_type::size; float dx = (x1 - x0) / width; float dy = (y1 - y0) / height; float arange[N]; std::iota(&arange[0], &arange[N], 0.f); // float_batch_type programIndex(&arange[0], xsimd::aligned_mode()); auto programIndex = float_batch_type::load(&arange[0], xsimd::aligned_mode()); // std::iota(programIndex.begin(), programIndex.end(), 0.f); for (int j = 0; j < height; j++) { for (int i = 0; i < width; i += N) { float_batch_type x(x0 + (i + programIndex) * dx); float_batch_type y(y0 + j * dy); auto active = x < float_batch_type(width); int base_index = (j * width + i); auto result = mandel(active, x, y, maxIters); // implement masked store! // xsimd::store_aligned(result, output + base_index, active); int_batch_type prev_data = int_batch_type::load_unaligned(output + base_index); select(batch_bool_cast(active), result, prev_data) .store_aligned(output + base_index); } } } } // namespace xsimd // omp version //////////////////////////////////////////////////////////////// namespace omp { #pragma omp declare simd template inline int mandel(T c_re, T c_im, int count) { T z_re = c_re, z_im = c_im; int i; for (i = 0; i < count; ++i) { if (z_re * z_re + z_im * z_im > 4.f) { break; } T new_re = z_re * z_re - z_im * z_im; T new_im = 2.f * z_re * z_im; z_re = c_re + new_re; z_im = c_im + new_im; } return i; } void mandelbrot(float x0, float y0, float x1, float y1, int width, int height, int maxIterations, int output[]) { float dx = (x1 - x0) / width; float dy = (y1 - y0) / height; for (int j = 0; j < height; j++) { #pragma omp simd for (int i = 0; i < width; ++i) { float x = x0 + i * dx; float y = y0 + j * dy; int index = (j * width + i); output[index] = mandel(x, y, maxIterations); } } } } // namespace omp // scalar version ///////////////////////////////////////////////////////////// namespace scalar { inline int mandel(float c_re, float c_im, int count) { float z_re = c_re, z_im = c_im; int i; for (i = 0; i < count; ++i) { if (z_re * z_re + z_im * z_im > 4.f) { break; } float new_re = z_re * z_re - z_im * z_im; float new_im = 2.f * z_re * z_im; z_re = c_re + new_re; z_im = c_im + new_im; } return i; } void mandelbrot(float x0, float y0, float x1, float y1, int width, int height, int maxIterations, int output[]) { float dx = (x1 - x0) / width; float dy = (y1 - y0) / height; for (int j = 0; j < height; j++) { for (int i = 0; i < width; ++i) { float x = x0 + i * dx; float y = y0 + j * dy; int index = (j * width + i); output[index] = mandel(x, y, maxIterations); } } } } // namespace scalar // run simd version of mandelbrot benchmark for a specific arch template void run_arch( bencher_t& bencher, float x0, float y0, float x1, float y1, int width, int height, int maxIters, std::vector>& buffer) { std::fill(buffer.begin(), buffer.end(), 0); auto stats = bencher([&]() { xsimd::mandelbrot(x0, y0, x1, y1, width, height, maxIters, buffer.data()); }); const float scalar_min = stats.min().count(); std::cout << '\n' << arch::name() << " " << stats << '\n'; auto filename = std::string("mandelbrot_") + std::string(arch::name()) + std::string(".ppm"); writePPM(filename.c_str(), width, height, buffer.data()); } template struct run_archlist; // run simd version of mandelbrot benchmark for a list // of archs template struct run_archlist> { template static void run( bencher_t& bencher, float x0, float y0, float x1, float y1, int width, int height, int maxIters, std::vector>& buffer) { (void)std::initializer_list { (run_arch(bencher, x0, y0, x1, x1, width, height, maxIters, buffer), 0)... }; } }; int main() { using namespace std::chrono; const unsigned int width = 1024; const unsigned int height = 768; const float x0 = -2; const float x1 = 1; const float y0 = -1; const float y1 = 1; const int maxIters = 256; std::vector> buf(width * height); auto bencher = pico_bench::Benchmarker { 64, seconds { 10 } }; std::cout << "starting benchmarks (results in 'ms')... " << '\n'; // scalar run /////////////////////////////////////////////////////////////// std::fill(buf.begin(), buf.end(), 0); auto stats_scalar = bencher([&]() { scalar::mandelbrot(x0, y0, x1, y1, width, height, maxIters, buf.data()); }); const float scalar_min = stats_scalar.min().count(); std::cout << '\n' << "scalar " << stats_scalar << '\n'; writePPM("mandelbrot_scalar.ppm", width, height, buf.data()); // omp run ////////////////////////////////////////////////////////////////// std::fill(buf.begin(), buf.end(), 0); auto stats_omp = bencher([&]() { omp::mandelbrot(x0, y0, x1, y1, width, height, maxIters, buf.data()); }); const float omp_min = stats_omp.min().count(); std::cout << '\n' << "omp " << stats_omp << '\n'; writePPM("mandelbrot_omp.ppm", width, height, buf.data()); run_archlist::run(bencher, x0, y0, x1, y1, width, height, maxIters, buf); return 0; } xsimd-12.1.1/examples/pico_bench.hpp000066400000000000000000000177161453610362700173630ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ // This file is derived from tsimd (MIT License) // https://github.com/ospray/tsimd/blob/master/benchmarks/pico_bench.h // Author Jefferson Amstutz / intel #ifndef PICO_BENCH_H #define PICO_BENCH_H #include #include #include #include #include #include #include #include #include #include namespace pico_bench { /* Statistics on some time measurement value T, e.g. T = * std::chrono::milliseconds T must be some std::chrono::duration type */ template class Statistics { using rep = typename T::rep; std::vector samples; public: std::string time_suffix; Statistics(std::vector s) : samples(s) { std::sort(std::begin(samples), std::end(samples)); } T percentile(const float p) const { return percentile(p, samples); } // Winsorize the data, sets all entries above 100 - limit percentile and // below limit percentile to the value of that percentile void winsorize(const float limit) { winsorize(limit, samples); } T median() const { return percentile(50.0, samples); } T median_abs_dev() const { const auto m = median(); std::vector deviations; deviations.reserve(samples.size()); std::transform(std::begin(samples), std::end(samples), std::back_inserter(deviations), [&m](const T& t) { return T { std::abs((t - m).count()) }; }); std::sort(std::begin(deviations), std::end(deviations)); return percentile(50.0, deviations); } T mean() const { const auto m = std::accumulate(std::begin(samples), std::end(samples), T { 0 }); return m / samples.size(); } T std_dev() const { const auto m = mean(); auto val = std::accumulate( std::begin(samples), std::end(samples), T { 0 }, [&m](const T& p, const T& t) { return T { static_cast(p.count() + std::pow((t - m).count(), 2)) }; }); return T { static_cast(std::sqrt(1.0 / static_cast(samples.size()) * static_cast(val.count()))) }; } T min() const { return samples.front(); } T max() const { return samples.back(); } std::size_t size() const { return samples.size(); } const T& operator[](size_t i) const { return samples[i]; } private: // Winsorize the data, sets all entries above 100 - limit percentile and // below limit percentile to the value of that percentile static void winsorize(const float limit, std::vector& samples) { const auto low = percentile(limit, samples); const auto high = percentile(100.0 - limit, samples); for (auto& t : samples) { if (t < low) { t = low; } else if (t > high) { t = high; } } } static T percentile(const float p, const std::vector& samples) { assert(!samples.empty()); assert(p <= 100.0); assert(p >= 0.0); if (samples.size() == 1) { return samples.front(); } if (p == 100.0) { return samples.back(); } const double rank = p / 100.0 * (static_cast(samples.size()) - 1.0); const double low_r = std::floor(rank); const double dist = rank - low_r; const size_t k = static_cast(low_r); const auto low = samples[k]; const auto high = samples[k + 1]; return T { static_cast(low.count() + (high - low).count() * dist) }; } }; /* Benchmarking measurment using some desired unit of time measurement, * e.g. T = std::chrono::milliseconds. T must be some std::chrono::duration */ template class Benchmarker { const size_t MAX_ITER; const T MAX_RUNTIME; template struct BenchWrapper { Fn fn; BenchWrapper(Fn fn) : fn(fn) { } T operator()() { auto start = std::chrono::high_resolution_clock::now(); fn(); auto end = std::chrono::high_resolution_clock::now(); return std::chrono::duration_cast(end - start); } }; public: using stats_type = Statistics; // Benchmark the functions either max_iter times or until max_runtime // seconds have elapsed max_runtime should be > 0 Benchmarker(const size_t max_iter, const std::chrono::seconds max_runtime) : MAX_ITER(max_iter) , MAX_RUNTIME(std::chrono::duration_cast(max_runtime)) { } // Create a benchmarker that will run the function for the desired number of // iterations, regardless of how long it takes Benchmarker(const size_t max_iter) : MAX_ITER(max_iter) , MAX_RUNTIME(0) { } template typename std::enable_if()())>::value, stats_type>::type operator()(Fn fn) const { return (*this)(BenchWrapper { fn }); } template typename std::enable_if()()), T>::value, stats_type>::type operator()(Fn fn) const { // Do a single un-timed warm up run fn(); T elapsed { 0 }; std::vector samples; for (size_t i = 0; i < MAX_ITER && (MAX_RUNTIME.count() == 0 || elapsed < MAX_RUNTIME); ++i, elapsed += samples.back()) { samples.push_back(fn()); } return stats_type { samples }; } }; } // namespace pico_bench template std::ostream& operator<<(std::ostream& os, const pico_bench::Statistics& stats) { os << "Statistics:\n" << "\tmax: " << stats.max().count() << stats.time_suffix << "\n" << "\tmin: " << stats.min().count() << stats.time_suffix << "\n" << "\tmedian: " << stats.median().count() << stats.time_suffix << "\n" << "\tmedian abs dev: " << stats.median_abs_dev().count() << stats.time_suffix << "\n" << "\tmean: " << stats.mean().count() << stats.time_suffix << "\n" << "\tstd dev: " << stats.std_dev().count() << stats.time_suffix << "\n" << "\t# of samples: " << stats.size(); return os; } #endif xsimd-12.1.1/include/000077500000000000000000000000001453610362700143525ustar00rootroot00000000000000xsimd-12.1.1/include/xsimd/000077500000000000000000000000001453610362700154765ustar00rootroot00000000000000xsimd-12.1.1/include/xsimd/arch/000077500000000000000000000000001453610362700164135ustar00rootroot00000000000000xsimd-12.1.1/include/xsimd/arch/generic/000077500000000000000000000000001453610362700200275ustar00rootroot00000000000000xsimd-12.1.1/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp000066400000000000000000000233471453610362700256020ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_GENERIC_ARITHMETIC_HPP #define XSIMD_GENERIC_ARITHMETIC_HPP #include #include #include #include "./xsimd_generic_details.hpp" namespace xsimd { namespace kernel { using namespace types; // bitwise_lshift template ::value, void>::type*/> inline batch bitwise_lshift(batch const& self, batch const& other, requires_arch) noexcept { return detail::apply([](T x, T y) noexcept { return x << y; }, self, other); } // bitwise_rshift template ::value, void>::type*/> inline batch bitwise_rshift(batch const& self, batch const& other, requires_arch) noexcept { return detail::apply([](T x, T y) noexcept { return x >> y; }, self, other); } // decr template inline batch decr(batch const& self, requires_arch) noexcept { return self - T(1); } // decr_if template inline batch decr_if(batch const& self, Mask const& mask, requires_arch) noexcept { return select(mask, decr(self), self); } // div template ::value, void>::type> inline batch div(batch const& self, batch const& other, requires_arch) noexcept { return detail::apply([](T x, T y) noexcept -> T { return x / y; }, self, other); } // fma template inline batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return x * y + z; } template inline batch, A> fma(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) noexcept { auto res_r = fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real())); auto res_i = fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag())); return { res_r, res_i }; } // fms template inline batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return x * y - z; } template inline batch, A> fms(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) noexcept { auto res_r = fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real())); auto res_i = fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag())); return { res_r, res_i }; } // fnma template inline batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return -x * y + z; } template inline batch, A> fnma(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) noexcept { auto res_r = -fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real())); auto res_i = -fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag())); return { res_r, res_i }; } // fnms template inline batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return -x * y - z; } template inline batch, A> fnms(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) noexcept { auto res_r = -fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real())); auto res_i = -fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag())); return { res_r, res_i }; } // hadd template ::value, void>::type*/> inline T hadd(batch const& self, requires_arch) noexcept { alignas(A::alignment()) T buffer[batch::size]; self.store_aligned(buffer); T res = 0; for (T val : buffer) { res += val; } return res; } // incr template inline batch incr(batch const& self, requires_arch) noexcept { return self + T(1); } // incr_if template inline batch incr_if(batch const& self, Mask const& mask, requires_arch) noexcept { return select(mask, incr(self), self); } // mul template ::value, void>::type*/> inline batch mul(batch const& self, batch const& other, requires_arch) noexcept { return detail::apply([](T x, T y) noexcept -> T { return x * y; }, self, other); } // rotl template inline batch rotl(batch const& self, STy other, requires_arch) noexcept { constexpr auto N = std::numeric_limits::digits; return (self << other) | (self >> (N - other)); } // rotr template inline batch rotr(batch const& self, STy other, requires_arch) noexcept { constexpr auto N = std::numeric_limits::digits; return (self >> other) | (self << (N - other)); } // sadd template inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept { return add(self, other); // no saturated arithmetic on floating point numbers } template ::value, void>::type*/> inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { auto mask = (other >> (8 * sizeof(T) - 1)); auto self_pos_branch = min(std::numeric_limits::max() - other, self); auto self_neg_branch = max(std::numeric_limits::min() - other, self); return other + select(batch_bool(mask.data), self_neg_branch, self_pos_branch); } else { const auto diffmax = std::numeric_limits::max() - self; const auto mindiff = min(diffmax, other); return self + mindiff; } } template inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept { return add(self, other); // no saturated arithmetic on floating point numbers } // ssub template inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept { return sub(self, other); // no saturated arithmetic on floating point numbers } template ::value, void>::type*/> inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { return sadd(self, -other); } else { const auto diff = min(self, other); return self - diff; } } template inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept { return sub(self, other); // no saturated arithmetic on floating point numbers } } } #endif xsimd-12.1.1/include/xsimd/arch/generic/xsimd_generic_complex.hpp000066400000000000000000000075631453610362700251220ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_GENERIC_COMPLEX_HPP #define XSIMD_GENERIC_COMPLEX_HPP #include #include "./xsimd_generic_details.hpp" namespace xsimd { namespace kernel { using namespace types; // real template inline batch real(batch const& self, requires_arch) noexcept { return self; } template inline batch real(batch, A> const& self, requires_arch) noexcept { return self.real(); } // imag template inline batch imag(batch const& /*self*/, requires_arch) noexcept { return batch(T(0)); } template inline batch imag(batch, A> const& self, requires_arch) noexcept { return self.imag(); } // arg template inline real_batch_type_t> arg(batch const& self, requires_arch) noexcept { return atan2(imag(self), real(self)); } // conj template inline complex_batch_type_t> conj(batch const& self, requires_arch) noexcept { return { real(self), -imag(self) }; } // norm template inline real_batch_type_t> norm(batch const& self, requires_arch) noexcept { return { fma(real(self), real(self), imag(self) * imag(self)) }; } // proj template inline complex_batch_type_t> proj(batch const& self, requires_arch) noexcept { using batch_type = complex_batch_type_t>; using real_batch = typename batch_type::real_batch; using real_value_type = typename real_batch::value_type; auto cond = xsimd::isinf(real(self)) || xsimd::isinf(imag(self)); return select(cond, batch_type(constants::infinity(), copysign(real_batch(real_value_type(0)), imag(self))), batch_type(self)); } template inline batch_bool isnan(batch, A> const& self, requires_arch) noexcept { return batch_bool(isnan(self.real()) || isnan(self.imag())); } template inline batch_bool isinf(batch, A> const& self, requires_arch) noexcept { return batch_bool(isinf(self.real()) || isinf(self.imag())); } template inline batch_bool isfinite(batch, A> const& self, requires_arch) noexcept { return batch_bool(isfinite(self.real()) && isfinite(self.imag())); } } } #endif xsimd-12.1.1/include/xsimd/arch/generic/xsimd_generic_details.hpp000066400000000000000000000352371453610362700250770ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_GENERIC_DETAILS_HPP #define XSIMD_GENERIC_DETAILS_HPP #include #include "../../math/xsimd_rem_pio2.hpp" #include "../../types/xsimd_generic_arch.hpp" #include "../../types/xsimd_utils.hpp" #include "../xsimd_constants.hpp" namespace xsimd { // Forward declaration. Should we put them in a separate file? template inline batch abs(batch const& self) noexcept; template inline batch abs(batch, A> const& self) noexcept; template inline bool any(batch_bool const& self) noexcept; template inline batch atan2(batch const& self, batch const& other) noexcept; template inline batch batch_cast(batch const&, batch const& out) noexcept; template inline batch bitofsign(batch const& self) noexcept; template inline batch bitwise_cast(batch const& self) noexcept; template inline batch cos(batch const& self) noexcept; template inline batch cosh(batch const& self) noexcept; template inline batch exp(batch const& self) noexcept; template inline batch fma(batch const& x, batch const& y, batch const& z) noexcept; template inline batch fms(batch const& x, batch const& y, batch const& z) noexcept; template inline batch frexp(const batch& x, const batch, A>& e) noexcept; template inline batch horner(const batch& self) noexcept; template inline batch hypot(const batch& self) noexcept; template inline batch_bool is_even(batch const& self) noexcept; template inline batch_bool is_flint(batch const& self) noexcept; template inline batch_bool is_odd(batch const& self) noexcept; template inline typename batch::batch_bool_type isinf(batch const& self) noexcept; template inline typename batch::batch_bool_type isfinite(batch const& self) noexcept; template inline typename batch::batch_bool_type isnan(batch const& self) noexcept; template inline batch ldexp(const batch& x, const batch, A>& e) noexcept; template inline batch log(batch const& self) noexcept; template inline batch nearbyint(batch const& self) noexcept; template inline batch, A> nearbyint_as_int(const batch& x) noexcept; template inline T reduce_add(batch const&) noexcept; template inline batch select(batch_bool const&, batch const&, batch const&) noexcept; template inline batch, A> select(batch_bool const&, batch, A> const&, batch, A> const&) noexcept; template inline batch sign(batch const& self) noexcept; template inline batch signnz(batch const& self) noexcept; template inline batch sin(batch const& self) noexcept; template inline batch sinh(batch const& self) noexcept; template inline std::pair, batch> sincos(batch const& self) noexcept; template inline batch sqrt(batch const& self) noexcept; template inline batch tan(batch const& self) noexcept; template inline batch, A> to_float(batch const& self) noexcept; template inline batch, A> to_int(batch const& self) noexcept; template inline batch trunc(batch const& self) noexcept; namespace kernel { namespace detail { template inline batch apply(F&& func, batch const& self, batch const& other) noexcept { constexpr std::size_t size = batch::size; alignas(A::alignment()) T self_buffer[size]; alignas(A::alignment()) T other_buffer[size]; self.store_aligned(&self_buffer[0]); other.store_aligned(&other_buffer[0]); for (std::size_t i = 0; i < size; ++i) { self_buffer[i] = func(self_buffer[i], other_buffer[i]); } return batch::load_aligned(self_buffer); } template inline batch apply_transform(F&& func, batch const& self) noexcept { static_assert(batch::size == batch::size, "Source and destination sizes must match"); constexpr std::size_t src_size = batch::size; constexpr std::size_t dest_size = batch::size; alignas(A::alignment()) T self_buffer[src_size]; alignas(A::alignment()) U other_buffer[dest_size]; self.store_aligned(&self_buffer[0]); for (std::size_t i = 0; i < src_size; ++i) { other_buffer[i] = func(self_buffer[i]); } return batch::load_aligned(other_buffer); } } // some generic fast_cast conversion namespace detail { template inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } template inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } template inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } template inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } template inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } template inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } template inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } template inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } // Provide a generic uint32_t -> float cast only if we have a // non-generic int32_t -> float fast_cast template const&>(), std::declval const&>(), A {}))> inline batch fast_cast(batch const& v, batch const&, requires_arch) noexcept { // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse batch msk_lo(0xFFFF); batch cnst65536f(65536.0f); auto v_lo = batch_cast(v & msk_lo); /* extract the 16 lowest significant bits of self */ auto v_hi = batch_cast(v >> 16); /* 16 most significant bits of v */ auto v_lo_flt = batch_cast(v_lo); /* No rounding */ auto v_hi_flt = batch_cast(v_hi); /* No rounding */ v_hi_flt = cnst65536f * v_hi_flt; /* No rounding */ return v_hi_flt + v_lo_flt; /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */ } // Provide a generic float -> uint32_t cast only if we have a // non-generic float -> int32_t fast_cast template const&>(), std::declval const&>(), A {}))> inline batch fast_cast(batch const& v, batch const&, requires_arch) noexcept { auto is_large = v >= batch(1u << 31); auto small = bitwise_cast(batch_cast(v)); auto large = bitwise_cast( batch_cast(v - batch(1u << 31)) ^ batch(1u << 31)); return bitwise_cast(select(is_large, large, small)); } } namespace detail { // Generic conversion handling machinery. Each architecture must define // conversion function when such conversions exits in the form of // intrinsic. Then we use that information to automatically decide whether // to use scalar or vector conversion when doing load / store / batch_cast struct with_fast_conversion { }; struct with_slow_conversion { }; template struct conversion_type_impl { using type = with_slow_conversion; }; using xsimd::detail::void_t; template struct conversion_type_impl&>(), std::declval&>(), std::declval()))>> { using type = with_fast_conversion; }; template using conversion_type = typename conversion_type_impl::type; } namespace detail { /* origin: boost/simdfunction/horn.hpp*/ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline B coef() noexcept { using value_type = typename B::value_type; return B(bit_cast(as_unsigned_integer_t(c))); } template inline B horner(const B&) noexcept { return B(typename B::value_type(0.)); } template inline B horner(const B&) noexcept { return coef(); } template inline B horner(const B& self) noexcept { return fma(self, horner(self), coef()); } /* origin: boost/simdfunction/horn1.hpp*/ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline B horner1(const B&) noexcept { return B(1.); } template inline B horner1(const B& x) noexcept { return x + detail::coef(); } template inline B horner1(const B& x) noexcept { return fma(x, horner1(x), detail::coef()); } } } } #endif xsimd-12.1.1/include/xsimd/arch/generic/xsimd_generic_logical.hpp000066400000000000000000000142531453610362700250570ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_GENERIC_LOGICAL_HPP #define XSIMD_GENERIC_LOGICAL_HPP #include "./xsimd_generic_details.hpp" namespace xsimd { namespace kernel { using namespace types; // from mask template inline batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) bool buffer[batch_bool::size]; // This is inefficient but should never be called. It's just a // temporary implementation until arm support is added. for (size_t i = 0; i < batch_bool::size; ++i) buffer[i] = mask & (1ull << i); return batch_bool::load_aligned(buffer); } // ge template inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return other <= self; } // gt template inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return other < self; } // is_even template inline batch_bool is_even(batch const& self, requires_arch) noexcept { return is_flint(self * T(0.5)); } // is_flint template inline batch_bool is_flint(batch const& self, requires_arch) noexcept { auto frac = select(isnan(self - self), constants::nan>(), self - trunc(self)); return frac == T(0.); } // is_odd template inline batch_bool is_odd(batch const& self, requires_arch) noexcept { return is_even(self - T(1.)); } // isinf template ::value, void>::type> inline batch_bool isinf(batch const&, requires_arch) noexcept { return batch_bool(false); } template inline batch_bool isinf(batch const& self, requires_arch) noexcept { return abs(self) == std::numeric_limits::infinity(); } template inline batch_bool isinf(batch const& self, requires_arch) noexcept { return abs(self) == std::numeric_limits::infinity(); } // isfinite template ::value, void>::type> inline batch_bool isfinite(batch const&, requires_arch) noexcept { return batch_bool(true); } template inline batch_bool isfinite(batch const& self, requires_arch) noexcept { return (self - self) == 0.f; } template inline batch_bool isfinite(batch const& self, requires_arch) noexcept { return (self - self) == 0.; } // isnan template ::value, void>::type> inline batch_bool isnan(batch const&, requires_arch) noexcept { return batch_bool(false); } // le template ::value, void>::type> inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return (self < other) || (self == other); } // neq template inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return !(other == self); } // logical_and template inline batch logical_and(batch const& self, batch const& other, requires_arch) noexcept { return detail::apply([](T x, T y) noexcept { return x && y; }, self, other); } // logical_or template inline batch logical_or(batch const& self, batch const& other, requires_arch) noexcept { return detail::apply([](T x, T y) noexcept { return x || y; }, self, other); } // mask template inline uint64_t mask(batch_bool const& self, requires_arch) noexcept { alignas(A::alignment()) bool buffer[batch_bool::size]; self.store_aligned(buffer); // This is inefficient but should never be called. It's just a // temporary implementation until arm support is added. uint64_t res = 0; for (size_t i = 0; i < batch_bool::size; ++i) if (buffer[i]) res |= 1ul << i; return res; } } } #endif xsimd-12.1.1/include/xsimd/arch/generic/xsimd_generic_math.hpp000066400000000000000000003304061453610362700243770ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_GENERIC_MATH_HPP #define XSIMD_GENERIC_MATH_HPP #include "../xsimd_scalar.hpp" #include "./xsimd_generic_details.hpp" #include "./xsimd_generic_trigo.hpp" #include namespace xsimd { namespace kernel { using namespace types; // abs template ::value, void>::type*/> inline batch abs(batch const& self, requires_arch) noexcept { if (std::is_unsigned::value) return self; else { auto sign = bitofsign(self); auto inv = self ^ sign; return inv - sign; } } template inline batch abs(batch, A> const& z, requires_arch) noexcept { return hypot(z.real(), z.imag()); } // batch_cast template inline batch batch_cast(batch const& self, batch const&, requires_arch) noexcept { return self; } namespace detail { template inline batch batch_cast(batch const& self, batch const& out, requires_arch, with_fast_conversion) noexcept { return fast_cast(self, out, A {}); } template inline batch batch_cast(batch const& self, batch const&, requires_arch, with_slow_conversion) noexcept { static_assert(!std::is_same::value, "there should be no conversion for this type combination"); using batch_type_in = batch; using batch_type_out = batch; static_assert(batch_type_in::size == batch_type_out::size, "compatible sizes"); alignas(A::alignment()) T_in buffer_in[batch_type_in::size]; alignas(A::alignment()) T_out buffer_out[batch_type_out::size]; self.store_aligned(&buffer_in[0]); std::copy(std::begin(buffer_in), std::end(buffer_in), std::begin(buffer_out)); return batch_type_out::load_aligned(buffer_out); } } template inline batch batch_cast(batch const& self, batch const& out, requires_arch) noexcept { return detail::batch_cast(self, out, A {}, detail::conversion_type {}); } // bitofsign template inline batch bitofsign(batch const& self, requires_arch) noexcept { static_assert(std::is_integral::value, "int type implementation"); if (std::is_unsigned::value) return batch(0); else return self >> (T)(8 * sizeof(T) - 1); } template inline batch bitofsign(batch const& self, requires_arch) noexcept { return self & constants::signmask>(); } template inline batch bitofsign(batch const& self, requires_arch) noexcept { return self & constants::signmask>(); } // bitwise_cast template inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return self; } // cbrt /* origin: boost/simd/arch/common/simd/function/cbrt.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline batch cbrt(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type z = abs(self); #ifndef XSIMD_NO_DENORMALS auto denormal = z < constants::smallestposval(); z = select(denormal, z * constants::twotonmb(), z); batch_type f = select(denormal, constants::twotonmbo3(), batch_type(1.)); #endif const batch_type CBRT2(bit_cast(0x3fa14518)); const batch_type CBRT4(bit_cast(0x3fcb2ff5)); const batch_type CBRT2I(bit_cast(0x3f4b2ff5)); const batch_type CBRT4I(bit_cast(0x3f214518)); using i_type = as_integer_t; i_type e; batch_type x = frexp(z, e); x = detail::horner(x); auto flag = e >= i_type(0); i_type e1 = abs(e); i_type rem = e1; e1 /= i_type(3); rem -= e1 * i_type(3); e = e1 * sign(e); const batch_type cbrt2 = select(batch_bool_cast(flag), CBRT2, CBRT2I); const batch_type cbrt4 = select(batch_bool_cast(flag), CBRT4, CBRT4I); batch_type fact = select(batch_bool_cast(rem == i_type(1)), cbrt2, batch_type(1.)); fact = select(batch_bool_cast(rem == i_type(2)), cbrt4, fact); x = ldexp(x * fact, e); x -= (x - z / (x * x)) * batch_type(1.f / 3.f); #ifndef XSIMD_NO_DENORMALS x = (x | bitofsign(self)) * f; #else x = x | bitofsign(self); #endif #ifndef XSIMD_NO_INFINITIES return select(self == batch_type(0.) || isinf(self), self, x); #else return select(self == batch_type(0.), self, x); #endif } template inline batch cbrt(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type z = abs(self); #ifndef XSIMD_NO_DENORMALS auto denormal = z < constants::smallestposval(); z = select(denormal, z * constants::twotonmb(), z); batch_type f = select(denormal, constants::twotonmbo3(), batch_type(1.)); #endif const batch_type CBRT2(bit_cast(int64_t(0x3ff428a2f98d728b))); const batch_type CBRT4(bit_cast(int64_t(0x3ff965fea53d6e3d))); const batch_type CBRT2I(bit_cast(int64_t(0x3fe965fea53d6e3d))); const batch_type CBRT4I(bit_cast(int64_t(0x3fe428a2f98d728b))); using i_type = as_integer_t; i_type e; batch_type x = frexp(z, e); x = detail::horner(x); auto flag = e >= typename i_type::value_type(0); i_type e1 = abs(e); i_type rem = e1; e1 /= i_type(3); rem -= e1 * i_type(3); e = e1 * sign(e); const batch_type cbrt2 = select(batch_bool_cast(flag), CBRT2, CBRT2I); const batch_type cbrt4 = select(batch_bool_cast(flag), CBRT4, CBRT4I); batch_type fact = select(batch_bool_cast(rem == i_type(1)), cbrt2, batch_type(1.)); fact = select(batch_bool_cast(rem == i_type(2)), cbrt4, fact); x = ldexp(x * fact, e); x -= (x - z / (x * x)) * batch_type(1. / 3.); x -= (x - z / (x * x)) * batch_type(1. / 3.); #ifndef XSIMD_NO_DENORMALS x = (x | bitofsign(self)) * f; #else x = x | bitofsign(self); #endif #ifndef XSIMD_NO_INFINITIES return select(self == batch_type(0.) || isinf(self), self, x); #else return select(self == batch_type(0.), self, x); #endif } // clip template inline batch clip(batch const& self, batch const& lo, batch const& hi, requires_arch) noexcept { return min(hi, max(self, lo)); } // copysign template ::value, void>::type> inline batch copysign(batch const& self, batch const& other, requires_arch) noexcept { return abs(self) | bitofsign(other); } // erf namespace detail { /* origin: boost/simd/arch/common/detail/generic/erf_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct erf_kernel; template struct erf_kernel> { using batch_type = batch; // computes erf(a0)/a0 // x is sqr(a0) and 0 <= abs(a0) <= 2/3 static inline batch_type erf1(const batch_type& x) noexcept { return detail::horner(x); } // computes erfc(x)*exp(sqr(x)) // x >= 2/3 static inline batch_type erfc2(const batch_type& x) noexcept { return detail::horner(x); } static inline batch_type erfc3(const batch_type& x) noexcept { return (batch_type(1.) - x) * detail::horner(x); } }; template struct erf_kernel> { using batch_type = batch; // computes erf(a0)/a0 // x is sqr(a0) and 0 <= abs(a0) <= 0.65 static inline batch_type erf1(const batch_type& x) noexcept { return detail::horner(x) / detail::horner(x); } // computes erfc(x)*exp(x*x) // 0.65 <= abs(x) <= 2.2 static inline batch_type erfc2(const batch_type& x) noexcept { return detail::horner(x) / detail::horner(x); } // computes erfc(x)*exp(x*x) // 2.2 <= abs(x) <= 6 static inline batch_type erfc3(const batch_type& x) noexcept { return detail::horner(x) / detail::horner(x); } // computes erfc(rx)*exp(rx*rx) // x >= 6 rx = 1/x static inline batch_type erfc4(const batch_type& x) noexcept { return detail::horner(x); } }; } /* origin: boost/simd/arch/common/simd/function/erf.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline batch erf(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); batch_type r1(0.); auto test1 = x < batch_type(2.f / 3.f); if (any(test1)) { r1 = self * detail::erf_kernel::erf1(x * x); if (all(test1)) return r1; } batch_type z = x / (batch_type(1.) + x); z -= batch_type(0.4f); batch_type r2 = batch_type(1.) - exp(-x * x) * detail::erf_kernel::erfc2(z); r2 = select(self < batch_type(0.), -r2, r2); r1 = select(test1, r1, r2); #ifndef XSIMD_NO_INFINITIES r1 = select(xsimd::isinf(self), sign(self), r1); #endif return r1; } template inline batch erf(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); batch_type xx = x * x; batch_type lim1(0.65); batch_type lim2(2.2); auto test1 = x < lim1; batch_type r1(0.); if (any(test1)) { r1 = self * detail::erf_kernel::erf1(xx); if (all(test1)) return r1; } auto test2 = x < lim2; auto test3 = test2 && !test1; batch_type ex = exp(-xx); if (any(test3)) { batch_type z = batch_type(1.) - ex * detail::erf_kernel::erfc2(x); batch_type r2 = select(self < batch_type(0.), -z, z); r1 = select(test1, r1, r2); if (all(test1 || test3)) return r1; } batch_type z = batch_type(1.) - ex * detail::erf_kernel::erfc3(x); z = select(self < batch_type(0.), -z, z); #ifndef XSIMD_NO_INFINITIES z = select(xsimd::isinf(self), sign(self), z); #endif return select(test2, r1, z); } // erfc template inline batch erfc(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); auto test0 = self < batch_type(0.); batch_type r1(0.); auto test1 = 3.f * x < 2.f; batch_type z = x / (batch_type(1.) + x); if (any(test1)) { r1 = detail::erf_kernel::erfc3(z); if (all(test1)) return select(test0, batch_type(2.) - r1, r1); } z -= batch_type(0.4f); batch_type r2 = exp(-x * x) * detail::erf_kernel::erfc2(z); r1 = select(test1, r1, r2); #ifndef XSIMD_NO_INFINITIES r1 = select(x == constants::infinity(), batch_type(0.), r1); #endif return select(test0, batch_type(2.) - r1, r1); } template inline batch erfc(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); batch_type xx = x * x; batch_type lim1(0.65); batch_type lim2(2.2); auto test0 = self < batch_type(0.); auto test1 = x < lim1; batch_type r1(0.); if (any(test1)) { r1 = batch_type(1.) - x * detail::erf_kernel::erf1(xx); if (all(test1)) return select(test0, batch_type(2.) - r1, r1); } auto test2 = x < lim2; auto test3 = test2 && !test1; batch_type ex = exp(-xx); if (any(test3)) { batch_type z = ex * detail::erf_kernel::erfc2(x); r1 = select(test1, r1, z); if (all(test1 || test3)) return select(test0, batch_type(2.) - r1, r1); } batch_type z = ex * detail::erf_kernel::erfc3(x); r1 = select(test2, r1, z); #ifndef XSIMD_NO_INFINITIES r1 = select(x == constants::infinity(), batch_type(0.), r1); #endif return select(test0, batch_type(2.) - r1, r1); } // estrin namespace detail { template struct estrin { B x; template inline B operator()(const Ts&... coefs) noexcept { return eval(coefs...); } private: inline B eval(const B& c0) noexcept { return c0; } inline B eval(const B& c0, const B& c1) noexcept { return fma(x, c1, c0); } template inline B eval(::xsimd::detail::index_sequence, const Tuple& tuple) { return estrin { x * x }(std::get(tuple)...); } template inline B eval(const std::tuple& tuple) noexcept { return eval(::xsimd::detail::make_index_sequence(), tuple); } template inline B eval(const std::tuple& tuple, const B& c0) noexcept { return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0)))); } template inline B eval(const std::tuple& tuple, const B& c0, const B& c1) noexcept { return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1)))); } template inline B eval(const std::tuple& tuple, const B& c0, const B& c1, const Ts&... coefs) noexcept { return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1))), coefs...); } template inline B eval(const B& c0, const B& c1, const Ts&... coefs) noexcept { return eval(std::make_tuple(eval(c0, c1)), coefs...); } }; } template inline batch estrin(const batch& self) noexcept { using batch_type = batch; return detail::estrin { self }(detail::coef()...); } // exp /* origin: boost/simd/arch/common/detail/simd/expo_base.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ namespace detail { enum exp_reduction_tag { exp_tag, exp2_tag, exp10_tag }; template struct exp_reduction_base; template struct exp_reduction_base { static constexpr B maxlog() noexcept { return constants::maxlog(); } static constexpr B minlog() noexcept { return constants::minlog(); } }; template struct exp_reduction_base { static constexpr B maxlog() noexcept { return constants::maxlog10(); } static constexpr B minlog() noexcept { return constants::minlog10(); } }; template struct exp_reduction_base { static constexpr B maxlog() noexcept { return constants::maxlog2(); } static constexpr B minlog() noexcept { return constants::minlog2(); } }; template struct exp_reduction; template struct exp_reduction : exp_reduction_base, exp_tag> { using batch_type = batch; static inline batch_type approx(const batch_type& x) noexcept { batch_type y = detail::horner(x); return ++fma(y, x * x, x); } static inline batch_type reduce(const batch_type& a, batch_type& x) noexcept { batch_type k = nearbyint(constants::invlog_2() * a); x = fnma(k, constants::log_2hi(), a); x = fnma(k, constants::log_2lo(), x); return k; } }; template struct exp_reduction : exp_reduction_base, exp10_tag> { using batch_type = batch; static inline batch_type approx(const batch_type& x) noexcept { return ++(detail::horner(x) * x); } static inline batch_type reduce(const batch_type& a, batch_type& x) noexcept { batch_type k = nearbyint(constants::invlog10_2() * a); x = fnma(k, constants::log10_2hi(), a); x -= k * constants::log10_2lo(); return k; } }; template struct exp_reduction : exp_reduction_base, exp2_tag> { using batch_type = batch; static inline batch_type approx(const batch_type& x) noexcept { batch_type y = detail::horner(x); return ++fma(y, x * x, x * constants::log_2()); } static inline batch_type reduce(const batch_type& a, batch_type& x) noexcept { batch_type k = nearbyint(a); x = (a - k); return k; } }; template struct exp_reduction : exp_reduction_base, exp_tag> { using batch_type = batch; static inline batch_type approx(const batch_type& x) noexcept { batch_type t = x * x; return fnma(t, detail::horner(t), x); } static inline batch_type reduce(const batch_type& a, batch_type& hi, batch_type& lo, batch_type& x) noexcept { batch_type k = nearbyint(constants::invlog_2() * a); hi = fnma(k, constants::log_2hi(), a); lo = k * constants::log_2lo(); x = hi - lo; return k; } static inline batch_type finalize(const batch_type& x, const batch_type& c, const batch_type& hi, const batch_type& lo) noexcept { return batch_type(1.) - (((lo - (x * c) / (batch_type(2.) - c)) - hi)); } }; template struct exp_reduction : exp_reduction_base, exp10_tag> { using batch_type = batch; static inline batch_type approx(const batch_type& x) noexcept { batch_type xx = x * x; batch_type px = x * detail::horner(xx); batch_type x2 = px / (detail::horner1(xx) - px); return ++(x2 + x2); } static inline batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) noexcept { batch_type k = nearbyint(constants::invlog10_2() * a); x = fnma(k, constants::log10_2hi(), a); x = fnma(k, constants::log10_2lo(), x); return k; } static inline batch_type finalize(const batch_type&, const batch_type& c, const batch_type&, const batch_type&) noexcept { return c; } }; template struct exp_reduction : exp_reduction_base, exp2_tag> { using batch_type = batch; static inline batch_type approx(const batch_type& x) noexcept { batch_type t = x * x; return fnma(t, detail::horner(t), x); } static inline batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) noexcept { batch_type k = nearbyint(a); x = (a - k) * constants::log_2(); return k; } static inline batch_type finalize(const batch_type& x, const batch_type& c, const batch_type&, const batch_type&) noexcept { return batch_type(1.) + x + x * c / (batch_type(2.) - c); } }; template inline batch exp(batch const& self) noexcept { using batch_type = batch; using reducer_t = exp_reduction; batch_type x; batch_type k = reducer_t::reduce(self, x); x = reducer_t::approx(x); x = select(self <= reducer_t::minlog(), batch_type(0.), ldexp(x, to_int(k))); x = select(self >= reducer_t::maxlog(), constants::infinity(), x); return x; } template inline batch exp(batch const& self) noexcept { using batch_type = batch; using reducer_t = exp_reduction; batch_type hi, lo, x; batch_type k = reducer_t::reduce(self, hi, lo, x); batch_type c = reducer_t::approx(x); c = reducer_t::finalize(x, c, hi, lo); c = select(self <= reducer_t::minlog(), batch_type(0.), ldexp(c, to_int(k))); c = select(self >= reducer_t::maxlog(), constants::infinity(), c); return c; } } template inline batch exp(batch const& self, requires_arch) noexcept { return detail::exp(self); } template inline batch, A> exp(batch, A> const& self, requires_arch) noexcept { using batch_type = batch, A>; auto isincos = sincos(self.imag()); return exp(self.real()) * batch_type(std::get<1>(isincos), std::get<0>(isincos)); } // exp10 template inline batch exp10(batch const& self, requires_arch) noexcept { return detail::exp(self); } // exp2 template inline batch exp2(batch const& self, requires_arch) noexcept { return detail::exp(self); } // expm1 namespace detail { /* origin: boost/simd/arch/common/detail/generic/expm1_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template static inline batch expm1(const batch& a) noexcept { using batch_type = batch; batch_type k = nearbyint(constants::invlog_2() * a); batch_type x = fnma(k, constants::log_2hi(), a); x = fnma(k, constants::log_2lo(), x); batch_type hx = x * batch_type(0.5); batch_type hxs = x * hx; batch_type r = detail::horner(hxs); batch_type t = fnma(r, hx, batch_type(3.)); batch_type e = hxs * ((r - t) / (batch_type(6.) - x * t)); e = fms(x, e, hxs); using i_type = as_integer_t; i_type ik = to_int(k); batch_type two2mk = ::xsimd::bitwise_cast((constants::maxexponent() - ik) << constants::nmb()); batch_type y = batch_type(1.) - two2mk - (e - x); return ldexp(y, ik); } template static inline batch expm1(const batch& a) noexcept { using batch_type = batch; batch_type k = nearbyint(constants::invlog_2() * a); batch_type hi = fnma(k, constants::log_2hi(), a); batch_type lo = k * constants::log_2lo(); batch_type x = hi - lo; batch_type hxs = x * x * batch_type(0.5); batch_type r = detail::horner(hxs); batch_type t = batch_type(3.) - r * batch_type(0.5) * x; batch_type e = hxs * ((r - t) / (batch_type(6) - x * t)); batch_type c = (hi - x) - lo; e = (x * (e - c) - c) - hxs; using i_type = as_integer_t; i_type ik = to_int(k); batch_type two2mk = ::xsimd::bitwise_cast((constants::maxexponent() - ik) << constants::nmb()); batch_type ct1 = batch_type(1.) - two2mk - (e - x); batch_type ct2 = ++(x - (e + two2mk)); batch_type y = select(k < batch_type(20.), ct1, ct2); return ldexp(y, ik); } } template inline batch expm1(batch const& self, requires_arch) noexcept { using batch_type = batch; return select(self < constants::logeps(), batch_type(-1.), select(self > constants::maxlog(), constants::infinity(), detail::expm1(self))); } template inline batch, A> expm1(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; real_batch isin = sin(z.imag()); real_batch rem1 = expm1(z.real()); real_batch re = rem1 + 1.; real_batch si = sin(z.imag() * 0.5); return { rem1 - 2. * re * si * si, re * isin }; } // polar template inline batch, A> polar(const batch& r, const batch& theta, requires_arch) noexcept { auto sincosTheta = sincos(theta); return { r * sincosTheta.second, r * sincosTheta.first }; } // fdim template inline batch fdim(batch const& self, batch const& other, requires_arch) noexcept { return fmax(batch(0), self - other); } // fmod template inline batch fmod(batch const& self, batch const& other, requires_arch) noexcept { return fnma(trunc(self / other), other, self); } // frexp /* origin: boost/simd/arch/common/simd/function/ifrexp.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline batch frexp(const batch& self, batch, A>& exp, requires_arch) noexcept { using batch_type = batch; using int_type = as_integer_t; using i_type = batch; i_type m1f = constants::mask1frexp(); i_type r1 = m1f & ::xsimd::bitwise_cast(self); batch_type x = self & ::xsimd::bitwise_cast(~m1f); exp = (r1 >> constants::nmb()) - constants::maxexponentm1(); exp = select(batch_bool_cast(self != batch_type(0.)), exp, i_type(typename i_type::value_type(0))); return select((self != batch_type(0.)), x | ::xsimd::bitwise_cast(constants::mask2frexp()), batch_type(0.)); } // from bool template inline batch from_bool(batch_bool const& self, requires_arch) noexcept { return batch(self.data) & batch(1); } // horner template inline batch horner(const batch& self) noexcept { return detail::horner, Coefs...>(self); } // hypot template inline batch hypot(batch const& self, batch const& other, requires_arch) noexcept { return sqrt(fma(self, self, other * other)); } // ipow template inline batch ipow(batch const& self, ITy other, requires_arch) noexcept { return ::xsimd::detail::ipow(self, other); } // ldexp /* origin: boost/simd/arch/common/simd/function/ldexp.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline batch ldexp(const batch& self, const batch, A>& other, requires_arch) noexcept { using batch_type = batch; using itype = as_integer_t; itype ik = other + constants::maxexponent(); ik = ik << constants::nmb(); return self * ::xsimd::bitwise_cast(ik); } // lgamma template inline batch lgamma(batch const& self, requires_arch) noexcept; namespace detail { /* origin: boost/simd/arch/common/detail/generic/gammaln_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template static inline batch gammalnB(const batch& x) noexcept { return horner, 0x3ed87730, // 4.227843421859038E-001 0x3ea51a64, // 3.224669577325661E-001, 0xbd89f07e, // -6.735323259371034E-002, 0x3ca89ed8, // 2.058355474821512E-002, 0xbbf164fd, // -7.366775108654962E-003, 0x3b3ba883, // 2.863437556468661E-003, 0xbaabeab1, // -1.311620815545743E-003, 0x3a1ebb94 // 6.055172732649237E-004 >(x); } template static inline batch gammalnC(const batch& x) noexcept { return horner, 0xbf13c468, // -5.772156501719101E-001 0x3f528d34, // 8.224670749082976E-001, 0xbecd27a8, // -4.006931650563372E-001, 0x3e8a898b, // 2.705806208275915E-001, 0xbe53c04f, // -2.067882815621965E-001, 0x3e2d4dab, // 1.692415923504637E-001, 0xbe22d329, // -1.590086327657347E-001, 0x3e0c3c4f // 1.369488127325832E-001 >(x); } template static inline batch gammaln2(const batch& x) noexcept { return horner, 0x3daaaa94, // 8.333316229807355E-002f 0xbb358701, // -2.769887652139868E-003f, 0x3a31fd69 // 6.789774945028216E-004f >(x); } template static inline batch gammaln1(const batch& x) noexcept { return horner, 0xc12a0c675418055eull, // -8.53555664245765465627E5 0xc13a45890219f20bull, // -1.72173700820839662146E6, 0xc131bc82f994db51ull, // -1.16237097492762307383E6, 0xc1143d73f89089e5ull, // -3.31612992738871184744E5, 0xc0e2f234355bb93eull, // -3.88016315134637840924E4, 0xc09589018ff36761ull // -1.37825152569120859100E3 >(x) / horner, 0xc13ece4b6a11e14aull, // -2.01889141433532773231E6 0xc1435255892ff34cull, // -2.53252307177582951285E6, 0xc131628671950043ull, // -1.13933444367982507207E6, 0xc10aeb84b9744c9bull, // -2.20528590553854454839E5, 0xc0d0aa0d7b89d757ull, // -1.70642106651881159223E4, 0xc075fd0d1cf312b2ull, // -3.51815701436523470549E2, 0x3ff0000000000000ull // 1.00000000000000000000E0 >(x); } template static inline batch gammalnA(const batch& x) noexcept { return horner, 0x3fb555555555554bull, // 8.33333333333331927722E-2 0xbf66c16c16b0a5a1ull, // -2.77777777730099687205E-3, 0x3f4a019f20dc5ebbull, // 7.93650340457716943945E-4, 0xbf437fbdb580e943ull, // -5.95061904284301438324E-4, 0x3f4a985027336661ull // 8.11614167470508450300E-4 >(x); } /* origin: boost/simd/arch/common/simd/function/gammaln.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct lgamma_impl; template struct lgamma_impl> { using batch_type = batch; static inline batch_type compute(const batch_type& a) noexcept { auto inf_result = (a <= batch_type(0.)) && is_flint(a); batch_type x = select(inf_result, constants::nan(), a); batch_type q = abs(x); #ifndef XSIMD_NO_INFINITIES inf_result = (x == constants::infinity()) || inf_result; #endif auto ltza = a < batch_type(0.); batch_type r; batch_type r1 = other(q); if (any(ltza)) { r = select(inf_result, constants::infinity(), negative(q, r1)); if (all(ltza)) return r; } batch_type r2 = select(ltza, r, r1); return select(a == constants::minusinfinity(), constants::nan(), select(inf_result, constants::infinity(), r2)); } private: static inline batch_type negative(const batch_type& q, const batch_type& w) noexcept { batch_type p = floor(q); batch_type z = q - p; auto test2 = z < batch_type(0.5); z = select(test2, z - batch_type(1.), z); z = q * sin(z, trigo_pi_tag()); return -log(constants::invpi() * abs(z)) - w; } static inline batch_type other(const batch_type& x) noexcept { auto xlt650 = (x < batch_type(6.5)); batch_type r0x = x; batch_type r0z = x; batch_type r0s = batch_type(1.); batch_type r1 = batch_type(0.); batch_type p = constants::nan(); if (any(xlt650)) { batch_type z = batch_type(1.); batch_type tx = select(xlt650, x, batch_type(0.)); batch_type nx = batch_type(0.); const batch_type _075 = batch_type(0.75); const batch_type _150 = batch_type(1.50); const batch_type _125 = batch_type(1.25); const batch_type _250 = batch_type(2.50); auto xge150 = (x >= _150); auto txgt250 = (tx > _250); // x >= 1.5 while (any(xge150 && txgt250)) { nx = select(txgt250, nx - batch_type(1.), nx); tx = select(txgt250, x + nx, tx); z = select(txgt250, z * tx, z); txgt250 = (tx > _250); } r0x = select(xge150, x + nx - batch_type(2.), x); r0z = select(xge150, z, r0z); r0s = select(xge150, batch_type(1.), r0s); // x >= 1.25 && x < 1.5 auto xge125 = (x >= _125); auto xge125t = xge125 && !xge150; if (any(xge125)) { r0x = select(xge125t, x - batch_type(1.), r0x); r0z = select(xge125t, z * x, r0z); r0s = select(xge125t, batch_type(-1.), r0s); } // x >= 0.75 && x < 1.5 batch_bool kernelC(false); auto xge075 = (x >= _075); auto xge075t = xge075 && !xge125; if (any(xge075t)) { kernelC = xge075t; r0x = select(xge075t, x - batch_type(1.), x); r0z = select(xge075t, batch_type(1.), r0z); r0s = select(xge075t, batch_type(-1.), r0s); p = gammalnC(r0x); } // tx < 1.5 && x < 0.75 auto txlt150 = (tx < _150) && !xge075; if (any(txlt150)) { auto orig = txlt150; while (any(txlt150)) { z = select(txlt150, z * tx, z); nx = select(txlt150, nx + batch_type(1.), nx); tx = select(txlt150, x + nx, tx); txlt150 = (tx < _150) && !xge075; } r0x = select(orig, r0x + nx - batch_type(2.), r0x); r0z = select(orig, z, r0z); r0s = select(orig, batch_type(-1.), r0s); } p = select(kernelC, p, gammalnB(r0x)); if (all(xlt650)) return fma(r0x, p, r0s * log(abs(r0z))); } r0z = select(xlt650, abs(r0z), x); batch_type m = log(r0z); r1 = fma(r0x, p, r0s * m); batch_type r2 = fma(x - batch_type(0.5), m, constants::logsqrt2pi() - x); r2 += gammaln2(batch_type(1.) / (x * x)) / x; return select(xlt650, r1, r2); } }; template struct lgamma_impl> { using batch_type = batch; static inline batch_type compute(const batch_type& a) noexcept { auto inf_result = (a <= batch_type(0.)) && is_flint(a); batch_type x = select(inf_result, constants::nan(), a); batch_type q = abs(x); #ifndef XSIMD_NO_INFINITIES inf_result = (q == constants::infinity()); #endif auto test = (a < batch_type(-34.)); batch_type r = constants::nan(); if (any(test)) { r = large_negative(q); if (all(test)) return select(inf_result, constants::nan(), r); } batch_type r1 = other(a); batch_type r2 = select(test, r, r1); return select(a == constants::minusinfinity(), constants::nan(), select(inf_result, constants::infinity(), r2)); } private: static inline batch_type large_negative(const batch_type& q) noexcept { batch_type w = lgamma(q); batch_type p = floor(q); batch_type z = q - p; auto test2 = (z < batch_type(0.5)); z = select(test2, z - batch_type(1.), z); z = q * sin(z, trigo_pi_tag()); z = abs(z); return constants::logpi() - log(z) - w; } static inline batch_type other(const batch_type& xx) noexcept { batch_type x = xx; auto test = (x < batch_type(13.)); batch_type r1 = batch_type(0.); if (any(test)) { batch_type z = batch_type(1.); batch_type p = batch_type(0.); batch_type u = select(test, x, batch_type(0.)); auto test1 = (u >= batch_type(3.)); while (any(test1)) { p = select(test1, p - batch_type(1.), p); u = select(test1, x + p, u); z = select(test1, z * u, z); test1 = (u >= batch_type(3.)); } auto test2 = (u < batch_type(2.)); while (any(test2)) { z = select(test2, z / u, z); p = select(test2, p + batch_type(1.), p); u = select(test2, x + p, u); test2 = (u < batch_type(2.)); } z = abs(z); x += p - batch_type(2.); r1 = x * gammaln1(x) + log(z); if (all(test)) return r1; } batch_type r2 = fma(xx - batch_type(0.5), log(xx), constants::logsqrt2pi() - xx); batch_type p = batch_type(1.) / (xx * xx); r2 += gammalnA(p) / xx; return select(test, r1, r2); } }; } template inline batch lgamma(batch const& self, requires_arch) noexcept { return detail::lgamma_impl>::compute(self); } // log /* origin: boost/simd/arch/common/simd/function/log.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline batch log(batch const& self, requires_arch) noexcept { using batch_type = batch; using int_type = as_integer_t; using i_type = batch; batch_type x = self; i_type k(0); auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (self < constants::smallestposval()) && isnez; if (any(test)) { k = select(batch_bool_cast(test), k - i_type(23), k); x = select(test, x * batch_type(8388608ul), x); } #endif i_type ix = ::xsimd::bitwise_cast(x); ix += 0x3f800000 - 0x3f3504f3; k += (ix >> 23) - 0x7f; ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; x = ::xsimd::bitwise_cast(ix); batch_type f = --x; batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t2 + t1; batch_type hfsq = batch_type(0.5) * f * f; batch_type dk = to_float(k); batch_type r = fma(dk, constants::log_2hi(), fma(s, (hfsq + R), dk * constants::log_2lo()) - hfsq + f); #ifndef XSIMD_NO_INFINITIES batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else batch_type zz = select(isnez, r, constants::minusinfinity()); #endif return select(!(self >= batch_type(0.)), constants::nan(), zz); } template inline batch log(batch const& self, requires_arch) noexcept { using batch_type = batch; using int_type = as_integer_t; using i_type = batch; batch_type x = self; i_type hx = ::xsimd::bitwise_cast(x) >> 32; i_type k(0); auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (self < constants::smallestposval()) && isnez; if (any(test)) { k = select(batch_bool_cast(test), k - i_type(54), k); x = select(test, x * batch_type(18014398509481984ull), x); } #endif hx += 0x3ff00000 - 0x3fe6a09e; k += (hx >> 20) - 0x3ff; batch_type dk = to_float(k); hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; x = ::xsimd::bitwise_cast(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast(x))); batch_type f = --x; batch_type hfsq = batch_type(0.5) * f * f; batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t2 + t1; batch_type r = fma(dk, constants::log_2hi(), fma(s, (hfsq + R), dk * constants::log_2lo()) - hfsq + f); #ifndef XSIMD_NO_INFINITIES batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else batch_type zz = select(isnez, r, constants::minusinfinity()); #endif return select(!(self >= batch_type(0.)), constants::nan(), zz); } template inline batch, A> log(const batch, A>& z, requires_arch) noexcept { return batch, A>(log(abs(z)), atan2(z.imag(), z.real())); } // log2 template inline batch log2(batch const& self, requires_arch) noexcept { using batch_type = batch; using int_type = as_integer_t; using i_type = batch; batch_type x = self; i_type k(0); auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (self < constants::smallestposval()) && isnez; if (any(test)) { k = select(batch_bool_cast(test), k - i_type(25), k); x = select(test, x * batch_type(33554432ul), x); } #endif i_type ix = ::xsimd::bitwise_cast(x); ix += 0x3f800000 - 0x3f3504f3; k += (ix >> 23) - 0x7f; ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; x = ::xsimd::bitwise_cast(ix); batch_type f = --x; batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t1 + t2; batch_type hfsq = batch_type(0.5) * f * f; batch_type dk = to_float(k); batch_type r = fma(fms(s, hfsq + R, hfsq) + f, constants::invlog_2(), dk); #ifndef XSIMD_NO_INFINITIES batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else batch_type zz = select(isnez, r, constants::minusinfinity()); #endif return select(!(self >= batch_type(0.)), constants::nan(), zz); } template inline batch log2(batch const& self, requires_arch) noexcept { using batch_type = batch; using int_type = as_integer_t; using i_type = batch; batch_type x = self; i_type hx = ::xsimd::bitwise_cast(x) >> 32; i_type k(0); auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (self < constants::smallestposval()) && isnez; if (any(test)) { k = select(batch_bool_cast(test), k - i_type(54), k); x = select(test, x * batch_type(18014398509481984ull), x); } #endif hx += 0x3ff00000 - 0x3fe6a09e; k += (hx >> 20) - 0x3ff; hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; x = ::xsimd::bitwise_cast(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast(x))); batch_type f = --x; batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t2 + t1; batch_type hfsq = batch_type(0.5) * f * f; batch_type hi = f - hfsq; hi = hi & ::xsimd::bitwise_cast((constants::allbits() << 32)); batch_type lo = fma(s, hfsq + R, f - hi - hfsq); batch_type val_hi = hi * constants::invlog_2hi(); batch_type val_lo = fma(lo + hi, constants::invlog_2lo(), lo * constants::invlog_2hi()); batch_type dk = to_float(k); batch_type w1 = dk + val_hi; val_lo += (dk - w1) + val_hi; val_hi = w1; batch_type r = val_lo + val_hi; #ifndef XSIMD_NO_INFINITIES batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else batch_type zz = select(isnez, r, constants::minusinfinity()); #endif return select(!(self >= batch_type(0.)), constants::nan(), zz); } namespace detail { template inline batch logN_complex_impl(const batch& z, typename batch::value_type base) noexcept { using batch_type = batch; using rv_type = typename batch_type::value_type; return log(z) / batch_type(rv_type(base)); } } template inline batch, A> log2(batch, A> const& self, requires_arch) noexcept { return detail::logN_complex_impl(self, std::log(2)); } // log10 /* origin: FreeBSD /usr/src/lib/msun/src/e_log10f.c */ /* * ==================================================== * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. * * Developed at SunPro, a Sun Microsystems, Inc. business. * Permission to use, copy, modify, and distribute this * software is freely granted, provided that this notice * is preserved. * ==================================================== */ template inline batch log10(batch const& self, requires_arch) noexcept { using batch_type = batch; const batch_type ivln10hi(4.3432617188e-01f), ivln10lo(-3.1689971365e-05f), log10_2hi(3.0102920532e-01f), log10_2lo(7.9034151668e-07f); using int_type = as_integer_t; using i_type = batch; batch_type x = self; i_type k(0); auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (self < constants::smallestposval()) && isnez; if (any(test)) { k = select(batch_bool_cast(test), k - i_type(25), k); x = select(test, x * batch_type(33554432ul), x); } #endif i_type ix = ::xsimd::bitwise_cast(x); ix += 0x3f800000 - 0x3f3504f3; k += (ix >> 23) - 0x7f; ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; x = ::xsimd::bitwise_cast(ix); batch_type f = --x; batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t2 + t1; batch_type dk = to_float(k); batch_type hfsq = batch_type(0.5) * f * f; batch_type hibits = f - hfsq; hibits &= ::xsimd::bitwise_cast(i_type(0xfffff000)); batch_type lobits = fma(s, hfsq + R, f - hibits - hfsq); batch_type r = fma(dk, log10_2hi, fma(hibits, ivln10hi, fma(lobits, ivln10hi, fma(lobits + hibits, ivln10lo, dk * log10_2lo)))); #ifndef XSIMD_NO_INFINITIES batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else batch_type zz = select(isnez, r, constants::minusinfinity()); #endif return select(!(self >= batch_type(0.)), constants::nan(), zz); } template inline batch log10(batch const& self, requires_arch) noexcept { using batch_type = batch; const batch_type ivln10hi(4.34294481878168880939e-01), ivln10lo(2.50829467116452752298e-11), log10_2hi(3.01029995663611771306e-01), log10_2lo(3.69423907715893078616e-13); using int_type = as_integer_t; using i_type = batch; batch_type x = self; i_type hx = ::xsimd::bitwise_cast(x) >> 32; i_type k(0); auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (self < constants::smallestposval()) && isnez; if (any(test)) { k = select(batch_bool_cast(test), k - i_type(54), k); x = select(test, x * batch_type(18014398509481984ull), x); } #endif hx += 0x3ff00000 - 0x3fe6a09e; k += (hx >> 20) - 0x3ff; hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; x = ::xsimd::bitwise_cast(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast(x))); batch_type f = --x; batch_type dk = to_float(k); batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t2 + t1; batch_type hfsq = batch_type(0.5) * f * f; batch_type hi = f - hfsq; hi = hi & ::xsimd::bitwise_cast(constants::allbits() << 32); batch_type lo = f - hi - hfsq + s * (hfsq + R); batch_type val_hi = hi * ivln10hi; batch_type y = dk * log10_2hi; batch_type val_lo = dk * log10_2lo + (lo + hi) * ivln10lo + lo * ivln10hi; batch_type w1 = y + val_hi; val_lo += (y - w1) + val_hi; val_hi = w1; batch_type r = val_lo + val_hi; #ifndef XSIMD_NO_INFINITIES batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else batch_type zz = select(isnez, r, constants::minusinfinity()); #endif return select(!(self >= batch_type(0.)), constants::nan(), zz); } template inline batch, A> log10(const batch, A>& z, requires_arch) noexcept { return detail::logN_complex_impl(z, std::log(10)); } // log1p /* origin: boost/simd/arch/common/simd/function/log1p.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline batch log1p(batch const& self, requires_arch) noexcept { using batch_type = batch; using int_type = as_integer_t; using i_type = batch; const batch_type uf = self + batch_type(1.); auto isnez = (uf != batch_type(0.)); i_type iu = ::xsimd::bitwise_cast(uf); iu += 0x3f800000 - 0x3f3504f3; i_type k = (iu >> 23) - 0x7f; iu = (iu & i_type(0x007fffff)) + 0x3f3504f3; batch_type f = --(::xsimd::bitwise_cast(iu)); batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t2 + t1; batch_type hfsq = batch_type(0.5) * f * f; batch_type dk = to_float(k); /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */ batch_type c = select(batch_bool_cast(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf; batch_type r = fma(dk, constants::log_2hi(), fma(s, (hfsq + R), dk * constants::log_2lo() + c) - hfsq + f); #ifndef XSIMD_NO_INFINITIES batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else batch_type zz = select(isnez, r, constants::minusinfinity()); #endif return select(!(uf >= batch_type(0.)), constants::nan(), zz); } template inline batch log1p(batch const& self, requires_arch) noexcept { using batch_type = batch; using int_type = as_integer_t; using i_type = batch; const batch_type uf = self + batch_type(1.); auto isnez = (uf != batch_type(0.)); i_type hu = ::xsimd::bitwise_cast(uf) >> 32; hu += 0x3ff00000 - 0x3fe6a09e; i_type k = (hu >> 20) - 0x3ff; /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */ batch_type c = select(batch_bool_cast(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf; hu = (hu & i_type(0x000fffff)) + 0x3fe6a09e; batch_type f = ::xsimd::bitwise_cast((hu << 32) | (i_type(0xffffffff) & ::xsimd::bitwise_cast(uf))); f = --f; batch_type hfsq = batch_type(0.5) * f * f; batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t2 + t1; batch_type dk = to_float(k); batch_type r = fma(dk, constants::log_2hi(), fma(s, hfsq + R, dk * constants::log_2lo() + c) - hfsq + f); #ifndef XSIMD_NO_INFINITIES batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else batch_type zz = select(isnez, r, constants::minusinfinity()); #endif return select(!(uf >= batch_type(0.)), constants::nan(), zz); } template inline batch, A> log1p(batch, A> const& self, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; batch_type u = 1 + self; batch_type logu = log(u); return select(u == batch_type(1.), self, select(u.real() <= real_batch(0.), logu, logu * self / (u - batch_type(1.)))); } // mod template ::value, void>::type> inline batch mod(batch const& self, batch const& other, requires_arch) noexcept { return detail::apply([](T x, T y) noexcept -> T { return x % y; }, self, other); } // nearbyint template ::value, void>::type> inline batch nearbyint(batch const& self, requires_arch) noexcept { return self; } namespace detail { template inline batch nearbyintf(batch const& self) noexcept { using batch_type = batch; batch_type s = bitofsign(self); batch_type v = self ^ s; batch_type t2n = constants::twotonmb(); // Under fast-math, reordering is possible and the compiler optimizes d // to v. That's not what we want, so prevent compiler optimization here. // FIXME: it may be better to emit a memory barrier here (?). #ifdef __FAST_MATH__ volatile batch_type d0 = v + t2n; batch_type d = *(batch_type*)(void*)(&d0) - t2n; #else batch_type d0 = v + t2n; batch_type d = d0 - t2n; #endif return s ^ select(v < t2n, d, v); } } template inline batch nearbyint(batch const& self, requires_arch) noexcept { return detail::nearbyintf(self); } template inline batch nearbyint(batch const& self, requires_arch) noexcept { return detail::nearbyintf(self); } // nearbyint_as_int template ::value, void>::type> inline batch nearbyint_as_int(batch const& self, requires_arch) noexcept { return self; } // nearbyint_as_int template inline batch, A> nearbyint_as_int(batch const& self, requires_arch) noexcept { using U = as_integer_t; return kernel::detail::apply_transform([](float x) noexcept -> U { return std::nearbyintf(x); }, self); } template inline batch, A> nearbyint_as_int(batch const& self, requires_arch) noexcept { using U = as_integer_t; return kernel::detail::apply_transform([](double x) noexcept -> U { return std::nearbyint(x); }, self); } // nextafter namespace detail { template ::value> struct nextafter_kernel { using batch_type = batch; static inline batch_type next(batch_type const& b) noexcept { return b; } static inline batch_type prev(batch_type const& b) noexcept { return b; } }; template struct bitwise_cast_batch; template struct bitwise_cast_batch { using type = batch; }; template struct bitwise_cast_batch { using type = batch; }; template struct nextafter_kernel { using batch_type = batch; using int_batch = typename bitwise_cast_batch::type; using int_type = typename int_batch::value_type; static inline batch_type next(const batch_type& b) noexcept { batch_type n = ::xsimd::bitwise_cast(::xsimd::bitwise_cast(b) + int_type(1)); return select(b == constants::infinity(), b, n); } static inline batch_type prev(const batch_type& b) noexcept { batch_type p = ::xsimd::bitwise_cast(::xsimd::bitwise_cast(b) - int_type(1)); return select(b == constants::minusinfinity(), b, p); } }; } template inline batch nextafter(batch const& from, batch const& to, requires_arch) noexcept { using kernel = detail::nextafter_kernel; return select(from == to, from, select(to > from, kernel::next(from), kernel::prev(from))); } // pow /* origin: boost/simd/arch/common/simd/function/pow.hpp*/ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline batch pow(batch const& self, batch const& other, requires_arch) noexcept { using batch_type = batch; const auto zero = batch_type(0.); auto negx = self < zero; auto iszero = self == zero; constexpr T e = static_cast(2.718281828459045); auto adj_self = select(iszero, batch_type(e), abs(self)); batch_type z = exp(other * log(adj_self)); z = select(iszero, zero, z); z = select(is_odd(other) && negx, -z, z); auto invalid = negx && !(is_flint(other) || isinf(other)); return select(invalid, constants::nan(), z); } template inline batch, A> pow(const batch, A>& a, const batch, A>& z, requires_arch) noexcept { using cplx_batch = batch, A>; using real_batch = typename cplx_batch::real_batch; real_batch absa = abs(a); real_batch arga = arg(a); real_batch x = z.real(); real_batch y = z.imag(); real_batch r = pow(absa, x); real_batch theta = x * arga; real_batch ze(0); auto cond = (y == ze); r = select(cond, r, r * exp(-y * arga)); theta = select(cond, theta, theta + y * log(absa)); return select(absa == ze, cplx_batch(ze), cplx_batch(r * cos(theta), r * sin(theta))); } // reciprocal template ::value, void>::type> inline batch reciprocal(batch const& self, requires_arch) noexcept { using batch_type = batch; return div(batch_type(1), self); } // reduce_add template inline std::complex reduce_add(batch, A> const& self, requires_arch) noexcept { return { reduce_add(self.real()), reduce_add(self.imag()) }; } namespace detail { template struct split_high { static constexpr T get(T i, T) { return i >= N ? (i % 2) : i + N; } }; template inline T reduce(Op, batch const& self, std::integral_constant) noexcept { return self.get(0); } template inline T reduce(Op op, batch const& self, std::integral_constant) noexcept { using index_type = as_unsigned_integer_t; batch split = swizzle(self, make_batch_constant, split_high>()); return reduce(op, op(split, self), std::integral_constant()); } } // reduce_max template inline T reduce_max(batch const& self, requires_arch) noexcept { return detail::reduce([](batch const& x, batch const& y) { return max(x, y); }, self, std::integral_constant::size>()); } // reduce_min template inline T reduce_min(batch const& self, requires_arch) noexcept { return detail::reduce([](batch const& x, batch const& y) { return min(x, y); }, self, std::integral_constant::size>()); } // remainder template inline batch remainder(batch const& self, batch const& other, requires_arch) noexcept { return fnma(nearbyint(self / other), other, self); } template inline batch remainder(batch const& self, batch const& other, requires_arch) noexcept { return fnma(nearbyint(self / other), other, self); } template ::value, void>::type> inline batch remainder(batch const& self, batch const& other, requires_arch) noexcept { auto mod = self % other; return select(mod <= other / 2, mod, mod - other); } // select template inline batch, A> select(batch_bool const& cond, batch, A> const& true_br, batch, A> const& false_br, requires_arch) noexcept { return { select(cond, true_br.real(), false_br.real()), select(cond, true_br.imag(), false_br.imag()) }; } // sign template ::value, void>::type> inline batch sign(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type res = select(self > batch_type(0), batch_type(1), batch_type(0)) - select(self < batch_type(0), batch_type(1), batch_type(0)); return res; } namespace detail { template inline batch signf(batch const& self) noexcept { using batch_type = batch; batch_type res = select(self > batch_type(0.f), batch_type(1.f), batch_type(0.f)) - select(self < batch_type(0.f), batch_type(1.f), batch_type(0.f)); #ifdef XSIMD_NO_NANS return res; #else return select(isnan(self), constants::nan(), res); #endif } } template inline batch sign(batch const& self, requires_arch) noexcept { return detail::signf(self); } template inline batch sign(batch const& self, requires_arch) noexcept { return detail::signf(self); } template inline batch, A> sign(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; auto rz = z.real(); auto iz = z.imag(); return select(rz != real_batch(0.), batch_type(sign(rz)), batch_type(sign(iz))); } // signnz template ::value, void>::type> inline batch signnz(batch const& self, requires_arch) noexcept { using batch_type = batch; return (self >> (sizeof(T) * 8 - 1)) | batch_type(1.); } namespace detail { template inline batch signnzf(batch const& self) noexcept { using batch_type = batch; #ifndef XSIMD_NO_NANS return select(isnan(self), constants::nan(), batch_type(1.) | (constants::signmask() & self)); #else return batch_type(1.) | (constants::signmask() & self); #endif } } template inline batch signnz(batch const& self, requires_arch) noexcept { return detail::signnzf(self); } template inline batch signnz(batch const& self, requires_arch) noexcept { return detail::signnzf(self); } // sqrt template inline batch, A> sqrt(batch, A> const& z, requires_arch) noexcept { constexpr T csqrt_scale_factor = std::is_same::value ? 6.7108864e7f : 1.8014398509481984e16; constexpr T csqrt_scale = std::is_same::value ? 1.220703125e-4f : 7.450580596923828125e-9; using batch_type = batch, A>; using real_batch = batch; real_batch x = z.real(); real_batch y = z.imag(); real_batch sqrt_x = sqrt(fabs(x)); real_batch sqrt_hy = sqrt(0.5 * fabs(y)); auto cond = (fabs(x) > real_batch(4.) || fabs(y) > real_batch(4.)); x = select(cond, x * 0.25, x * csqrt_scale_factor); y = select(cond, y * 0.25, y * csqrt_scale_factor); real_batch scale = select(cond, real_batch(2.), real_batch(csqrt_scale)); real_batch r = abs(batch_type(x, y)); auto condxp = x > real_batch(0.); real_batch t0 = select(condxp, xsimd::sqrt(0.5 * (r + x)), xsimd::sqrt(0.5 * (r - x))); real_batch r0 = scale * fabs((0.5 * y) / t0); t0 *= scale; real_batch t = select(condxp, t0, r0); r = select(condxp, r0, t0); batch_type resg = select(y < real_batch(0.), batch_type(t, -r), batch_type(t, r)); real_batch ze(0.); return select(y == ze, select(x == ze, batch_type(ze, ze), select(x < ze, batch_type(ze, sqrt_x), batch_type(sqrt_x, ze))), select(x == ze, select(y > ze, batch_type(sqrt_hy, sqrt_hy), batch_type(sqrt_hy, -sqrt_hy)), resg)); } // tgamma namespace detail { /* origin: boost/simd/arch/common/detail/generic/stirling_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct stirling_kernel; template struct stirling_kernel> { using batch_type = batch; static inline batch_type compute(const batch_type& x) noexcept { return horner(x); } static inline batch_type split_limit() noexcept { return batch_type(bit_cast(uint32_t(0x41d628f6))); } static inline batch_type large_limit() noexcept { return batch_type(bit_cast(uint32_t(0x420c28f3))); } }; template struct stirling_kernel> { using batch_type = batch; static inline batch_type compute(const batch_type& x) noexcept { return horner(x); } static inline batch_type split_limit() noexcept { return batch_type(bit_cast(uint64_t(0x4061e083ba3443d4))); } static inline batch_type large_limit() noexcept { return batch_type(bit_cast(uint64_t(0x4065800000000000))); } }; /* origin: boost/simd/arch/common/simd/function/stirling.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline batch stirling(const batch& a) noexcept { using batch_type = batch; const batch_type stirlingsplitlim = stirling_kernel::split_limit(); const batch_type stirlinglargelim = stirling_kernel::large_limit(); batch_type x = select(a >= batch_type(0.), a, constants::nan()); batch_type w = batch_type(1.) / x; w = fma(w, stirling_kernel::compute(w), batch_type(1.)); batch_type y = exp(-x); auto test = (x < stirlingsplitlim); batch_type z = x - batch_type(0.5); z = select(test, z, batch_type(0.5) * z); batch_type v = exp(z * log(abs(x))); y *= v; y = select(test, y, y * v); y *= constants::sqrt_2pi() * w; #ifndef XSIMD_NO_INFINITIES y = select(isinf(x), x, y); #endif return select(x > stirlinglargelim, constants::infinity(), y); } /* origin: boost/simd/arch/common/detail/generic/gamma_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct tgamma_kernel; template struct tgamma_kernel> { using batch_type = batch; static inline batch_type compute(const batch_type& x) noexcept { return horner(x); } }; template struct tgamma_kernel> { using batch_type = batch; static inline batch_type compute(const batch_type& x) noexcept { return horner(x) / horner(x); } }; /* origin: boost/simd/arch/common/simd/function/gamma.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline B tgamma_large_negative(const B& a) noexcept { B st = stirling(a); B p = floor(a); B sgngam = select(is_even(p), -B(1.), B(1.)); B z = a - p; auto test2 = z < B(0.5); z = select(test2, z - B(1.), z); z = a * sin(z, trigo_pi_tag()); z = abs(z); return sgngam * constants::pi() / (z * st); } template inline B tgamma_other(const B& a, const BB& test) noexcept { B x = select(test, B(2.), a); #ifndef XSIMD_NO_INFINITIES auto inf_result = (a == constants::infinity()); x = select(inf_result, B(2.), x); #endif B z = B(1.); auto test1 = (x >= B(3.)); while (any(test1)) { x = select(test1, x - B(1.), x); z = select(test1, z * x, z); test1 = (x >= B(3.)); } test1 = (x < B(0.)); while (any(test1)) { z = select(test1, z / x, z); x = select(test1, x + B(1.), x); test1 = (x < B(0.)); } auto test2 = (x < B(2.)); while (any(test2)) { z = select(test2, z / x, z); x = select(test2, x + B(1.), x); test2 = (x < B(2.)); } x = z * tgamma_kernel::compute(x - B(2.)); #ifndef XSIMD_NO_INFINITIES return select(inf_result, a, x); #else return x; #endif } } template inline batch tgamma(batch const& self, requires_arch) noexcept { using batch_type = batch; auto nan_result = (self < batch_type(0.) && is_flint(self)); #ifndef XSIMD_NO_INVALIDS nan_result = isnan(self) || nan_result; #endif batch_type q = abs(self); auto test = (self < batch_type(-33.)); batch_type r = constants::nan(); if (any(test)) { r = detail::tgamma_large_negative(q); if (all(test)) return select(nan_result, constants::nan(), r); } batch_type r1 = detail::tgamma_other(self, test); batch_type r2 = select(test, r, r1); return select(self == batch_type(0.), copysign(constants::infinity(), self), select(nan_result, constants::nan(), r2)); } } } #endif xsimd-12.1.1/include/xsimd/arch/generic/xsimd_generic_memory.hpp000066400000000000000000000647261453610362700247670ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_GENERIC_MEMORY_HPP #define XSIMD_GENERIC_MEMORY_HPP #include #include #include #include "../../types/xsimd_batch_constant.hpp" #include "./xsimd_generic_details.hpp" namespace xsimd { template struct batch_constant; template struct batch_bool_constant; namespace kernel { using namespace types; // compress namespace detail { template inline batch create_compress_swizzle_mask(I bitmask, ::xsimd::detail::index_sequence) { batch swizzle_mask(IT(0)); alignas(A::alignment()) IT mask_buffer[batch::size] = { Is... }; size_t inserted = 0; for (size_t i = 0; i < sizeof...(Is); ++i) if ((bitmask >> i) & 1u) std::swap(mask_buffer[inserted++], mask_buffer[i]); return batch::load_aligned(&mask_buffer[0]); } } template inline batch compress(batch const& x, batch_bool const& mask, kernel::requires_arch) noexcept { using IT = as_unsigned_integer_t; constexpr std::size_t size = batch_bool::size; auto bitmask = mask.mask(); auto z = select(mask, x, batch((T)0)); auto compress_mask = detail::create_compress_swizzle_mask(bitmask, ::xsimd::detail::make_index_sequence()); return swizzle(z, compress_mask); } // expand namespace detail { template inline batch create_expand_swizzle_mask(I bitmask, ::xsimd::detail::index_sequence) { batch swizzle_mask(IT(0)); IT j = 0; (void)std::initializer_list { ((swizzle_mask = insert(swizzle_mask, j, index())), (j += ((bitmask >> Is) & 1u)), true)... }; return swizzle_mask; } } template inline batch expand(batch const& x, batch_bool const& mask, kernel::requires_arch) noexcept { constexpr std::size_t size = batch_bool::size; auto bitmask = mask.mask(); auto swizzle_mask = detail::create_expand_swizzle_mask, A>(bitmask, ::xsimd::detail::make_index_sequence()); auto z = swizzle(x, swizzle_mask); return select(mask, z, batch(T(0))); } // extract_pair template inline batch extract_pair(batch const& self, batch const& other, std::size_t i, requires_arch) noexcept { constexpr std::size_t size = batch::size; assert(i < size && "index in bounds"); alignas(A::alignment()) T self_buffer[size]; self.store_aligned(self_buffer); alignas(A::alignment()) T other_buffer[size]; other.store_aligned(other_buffer); alignas(A::alignment()) T concat_buffer[size]; for (std::size_t j = 0; j < (size - i); ++j) { concat_buffer[j] = other_buffer[i + j]; if (j < i) { concat_buffer[size - 1 - j] = self_buffer[i - 1 - j]; } } return batch::load_aligned(concat_buffer); } // gather namespace detail { template ::type = 0> inline batch gather(U const* src, batch const& index, ::xsimd::index I) noexcept { return insert(batch {}, static_cast(src[index.get(I)]), I); } template ::type = 0> inline batch gather(U const* src, batch const& index, ::xsimd::index I) noexcept { static_assert(N <= batch::size, "Incorrect value in recursion!"); const auto test = gather(src, index, {}); return insert(test, static_cast(src[index.get(I)]), I); } } // namespace detail template inline batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept { static_assert(batch::size == batch::size, "Index and destination sizes must match"); return detail::gather::size - 1, T, A>(src, index, {}); } // Gather with runtime indexes and mismatched strides. template inline detail::sizes_mismatch_t> gather(batch const&, U const* src, batch const& index, kernel::requires_arch) noexcept { static_assert(batch::size == batch::size, "Index and destination sizes must match"); return detail::gather::size - 1, T, A>(src, index, {}); } // Gather with runtime indexes and matching strides. template inline detail::stride_match_t> gather(batch const&, U const* src, batch const& index, kernel::requires_arch) noexcept { static_assert(batch::size == batch::size, "Index and destination sizes must match"); return batch_cast(kernel::gather(batch {}, src, index, A {})); } // insert template inline batch insert(batch const& self, T val, index, requires_arch) noexcept { struct index_mask { static constexpr bool get(size_t index, size_t /* size*/) { return index != I; } }; batch tmp(val); return select(make_batch_bool_constant, index_mask>(), self, tmp); } // get template inline T get(batch const& self, ::xsimd::index, requires_arch) noexcept { alignas(A::alignment()) T buffer[batch::size]; self.store_aligned(&buffer[0]); return buffer[I]; } template inline T get(batch_bool const& self, ::xsimd::index, requires_arch) noexcept { alignas(A::alignment()) T buffer[batch_bool::size]; self.store_aligned(&buffer[0]); return buffer[I]; } template inline auto get(batch, A> const& self, ::xsimd::index, requires_arch) noexcept -> typename batch, A>::value_type { alignas(A::alignment()) T buffer[batch, A>::size]; self.store_aligned(&buffer[0]); return buffer[I]; } template inline T get(batch const& self, std::size_t i, requires_arch) noexcept { alignas(A::alignment()) T buffer[batch::size]; self.store_aligned(&buffer[0]); return buffer[i]; } template inline T get(batch_bool const& self, std::size_t i, requires_arch) noexcept { alignas(A::alignment()) bool buffer[batch_bool::size]; self.store_aligned(&buffer[0]); return buffer[i]; } template inline auto get(batch, A> const& self, std::size_t i, requires_arch) noexcept -> typename batch, A>::value_type { using T2 = typename batch, A>::value_type; alignas(A::alignment()) T2 buffer[batch, A>::size]; self.store_aligned(&buffer[0]); return buffer[i]; } // load_aligned namespace detail { template inline batch load_aligned(T_in const* mem, convert, requires_arch, with_fast_conversion) noexcept { using batch_type_in = batch; using batch_type_out = batch; return fast_cast(batch_type_in::load_aligned(mem), batch_type_out(), A {}); } template inline batch load_aligned(T_in const* mem, convert, requires_arch, with_slow_conversion) noexcept { static_assert(!std::is_same::value, "there should be a direct load for this type combination"); using batch_type_out = batch; alignas(A::alignment()) T_out buffer[batch_type_out::size]; std::copy(mem, mem + batch_type_out::size, std::begin(buffer)); return batch_type_out::load_aligned(buffer); } } template inline batch load_aligned(T_in const* mem, convert cvt, requires_arch) noexcept { return detail::load_aligned(mem, cvt, A {}, detail::conversion_type {}); } // load_unaligned namespace detail { template inline batch load_unaligned(T_in const* mem, convert, requires_arch, with_fast_conversion) noexcept { using batch_type_in = batch; using batch_type_out = batch; return fast_cast(batch_type_in::load_unaligned(mem), batch_type_out(), A {}); } template inline batch load_unaligned(T_in const* mem, convert cvt, requires_arch, with_slow_conversion) noexcept { static_assert(!std::is_same::value, "there should be a direct load for this type combination"); return load_aligned(mem, cvt, generic {}, with_slow_conversion {}); } } template inline batch load_unaligned(T_in const* mem, convert cvt, requires_arch) noexcept { return detail::load_unaligned(mem, cvt, generic {}, detail::conversion_type {}); } // rotate_left template inline batch rotate_left(batch const& self, requires_arch) noexcept { struct rotate_generator { static constexpr size_t get(size_t index, size_t size) { return (index - N) % size; } }; return swizzle(self, make_batch_constant, A>, rotate_generator>(), A {}); } template inline batch, A> rotate_left(batch, A> const& self, requires_arch) noexcept { return { rotate_left(self.real()), rotate_left(self.imag()) }; } // rotate_right template inline batch rotate_right(batch const& self, requires_arch) noexcept { struct rotate_generator { static constexpr size_t get(size_t index, size_t size) { return (index + N) % size; } }; return swizzle(self, make_batch_constant, A>, rotate_generator>(), A {}); } template inline batch, A> rotate_right(batch, A> const& self, requires_arch) noexcept { return { rotate_right(self.real()), rotate_right(self.imag()) }; } // Scatter with runtime indexes. namespace detail { template ::type = 0> inline void scatter(batch const& src, U* dst, batch const& index, ::xsimd::index I) noexcept { dst[index.get(I)] = static_cast(src.get(I)); } template ::type = 0> inline void scatter(batch const& src, U* dst, batch const& index, ::xsimd::index I) noexcept { static_assert(N <= batch::size, "Incorrect value in recursion!"); kernel::detail::scatter( src, dst, index, {}); dst[index.get(I)] = static_cast(src.get(I)); } } // namespace detail template inline void scatter(batch const& src, T* dst, batch const& index, kernel::requires_arch) noexcept { static_assert(batch::size == batch::size, "Source and index sizes must match"); kernel::detail::scatter::size - 1, T, A, T, V>( src, dst, index, {}); } template inline detail::sizes_mismatch_t scatter(batch const& src, U* dst, batch const& index, kernel::requires_arch) noexcept { static_assert(batch::size == batch::size, "Source and index sizes must match"); kernel::detail::scatter::size - 1, T, A, U, V>( src, dst, index, {}); } template inline detail::stride_match_t scatter(batch const& src, U* dst, batch const& index, kernel::requires_arch) noexcept { static_assert(batch::size == batch::size, "Source and index sizes must match"); const auto tmp = batch_cast(src); kernel::scatter(tmp, dst, index, A {}); } // shuffle namespace detail { constexpr bool is_swizzle_fst(size_t) { return true; } template constexpr bool is_swizzle_fst(size_t bsize, ITy index, ITys... indices) { return index < bsize && is_swizzle_fst(bsize, indices...); } constexpr bool is_swizzle_snd(size_t) { return true; } template constexpr bool is_swizzle_snd(size_t bsize, ITy index, ITys... indices) { return index >= bsize && is_swizzle_snd(bsize, indices...); } constexpr bool is_zip_lo(size_t) { return true; } template constexpr bool is_zip_lo(size_t bsize, ITy0 index0, ITy1 index1, ITys... indices) { return index0 == (bsize - (sizeof...(indices) + 2)) && index1 == (2 * bsize - (sizeof...(indices) + 2)) && is_zip_lo(bsize, indices...); } constexpr bool is_zip_hi(size_t) { return true; } template constexpr bool is_zip_hi(size_t bsize, ITy0 index0, ITy1 index1, ITys... indices) { return index0 == (bsize / 2 + bsize - (sizeof...(indices) + 2)) && index1 == (bsize / 2 + 2 * bsize - (sizeof...(indices) + 2)) && is_zip_hi(bsize, indices...); } constexpr bool is_select(size_t) { return true; } template constexpr bool is_select(size_t bsize, ITy index, ITys... indices) { return (index < bsize ? index : index - bsize) == (bsize - sizeof...(ITys)) && is_select(bsize, indices...); } } template inline batch shuffle(batch const& x, batch const& y, batch_constant, Indices...>, requires_arch) noexcept { constexpr size_t bsize = sizeof...(Indices); // Detect common patterns XSIMD_IF_CONSTEXPR(detail::is_swizzle_fst(bsize, Indices...)) { return swizzle(x, batch_constant, ((Indices >= bsize) ? 0 /* never happens */ : Indices)...>()); } XSIMD_IF_CONSTEXPR(detail::is_swizzle_snd(bsize, Indices...)) { return swizzle(y, batch_constant, ((Indices >= bsize) ? (Indices - bsize) : 0 /* never happens */)...>()); } XSIMD_IF_CONSTEXPR(detail::is_zip_lo(bsize, Indices...)) { return zip_lo(x, y); } XSIMD_IF_CONSTEXPR(detail::is_zip_hi(bsize, Indices...)) { return zip_hi(x, y); } XSIMD_IF_CONSTEXPR(detail::is_select(bsize, Indices...)) { return select(batch_bool_constant, (Indices < bsize)...>(), x, y); } #if defined(__has_builtin) #if __has_builtin(__builtin_shuffle_vector) #define builtin_shuffle __builtin_shuffle_vector #endif #endif #if defined(builtin_shuffle) return builtin_shuffle(x.data, y.data, Indices...); // FIXME: my experiments show that GCC only correctly optimizes this builtin // starting at GCC 13, where it already has __builtin_shuffle_vector // // #elif __has_builtin(__builtin_shuffle) || GCC >= 6 // typedef ITy integer_vector_type __attribute__((vector_size(sizeof(batch)))); // return __builtin_shuffle(x.data, y.data, integer_vector_type{Indices...}); #else // Use a generic_pattern. It is suboptimal but clang optimizes this // pretty well. batch x_lane = swizzle(x, batch_constant, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>()); batch y_lane = swizzle(y, batch_constant, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>()); batch_bool_constant, (Indices < bsize)...> select_x_lane; return select(select_x_lane, x_lane, y_lane); #endif } // store template inline void store(batch_bool const& self, bool* mem, requires_arch) noexcept { using batch_type = batch; constexpr auto size = batch_bool::size; alignas(A::alignment()) T buffer[size]; kernel::store_aligned(&buffer[0], batch_type(self), A {}); for (std::size_t i = 0; i < size; ++i) mem[i] = bool(buffer[i]); } // store_aligned template inline void store_aligned(T_out* mem, batch const& self, requires_arch) noexcept { static_assert(!std::is_same::value, "there should be a direct store for this type combination"); alignas(A::alignment()) T_in buffer[batch::size]; store_aligned(&buffer[0], self); std::copy(std::begin(buffer), std::end(buffer), mem); } // store_unaligned template inline void store_unaligned(T_out* mem, batch const& self, requires_arch) noexcept { static_assert(!std::is_same::value, "there should be a direct store for this type combination"); return store_aligned(mem, self, generic {}); } // swizzle template inline batch, A> swizzle(batch, A> const& self, batch_constant, Vs...> mask, requires_arch) noexcept { return { swizzle(self.real(), mask), swizzle(self.imag(), mask) }; } template inline batch swizzle(batch const& self, batch mask, requires_arch) noexcept { constexpr size_t size = batch::size; alignas(A::alignment()) T self_buffer[size]; store_aligned(&self_buffer[0], self); alignas(A::alignment()) ITy mask_buffer[size]; store_aligned(&mask_buffer[0], mask); alignas(A::alignment()) T out_buffer[size]; for (size_t i = 0; i < size; ++i) out_buffer[i] = self_buffer[mask_buffer[i]]; return batch::load_aligned(out_buffer); } template inline batch, A> swizzle(batch, A> const& self, batch mask, requires_arch) noexcept { return { swizzle(self.real(), mask), swizzle(self.imag(), mask) }; } // load_complex_aligned namespace detail { template inline batch, A> load_complex(batch const& /*hi*/, batch const& /*lo*/, requires_arch) noexcept { static_assert(std::is_same::value, "load_complex not implemented for the required architecture"); } template inline batch complex_high(batch, A> const& /*src*/, requires_arch) noexcept { static_assert(std::is_same::value, "complex_high not implemented for the required architecture"); } template inline batch complex_low(batch, A> const& /*src*/, requires_arch) noexcept { static_assert(std::is_same::value, "complex_low not implemented for the required architecture"); } } template inline batch, A> load_complex_aligned(std::complex const* mem, convert>, requires_arch) noexcept { using real_batch = batch; T_in const* buffer = reinterpret_cast(mem); real_batch hi = real_batch::load_aligned(buffer), lo = real_batch::load_aligned(buffer + real_batch::size); return detail::load_complex(hi, lo, A {}); } // load_complex_unaligned template inline batch, A> load_complex_unaligned(std::complex const* mem, convert>, requires_arch) noexcept { using real_batch = batch; T_in const* buffer = reinterpret_cast(mem); real_batch hi = real_batch::load_unaligned(buffer), lo = real_batch::load_unaligned(buffer + real_batch::size); return detail::load_complex(hi, lo, A {}); } // store_complex_aligned template inline void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept { using real_batch = batch; real_batch hi = detail::complex_high(src, A {}); real_batch lo = detail::complex_low(src, A {}); T_out* buffer = reinterpret_cast(dst); lo.store_aligned(buffer); hi.store_aligned(buffer + real_batch::size); } // store_compelx_unaligned template inline void store_complex_unaligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept { using real_batch = batch; real_batch hi = detail::complex_high(src, A {}); real_batch lo = detail::complex_low(src, A {}); T_out* buffer = reinterpret_cast(dst); lo.store_unaligned(buffer); hi.store_unaligned(buffer + real_batch::size); } } } #endif xsimd-12.1.1/include/xsimd/arch/generic/xsimd_generic_rounding.hpp000066400000000000000000000052711453610362700252720ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_GENERIC_ROUNDING_HPP #define XSIMD_GENERIC_ROUNDING_HPP #include "./xsimd_generic_details.hpp" namespace xsimd { namespace kernel { using namespace types; // ceil template inline batch ceil(batch const& self, requires_arch) noexcept { batch truncated_self = trunc(self); return select(truncated_self < self, truncated_self + 1, truncated_self); } // floor template inline batch floor(batch const& self, requires_arch) noexcept { batch truncated_self = trunc(self); return select(truncated_self > self, truncated_self - 1, truncated_self); } // round template inline batch round(batch const& self, requires_arch) noexcept { auto v = abs(self); auto c = ceil(v); auto cp = select(c - 0.5 > v, c - 1, c); return select(v > constants::maxflint>(), self, copysign(cp, self)); } // trunc template ::value, void>::type> inline batch trunc(batch const& self, requires_arch) noexcept { return self; } template inline batch trunc(batch const& self, requires_arch) noexcept { return select(abs(self) < constants::maxflint>(), to_float(to_int(self)), self); } template inline batch trunc(batch const& self, requires_arch) noexcept { return select(abs(self) < constants::maxflint>(), to_float(to_int(self)), self); } } } #endif xsimd-12.1.1/include/xsimd/arch/generic/xsimd_generic_trigo.hpp000066400000000000000000001272471453610362700246010ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_GENERIC_TRIGO_HPP #define XSIMD_GENERIC_TRIGO_HPP #include "./xsimd_generic_details.hpp" #include namespace xsimd { namespace kernel { /* origin: boost/simd/arch/common/detail/simd/trig_base.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ using namespace types; // acos template inline batch acos(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); auto x_larger_05 = x > batch_type(0.5); x = select(x_larger_05, sqrt(fma(batch_type(-0.5), x, batch_type(0.5))), self); x = asin(x); x = select(x_larger_05, x + x, x); x = select(self < batch_type(-0.5), constants::pi() - x, x); return select(x_larger_05, x, constants::pio2() - x); } template inline batch, A> acos(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; batch_type tmp = asin(z); return { constants::pio2() - tmp.real(), -tmp.imag() }; } // acosh /* origin: boost/simd/arch/common/simd/function/acosh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline batch acosh(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = self - batch_type(1.); auto test = x > constants::oneotwoeps(); batch_type z = select(test, self, x + sqrt(x + x + x * x)); batch_type l1pz = log1p(z); return select(test, l1pz + constants::log_2(), l1pz); } template inline batch, A> acosh(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; batch_type w = acos(z); w = batch_type(-w.imag(), w.real()); return w; } // asin template inline batch asin(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); batch_type sign = bitofsign(self); auto x_larger_05 = x > batch_type(0.5); batch_type z = select(x_larger_05, batch_type(0.5) * (batch_type(1.) - x), x * x); x = select(x_larger_05, sqrt(z), x); batch_type z1 = detail::horner(z); z1 = fma(z1, z * x, x); z = select(x_larger_05, constants::pio2() - (z1 + z1), z1); return z ^ sign; } template inline batch asin(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); auto small_cond = x < constants::sqrteps(); batch_type ct1 = batch_type(bit_cast(int64_t(0x3fe4000000000000))); batch_type zz1 = batch_type(1.) - x; batch_type vp = zz1 * detail::horner(zz1) / detail::horner1(zz1); zz1 = sqrt(zz1 + zz1); batch_type z = constants::pio4() - zz1; zz1 = fms(zz1, vp, constants::pio_2lo()); z = z - zz1; zz1 = z + constants::pio4(); batch_type zz2 = self * self; z = zz2 * detail::horner(zz2) / detail::horner1(zz2); zz2 = fma(x, z, x); return select(x > batch_type(1.), constants::nan(), select(small_cond, x, select(x > ct1, zz1, zz2)) ^ bitofsign(self)); } template inline batch, A> asin(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; real_batch x = z.real(); real_batch y = z.imag(); batch_type ct(-y, x); batch_type zz(real_batch(1.) - (x - y) * (x + y), -2 * x * y); zz = log(ct + sqrt(zz)); batch_type resg(zz.imag(), -zz.real()); return select(y == real_batch(0.), select(fabs(x) > real_batch(1.), batch_type(constants::pio2(), real_batch(0.)), batch_type(asin(x), real_batch(0.))), resg); } // asinh /* origin: boost/simd/arch/common/simd/function/asinh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ namespace detail { template ::value, void>::type> inline batch average(const batch& x1, const batch& x2) noexcept { return (x1 & x2) + ((x1 ^ x2) >> 1); } template inline batch averagef(const batch& x1, const batch& x2) noexcept { using batch_type = batch; return fma(x1, batch_type(0.5), x2 * batch_type(0.5)); } template inline batch average(batch const& x1, batch const& x2) noexcept { return averagef(x1, x2); } template inline batch average(batch const& x1, batch const& x2) noexcept { return averagef(x1, x2); } } template inline batch asinh(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); auto lthalf = x < batch_type(0.5); batch_type x2 = x * x; batch_type bts = bitofsign(self); batch_type z(0.); if (any(lthalf)) { z = detail::horner(x2) * x; if (all(lthalf)) return z ^ bts; } batch_type tmp = select(x > constants::oneosqrteps(), x, detail::average(x, hypot(batch_type(1.), x))); #ifndef XSIMD_NO_NANS return select(isnan(self), constants::nan(), select(lthalf, z, log(tmp) + constants::log_2()) ^ bts); #else return select(lthalf, z, log(tmp) + constants::log_2()) ^ bts; #endif } template inline batch asinh(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); auto test = x > constants::oneosqrteps(); batch_type z = select(test, x - batch_type(1.), x + x * x / (batch_type(1.) + hypot(batch_type(1.), x))); #ifndef XSIMD_NO_INFINITIES z = select(x == constants::infinity(), x, z); #endif batch_type l1pz = log1p(z); z = select(test, l1pz + constants::log_2(), l1pz); return bitofsign(self) ^ z; } template inline batch, A> asinh(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; batch_type w = asin(batch_type(-z.imag(), z.real())); w = batch_type(w.imag(), -w.real()); return w; } // atan namespace detail { template static inline batch kernel_atan(const batch& x, const batch& recx) noexcept { using batch_type = batch; const auto flag1 = x < constants::tan3pio8(); const auto flag2 = (x >= batch_type(bit_cast((uint32_t)0x3ed413cd))) && flag1; batch_type yy = select(flag1, batch_type(0.), constants::pio2()); yy = select(flag2, constants::pio4(), yy); batch_type xx = select(flag1, x, -recx); xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx); const batch_type z = xx * xx; batch_type z1 = detail::horner(z); z1 = fma(xx, z1 * z, xx); z1 = select(flag2, z1 + constants::pio_4lo(), z1); z1 = select(!flag1, z1 + constants::pio_2lo(), z1); return yy + z1; } template static inline batch kernel_atan(const batch& x, const batch& recx) noexcept { using batch_type = batch; const auto flag1 = x < constants::tan3pio8(); const auto flag2 = (x >= constants::tanpio8()) && flag1; batch_type yy = select(flag1, batch_type(0.), constants::pio2()); yy = select(flag2, constants::pio4(), yy); batch_type xx = select(flag1, x, -recx); xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx); batch_type z = xx * xx; z *= detail::horner(z) / detail::horner1(z); z = fma(xx, z, xx); z = select(flag2, z + constants::pio_4lo(), z); z = z + select(flag1, batch_type(0.), constants::pio_2lo()); return yy + z; } } template inline batch atan(batch const& self, requires_arch) noexcept { using batch_type = batch; const batch_type absa = abs(self); const batch_type x = detail::kernel_atan(absa, batch_type(1.) / absa); return x ^ bitofsign(self); } template inline batch, A> atan(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; real_batch x = z.real(); real_batch y = z.imag(); real_batch x2 = x * x; real_batch one(1.); real_batch a = one - x2 - (y * y); real_batch w = 0.5 * atan2(2. * x, a); real_batch num = y + one; num = x2 + num * num; real_batch den = y - one; den = x2 + den * den; batch_type res = select((x == real_batch(0.)) && (y == real_batch(1.)), batch_type(real_batch(0.), constants::infinity()), batch_type(w, 0.25 * log(num / den))); return res; } // atanh /* origin: boost/simd/arch/common/simd/function/acosh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline batch atanh(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); batch_type t = x + x; batch_type z = batch_type(1.) - x; auto test = x < batch_type(0.5); batch_type tmp = select(test, x, t) / z; return bitofsign(self) ^ (batch_type(0.5) * log1p(select(test, fma(t, tmp, t), tmp))); } template inline batch, A> atanh(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; batch_type w = atan(batch_type(-z.imag(), z.real())); w = batch_type(w.imag(), -w.real()); return w; } // atan2 template inline batch atan2(batch const& self, batch const& other, requires_arch) noexcept { using batch_type = batch; const batch_type q = abs(self / other); const batch_type z = detail::kernel_atan(q, batch_type(1.) / q); return select(other > batch_type(0.), z, constants::pi() - z) * signnz(self); } // cos namespace detail { template inline batch quadrant(const batch& x) noexcept { return x & batch(3); } template inline batch quadrant(const batch& x) noexcept { return to_float(quadrant(to_int(x))); } template inline batch quadrant(const batch& x) noexcept { using batch_type = batch; batch_type a = x * batch_type(0.25); return (a - floor(a)) * batch_type(4.); } /* origin: boost/simd/arch/common/detail/simd/f_trig_evaluation.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline batch cos_eval(const batch& z) noexcept { using batch_type = batch; batch_type y = detail::horner(z); return batch_type(1.) + fma(z, batch_type(-0.5), y * z * z); } template inline batch sin_eval(const batch& z, const batch& x) noexcept { using batch_type = batch; batch_type y = detail::horner(z); return fma(y * z, x, x); } template static inline batch base_tancot_eval(const batch& z) noexcept { using batch_type = batch; batch_type zz = z * z; batch_type y = detail::horner(zz); return fma(y, zz * z, z); } template static inline batch tan_eval(const batch& z, const BB& test) noexcept { using batch_type = batch; batch_type y = base_tancot_eval(z); return select(test, y, -batch_type(1.) / y); } template static inline batch cot_eval(const batch& z, const BB& test) noexcept { using batch_type = batch; batch_type y = base_tancot_eval(z); return select(test, batch_type(1.) / y, -y); } /* origin: boost/simd/arch/common/detail/simd/d_trig_evaluation.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template static inline batch cos_eval(const batch& z) noexcept { using batch_type = batch; batch_type y = detail::horner(z); return batch_type(1.) - y * z; } template static inline batch sin_eval(const batch& z, const batch& x) noexcept { using batch_type = batch; batch_type y = detail::horner(z); return fma(y * z, x, x); } template static inline batch base_tancot_eval(const batch& z) noexcept { using batch_type = batch; batch_type zz = z * z; batch_type num = detail::horner(zz); batch_type den = detail::horner1(zz); return fma(z, (zz * (num / den)), z); } template static inline batch tan_eval(const batch& z, const BB& test) noexcept { using batch_type = batch; batch_type y = base_tancot_eval(z); return select(test, y, -batch_type(1.) / y); } template static inline batch cot_eval(const batch& z, const BB& test) noexcept { using batch_type = batch; batch_type y = base_tancot_eval(z); return select(test, batch_type(1.) / y, -y); } /* origin: boost/simd/arch/common/detail/simd/trig_reduction.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ struct trigo_radian_tag { }; struct trigo_pi_tag { }; template struct trigo_reducer { static inline B reduce(const B& x, B& xr) noexcept { if (all(x <= constants::pio4())) { xr = x; return B(0.); } else if (all(x <= constants::pio2())) { auto test = x > constants::pio4(); xr = x - constants::pio2_1(); xr -= constants::pio2_2(); xr -= constants::pio2_3(); xr = select(test, xr, x); return select(test, B(1.), B(0.)); } else if (all(x <= constants::twentypi())) { B xi = nearbyint(x * constants::twoopi()); xr = fnma(xi, constants::pio2_1(), x); xr -= xi * constants::pio2_2(); xr -= xi * constants::pio2_3(); return quadrant(xi); } else if (all(x <= constants::mediumpi())) { B fn = nearbyint(x * constants::twoopi()); B r = x - fn * constants::pio2_1(); B w = fn * constants::pio2_1t(); B t = r; w = fn * constants::pio2_2(); r = t - w; w = fn * constants::pio2_2t() - ((t - r) - w); t = r; w = fn * constants::pio2_3(); r = t - w; w = fn * constants::pio2_3t() - ((t - r) - w); xr = r - w; return quadrant(fn); } else { static constexpr std::size_t size = B::size; using value_type = typename B::value_type; alignas(B) std::array tmp; alignas(B) std::array txr; alignas(B) std::array args; x.store_aligned(args.data()); for (std::size_t i = 0; i < size; ++i) { double arg = args[i]; if (arg == std::numeric_limits::infinity()) { tmp[i] = 0.; txr[i] = std::numeric_limits::quiet_NaN(); } else { double y[2]; std::int32_t n = ::xsimd::detail::__ieee754_rem_pio2(arg, y); tmp[i] = value_type(n & 3); txr[i] = value_type(y[0]); } } xr = B::load_aligned(&txr[0]); B res = B::load_aligned(&tmp[0]); return res; } } }; template struct trigo_reducer { static inline B reduce(const B& x, B& xr) noexcept { B xi = nearbyint(x * B(2.)); B x2 = x - xi * B(0.5); xr = x2 * constants::pi(); return quadrant(xi); } }; } template inline batch cos(batch const& self, requires_arch) noexcept { using batch_type = batch; const batch_type x = abs(self); batch_type xr = constants::nan(); const batch_type n = detail::trigo_reducer::reduce(x, xr); auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); auto swap_bit = fma(batch_type(-2.), tmp, n); auto sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask(), batch_type(0.)); const batch_type z = xr * xr; const batch_type se = detail::sin_eval(z, xr); const batch_type ce = detail::cos_eval(z); const batch_type z1 = select(swap_bit != batch_type(0.), se, ce); return z1 ^ sign_bit; } template inline batch, A> cos(batch, A> const& z, requires_arch) noexcept { return { cos(z.real()) * cosh(z.imag()), -sin(z.real()) * sinh(z.imag()) }; } // cosh /* origin: boost/simd/arch/common/simd/function/cosh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline batch cosh(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); auto test1 = x > (constants::maxlog() - constants::log_2()); batch_type fac = select(test1, batch_type(0.5), batch_type(1.)); batch_type tmp = exp(x * fac); batch_type tmp1 = batch_type(0.5) * tmp; return select(test1, tmp1 * tmp, detail::average(tmp, batch_type(1.) / tmp)); } template inline batch, A> cosh(const batch, A>& z, requires_arch) noexcept { auto x = z.real(); auto y = z.imag(); return { cosh(x) * cos(y), sinh(x) * sin(y) }; } // sin namespace detail { template inline batch sin(batch const& self, Tag = Tag()) noexcept { using batch_type = batch; const batch_type x = abs(self); batch_type xr = constants::nan(); const batch_type n = detail::trigo_reducer::reduce(x, xr); auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); auto swap_bit = fma(batch_type(-2.), tmp, n); auto sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask(), batch_type(0.)); const batch_type z = xr * xr; const batch_type se = detail::sin_eval(z, xr); const batch_type ce = detail::cos_eval(z); const batch_type z1 = select(swap_bit == batch_type(0.), se, ce); return z1 ^ sign_bit; } } template inline batch sin(batch const& self, requires_arch) noexcept { return detail::sin(self); } template inline batch, A> sin(batch, A> const& z, requires_arch) noexcept { return { sin(z.real()) * cosh(z.imag()), cos(z.real()) * sinh(z.imag()) }; } // sincos template inline std::pair, batch> sincos(batch const& self, requires_arch) noexcept { using batch_type = batch; const batch_type x = abs(self); batch_type xr = constants::nan(); const batch_type n = detail::trigo_reducer::reduce(x, xr); auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); auto swap_bit = fma(batch_type(-2.), tmp, n); const batch_type z = xr * xr; const batch_type se = detail::sin_eval(z, xr); const batch_type ce = detail::cos_eval(z); auto sin_sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask(), batch_type(0.)); const batch_type sin_z1 = select(swap_bit == batch_type(0.), se, ce); auto cos_sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask(), batch_type(0.)); const batch_type cos_z1 = select(swap_bit != batch_type(0.), se, ce); return std::make_pair(sin_z1 ^ sin_sign_bit, cos_z1 ^ cos_sign_bit); } template inline std::pair, A>, batch, A>> sincos(batch, A> const& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; real_batch rcos = cos(z.real()); real_batch rsin = sin(z.real()); real_batch icosh = cosh(z.imag()); real_batch isinh = sinh(z.imag()); return std::make_pair(batch_type(rsin * icosh, rcos * isinh), batch_type(rcos * icosh, -rsin * isinh)); } // sinh namespace detail { /* origin: boost/simd/arch/common/detail/generic/sinh_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline batch sinh_kernel(batch const& self) noexcept { using batch_type = batch; batch_type sqr_self = self * self; return detail::horner(sqr_self) * self; } template inline batch sinh_kernel(batch const& self) noexcept { using batch_type = batch; batch_type sqrself = self * self; return fma(self, (detail::horner(sqrself) / detail::horner1(sqrself)) * sqrself, self); } } /* origin: boost/simd/arch/common/simd/function/sinh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline batch sinh(batch const& a, requires_arch) noexcept { using batch_type = batch; batch_type half(0.5); batch_type x = abs(a); auto lt1 = x < batch_type(1.); batch_type bts = bitofsign(a); batch_type z(0.); if (any(lt1)) { z = detail::sinh_kernel(x); if (all(lt1)) return z ^ bts; } auto test1 = x > (constants::maxlog() - constants::log_2()); batch_type fac = select(test1, half, batch_type(1.)); batch_type tmp = exp(x * fac); batch_type tmp1 = half * tmp; batch_type r = select(test1, tmp1 * tmp, tmp1 - half / tmp); return select(lt1, z, r) ^ bts; } template inline batch, A> sinh(const batch, A>& z, requires_arch) noexcept { auto x = z.real(); auto y = z.imag(); return { sinh(x) * cos(y), cosh(x) * sin(y) }; } // tan template inline batch tan(batch const& self, requires_arch) noexcept { using batch_type = batch; const batch_type x = abs(self); batch_type xr = constants::nan(); const batch_type n = detail::trigo_reducer::reduce(x, xr); auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); auto swap_bit = fma(batch_type(-2.), tmp, n); auto test = (swap_bit == batch_type(0.)); const batch_type y = detail::tan_eval(xr, test); return y ^ bitofsign(self); } template inline batch, A> tan(batch, A> const& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; real_batch d = cos(2 * z.real()) + cosh(2 * z.imag()); batch_type winf(constants::infinity(), constants::infinity()); real_batch wreal = sin(2 * z.real()) / d; real_batch wimag = sinh(2 * z.imag()); batch_type wres = select(isinf(wimag), batch_type(wreal, real_batch(1.)), batch_type(wreal, wimag / d)); return select(d == real_batch(0.), winf, wres); } // tanh namespace detail { /* origin: boost/simd/arch/common/detail/generic/tanh_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct tanh_kernel; template struct tanh_kernel> { using batch_type = batch; static inline batch_type tanh(const batch_type& x) noexcept { batch_type sqrx = x * x; return fma(detail::horner(sqrx) * sqrx, x, x); } static inline batch_type cotanh(const batch_type& x) noexcept { return batch_type(1.) / tanh(x); } }; template struct tanh_kernel> { using batch_type = batch; static inline batch_type tanh(const batch_type& x) noexcept { batch_type sqrx = x * x; return fma(sqrx * p(sqrx) / q(sqrx), x, x); } static inline batch_type cotanh(const batch_type& x) noexcept { batch_type sqrx = x * x; batch_type qval = q(sqrx); return qval / (x * fma(p(sqrx), sqrx, qval)); } static inline batch_type p(const batch_type& x) noexcept { return detail::horner(x); } static inline batch_type q(const batch_type& x) noexcept { return detail::horner1(x); } }; } /* origin: boost/simd/arch/common/simd/function/tanh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline batch tanh(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type one(1.); batch_type x = abs(self); auto test = x < (batch_type(5.) / batch_type(8.)); batch_type bts = bitofsign(self); batch_type z = one; if (any(test)) { z = detail::tanh_kernel::tanh(x); if (all(test)) return z ^ bts; } batch_type r = fma(batch_type(-2.), one / (one + exp(x + x)), one); return select(test, z, r) ^ bts; } template inline batch, A> tanh(const batch, A>& z, requires_arch) noexcept { using real_batch = typename batch, A>::real_batch; auto x = z.real(); auto y = z.imag(); real_batch two(2); auto d = cosh(two * x) + cos(two * y); return { sinh(two * x) / d, sin(two * y) / d }; } } } #endif xsimd-12.1.1/include/xsimd/arch/xsimd_avx.hpp000066400000000000000000002341331453610362700211340ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX_HPP #define XSIMD_AVX_HPP #include #include #include #include "../types/xsimd_avx_register.hpp" namespace xsimd { namespace kernel { using namespace types; // fwd template inline batch insert(batch const& self, T val, index, requires_arch) noexcept; namespace detail { inline void split_avx(__m256i val, __m128i& low, __m128i& high) noexcept { low = _mm256_castsi256_si128(val); high = _mm256_extractf128_si256(val, 1); } inline void split_avx(__m256 val, __m128& low, __m128& high) noexcept { low = _mm256_castps256_ps128(val); high = _mm256_extractf128_ps(val, 1); } inline void split_avx(__m256d val, __m128d& low, __m128d& high) noexcept { low = _mm256_castpd256_pd128(val); high = _mm256_extractf128_pd(val, 1); } inline __m256i merge_sse(__m128i low, __m128i high) noexcept { return _mm256_insertf128_si256(_mm256_castsi128_si256(low), high, 1); } inline __m256 merge_sse(__m128 low, __m128 high) noexcept { return _mm256_insertf128_ps(_mm256_castps128_ps256(low), high, 1); } inline __m256d merge_sse(__m128d low, __m128d high) noexcept { return _mm256_insertf128_pd(_mm256_castpd128_pd256(low), high, 1); } template inline __m256i fwd_to_sse(F f, __m256i self) noexcept { __m128i self_low, self_high; split_avx(self, self_low, self_high); __m128i res_low = f(self_low); __m128i res_high = f(self_high); return merge_sse(res_low, res_high); } template inline __m256i fwd_to_sse(F f, __m256i self, __m256i other) noexcept { __m128i self_low, self_high, other_low, other_high; split_avx(self, self_low, self_high); split_avx(other, other_low, other_high); __m128i res_low = f(self_low, other_low); __m128i res_high = f(self_high, other_high); return merge_sse(res_low, res_high); } template inline __m256i fwd_to_sse(F f, __m256i self, int32_t other) noexcept { __m128i self_low, self_high; split_avx(self, self_low, self_high); __m128i res_low = f(self_low, other); __m128i res_high = f(self_high, other); return merge_sse(res_low, res_high); } } // abs template inline batch abs(batch const& self, requires_arch) noexcept { __m256 sign_mask = _mm256_set1_ps(-0.f); // -0.f = 1 << 31 return _mm256_andnot_ps(sign_mask, self); } template inline batch abs(batch const& self, requires_arch) noexcept { __m256d sign_mask = _mm256_set1_pd(-0.f); // -0.f = 1 << 31 return _mm256_andnot_pd(sign_mask, self); } // add template ::value, void>::type> inline batch add(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return add(batch(s), batch(o)); }, self, other); } template inline batch add(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_add_ps(self, other); } template inline batch add(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_add_pd(self, other); } // all template inline bool all(batch_bool const& self, requires_arch) noexcept { return _mm256_testc_ps(self, batch_bool(true)) != 0; } template inline bool all(batch_bool const& self, requires_arch) noexcept { return _mm256_testc_pd(self, batch_bool(true)) != 0; } template ::value, void>::type> inline bool all(batch_bool const& self, requires_arch) noexcept { return _mm256_testc_si256(self, batch_bool(true)) != 0; } // any template inline bool any(batch_bool const& self, requires_arch) noexcept { return !_mm256_testz_ps(self, self); } template inline bool any(batch_bool const& self, requires_arch) noexcept { return !_mm256_testz_pd(self, self); } template ::value, void>::type> inline bool any(batch_bool const& self, requires_arch) noexcept { return !_mm256_testz_si256(self, self); } // batch_bool_cast template inline batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept { return { bitwise_cast(batch(self.data)).data }; } // bitwise_and template inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_and_ps(self, other); } template inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_and_pd(self, other); } template inline batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_and_ps(self, other); } template inline batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_and_pd(self, other); } template ::value, void>::type> inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_and(batch(s), batch(o)); }, self, other); } template ::value, void>::type> inline batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_and(batch(s), batch(o)); }, self, other); } // bitwise_andnot template inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_andnot_ps(other, self); } template inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_andnot_pd(other, self); } template inline batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_andnot_ps(other, self); } template inline batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_andnot_pd(other, self); } template ::value, void>::type> inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_andnot(batch(s), batch(o)); }, self, other); } template ::value, void>::type> inline batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_andnot(batch(s), batch(o)); }, self, other); } // bitwise_lshift template ::value, void>::type> inline batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept { return bitwise_lshift(batch(s), o, sse4_2 {}); }, self, other); } // bitwise_not template ::value, void>::type> inline batch bitwise_not(batch const& self, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s) noexcept { return bitwise_not(batch(s), sse4_2 {}); }, self); } template ::value, void>::type> inline batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s) noexcept { return bitwise_not(batch_bool(s), sse4_2 {}); }, self); } // bitwise_or template inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_or_ps(self, other); } template inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_or_pd(self, other); } template inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_or_ps(self, other); } template inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_or_pd(self, other); } template ::value, void>::type> inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_or(batch(s), batch(o)); }, self, other); } template ::value, void>::type> inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_or(batch_bool(s), batch_bool(o)); }, self, other); } // bitwise_rshift template ::value, void>::type> inline batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept { return bitwise_rshift(batch(s), o, sse4_2 {}); }, self, other); } // bitwise_xor template inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_xor_ps(self, other); } template inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_xor_pd(self, other); } template inline batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_xor_ps(self, other); } template inline batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_xor_pd(self, other); } template ::value, void>::type> inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_xor(batch(s), batch(o), sse4_2 {}); }, self, other); } template ::value, void>::type> inline batch bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_xor(batch_bool(s), batch_bool(o), sse4_2 {}); }, self, other); } // bitwise_cast template ::value, void>::type> inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_castsi256_ps(self); } template ::value, void>::type> inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_castsi256_pd(self); } template ::type>::value, void>::type> inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return batch(self.data); } template inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_castps_pd(self); } template ::value, void>::type> inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_castps_si256(self); } template inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_castpd_ps(self); } template ::value, void>::type> inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_castpd_si256(self); } // bitwise_not template inline batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1))); } template inline batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1))); } template inline batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1))); } template inline batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1))); } // broadcast template ::value, void>::type> inline batch broadcast(T val, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_set1_epi8(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_set1_epi16(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_set1_epi32(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_set1_epi64x(val); } else { assert(false && "unsupported"); return {}; } } template inline batch broadcast(float val, requires_arch) noexcept { return _mm256_set1_ps(val); } template inline batch broadcast(double val, requires_arch) noexcept { return _mm256_set1_pd(val); } // ceil template inline batch ceil(batch const& self, requires_arch) noexcept { return _mm256_ceil_ps(self); } template inline batch ceil(batch const& self, requires_arch) noexcept { return _mm256_ceil_pd(self); } namespace detail { // On clang, _mm256_extractf128_ps is built upon build_shufflevector // which require index parameter to be a constant template inline B get_half_complex_f(const B& real, const B& imag) noexcept { __m128 tmp0 = _mm256_extractf128_ps(real, index); __m128 tmp1 = _mm256_extractf128_ps(imag, index); __m128 tmp2 = _mm_unpackhi_ps(tmp0, tmp1); tmp0 = _mm_unpacklo_ps(tmp0, tmp1); __m256 res = real; res = _mm256_insertf128_ps(res, tmp0, 0); res = _mm256_insertf128_ps(res, tmp2, 1); return res; } template inline B get_half_complex_d(const B& real, const B& imag) noexcept { __m128d tmp0 = _mm256_extractf128_pd(real, index); __m128d tmp1 = _mm256_extractf128_pd(imag, index); __m128d tmp2 = _mm_unpackhi_pd(tmp0, tmp1); tmp0 = _mm_unpacklo_pd(tmp0, tmp1); __m256d res = real; res = _mm256_insertf128_pd(res, tmp0, 0); res = _mm256_insertf128_pd(res, tmp2, 1); return res; } // complex_low template inline batch complex_low(batch, A> const& self, requires_arch) noexcept { return get_half_complex_f<0>(self.real(), self.imag()); } template inline batch complex_low(batch, A> const& self, requires_arch) noexcept { return get_half_complex_d<0>(self.real(), self.imag()); } // complex_high template inline batch complex_high(batch, A> const& self, requires_arch) noexcept { return get_half_complex_f<1>(self.real(), self.imag()); } template inline batch complex_high(batch, A> const& self, requires_arch) noexcept { return get_half_complex_d<1>(self.real(), self.imag()); } } // fast_cast namespace detail { template inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_cvtepi32_ps(self); } template inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_cvttps_epi32(self); } } // decr_if template ::value, void>::type> inline batch decr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept { return self + batch(mask.data); } // div template inline batch div(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_div_ps(self, other); } template inline batch div(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_div_pd(self, other); } // eq template inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_ps(self, other, _CMP_EQ_OQ); } template inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_pd(self, other, _CMP_EQ_OQ); } template inline batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return ~(self != other); } template inline batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return ~(self != other); } template ::value, void>::type> inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return eq(batch(s), batch(o), sse4_2 {}); }, self, other); } template ::value, void>::type> inline batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return ~(self != other); } // floor template inline batch floor(batch const& self, requires_arch) noexcept { return _mm256_floor_ps(self); } template inline batch floor(batch const& self, requires_arch) noexcept { return _mm256_floor_pd(self); } // from_mask template inline batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) static const uint64_t lut32[] = { 0x0000000000000000ul, 0x00000000FFFFFFFFul, 0xFFFFFFFF00000000ul, 0xFFFFFFFFFFFFFFFFul, }; assert(!(mask & ~0xFFul) && "inbound mask"); return _mm256_castsi256_ps(_mm256_setr_epi64x(lut32[mask & 0x3], lut32[(mask >> 2) & 0x3], lut32[(mask >> 4) & 0x3], lut32[mask >> 6])); } template inline batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) static const uint64_t lut64[][4] = { { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul }, { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul }, { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul }, { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul }, { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, }; assert(!(mask & ~0xFul) && "inbound mask"); return _mm256_castsi256_pd(_mm256_load_si256((const __m256i*)lut64[mask])); } template ::value, void>::type> inline batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) static const uint32_t lut32[] = { 0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF, 0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF, 0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF, 0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF, }; alignas(A::alignment()) static const uint64_t lut64[] = { 0x0000000000000000ul, 0x000000000000FFFFul, 0x00000000FFFF0000ul, 0x00000000FFFFFFFFul, 0x0000FFFF00000000ul, 0x0000FFFF0000FFFFul, 0x0000FFFFFFFF0000ul, 0x0000FFFFFFFFFFFFul, 0xFFFF000000000000ul, 0xFFFF00000000FFFFul, 0xFFFF0000FFFF0000ul, 0xFFFF0000FFFFFFFFul, 0xFFFFFFFF00000000ul, 0xFFFFFFFF0000FFFFul, 0xFFFFFFFFFFFF0000ul, 0xFFFFFFFFFFFFFFFFul, }; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { assert(!(mask & ~0xFFFFFFFFul) && "inbound mask"); return _mm256_setr_epi32(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF], lut32[(mask >> 8) & 0xF], lut32[(mask >> 12) & 0xF], lut32[(mask >> 16) & 0xF], lut32[(mask >> 20) & 0xF], lut32[(mask >> 24) & 0xF], lut32[mask >> 28]); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { assert(!(mask & ~0xFFFFul) && "inbound mask"); return _mm256_setr_epi64x(lut64[mask & 0xF], lut64[(mask >> 4) & 0xF], lut64[(mask >> 8) & 0xF], lut64[(mask >> 12) & 0xF]); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_castps_si256(from_mask(batch_bool {}, mask, avx {})); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_castpd_si256(from_mask(batch_bool {}, mask, avx {})); } } // haddp template inline batch haddp(batch const* row, requires_arch) noexcept { // row = (a,b,c,d,e,f,g,h) // tmp0 = (a0+a1, a2+a3, b0+b1, b2+b3, a4+a5, a6+a7, b4+b5, b6+b7) __m256 tmp0 = _mm256_hadd_ps(row[0], row[1]); // tmp1 = (c0+c1, c2+c3, d1+d2, d2+d3, c4+c5, c6+c7, d4+d5, d6+d7) __m256 tmp1 = _mm256_hadd_ps(row[2], row[3]); // tmp1 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3, // a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7) tmp1 = _mm256_hadd_ps(tmp0, tmp1); // tmp0 = (e0+e1, e2+e3, f0+f1, f2+f3, e4+e5, e6+e7, f4+f5, f6+f7) tmp0 = _mm256_hadd_ps(row[4], row[5]); // tmp2 = (g0+g1, g2+g3, h0+h1, h2+h3, g4+g5, g6+g7, h4+h5, h6+h7) __m256 tmp2 = _mm256_hadd_ps(row[6], row[7]); // tmp2 = (e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3, // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7) tmp2 = _mm256_hadd_ps(tmp0, tmp2); // tmp0 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3, // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7) tmp0 = _mm256_blend_ps(tmp1, tmp2, 0b11110000); // tmp1 = (a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7, // e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3) tmp1 = _mm256_permute2f128_ps(tmp1, tmp2, 0x21); return _mm256_add_ps(tmp0, tmp1); } template inline batch haddp(batch const* row, requires_arch) noexcept { // row = (a,b,c,d) // tmp0 = (a0+a1, b0+b1, a2+a3, b2+b3) __m256d tmp0 = _mm256_hadd_pd(row[0], row[1]); // tmp1 = (c0+c1, d0+d1, c2+c3, d2+d3) __m256d tmp1 = _mm256_hadd_pd(row[2], row[3]); // tmp2 = (a0+a1, b0+b1, c2+c3, d2+d3) __m256d tmp2 = _mm256_blend_pd(tmp0, tmp1, 0b1100); // tmp1 = (a2+a3, b2+b3, c2+c3, d2+d3) tmp1 = _mm256_permute2f128_pd(tmp0, tmp1, 0x21); return _mm256_add_pd(tmp1, tmp2); } // incr_if template ::value, void>::type> inline batch incr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept { return self - batch(mask.data); } // insert template ::value, void>::type> inline batch insert(batch const& self, T val, index pos, requires_arch) noexcept { #if !defined(_MSC_VER) || _MSC_VER > 1900 XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_insert_epi8(self, val, I); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_insert_epi16(self, val, I); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_insert_epi32(self, val, I); } else { return insert(self, val, pos, generic {}); } #endif return insert(self, val, pos, generic {}); } // isnan template inline batch_bool isnan(batch const& self, requires_arch) noexcept { return _mm256_cmp_ps(self, self, _CMP_UNORD_Q); } template inline batch_bool isnan(batch const& self, requires_arch) noexcept { return _mm256_cmp_pd(self, self, _CMP_UNORD_Q); } // le template inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_ps(self, other, _CMP_LE_OQ); } template inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_pd(self, other, _CMP_LE_OQ); } // load_aligned template ::value, void>::type> inline batch load_aligned(T const* mem, convert, requires_arch) noexcept { return _mm256_load_si256((__m256i const*)mem); } template inline batch load_aligned(float const* mem, convert, requires_arch) noexcept { return _mm256_load_ps(mem); } template inline batch load_aligned(double const* mem, convert, requires_arch) noexcept { return _mm256_load_pd(mem); } namespace detail { // load_complex template inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { using batch_type = batch; __m128 tmp0 = _mm256_extractf128_ps(hi, 0); __m128 tmp1 = _mm256_extractf128_ps(hi, 1); __m128 tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0)); __m128 tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1)); batch_type real = _mm256_castps128_ps256(tmp_real); batch_type imag = _mm256_castps128_ps256(tmp_imag); tmp0 = _mm256_extractf128_ps(lo, 0); tmp1 = _mm256_extractf128_ps(lo, 1); tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0)); tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1)); real = _mm256_insertf128_ps(real, tmp_real, 1); imag = _mm256_insertf128_ps(imag, tmp_imag, 1); return { real, imag }; } template inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { using batch_type = batch; __m128d tmp0 = _mm256_extractf128_pd(hi, 0); __m128d tmp1 = _mm256_extractf128_pd(hi, 1); batch_type real = _mm256_castpd128_pd256(_mm_unpacklo_pd(tmp0, tmp1)); batch_type imag = _mm256_castpd128_pd256(_mm_unpackhi_pd(tmp0, tmp1)); tmp0 = _mm256_extractf128_pd(lo, 0); tmp1 = _mm256_extractf128_pd(lo, 1); __m256d re_tmp1 = _mm256_insertf128_pd(real, _mm_unpacklo_pd(tmp0, tmp1), 1); __m256d im_tmp1 = _mm256_insertf128_pd(imag, _mm_unpackhi_pd(tmp0, tmp1), 1); real = _mm256_blend_pd(real, re_tmp1, 12); imag = _mm256_blend_pd(imag, im_tmp1, 12); return { real, imag }; } } // load_unaligned template ::value, void>::type> inline batch load_unaligned(T const* mem, convert, requires_arch) noexcept { return _mm256_loadu_si256((__m256i const*)mem); } template inline batch load_unaligned(float const* mem, convert, requires_arch) noexcept { return _mm256_loadu_ps(mem); } template inline batch load_unaligned(double const* mem, convert, requires_arch) noexcept { return _mm256_loadu_pd(mem); } // lt template inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_ps(self, other, _CMP_LT_OQ); } template inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_pd(self, other, _CMP_LT_OQ); } template ::value, void>::type> inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return lt(batch(s), batch(o)); }, self, other); } // mask template ::value, void>::type> inline uint64_t mask(batch_bool const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2) { __m128i self_low, self_high; detail::split_avx(self, self_low, self_high); return mask(batch_bool(self_low), sse4_2 {}) | (mask(batch_bool(self_high), sse4_2 {}) << (128 / (8 * sizeof(T)))); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_movemask_ps(_mm256_castsi256_ps(self)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_movemask_pd(_mm256_castsi256_pd(self)); } else { assert(false && "unsupported arch/op combination"); return {}; } } template inline uint64_t mask(batch_bool const& self, requires_arch) noexcept { return _mm256_movemask_ps(self); } template inline uint64_t mask(batch_bool const& self, requires_arch) noexcept { return _mm256_movemask_pd(self); } // max template inline batch max(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_max_ps(self, other); } template inline batch max(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_max_pd(self, other); } template ::value, void>::type> inline batch max(batch const& self, batch const& other, requires_arch) noexcept { return select(self > other, self, other); } // min template inline batch min(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_min_ps(self, other); } template inline batch min(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_min_pd(self, other); } template ::value, void>::type> inline batch min(batch const& self, batch const& other, requires_arch) noexcept { return select(self <= other, self, other); } // mul template inline batch mul(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_mul_ps(self, other); } template inline batch mul(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_mul_pd(self, other); } // nearbyint template inline batch nearbyint(batch const& self, requires_arch) noexcept { return _mm256_round_ps(self, _MM_FROUND_TO_NEAREST_INT); } template inline batch nearbyint(batch const& self, requires_arch) noexcept { return _mm256_round_pd(self, _MM_FROUND_TO_NEAREST_INT); } // nearbyint_as_int template inline batch nearbyint_as_int(batch const& self, requires_arch) noexcept { return _mm256_cvtps_epi32(self); } // neg template ::value, void>::type> inline batch neg(batch const& self, requires_arch) noexcept { return 0 - self; } template batch neg(batch const& self, requires_arch) { return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); } template inline batch neg(batch const& self, requires_arch) noexcept { return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000))); } // neq template inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_ps(self, other, _CMP_NEQ_UQ); } template inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_pd(self, other, _CMP_NEQ_UQ); } template ::value, void>::type> inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return ~(self == other); } template inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_xor_ps(self, other); } template inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_xor_pd(self, other); } template ::value, void>::type> inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(self.data), _mm256_castsi256_ps(other.data))); } // reciprocal template inline batch reciprocal(batch const& self, kernel::requires_arch) noexcept { return _mm256_rcp_ps(self); } // reduce_add template inline float reduce_add(batch const& rhs, requires_arch) noexcept { // Warning about _mm256_hadd_ps: // _mm256_hadd_ps(a,b) gives // (a0+a1,a2+a3,b0+b1,b2+b3,a4+a5,a6+a7,b4+b5,b6+b7). Hence we can't // rely on a naive use of this method // rhs = (x0, x1, x2, x3, x4, x5, x6, x7) // tmp = (x4, x5, x6, x7, x0, x1, x2, x3) __m256 tmp = _mm256_permute2f128_ps(rhs, rhs, 1); // tmp = (x4+x0, x5+x1, x6+x2, x7+x3, x0+x4, x1+x5, x2+x6, x3+x7) tmp = _mm256_add_ps(rhs, tmp); // tmp = (x4+x0+x5+x1, x6+x2+x7+x3, -, -, -, -, -, -) tmp = _mm256_hadd_ps(tmp, tmp); // tmp = (x4+x0+x5+x1+x6+x2+x7+x3, -, -, -, -, -, -, -) tmp = _mm256_hadd_ps(tmp, tmp); return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0)); } template inline double reduce_add(batch const& rhs, requires_arch) noexcept { // rhs = (x0, x1, x2, x3) // tmp = (x2, x3, x0, x1) __m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1); // tmp = (x2+x0, x3+x1, -, -) tmp = _mm256_add_pd(rhs, tmp); // tmp = (x2+x0+x3+x1, -, -, -) tmp = _mm256_hadd_pd(tmp, tmp); return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0)); } template ::value, void>::type> inline T reduce_add(batch const& self, requires_arch) noexcept { __m128i low, high; detail::split_avx(self, low, high); batch blow(low), bhigh(high); return reduce_add(blow) + reduce_add(bhigh); } // reduce_max template ::type> inline T reduce_max(batch const& self, requires_arch) noexcept { constexpr auto mask = detail::shuffle(1, 0); batch step = _mm256_permute2f128_si256(self, self, mask); batch acc = max(self, step); __m128i low = _mm256_castsi256_si128(acc); return reduce_max(batch(low)); } // reduce_min template ::type> inline T reduce_min(batch const& self, requires_arch) noexcept { constexpr auto mask = detail::shuffle(1, 0); batch step = _mm256_permute2f128_si256(self, self, mask); batch acc = min(self, step); __m128i low = _mm256_castsi256_si128(acc); return reduce_min(batch(low)); } // rsqrt template inline batch rsqrt(batch const& val, requires_arch) noexcept { return _mm256_rsqrt_ps(val); } template inline batch rsqrt(batch const& val, requires_arch) noexcept { return _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(val))); } // sadd template ::value, void>::type> inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { auto mask = (other >> (8 * sizeof(T) - 1)); auto self_pos_branch = min(std::numeric_limits::max() - other, self); auto self_neg_branch = max(std::numeric_limits::min() - other, self); return other + select(batch_bool(mask.data), self_neg_branch, self_pos_branch); } else { const auto diffmax = std::numeric_limits::max() - self; const auto mindiff = min(diffmax, other); return self + mindiff; } } // select template inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return _mm256_blendv_ps(false_br, true_br, cond); } template inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return _mm256_blendv_pd(false_br, true_br, cond); } template ::value, void>::type> inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { __m128i cond_low, cond_hi; detail::split_avx(cond, cond_low, cond_hi); __m128i true_low, true_hi; detail::split_avx(true_br, true_low, true_hi); __m128i false_low, false_hi; detail::split_avx(false_br, false_low, false_hi); __m128i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), sse4_2 {}); __m128i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), sse4_2 {}); return detail::merge_sse(res_low, res_hi); } template ::value, void>::type> inline batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { return select(batch_bool { Values... }, true_br, false_br, avx2 {}); } template inline batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { constexpr auto mask = batch_bool_constant, Values...>::mask(); return _mm256_blend_ps(false_br, true_br, mask); } template inline batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { constexpr auto mask = batch_bool_constant, Values...>::mask(); return _mm256_blend_pd(false_br, true_br, mask); } // set template inline batch set(batch const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch::size, "consistent init"); return _mm256_setr_ps(values...); } template inline batch set(batch const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch::size, "consistent init"); return _mm256_setr_pd(values...); } template ::value, void>::type> inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3) noexcept { return _mm256_set_epi64x(v3, v2, v1, v0); } template ::value, void>::type> inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept { return _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7); } template ::value, void>::type> inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept { return _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); } template ::value, void>::type> inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept { return _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31); } template ::value, void>::type> inline batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { return set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data; } template inline batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); return _mm256_castsi256_ps(set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data); } template inline batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); return _mm256_castsi256_pd(set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data); } // shuffle template inline batch shuffle(batch const& x, batch const& y, batch_constant, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch) noexcept { constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3); // shuffle within lane if (I4 == (I0 + 4) && I5 == (I1 + 4) && I6 == (I2 + 4) && I7 == (I3 + 4) && I0 < 4 && I1 < 4 && I2 >= 8 && I2 < 12 && I3 >= 8 && I3 < 12) return _mm256_shuffle_ps(x, y, smask); // shuffle within opposite lane if (I4 == (I0 + 4) && I5 == (I1 + 4) && I6 == (I2 + 4) && I7 == (I3 + 4) && I2 < 4 && I3 < 4 && I0 >= 8 && I0 < 12 && I1 >= 8 && I1 < 12) return _mm256_shuffle_ps(y, x, smask); return shuffle(x, y, mask, generic {}); } template inline batch shuffle(batch const& x, batch const& y, batch_constant, I0, I1, I2, I3> mask, requires_arch) noexcept { constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3); // shuffle within lane if (I0 < 2 && I1 >= 4 && I1 < 6 && I2 >= 2 && I2 < 4 && I3 >= 6) return _mm256_shuffle_pd(x, y, smask); // shuffle within opposite lane if (I1 < 2 && I0 >= 4 && I0 < 6 && I3 >= 2 && I3 < 4 && I2 >= 6) return _mm256_shuffle_pd(y, x, smask); return shuffle(x, y, mask, generic {}); } // slide_left template inline batch slide_left(batch const& x, requires_arch) noexcept { constexpr unsigned BitCount = N * 8; if (BitCount == 0) { return x; } if (BitCount >= 256) { return batch(T(0)); } if (BitCount > 128) { constexpr unsigned M = (BitCount - 128) / 8; __m128i low = _mm256_castsi256_si128(x); auto y = _mm_slli_si128(low, M); __m256i zero = _mm256_setzero_si256(); return _mm256_insertf128_si256(zero, y, 1); } if (BitCount == 128) { __m128i low = _mm256_castsi256_si128(x); __m256i zero = _mm256_setzero_si256(); return _mm256_insertf128_si256(zero, low, 1); } // shifting by [0, 128[ bits constexpr unsigned M = BitCount / 8; __m128i low = _mm256_castsi256_si128(x); auto ylow = _mm_slli_si128(low, M); auto zlow = _mm_srli_si128(low, 16 - M); __m128i high = _mm256_extractf128_si256(x, 1); auto yhigh = _mm_slli_si128(high, M); __m256i res = _mm256_castsi128_si256(ylow); return _mm256_insertf128_si256(res, _mm_or_si128(yhigh, zlow), 1); } // slide_right template inline batch slide_right(batch const& x, requires_arch) noexcept { constexpr unsigned BitCount = N * 8; if (BitCount == 0) { return x; } if (BitCount >= 256) { return batch(T(0)); } if (BitCount > 128) { constexpr unsigned M = (BitCount - 128) / 8; __m128i high = _mm256_extractf128_si256(x, 1); __m128i y = _mm_srli_si128(high, M); __m256i zero = _mm256_setzero_si256(); return _mm256_insertf128_si256(zero, y, 0); } if (BitCount == 128) { __m128i high = _mm256_extractf128_si256(x, 1); return _mm256_castsi128_si256(high); } // shifting by [0, 128[ bits constexpr unsigned M = BitCount / 8; __m128i low = _mm256_castsi256_si128(x); auto ylow = _mm_srli_si128(low, M); __m128i high = _mm256_extractf128_si256(x, 1); auto yhigh = _mm_srli_si128(high, M); auto zhigh = _mm_slli_si128(high, 16 - M); __m256i res = _mm256_castsi128_si256(_mm_or_si128(ylow, zhigh)); return _mm256_insertf128_si256(res, yhigh, 1); } // sqrt template inline batch sqrt(batch const& val, requires_arch) noexcept { return _mm256_sqrt_ps(val); } template inline batch sqrt(batch const& val, requires_arch) noexcept { return _mm256_sqrt_pd(val); } // ssub template ::value, void>::type> inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { return sadd(self, -other); } else { const auto diff = min(self, other); return self - diff; } } // store_aligned template ::value, void>::type> inline void store_aligned(T* mem, batch const& self, requires_arch) noexcept { return _mm256_store_si256((__m256i*)mem, self); } template ::value, void>::type> inline void store_aligned(T* mem, batch_bool const& self, requires_arch) noexcept { return _mm256_store_si256((__m256i*)mem, self); } template inline void store_aligned(float* mem, batch const& self, requires_arch) noexcept { return _mm256_store_ps(mem, self); } template inline void store_aligned(double* mem, batch const& self, requires_arch) noexcept { return _mm256_store_pd(mem, self); } // store_unaligned template ::value, void>::type> inline void store_unaligned(T* mem, batch const& self, requires_arch) noexcept { return _mm256_storeu_si256((__m256i*)mem, self); } template ::value, void>::type> inline void store_unaligned(T* mem, batch_bool const& self, requires_arch) noexcept { return _mm256_storeu_si256((__m256i*)mem, self); } template inline void store_unaligned(float* mem, batch const& self, requires_arch) noexcept { return _mm256_storeu_ps(mem, self); } template inline void store_unaligned(double* mem, batch const& self, requires_arch) noexcept { return _mm256_storeu_pd(mem, self); } // sub template ::value, void>::type> inline batch sub(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return sub(batch(s), batch(o)); }, self, other); } template inline batch sub(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_sub_ps(self, other); } template inline batch sub(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_sub_pd(self, other); } // swizzle (dynamic mask) template inline batch swizzle(batch const& self, batch mask, requires_arch) noexcept { // duplicate low and high part of input __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1)); __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0); __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self)); __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1); // normalize mask batch half_mask = mask % 4; // permute within each lane __m256 r0 = _mm256_permutevar_ps(low_low, half_mask); __m256 r1 = _mm256_permutevar_ps(hi_hi, half_mask); // mask to choose the right lane batch_bool blend_mask = mask >= 4; // blend the two permutes return _mm256_blendv_ps(r0, r1, batch_bool_cast(blend_mask)); } template inline batch swizzle(batch const& self, batch mask, requires_arch) noexcept { // duplicate low and high part of input __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1)); __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0); __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self)); __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1); // normalize mask batch half_mask = -(mask & 1); // permute within each lane __m256d r0 = _mm256_permutevar_pd(low_low, half_mask); __m256d r1 = _mm256_permutevar_pd(hi_hi, half_mask); // mask to choose the right lane batch_bool blend_mask = mask >= 2; // blend the two permutes return _mm256_blendv_pd(r0, r1, batch_bool_cast(blend_mask)); } template = 0> inline batch swizzle(batch const& self, batch const& mask, requires_arch) noexcept { return bitwise_cast( swizzle(bitwise_cast(self), mask)); } template = 0> inline batch swizzle(batch const& self, batch const& mask, requires_arch) noexcept { return bitwise_cast( swizzle(bitwise_cast(self), mask)); } // swizzle (constant mask) template inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch) noexcept { // duplicate low and high part of input __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1)); __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0); __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self)); __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1); // normalize mask batch_constant, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask; // permute within each lane __m256 r0 = _mm256_permutevar_ps(low_low, (batch)half_mask); __m256 r1 = _mm256_permutevar_ps(hi_hi, (batch)half_mask); // mask to choose the right lane batch_bool_constant, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask; // blend the two permutes constexpr auto mask = blend_mask.mask(); return _mm256_blend_ps(r0, r1, mask); } template inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3>, requires_arch) noexcept { // duplicate low and high part of input __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1)); __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0); __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self)); __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1); // normalize mask batch_constant, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask; // permute within each lane __m256d r0 = _mm256_permutevar_pd(low_low, (batch)half_mask); __m256d r1 = _mm256_permutevar_pd(hi_hi, (batch)half_mask); // mask to choose the right lane batch_bool_constant, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask; // blend the two permutes constexpr auto mask = blend_mask.mask(); return _mm256_blend_pd(r0, r1, mask); } template = 0> inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3, V4, V5, V6, V7> const& mask, requires_arch) noexcept { return bitwise_cast( swizzle(bitwise_cast(self), mask)); } template = 0> inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3> const& mask, requires_arch) noexcept { return bitwise_cast( swizzle(bitwise_cast(self), mask)); } // trunc template inline batch trunc(batch const& self, requires_arch) noexcept { return _mm256_round_ps(self, _MM_FROUND_TO_ZERO); } template inline batch trunc(batch const& self, requires_arch) noexcept { return _mm256_round_pd(self, _MM_FROUND_TO_ZERO); } // zip_hi template ::value, void>::type> inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2) { // extract high word __m128i self_hi = _mm256_extractf128_si256(self, 1); __m128i other_hi = _mm256_extractf128_si256(other, 1); // interleave __m128i res_lo, res_hi; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { res_lo = _mm_unpacklo_epi8(self_hi, other_hi); res_hi = _mm_unpackhi_epi8(self_hi, other_hi); } else { res_lo = _mm_unpacklo_epi16(self_hi, other_hi); res_hi = _mm_unpackhi_epi16(self_hi, other_hi); } // fuse return _mm256_castps_si256( _mm256_insertf128_ps( _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)), _mm_castsi128_ps(res_hi), 1)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other)); auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other)); return _mm256_castps_si256(_mm256_permute2f128_ps(lo, hi, 0x31)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other)); auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other)); return _mm256_castpd_si256(_mm256_permute2f128_pd(lo, hi, 0x31)); } else { assert(false && "unsupported arch/op combination"); return {}; } } template inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm256_unpacklo_ps(self, other); auto hi = _mm256_unpackhi_ps(self, other); return _mm256_permute2f128_ps(lo, hi, 0x31); } template inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm256_unpacklo_pd(self, other); auto hi = _mm256_unpackhi_pd(self, other); return _mm256_permute2f128_pd(lo, hi, 0x31); } // zip_lo template ::value, void>::type> inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2) { // extract low word __m128i self_lo = _mm256_extractf128_si256(self, 0); __m128i other_lo = _mm256_extractf128_si256(other, 0); // interleave __m128i res_lo, res_hi; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { res_lo = _mm_unpacklo_epi8(self_lo, other_lo); res_hi = _mm_unpackhi_epi8(self_lo, other_lo); } else { res_lo = _mm_unpacklo_epi16(self_lo, other_lo); res_hi = _mm_unpackhi_epi16(self_lo, other_lo); } // fuse return _mm256_castps_si256( _mm256_insertf128_ps( _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)), _mm_castsi128_ps(res_hi), 1)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other)); auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other)); return _mm256_castps_si256(_mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other)); auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other)); return _mm256_castpd_si256(_mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1)); } else { assert(false && "unsupported arch/op combination"); return {}; } } template inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm256_unpacklo_ps(self, other); auto hi = _mm256_unpackhi_ps(self, other); return _mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1); } template inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm256_unpacklo_pd(self, other); auto hi = _mm256_unpackhi_pd(self, other); return _mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1); } } } #endif xsimd-12.1.1/include/xsimd/arch/xsimd_avx2.hpp000066400000000000000000001212151453610362700212120ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX2_HPP #define XSIMD_AVX2_HPP #include #include #include "../types/xsimd_avx2_register.hpp" namespace xsimd { namespace kernel { using namespace types; // abs template ::value, void>::type> inline batch abs(batch const& self, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_abs_epi8(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_abs_epi16(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_abs_epi32(self); } else { return abs(self, avx {}); } } return self; } // add template ::value, void>::type> inline batch add(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_add_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_add_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_add_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_add_epi64(self, other); } else { return add(self, other, avx {}); } } // bitwise_and template ::value, void>::type> inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_and_si256(self, other); } template ::value, void>::type> inline batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_and_si256(self, other); } // bitwise_andnot template ::value, void>::type> inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_andnot_si256(other, self); } template ::value, void>::type> inline batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_andnot_si256(other, self); } // bitwise_not template ::value, void>::type> inline batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm256_xor_si256(self, _mm256_set1_epi32(-1)); } template ::value, void>::type> inline batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { return _mm256_xor_si256(self, _mm256_set1_epi32(-1)); } // bitwise_lshift template ::value, void>::type> inline batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_slli_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_slli_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_slli_epi64(self, other); } else { return bitwise_lshift(self, other, avx {}); } } template ::value, void>::type> inline batch bitwise_lshift(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_sllv_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_sllv_epi64(self, other); } else { return bitwise_lshift(self, other, avx {}); } } // bitwise_or template ::value, void>::type> inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_or_si256(self, other); } template ::value, void>::type> inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_or_si256(self, other); } // bitwise_rshift template ::value, void>::type> inline batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { __m256i sign_mask = _mm256_set1_epi16((0xFF00 >> other) & 0x00FF); __m256i cmp_is_negative = _mm256_cmpgt_epi8(_mm256_setzero_si256(), self); __m256i res = _mm256_srai_epi16(self, other); return _mm256_or_si256( detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_and(batch(s), batch(o), sse4_2 {}); }, sign_mask, cmp_is_negative), _mm256_andnot_si256(sign_mask, res)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_srai_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_srai_epi32(self, other); } else { return bitwise_rshift(self, other, avx {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_srli_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_srli_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_srli_epi64(self, other); } else { return bitwise_rshift(self, other, avx {}); } } } template ::value, void>::type> inline batch bitwise_rshift(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_srav_epi32(self, other); } else { return bitwise_rshift(self, other, avx {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_srlv_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_srlv_epi64(self, other); } else { return bitwise_rshift(self, other, avx {}); } } } // bitwise_xor template ::value, void>::type> inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_xor_si256(self, other); } template ::value, void>::type> inline batch bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_xor_si256(self, other); } // complex_low template inline batch complex_low(batch, A> const& self, requires_arch) noexcept { __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 1, 1, 0)); __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(1, 2, 0, 0)); return _mm256_blend_pd(tmp0, tmp1, 10); } // complex_high template inline batch complex_high(batch, A> const& self, requires_arch) noexcept { __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 3, 1, 2)); __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(3, 2, 2, 0)); return _mm256_blend_pd(tmp0, tmp1, 10); } // fast_cast namespace detail { template inline batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx // adapted to avx __m256i xH = _mm256_srli_epi64(x, 32); xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.))); // 2^84 __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000); __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52 __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52 return _mm256_add_pd(f, _mm256_castsi256_pd(xL)); } template inline batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx // adapted to avx __m256i xH = _mm256_srai_epi32(x, 16); xH = _mm256_and_si256(xH, _mm256_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF)); xH = _mm256_add_epi64(xH, _mm256_castpd_si256(_mm256_set1_pd(442721857769029238784.))); // 3*2^67 __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000); __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52 __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52 return _mm256_add_pd(f, _mm256_castsi256_pd(xL)); } } // eq template ::value, void>::type> inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_cmpeq_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_cmpeq_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_cmpeq_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_cmpeq_epi64(self, other); } else { return eq(self, other, avx {}); } } // gather template = 0, detail::enable_sized_integral_t = 0> inline batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept { // scatter for this one is AVX512F+AVX512VL return _mm256_i32gather_epi32(reinterpret_cast(src), index, sizeof(T)); } template = 0, detail::enable_sized_integral_t = 0> inline batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept { // scatter for this one is AVX512F+AVX512VL return _mm256_i64gather_epi64(reinterpret_cast(src), index, sizeof(T)); } template = 0> inline batch gather(batch const&, float const* src, batch const& index, kernel::requires_arch) noexcept { // scatter for this one is AVX512F+AVX512VL return _mm256_i32gather_ps(src, index, sizeof(float)); } template = 0> inline batch gather(batch const&, double const* src, batch const& index, requires_arch) noexcept { // scatter for this one is AVX512F+AVX512VL return _mm256_i64gather_pd(src, index, sizeof(double)); } // gather: handmade conversions template = 0> inline batch gather(batch const&, double const* src, batch const& index, requires_arch) noexcept { const batch low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double))); const batch high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double))); return detail::merge_sse(_mm256_cvtpd_ps(low.data), _mm256_cvtpd_ps(high.data)); } template = 0> inline batch gather(batch const&, double const* src, batch const& index, requires_arch) noexcept { const batch low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double))); const batch high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double))); return detail::merge_sse(_mm256_cvtpd_epi32(low.data), _mm256_cvtpd_epi32(high.data)); } // lt template ::value, void>::type> inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_cmpgt_epi8(other, self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_cmpgt_epi16(other, self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_cmpgt_epi32(other, self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_cmpgt_epi64(other, self); } else { return lt(self, other, avx {}); } } else { return lt(self, other, avx {}); } } // load_complex template inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { using batch_type = batch; batch_type real = _mm256_castpd_ps( _mm256_permute4x64_pd( _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0))), _MM_SHUFFLE(3, 1, 2, 0))); batch_type imag = _mm256_castpd_ps( _mm256_permute4x64_pd( _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1))), _MM_SHUFFLE(3, 1, 2, 0))); return { real, imag }; } template inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { using batch_type = batch; batch_type real = _mm256_permute4x64_pd(_mm256_unpacklo_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0)); batch_type imag = _mm256_permute4x64_pd(_mm256_unpackhi_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0)); return { real, imag }; } // mask template ::value, void>::type> inline uint64_t mask(batch_bool const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { uint64_t mask8 = 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self); return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4) | (detail::mask_lut(mask8 >> 16) << 8) | (detail::mask_lut(mask8 >> 24) << 12); } else { return mask(self, avx {}); } } // max template ::value, void>::type> inline batch max(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_max_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_max_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_max_epi32(self, other); } else { return max(self, other, avx {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_max_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_max_epu16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_max_epu32(self, other); } else { return max(self, other, avx {}); } } } // min template ::value, void>::type> inline batch min(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_min_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_min_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_min_epi32(self, other); } else { return min(self, other, avx {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_min_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_min_epu16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_min_epu32(self, other); } else { return min(self, other, avx {}); } } } // mul template ::value, void>::type> inline batch mul(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { __m256i mask_hi = _mm256_set1_epi32(0xFF00FF00); __m256i res_lo = _mm256_mullo_epi16(self, other); __m256i other_hi = _mm256_srli_epi16(other, 8); __m256i self_hi = _mm256_and_si256(self, mask_hi); __m256i res_hi = _mm256_mullo_epi16(self_hi, other_hi); __m256i res = _mm256_blendv_epi8(res_lo, res_hi, mask_hi); return res; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_mullo_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_mullo_epi32(self, other); } else { return mul(self, other, avx {}); } } // reduce_add template ::value, void>::type> inline T reduce_add(batch const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { __m256i tmp1 = _mm256_hadd_epi32(self, self); __m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1); __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1); __m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3); return _mm_cvtsi128_si32(tmp4); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { __m256i tmp1 = _mm256_shuffle_epi32(self, 0x0E); __m256i tmp2 = _mm256_add_epi64(self, tmp1); __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1); __m128i res = _mm_add_epi64(_mm256_castsi256_si128(tmp2), tmp3); #if defined(__x86_64__) return _mm_cvtsi128_si64(res); #else __m128i m; _mm_storel_epi64(&m, res); int64_t i; std::memcpy(&i, &m, sizeof(i)); return i; #endif } else { return reduce_add(self, avx {}); } } // rotate_right template inline batch rotate_right(batch const& self, requires_arch) noexcept { return _mm256_alignr_epi8(self, self, N); } template inline batch rotate_right(batch const& self, requires_arch) noexcept { return bitwise_cast(rotate_right(bitwise_cast(self), avx2 {})); } // sadd template ::value, void>::type> inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_adds_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_adds_epi16(self, other); } else { return sadd(self, other, avx {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_adds_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_adds_epu16(self, other); } else { return sadd(self, other, avx {}); } } } // select template ::value, void>::type> inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_blendv_epi8(false_br, true_br, cond); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_blendv_epi8(false_br, true_br, cond); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_blendv_epi8(false_br, true_br, cond); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_blendv_epi8(false_br, true_br, cond); } else { return select(cond, true_br, false_br, avx {}); } } template ::value, void>::type> inline batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { constexpr int mask = batch_bool_constant, Values...>::mask(); // FIXME: for some reason mask here is not considered as an immediate, // but it's okay for _mm256_blend_epi32 // case 2: return _mm256_blend_epi16(false_br, true_br, mask); XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_blend_epi32(false_br, true_br, mask); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { constexpr int imask = detail::interleave(mask); return _mm256_blend_epi32(false_br, true_br, imask); } else { return select(batch_bool { Values... }, true_br, false_br, avx2 {}); } } // slide_left template inline batch slide_left(batch const& x, requires_arch) noexcept { constexpr unsigned BitCount = N * 8; if (BitCount == 0) { return x; } if (BitCount >= 256) { return batch(T(0)); } if (BitCount > 128) { constexpr unsigned M = (BitCount - 128) / 8; auto y = _mm256_bslli_epi128(x, M); return _mm256_permute2x128_si256(y, y, 0x28); } if (BitCount == 128) { return _mm256_permute2x128_si256(x, x, 0x28); } // shifting by [0, 128[ bits constexpr unsigned M = BitCount / 8; auto y = _mm256_bslli_epi128(x, M); auto z = _mm256_bsrli_epi128(x, 16 - M); auto w = _mm256_permute2x128_si256(z, z, 0x28); return _mm256_or_si256(y, w); } // slide_right template inline batch slide_right(batch const& x, requires_arch) noexcept { constexpr unsigned BitCount = N * 8; if (BitCount == 0) { return x; } if (BitCount >= 256) { return batch(T(0)); } if (BitCount > 128) { constexpr unsigned M = (BitCount - 128) / 8; auto y = _mm256_bsrli_epi128(x, M); return _mm256_permute2x128_si256(y, y, 0x81); } if (BitCount == 128) { return _mm256_permute2x128_si256(x, x, 0x81); } // shifting by [0, 128[ bits constexpr unsigned M = BitCount / 8; auto y = _mm256_bsrli_epi128(x, M); auto z = _mm256_bslli_epi128(x, 16 - M); auto w = _mm256_permute2x128_si256(z, z, 0x81); return _mm256_or_si256(y, w); } // ssub template ::value, void>::type> inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_subs_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_subs_epi16(self, other); } else { return ssub(self, other, avx {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_subs_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_subs_epu16(self, other); } else { return ssub(self, other, avx {}); } } } // sub template ::value, void>::type> inline batch sub(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_sub_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_sub_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_sub_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_sub_epi64(self, other); } else { return sub(self, other, avx {}); } } // swizzle (dynamic mask) template inline batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm256_permutevar8x32_ps(self, mask); } template inline batch swizzle(batch const& self, batch mask, requires_arch) noexcept { batch broadcaster = { 0, 1, 0, 1, 0, 1, 0, 1 }; constexpr uint64_t comb = 0x0000000100000001ul * 2; return bitwise_cast(swizzle(bitwise_cast(self), bitwise_cast(mask * comb) + broadcaster, avx2 {})); } template inline batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx2 {})); } template inline batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx2 {})); } template inline batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm256_permutevar8x32_epi32(self, mask); } template inline batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx2 {})); } // swizzle (constant mask) template inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch) noexcept { return _mm256_permutevar8x32_ps(self, (batch)mask); } template inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3>, requires_arch) noexcept { constexpr auto mask = detail::shuffle(V0, V1, V2, V3); return _mm256_permute4x64_pd(self, mask); } template inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3>, requires_arch) noexcept { constexpr auto mask = detail::shuffle(V0, V1, V2, V3); return _mm256_permute4x64_epi64(self, mask); } template inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3> mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx2 {})); } template inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch) noexcept { return _mm256_permutevar8x32_epi32(self, (batch)mask); } template inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx2 {})); } // zip_hi template ::value, void>::type> inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { auto lo = _mm256_unpacklo_epi8(self, other); auto hi = _mm256_unpackhi_epi8(self, other); return _mm256_permute2f128_si256(lo, hi, 0x31); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { auto lo = _mm256_unpacklo_epi16(self, other); auto hi = _mm256_unpackhi_epi16(self, other); return _mm256_permute2f128_si256(lo, hi, 0x31); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { auto lo = _mm256_unpacklo_epi32(self, other); auto hi = _mm256_unpackhi_epi32(self, other); return _mm256_permute2f128_si256(lo, hi, 0x31); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { auto lo = _mm256_unpacklo_epi64(self, other); auto hi = _mm256_unpackhi_epi64(self, other); return _mm256_permute2f128_si256(lo, hi, 0x31); } else { assert(false && "unsupported arch/op combination"); return {}; } } // zip_lo template ::value, void>::type> inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { auto lo = _mm256_unpacklo_epi8(self, other); auto hi = _mm256_unpackhi_epi8(self, other); return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { auto lo = _mm256_unpacklo_epi16(self, other); auto hi = _mm256_unpackhi_epi16(self, other); return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { auto lo = _mm256_unpacklo_epi32(self, other); auto hi = _mm256_unpackhi_epi32(self, other); return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { auto lo = _mm256_unpacklo_epi64(self, other); auto hi = _mm256_unpackhi_epi64(self, other); return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1); } else { assert(false && "unsupported arch/op combination"); return {}; } } } } #endif xsimd-12.1.1/include/xsimd/arch/xsimd_avx512bw.hpp000066400000000000000000000626341453610362700217220ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512BW_HPP #define XSIMD_AVX512BW_HPP #include #include #include "../types/xsimd_avx512bw_register.hpp" namespace xsimd { namespace kernel { using namespace types; namespace detail { template inline batch_bool compare_int_avx512bw(batch const& self, batch const& other) noexcept { using register_type = typename batch_bool::register_type; if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return (register_type)_mm512_cmp_epi8_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return (register_type)_mm512_cmp_epi16_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return (register_type)_mm512_cmp_epu8_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return (register_type)_mm512_cmp_epu16_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp); } } } } // abs template ::value, void>::type> inline batch abs(batch const& self, requires_arch) noexcept { if (std::is_unsigned::value) { return self; } XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_abs_epi8(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_abs_epi16(self); } else { return abs(self, avx512dq {}); } } // add template ::value, void>::type> inline batch add(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_add_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_add_epi16(self, other); } else { return add(self, other, avx512dq {}); } } // bitwise_lshift template ::value, void>::type> inline batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_sllv_epi16(self, _mm512_set1_epi16(other)); #else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_slli_epi16(self, other); #endif } else { return bitwise_lshift(self, other, avx512dq {}); } } // bitwise_rshift template ::value, void>::type> inline batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { __m512i sign_mask = _mm512_set1_epi16((0xFF00 >> other) & 0x00FF); __m512i zeros = _mm512_setzero_si512(); __mmask64 cmp_is_negative_mask = _mm512_cmpgt_epi8_mask(zeros, self); __m512i cmp_sign_mask = _mm512_mask_blend_epi8(cmp_is_negative_mask, zeros, sign_mask); #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) __m512i res = _mm512_srav_epi16(self, _mm512_set1_epi16(other)); #else __m512i res = _mm512_srai_epi16(self, other); #endif return _mm512_or_si512(cmp_sign_mask, _mm512_andnot_si512(sign_mask, res)); #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_srav_epi16(self, _mm512_set1_epi16(other)); #else } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_srai_epi16(self, other); #endif } else { return bitwise_rshift(self, other, avx512dq {}); } } else { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_srlv_epi16(self, _mm512_set1_epi16(other)); #else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_srli_epi16(self, other); #endif } else { return bitwise_rshift(self, other, avx512dq {}); } } } // eq template ::value, void>::type> inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512bw(self, other); } // ge template ::value, void>::type> inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512bw(self, other); } // gt template ::value, void>::type> inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512bw(self, other); } // le template ::value, void>::type> inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512bw(self, other); } // lt template ::value, void>::type> inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512bw(self, other); } // max template ::value, void>::type> inline batch max(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_max_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_max_epi16(self, other); } else { return max(self, other, avx512dq {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_max_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_max_epu16(self, other); } else { return max(self, other, avx512dq {}); } } } // min template ::value, void>::type> inline batch min(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_min_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_min_epi16(self, other); } else { return min(self, other, avx512dq {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_min_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_min_epu16(self, other); } else { return min(self, other, avx512dq {}); } } } // mul template ::value, void>::type> inline batch mul(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { __m512i upper = _mm512_and_si512(_mm512_mullo_epi16(self, other), _mm512_srli_epi16(_mm512_set1_epi16(-1), 8)); __m512i lower = _mm512_slli_epi16(_mm512_mullo_epi16(_mm512_srli_epi16(self, 8), _mm512_srli_epi16(other, 8)), 8); return _mm512_or_si512(upper, lower); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_mullo_epi16(self, other); } else { return mul(self, other, avx512dq {}); } } // neq template ::value, void>::type> inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512bw(self, other); } // rotate_right template inline batch rotate_right(batch const& self, requires_arch) noexcept { return _mm512_alignr_epi8(self, self, N); } template inline batch rotate_right(batch const& self, requires_arch) noexcept { return bitwise_cast(rotate_right(bitwise_cast(self), avx2 {})); } // sadd template ::value, void>::type> inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_adds_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_adds_epi16(self, other); } else { return sadd(self, other, avx512dq {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_adds_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_adds_epu16(self, other); } else { return sadd(self, other, avx512dq {}); } } } // select template ::value, void>::type> inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_mask_blend_epi8(cond, false_br.data, true_br.data); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_mask_blend_epi16(cond, false_br.data, true_br.data); } else { return select(cond, true_br, false_br, avx512dq {}); } } // slide_left namespace detail { template constexpr std::array make_slide_perm_hi(::xsimd::detail::index_sequence) { return { (Is == 0 ? 8 : Is - 1)... }; } template constexpr std::array make_slide_left_pattern(::xsimd::detail::index_sequence) { return { (Is >= N ? Is - N : 0)... }; } template constexpr std::array make_slide_left_mask(::xsimd::detail::index_sequence) { return { (Is >= N ? 0xFFFF : 0x0000)... }; } } template inline batch slide_left(batch const& x, requires_arch) noexcept { constexpr unsigned BitCount = N * 8; if (BitCount == 0) { return x; } if (BitCount >= 512) { return batch(T(0)); } batch xx; if (N & 1) { alignas(A::alignment()) uint64_t buffer[8]; _mm512_store_epi64(&buffer[0], x); for (int i = 7; i > 0; --i) buffer[i] = (buffer[i] << 8) | (buffer[i - 1] >> 56); buffer[0] = buffer[0] << 8; xx = _mm512_load_epi64(&buffer[0]); alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_hi(::xsimd::detail::make_index_sequence<512 / 64>()); __m512i xl = _mm512_slli_epi64(x, 8); __m512i xr = _mm512_srli_epi64(x, 56); xr = _mm512_permutex2var_epi64(xr, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512()); xx = _mm512_or_si512(xr, xl); if (N == 1) return xx; } else { xx = x; } alignas(A::alignment()) auto slide_pattern = detail::make_slide_left_pattern(::xsimd::detail::make_index_sequence<512 / 16>()); alignas(A::alignment()) auto slide_mask = detail::make_slide_left_mask(::xsimd::detail::make_index_sequence<512 / 16>()); return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data())); } // slide_right namespace detail { template constexpr std::array make_slide_perm_low(::xsimd::detail::index_sequence) { return { (Is + 1)... }; } template constexpr std::array make_slide_right_pattern(::xsimd::detail::index_sequence) { return { (Is < (32 - N) ? Is + N : 0)... }; } template constexpr std::array make_slide_right_mask(::xsimd::detail::index_sequence) { return { (Is < 32 - N ? 0xFFFF : 0x0000)... }; } } template inline batch slide_right(batch const& x, requires_arch) noexcept { constexpr unsigned BitCount = N * 8; if (BitCount == 0) { return x; } if (BitCount >= 512) { return batch(T(0)); } batch xx; if (N & 1) { alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_low(::xsimd::detail::make_index_sequence<512 / 64>()); __m512i xr = _mm512_srli_epi64(x, 8); __m512i xl = _mm512_slli_epi64(x, 56); xl = _mm512_permutex2var_epi64(xl, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512()); xx = _mm512_or_si512(xr, xl); if (N == 1) return xx; } else { xx = x; } alignas(A::alignment()) auto slide_pattern = detail::make_slide_right_pattern(::xsimd::detail::make_index_sequence<512 / 16>()); alignas(A::alignment()) auto slide_mask = detail::make_slide_right_mask(::xsimd::detail::make_index_sequence<512 / 16>()); return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data())); } // ssub template ::value, void>::type> inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_subs_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_subs_epi16(self, other); } else { return ssub(self, other, avx512dq {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_subs_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_subs_epu16(self, other); } else { return ssub(self, other, avx512dq {}); } } } // sub template ::value, void>::type> inline batch sub(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_sub_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_sub_epi16(self, other); } else { return sub(self, other, avx512dq {}); } } // swizzle (dynamic version) template inline batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm512_permutexvar_epi16(mask, self); } template inline batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512bw {})); } template inline batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm512_shuffle_epi8(self, mask); } template inline batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512bw {})); } // swizzle (static version) template inline batch swizzle(batch const& self, batch_constant, Vs...> mask, requires_arch) noexcept { return swizzle(self, (batch)mask, avx512bw {}); } template inline batch swizzle(batch const& self, batch_constant, Vs...> mask, requires_arch) noexcept { return swizzle(self, (batch)mask, avx512bw {}); } template inline batch swizzle(batch const& self, batch_constant, Vs...> mask, requires_arch) noexcept { return swizzle(self, (batch)mask, avx512bw {}); } template inline batch swizzle(batch const& self, batch_constant, Vs...> mask, requires_arch) noexcept { return swizzle(self, (batch)mask, avx512bw {}); } // zip_hi template ::value, void>::type> inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { __m512i lo, hi; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { lo = _mm512_unpacklo_epi8(self, other); hi = _mm512_unpackhi_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { lo = _mm512_unpacklo_epi16(self, other); hi = _mm512_unpackhi_epi16(self, other); } else { return zip_hi(self, other, avx512f {}); } return _mm512_inserti32x4( _mm512_inserti32x4( _mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0), _mm512_extracti32x4_epi32(lo, 3), 2), _mm512_extracti32x4_epi32(hi, 2), 1); } // zip_lo template ::value, void>::type> inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { __m512i lo, hi; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { lo = _mm512_unpacklo_epi8(self, other); hi = _mm512_unpackhi_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { lo = _mm512_unpacklo_epi16(self, other); hi = _mm512_unpackhi_epi16(self, other); } else { return zip_lo(self, other, avx512f {}); } return _mm512_inserti32x4( _mm512_inserti32x4( _mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1), _mm512_extracti32x4_epi32(hi, 1), 3), _mm512_extracti32x4_epi32(lo, 1), 2); } } } #endif xsimd-12.1.1/include/xsimd/arch/xsimd_avx512cd.hpp000066400000000000000000000017231453610362700216700ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512CD_HPP #define XSIMD_AVX512CD_HPP #include "../types/xsimd_avx512cd_register.hpp" namespace xsimd { namespace kernel { // Nothing there yet. } } #endif xsimd-12.1.1/include/xsimd/arch/xsimd_avx512dq.hpp000066400000000000000000000220361453610362700217060ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512_DQHPP #define XSIMD_AVX512_D_HPP #include "../types/xsimd_avx512dq_register.hpp" namespace xsimd { namespace kernel { using namespace types; // bitwise_and template inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_and_ps(self, other); } template inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_and_pd(self, other); } // bitwise_andnot template inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_andnot_ps(other, self); } template inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_andnot_pd(other, self); } // bitwise_not template inline batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm512_xor_ps(self, _mm512_castsi512_ps(_mm512_set1_epi32(-1))); } template inline batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm512_xor_pd(self, _mm512_castsi512_pd(_mm512_set1_epi32(-1))); } // bitwise_or template inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_or_ps(self, other); } template inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_or_pd(self, other); } template inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(self.data | other.data); } // bitwise_xor template inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_xor_ps(self, other); } template inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_xor_pd(self, other); } // haddp template inline batch haddp(batch const* row, requires_arch) noexcept { // The following folds over the vector once: // tmp1 = [a0..8, b0..8] // tmp2 = [a8..f, b8..f] #define XSIMD_AVX512_HADDP_STEP1(I, a, b) \ batch res##I; \ { \ auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \ auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \ res##I = _mm512_add_ps(tmp1, tmp2); \ } XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]); XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]); XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]); XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]); XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]); XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]); XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]); XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]); #undef XSIMD_AVX512_HADDP_STEP1 // The following flds the code and shuffles so that hadd_ps produces the correct result // tmp1 = [a0..4, a8..12, b0..4, b8..12] (same for tmp3) // tmp2 = [a5..8, a12..16, b5..8, b12..16] (same for tmp4) // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ... #define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d) \ batch halfx##I; \ { \ auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \ auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \ \ auto resx1 = _mm512_add_ps(tmp1, tmp2); \ \ auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \ auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \ \ auto resx2 = _mm512_add_ps(tmp3, tmp4); \ \ auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \ auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \ \ auto resx3 = _mm512_add_ps(tmp5, tmp6); \ \ halfx##I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0), \ _mm512_extractf32x8_ps(resx3, 1)); \ } XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3); XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7); #undef XSIMD_AVX512_HADDP_STEP2 auto concat = _mm512_castps256_ps512(halfx0); concat = _mm512_insertf32x8(concat, halfx1, 1); return concat; } // ldexp template inline batch ldexp(const batch& self, const batch, A>& other, requires_arch) noexcept { return _mm512_scalef_pd(self, _mm512_cvtepi64_pd(other)); } // mul template inline batch mul(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_mullo_epi64(self, other); } template inline batch mul(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_mullo_epi64(self, other); } // nearbyint_as_int template inline batch nearbyint_as_int(batch const& self, requires_arch) noexcept { return _mm512_cvtpd_epi64(self); } // reduce_add template inline float reduce_add(batch const& rhs, requires_arch) noexcept { __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1); __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0); __m256 res1 = _mm256_add_ps(tmp1, tmp2); return reduce_add(batch(res1), avx2 {}); } // convert namespace detail { template inline batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { return _mm512_cvtepi64_pd(self); } template inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_cvttpd_epi64(self); } } } } #endif xsimd-12.1.1/include/xsimd/arch/xsimd_avx512er.hpp000066400000000000000000000016451453610362700217130ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512ER_HPP #define XSIMD_AVX512ER_HPP #include #include #include "../types/xsimd_avx512er_register.hpp" #endif xsimd-12.1.1/include/xsimd/arch/xsimd_avx512f.hpp000066400000000000000000003135461453610362700215400ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512F_HPP #define XSIMD_AVX512F_HPP #include #include #include #include "../types/xsimd_avx512f_register.hpp" namespace xsimd { namespace kernel { using namespace types; namespace detail { inline void split_avx512(__m512 val, __m256& low, __m256& high) noexcept { low = _mm512_castps512_ps256(val); high = _mm512_extractf32x8_ps(val, 1); } inline void split_avx512(__m512d val, __m256d& low, __m256d& high) noexcept { low = _mm512_castpd512_pd256(val); high = _mm512_extractf64x4_pd(val, 1); } inline void split_avx512(__m512i val, __m256i& low, __m256i& high) noexcept { low = _mm512_castsi512_si256(val); high = _mm512_extracti64x4_epi64(val, 1); } inline __m512i merge_avx(__m256i low, __m256i high) noexcept { return _mm512_inserti64x4(_mm512_castsi256_si512(low), high, 1); } inline __m512 merge_avx(__m256 low, __m256 high) noexcept { return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castpd256_pd512(_mm256_castps_pd(low)), _mm256_castps_pd(high), 1)); } inline __m512d merge_avx(__m256d low, __m256d high) noexcept { return _mm512_insertf64x4(_mm512_castpd256_pd512(low), high, 1); } template __m512i fwd_to_avx(F f, __m512i self) { __m256i self_low, self_high; split_avx512(self, self_low, self_high); __m256i res_low = f(self_low); __m256i res_high = f(self_high); return merge_avx(res_low, res_high); } template __m512i fwd_to_avx(F f, __m512i self, __m512i other) { __m256i self_low, self_high, other_low, other_high; split_avx512(self, self_low, self_high); split_avx512(other, other_low, other_high); __m256i res_low = f(self_low, other_low); __m256i res_high = f(self_high, other_high); return merge_avx(res_low, res_high); } template __m512i fwd_to_avx(F f, __m512i self, int32_t other) { __m256i self_low, self_high; split_avx512(self, self_low, self_high); __m256i res_low = f(self_low, other); __m256i res_high = f(self_high, other); return merge_avx(res_low, res_high); } } namespace detail { inline uint32_t morton(uint16_t x, uint16_t y) noexcept { static const unsigned short MortonTable256[256] = { 0x0000, 0x0001, 0x0004, 0x0005, 0x0010, 0x0011, 0x0014, 0x0015, 0x0040, 0x0041, 0x0044, 0x0045, 0x0050, 0x0051, 0x0054, 0x0055, 0x0100, 0x0101, 0x0104, 0x0105, 0x0110, 0x0111, 0x0114, 0x0115, 0x0140, 0x0141, 0x0144, 0x0145, 0x0150, 0x0151, 0x0154, 0x0155, 0x0400, 0x0401, 0x0404, 0x0405, 0x0410, 0x0411, 0x0414, 0x0415, 0x0440, 0x0441, 0x0444, 0x0445, 0x0450, 0x0451, 0x0454, 0x0455, 0x0500, 0x0501, 0x0504, 0x0505, 0x0510, 0x0511, 0x0514, 0x0515, 0x0540, 0x0541, 0x0544, 0x0545, 0x0550, 0x0551, 0x0554, 0x0555, 0x1000, 0x1001, 0x1004, 0x1005, 0x1010, 0x1011, 0x1014, 0x1015, 0x1040, 0x1041, 0x1044, 0x1045, 0x1050, 0x1051, 0x1054, 0x1055, 0x1100, 0x1101, 0x1104, 0x1105, 0x1110, 0x1111, 0x1114, 0x1115, 0x1140, 0x1141, 0x1144, 0x1145, 0x1150, 0x1151, 0x1154, 0x1155, 0x1400, 0x1401, 0x1404, 0x1405, 0x1410, 0x1411, 0x1414, 0x1415, 0x1440, 0x1441, 0x1444, 0x1445, 0x1450, 0x1451, 0x1454, 0x1455, 0x1500, 0x1501, 0x1504, 0x1505, 0x1510, 0x1511, 0x1514, 0x1515, 0x1540, 0x1541, 0x1544, 0x1545, 0x1550, 0x1551, 0x1554, 0x1555, 0x4000, 0x4001, 0x4004, 0x4005, 0x4010, 0x4011, 0x4014, 0x4015, 0x4040, 0x4041, 0x4044, 0x4045, 0x4050, 0x4051, 0x4054, 0x4055, 0x4100, 0x4101, 0x4104, 0x4105, 0x4110, 0x4111, 0x4114, 0x4115, 0x4140, 0x4141, 0x4144, 0x4145, 0x4150, 0x4151, 0x4154, 0x4155, 0x4400, 0x4401, 0x4404, 0x4405, 0x4410, 0x4411, 0x4414, 0x4415, 0x4440, 0x4441, 0x4444, 0x4445, 0x4450, 0x4451, 0x4454, 0x4455, 0x4500, 0x4501, 0x4504, 0x4505, 0x4510, 0x4511, 0x4514, 0x4515, 0x4540, 0x4541, 0x4544, 0x4545, 0x4550, 0x4551, 0x4554, 0x4555, 0x5000, 0x5001, 0x5004, 0x5005, 0x5010, 0x5011, 0x5014, 0x5015, 0x5040, 0x5041, 0x5044, 0x5045, 0x5050, 0x5051, 0x5054, 0x5055, 0x5100, 0x5101, 0x5104, 0x5105, 0x5110, 0x5111, 0x5114, 0x5115, 0x5140, 0x5141, 0x5144, 0x5145, 0x5150, 0x5151, 0x5154, 0x5155, 0x5400, 0x5401, 0x5404, 0x5405, 0x5410, 0x5411, 0x5414, 0x5415, 0x5440, 0x5441, 0x5444, 0x5445, 0x5450, 0x5451, 0x5454, 0x5455, 0x5500, 0x5501, 0x5504, 0x5505, 0x5510, 0x5511, 0x5514, 0x5515, 0x5540, 0x5541, 0x5544, 0x5545, 0x5550, 0x5551, 0x5554, 0x5555 }; uint32_t z = MortonTable256[y >> 8] << 17 | MortonTable256[x >> 8] << 16 | MortonTable256[y & 0xFF] << 1 | MortonTable256[x & 0xFF]; return z; } template inline batch_bool compare_int_avx512f(batch const& self, batch const& other) noexcept { using register_type = typename batch_bool::register_type; if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { // shifting to take sign into account uint64_t mask_low0 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x000000FF)) << 24, (batch(other.data) & batch(0x000000FF)) << 24, Cmp); uint64_t mask_low1 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x0000FF00)) << 16, (batch(other.data) & batch(0x0000FF00)) << 16, Cmp); uint64_t mask_high0 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x00FF0000)) << 8, (batch(other.data) & batch(0x00FF0000)) << 8, Cmp); uint64_t mask_high1 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0xFF000000)), (batch(other.data) & batch(0xFF000000)), Cmp); uint64_t mask = 0; for (unsigned i = 0; i < 16; ++i) { mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); } return (register_type)mask; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { // shifting to take sign into account uint16_t mask_low = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x0000FFFF)) << 16, (batch(other.data) & batch(0x0000FFFF)) << 16, Cmp); uint16_t mask_high = _mm512_cmp_epi32_mask((batch(self.data) & batch(0xFFFF0000)), (batch(other.data) & batch(0xFFFF0000)), Cmp); return static_cast(morton(mask_low, mask_high)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { uint64_t mask_low0 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x000000FF)), (batch(other.data) & batch(0x000000FF)), Cmp); uint64_t mask_low1 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x0000FF00)), (batch(other.data) & batch(0x0000FF00)), Cmp); uint64_t mask_high0 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x00FF0000)), (batch(other.data) & batch(0x00FF0000)), Cmp); uint64_t mask_high1 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0xFF000000)), (batch(other.data) & batch(0xFF000000)), Cmp); uint64_t mask = 0; for (unsigned i = 0; i < 16; ++i) { mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); } return (register_type)mask; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { uint16_t mask_low = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x0000FFFF)), (batch(other.data) & batch(0x0000FFFF)), Cmp); uint16_t mask_high = _mm512_cmp_epu32_mask((batch(self.data) & batch(0xFFFF0000)), (batch(other.data) & batch(0xFFFF0000)), Cmp); return static_cast(morton(mask_low, mask_high)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp); } } } } // abs template inline batch abs(batch const& self, requires_arch) noexcept { __m512 self_asf = (__m512)self; __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asf); __m512i res_asi = _mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF), self_asi); return *reinterpret_cast<__m512*>(&res_asi); } template inline batch abs(batch const& self, requires_arch) noexcept { __m512d self_asd = (__m512d)self; __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asd); __m512i res_asi = _mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF), self_asi); return *reinterpret_cast<__m512d*>(&res_asi); } template ::value, void>::type> inline batch abs(batch const& self, requires_arch) noexcept { if (std::is_unsigned::value) { return self; } XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return detail::fwd_to_avx([](__m256i s) noexcept { return abs(batch(s)); }, self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return detail::fwd_to_avx([](__m256i s) noexcept { return abs(batch(s)); }, self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_abs_epi32(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_abs_epi64(self); } else { assert(false && "unsupported arch/op combination"); return {}; } } // add template ::value, void>::type> inline batch add(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return add(batch(s), batch(o)); }, self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return add(batch(s), batch(o)); }, self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_add_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_add_epi64(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } template inline batch add(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_add_ps(self, other); } template inline batch add(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_add_pd(self, other); } // all template inline bool all(batch_bool const& self, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return self.data == register_type(-1); } // any template inline bool any(batch_bool const& self, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return self.data != register_type(0); } // batch_bool_cast template inline batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept { return self.data; } // bitwise_and template inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { #if defined(_MSC_VER) return _mm512_and_ps(self, other); #else return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); #endif } template inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); } template ::value, void>::type> inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_and_si512(self, other); } template inline batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(self.data & other.data); } // bitwise_andnot template inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(other), _mm512_castps_si512(self))); } template inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_castsi512_pd(_mm512_andnot_si512(_mm512_castpd_si512(other), _mm512_castpd_si512(self))); } template ::value, void>::type> inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_andnot_si512(other, self); } template inline batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(self.data & ~other.data); } // bitwise_lshift template ::value, void>::type> inline batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) __m512i tmp = _mm512_sllv_epi32(self, _mm512_set1_epi32(other)); #else __m512i tmp = _mm512_slli_epi32(self, other); #endif return _mm512_and_si512(_mm512_set1_epi8(0xFF << other), tmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept { return bitwise_lshift(batch(s), o, avx2 {}); }, self, other); #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_sllv_epi32(self, _mm512_set1_epi32(other)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_sllv_epi64(self, _mm512_set1_epi64(other)); #else } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_slli_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_slli_epi64(self, other); #endif } else { assert(false && "unsupported arch/op combination"); return {}; } } // bitwise_not template ::value, void>::type> inline batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm512_xor_si512(self, _mm512_set1_epi32(-1)); } template inline batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(~self.data); } template inline batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_set1_epi32(-1))); } template inline batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_set1_epi32(-1))); } // bitwise_or template inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_castsi512_ps(_mm512_or_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); } template inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_castsi512_pd(_mm512_or_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); } template inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(self.data | other.data); } template ::value, void>::type> inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_or_si512(self, other); } // bitwise_rshift template ::value, void>::type> inline batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept { if (std::is_signed::value) { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_srav_epi32(self, _mm512_set1_epi32(other)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_srav_epi64(self, _mm512_set1_epi64(other)); #else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_srai_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_srai_epi64(self, other); #endif } else { return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept { return bitwise_rshift(batch(s), o, avx2 {}); }, self, other); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) __m512i tmp = _mm512_srlv_epi32(self, _mm512_set1_epi32(other)); #else __m512i tmp = _mm512_srli_epi32(self, other); #endif return _mm512_and_si512(_mm512_set1_epi8(0xFF >> other), tmp); #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_srlv_epi32(self, _mm512_set1_epi32(other)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_srlv_epi64(self, _mm512_set1_epi64(other)); #else } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_srli_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_srli_epi64(self, other); #endif } else { return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept { return bitwise_rshift(batch(s), o, avx2 {}); }, self, other); } } } // bitwise_xor template inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); } template inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); } template inline batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(self.data | other.data); } template ::value, void>::type> inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_xor_si512(self, other); } // bitwise_cast template ::value, void>::type> inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_castsi512_ps(self); } template ::value, void>::type> inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_castsi512_pd(self); } template ::type>::value, void>::type> inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return batch(self.data); } template inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_castps_pd(self); } template ::value, void>::type> inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_castps_si512(self); } template inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_castpd_ps(self); } template ::value, void>::type> inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_castpd_si512(self); } // broadcast template ::value, void>::type> inline batch broadcast(T val, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_set1_epi8(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_set1_epi16(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_set1_epi32(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_set1_epi64(val); } else { assert(false && "unsupported"); return {}; } } template inline batch broadcast(float val, requires_arch) noexcept { return _mm512_set1_ps(val); } template batch inline broadcast(double val, requires_arch) noexcept { return _mm512_set1_pd(val); } // ceil template inline batch ceil(batch const& self, requires_arch) noexcept { return _mm512_roundscale_ps(self, _MM_FROUND_TO_POS_INF); } template inline batch ceil(batch const& self, requires_arch) noexcept { return _mm512_roundscale_pd(self, _MM_FROUND_TO_POS_INF); } // compress template inline batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_compress_ps(mask.mask(), self); } template inline batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_compress_pd(mask.mask(), self); } template inline batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_compress_epi32(mask.mask(), self); } template inline batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_compress_epi32(mask.mask(), self); } template inline batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_compress_epi64(mask.mask(), self); } template inline batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_compress_epi64(mask.mask(), self); } // convert namespace detail { template inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_cvtepi32_ps(self); } template inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_cvttps_epi32(self); } template inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_cvtepu32_ps(self); } template batch fast_cast(batch const& self, batch const&, requires_arch) { return _mm512_cvttps_epu32(self); } } namespace detail { // complex_low template inline batch complex_low(batch, A> const& self, requires_arch) noexcept { __m512i idx = _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); return _mm512_permutex2var_ps(self.real(), idx, self.imag()); } template inline batch complex_low(batch, A> const& self, requires_arch) noexcept { __m512i idx = _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11); return _mm512_permutex2var_pd(self.real(), idx, self.imag()); } // complex_high template inline batch complex_high(batch, A> const& self, requires_arch) noexcept { __m512i idx = _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); return _mm512_permutex2var_ps(self.real(), idx, self.imag()); } template inline batch complex_high(batch, A> const& self, requires_arch) noexcept { __m512i idx = _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15); return _mm512_permutex2var_pd(self.real(), idx, self.imag()); } } // div template inline batch div(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_div_ps(self, other); } template inline batch div(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_div_pd(self, other); } // eq template inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_ps_mask(self, other, _CMP_EQ_OQ); } template inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_pd_mask(self, other, _CMP_EQ_OQ); } template ::value, void>::type> inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512f(self, other); } template inline batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(~self.data ^ other.data); } // expand template inline batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_expand_ps(mask.mask(), self); } template inline batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_expand_pd(mask.mask(), self); } template inline batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_expand_epi32(mask.mask(), self); } template inline batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_expand_epi32(mask.mask(), self); } template inline batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_expand_epi64(mask.mask(), self); } template inline batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_expand_epi64(mask.mask(), self); } // floor template inline batch floor(batch const& self, requires_arch) noexcept { return _mm512_roundscale_ps(self, _MM_FROUND_TO_NEG_INF); } template inline batch floor(batch const& self, requires_arch) noexcept { return _mm512_roundscale_pd(self, _MM_FROUND_TO_NEG_INF); } // fnma template inline batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm512_fnmadd_ps(x, y, z); } template inline batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm512_fnmadd_pd(x, y, z); } // fma template inline batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm512_fmadd_ps(x, y, z); } template inline batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm512_fmadd_pd(x, y, z); } // fms template inline batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm512_fmsub_ps(x, y, z); } template inline batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm512_fmsub_pd(x, y, z); } // from bool template inline batch from_bool(batch_bool const& self, requires_arch) noexcept { return select(self, batch(1), batch(0)); } // from_mask template inline batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { return static_cast::register_type>(mask); } // gather template = 0, detail::enable_sized_integral_t = 0> inline batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept { return _mm512_i32gather_epi32(index, static_cast(src), sizeof(T)); } template = 0, detail::enable_sized_integral_t = 0> inline batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept { return _mm512_i64gather_epi64(index, static_cast(src), sizeof(T)); } template = 0> inline batch gather(batch const&, float const* src, batch const& index, kernel::requires_arch) noexcept { return _mm512_i32gather_ps(index, src, sizeof(float)); } template = 0> inline batch gather(batch const&, double const* src, batch const& index, kernel::requires_arch) noexcept { return _mm512_i64gather_pd(index, src, sizeof(double)); } // gather: handmade conversions template = 0> inline batch gather(batch const&, double const* src, batch const& index, requires_arch) noexcept { const batch low(_mm512_i32gather_pd(_mm512_castsi512_si256(index.data), src, sizeof(double))); const batch high(_mm512_i32gather_pd(_mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(index.data), 1)), src, sizeof(double))); return detail::merge_avx(_mm512_cvtpd_ps(low.data), _mm512_cvtpd_ps(high.data)); } template = 0> inline batch gather(batch const&, double const* src, batch const& index, requires_arch) noexcept { const batch low(_mm512_i32gather_pd(_mm512_castsi512_si256(index.data), src, sizeof(double))); const batch high(_mm512_i32gather_pd(_mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(index.data), 1)), src, sizeof(double))); return detail::merge_avx(_mm512_cvtpd_epi32(low.data), _mm512_cvtpd_epi32(high.data)); } // ge template inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_ps_mask(self, other, _CMP_GE_OQ); } template inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_pd_mask(self, other, _CMP_GE_OQ); } template ::value, void>::type> inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512f(self, other); } // gt template inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_ps_mask(self, other, _CMP_GT_OQ); } template inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_pd_mask(self, other, _CMP_GT_OQ); } template ::value, void>::type> inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512f(self, other); } // haddp template inline batch haddp(batch const* row, requires_arch) noexcept { // The following folds over the vector once: // tmp1 = [a0..8, b0..8] // tmp2 = [a8..f, b8..f] #define XSIMD_AVX512_HADDP_STEP1(I, a, b) \ batch res##I; \ { \ auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \ auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \ res##I = _mm512_add_ps(tmp1, tmp2); \ } XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]); XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]); XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]); XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]); XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]); XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]); XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]); XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]); #undef XSIMD_AVX512_HADDP_STEP1 // The following flds the code and shuffles so that hadd_ps produces the correct result // tmp1 = [a0..4, a8..12, b0..4, b8..12] (same for tmp3) // tmp2 = [a5..8, a12..16, b5..8, b12..16] (same for tmp4) // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ... #define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d) \ batch halfx##I; \ { \ auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \ auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \ \ auto resx1 = _mm512_add_ps(tmp1, tmp2); \ \ auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \ auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \ \ auto resx2 = _mm512_add_ps(tmp3, tmp4); \ \ auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \ auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \ \ auto resx3 = _mm512_add_ps(tmp5, tmp6); \ \ halfx##I = _mm256_hadd_ps(_mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(resx3, 0)), _mm512_extractf32x4_ps(resx3, 1), 1), \ _mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(resx3, 2)), _mm512_extractf32x4_ps(resx3, 3), 1)); \ } XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3); XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7); #undef XSIMD_AVX512_HADDP_STEP2 auto concat = _mm512_castps256_ps512(halfx0); concat = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(concat), _mm256_castps_pd(halfx1), 1)); return concat; } template inline batch haddp(batch const* row, requires_arch) noexcept { #define step1(I, a, b) \ batch res##I; \ { \ auto tmp1 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \ auto tmp2 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \ res##I = _mm512_add_pd(tmp1, tmp2); \ } step1(1, row[0], row[2]); step1(2, row[4], row[6]); step1(3, row[1], row[3]); step1(4, row[5], row[7]); #undef step1 auto tmp5 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(2, 0, 2, 0)); auto tmp6 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(3, 1, 3, 1)); auto resx1 = _mm512_add_pd(tmp5, tmp6); auto tmp7 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(2, 0, 2, 0)); auto tmp8 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(3, 1, 3, 1)); auto resx2 = _mm512_add_pd(tmp7, tmp8); auto tmpx = _mm512_shuffle_pd(resx1, resx2, 0b00000000); auto tmpy = _mm512_shuffle_pd(resx1, resx2, 0b11111111); return _mm512_add_pd(tmpx, tmpy); } // isnan template inline batch_bool isnan(batch const& self, requires_arch) noexcept { return _mm512_cmp_ps_mask(self, self, _CMP_UNORD_Q); } template inline batch_bool isnan(batch const& self, requires_arch) noexcept { return _mm512_cmp_pd_mask(self, self, _CMP_UNORD_Q); } // ldexp template inline batch ldexp(const batch& self, const batch, A>& other, requires_arch) noexcept { return _mm512_scalef_ps(self, _mm512_cvtepi32_ps(other)); } template inline batch ldexp(const batch& self, const batch, A>& other, requires_arch) noexcept { // FIXME: potential data loss here when converting other elements to // int32 before converting them back to double. __m512d adjusted_index = _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(other)); return _mm512_scalef_pd(self, adjusted_index); } // le template inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_ps_mask(self, other, _CMP_LE_OQ); } template inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_pd_mask(self, other, _CMP_LE_OQ); } template ::value, void>::type> inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512f(self, other); } // load_aligned template ::value, void>::type> inline batch load_aligned(T const* mem, convert, requires_arch) noexcept { return _mm512_load_si512((__m512i const*)mem); } template inline batch load_aligned(float const* mem, convert, requires_arch) noexcept { return _mm512_load_ps(mem); } template inline batch load_aligned(double const* mem, convert, requires_arch) noexcept { return _mm512_load_pd(mem); } // load_complex namespace detail { template inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { __m512i real_idx = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); __m512i imag_idx = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31); auto real = _mm512_permutex2var_ps(hi, real_idx, lo); auto imag = _mm512_permutex2var_ps(hi, imag_idx, lo); return { real, imag }; } template inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { __m512i real_idx = _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14); __m512i imag_idx = _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15); auto real = _mm512_permutex2var_pd(hi, real_idx, lo); auto imag = _mm512_permutex2var_pd(hi, imag_idx, lo); return { real, imag }; } } // load_unaligned template ::value, void>::type> inline batch load_unaligned(T const* mem, convert, requires_arch) noexcept { return _mm512_loadu_si512((__m512i const*)mem); } template inline batch load_unaligned(float const* mem, convert, requires_arch) noexcept { return _mm512_loadu_ps(mem); } template inline batch load_unaligned(double const* mem, convert, requires_arch) noexcept { return _mm512_loadu_pd(mem); } // lt template inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_ps_mask(self, other, _CMP_LT_OQ); } template inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_pd_mask(self, other, _CMP_LT_OQ); } template ::value, void>::type> inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512f(self, other); } // mask template inline uint64_t mask(batch_bool const& self, requires_arch) noexcept { return self.data; } // max template inline batch max(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_max_ps(self, other); } template inline batch max(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_max_pd(self, other); } template ::value, void>::type> inline batch max(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_max_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_max_epi64(self, other); } else { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return max(batch(s), batch(o)); }, self, other); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_max_epu32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_max_epu64(self, other); } else { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return max(batch(s), batch(o)); }, self, other); } } } // min template inline batch min(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_min_ps(self, other); } template inline batch min(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_min_pd(self, other); } template ::value, void>::type> inline batch min(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_min_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_min_epi64(self, other); } else { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return min(batch(s), batch(o)); }, self, other); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_min_epu32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_min_epu64(self, other); } else { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return min(batch(s), batch(o)); }, self, other); } } } // mul template inline batch mul(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_mul_ps(self, other); } template inline batch mul(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_mul_pd(self, other); } template ::value, void>::type> inline batch mul(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_mullo_epi32(self, other); } else { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return mul(batch(s), batch(o)); }, self, other); } } // nearbyint template inline batch nearbyint(batch const& self, requires_arch) noexcept { return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION); } template inline batch nearbyint(batch const& self, requires_arch) noexcept { return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION); } // nearbyint_as_int template inline batch nearbyint_as_int(batch const& self, requires_arch) noexcept { return _mm512_cvtps_epi32(self); } // neg template inline batch neg(batch const& self, requires_arch) noexcept { return 0 - self; } // neq template inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_ps_mask(self, other, _CMP_NEQ_UQ); } template inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_pd_mask(self, other, _CMP_NEQ_UQ); } template ::value, void>::type> inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return ~(self == other); } template inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(self.data ^ other.data); } // reciprocal template inline batch reciprocal(batch const& self, kernel::requires_arch) noexcept { return _mm512_rcp14_ps(self); } template inline batch reciprocal(batch const& self, kernel::requires_arch) noexcept { return _mm512_rcp14_pd(self); } // reduce_add template inline float reduce_add(batch const& rhs, requires_arch) noexcept { __m128 tmp1 = _mm512_extractf32x4_ps(rhs, 0); __m128 tmp2 = _mm512_extractf32x4_ps(rhs, 1); __m128 tmp3 = _mm512_extractf32x4_ps(rhs, 2); __m128 tmp4 = _mm512_extractf32x4_ps(rhs, 3); __m128 res1 = _mm_add_ps(tmp1, tmp2); __m128 res2 = _mm_add_ps(tmp3, tmp4); __m128 res3 = _mm_add_ps(res1, res2); return reduce_add(batch(res3), sse4_2 {}); } template inline double reduce_add(batch const& rhs, requires_arch) noexcept { __m256d tmp1 = _mm512_extractf64x4_pd(rhs, 1); __m256d tmp2 = _mm512_extractf64x4_pd(rhs, 0); __m256d res1 = _mm256_add_pd(tmp1, tmp2); return reduce_add(batch(res1), avx2 {}); } template ::value, void>::type> inline T reduce_add(batch const& self, requires_arch) noexcept { __m256i low, high; detail::split_avx512(self, low, high); batch blow(low), bhigh(high); return reduce_add(blow, avx2 {}) + reduce_add(bhigh, avx2 {}); } // reduce_max template ::type> inline T reduce_max(batch const& self, requires_arch) noexcept { constexpr batch_constant, 5, 6, 7, 8, 0, 0, 0, 0> mask; batch step = _mm512_permutexvar_epi64((batch)mask, self); batch acc = max(self, step); __m256i low = _mm512_castsi512_si256(acc); return reduce_max(batch(low)); } // reduce_min template ::type> inline T reduce_min(batch const& self, requires_arch) noexcept { constexpr batch_constant, 5, 6, 7, 8, 0, 0, 0, 0> mask; batch step = _mm512_permutexvar_epi64((batch)mask, self); batch acc = min(self, step); __m256i low = _mm512_castsi512_si256(acc); return reduce_min(batch(low)); } // rsqrt template inline batch rsqrt(batch const& val, requires_arch) noexcept { return _mm512_rsqrt14_ps(val); } template inline batch rsqrt(batch const& val, requires_arch) noexcept { return _mm512_rsqrt14_pd(val); } // sadd template ::value, void>::type> inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { auto mask = other < 0; auto self_pos_branch = min(std::numeric_limits::max() - other, self); auto self_neg_branch = max(std::numeric_limits::min() - other, self); return other + select(mask, self_neg_branch, self_pos_branch); } else { const auto diffmax = std::numeric_limits::max() - self; const auto mindiff = min(diffmax, other); return self + mindiff; } } // scatter template ::value || std::is_same::value, void>::type> inline void scatter(batch const& src, T* dst, batch const& index, kernel::requires_arch) noexcept { _mm512_i32scatter_epi32(dst, index, src, sizeof(T)); } template ::value || std::is_same::value, void>::type> inline void scatter(batch const& src, T* dst, batch const& index, kernel::requires_arch) noexcept { _mm512_i64scatter_epi64(dst, index, src, sizeof(T)); } template inline void scatter(batch const& src, float* dst, batch const& index, kernel::requires_arch) noexcept { _mm512_i32scatter_ps(dst, index, src, sizeof(float)); } template inline void scatter(batch const& src, double* dst, batch const& index, kernel::requires_arch) noexcept { _mm512_i64scatter_pd(dst, index, src, sizeof(double)); } // select template inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return _mm512_mask_blend_ps(cond, false_br, true_br); } template inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return _mm512_mask_blend_pd(cond, false_br, true_br); } template ::value, void>::type> inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { alignas(avx2::alignment()) uint8_t buffer[64]; // FIXME: ultra inefficient for (int i = 0; i < 64; ++i) buffer[i] = cond.data & (1ull << i) ? 0xFF : 0; __m256i cond_low = batch::load_aligned(&buffer[0]); __m256i cond_hi = batch::load_aligned(&buffer[32]); __m256i true_low, true_hi; detail::split_avx512(true_br, true_low, true_hi); __m256i false_low, false_hi; detail::split_avx512(false_br, false_low, false_hi); __m256i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), avx2 {}); __m256i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), avx2 {}); return detail::merge_avx(res_low, res_hi); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { __m256i cond_low = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data & 0xFFFF, _mm512_set1_epi32(~0)); __m256i cond_hi = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data >> 16, _mm512_set1_epi32(~0)); __m256i true_low, true_hi; detail::split_avx512(true_br, true_low, true_hi); __m256i false_low, false_hi; detail::split_avx512(false_br, false_low, false_hi); __m256i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), avx2 {}); __m256i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), avx2 {}); return detail::merge_avx(res_low, res_hi); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_mask_blend_epi32(cond, false_br, true_br); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_mask_blend_epi64(cond, false_br, true_br); } else { assert(false && "unsupported arch/type combination"); return {}; } } template ::value, void>::type> inline batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { return select(batch_bool { Values... }, true_br, false_br, avx512f {}); } namespace detail { template using enable_signed_integer_t = typename std::enable_if::value && std::is_signed::value, int>::type; template using enable_unsigned_integer_t = typename std::enable_if::value && std::is_unsigned::value, int>::type; } // set template inline batch set(batch const&, requires_arch, float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15) noexcept { return _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); } template inline batch set(batch const&, requires_arch, double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7) noexcept { return _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7); } template ::value, void>::type> inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept { return _mm512_set_epi64(v7, v6, v5, v4, v3, v2, v1, v0); } template ::value, void>::type> inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept { return _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); } template = 0> inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept { #if defined(__clang__) || __GNUC__ return __extension__(__m512i)(__v32hi) { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 }; #else return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31); #endif } template = 0> inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept { #if defined(__clang__) || __GNUC__ return __extension__(__m512i)(__v32hu) { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 }; #else return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31); #endif } template = 0> inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31, T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39, T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47, T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55, T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept { #if defined(__clang__) || __GNUC__ return __extension__(__m512i)(__v64qi) { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63 }; #else return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63); #endif } template = 0> inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31, T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39, T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47, T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55, T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept { #if defined(__clang__) || __GNUC__ return __extension__(__m512i)(__v64qu) { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63 }; #else return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63); #endif } template inline batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); using register_type = typename batch_bool::register_type; register_type r = 0; unsigned shift = 0; (void)std::initializer_list { (r |= register_type(values ? 1 : 0) << (shift++))... }; return r; } // shuffle template inline batch shuffle(batch const& x, batch const& y, batch_constant, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15> mask, requires_arch) noexcept { constexpr uint32_t smask = (I0 & 0x3) | ((I1 & 0x3) << 2) | ((I2 & 0x3) << 4) | ((I3 & 0x3) << 6); // shuffle within lane if ((I4 == I0 + 4) && (I5 == I1 + 4) && (I6 == I2 + 4) && (I7 == I3 + 4) && (I8 == I0 + 8) && (I9 == I1 + 8) && (I10 == I2 + 8) && (I11 == I3 + 8) && (I12 == I0 + 12) && (I13 == I1 + 12) && (I14 == I2 + 12) && (I15 == I3 + 12) && I0 < 4 && I1 < 4 && I2 >= 16 && I2 < 20 && I3 >= 16 && I3 < 20) return _mm512_shuffle_ps(x, y, smask); // shuffle within opposite lane if ((I4 == I0 + 4) && (I5 == I1 + 4) && (I6 == I2 + 4) && (I7 == I3 + 4) && (I8 == I0 + 8) && (I9 == I1 + 8) && (I10 == I2 + 8) && (I11 == I3 + 8) && (I12 == I0 + 12) && (I13 == I1 + 12) && (I14 == I2 + 12) && (I15 == I3 + 12) && I2 < 4 && I3 < 4 && I0 >= 16 && I0 < 20 && I1 >= 16 && I1 < 20) return _mm512_shuffle_ps(y, x, smask); return shuffle(x, y, mask, generic {}); } template inline batch shuffle(batch const& x, batch const& y, batch_constant, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch) noexcept { constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3) | ((I4 & 0x1) << 4) | ((I5 & 0x1) << 5) | ((I6 & 0x1) << 6) | ((I7 & 0x1) << 7); // shuffle within lane if (I0 < 2 && I1 >= 8 && I1 < 10 && I2 >= 2 && I2 < 4 && I3 >= 10 && I3 < 12 && I4 >= 4 && I4 < 6 && I5 >= 12 && I5 < 14 && I6 >= 6 && I6 < 8 && I7 >= 14) return _mm512_shuffle_pd(x, y, smask); // shuffle within opposite lane if (I1 < 2 && I0 >= 8 && I0 < 10 && I3 >= 2 && I3 < 4 && I2 >= 10 && I2 < 12 && I5 >= 4 && I5 < 6 && I4 >= 12 && I4 < 14 && I7 >= 6 && I7 < 8 && I6 >= 14) return _mm512_shuffle_pd(y, x, smask); return shuffle(x, y, mask, generic {}); } // slide_left template inline batch slide_left(batch const&, requires_arch) noexcept { static_assert(N == 0xDEAD, "not implemented yet"); return {}; } // slide_right template inline batch slide_right(batch const&, requires_arch) noexcept { static_assert(N == 0xDEAD, "not implemented yet"); return {}; } // sqrt template inline batch sqrt(batch const& val, requires_arch) noexcept { return _mm512_sqrt_ps(val); } template inline batch sqrt(batch const& val, requires_arch) noexcept { return _mm512_sqrt_pd(val); } // ssub template ::value, void>::type> inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { return sadd(self, -other); } else { const auto diff = min(self, other); return self - diff; } } // store template inline void store(batch_bool const& self, bool* mem, requires_arch) noexcept { using register_type = typename batch_bool::register_type; constexpr auto size = batch_bool::size; for (std::size_t i = 0; i < size; ++i) mem[i] = self.data & (register_type(1) << i); } // store_aligned template ::value, void>::type> inline void store_aligned(T* mem, batch const& self, requires_arch) noexcept { return _mm512_store_si512((__m512i*)mem, self); } template ::value, void>::type> inline void store_aligned(T* mem, batch_bool const& self, requires_arch) noexcept { return _mm512_store_si512((__m512i*)mem, self); } template inline void store_aligned(float* mem, batch const& self, requires_arch) noexcept { return _mm512_store_ps(mem, self); } template inline void store_aligned(double* mem, batch const& self, requires_arch) noexcept { return _mm512_store_pd(mem, self); } // store_unaligned template ::value, void>::type> inline void store_unaligned(T* mem, batch const& self, requires_arch) noexcept { return _mm512_storeu_si512((__m512i*)mem, self); } template ::value, void>::type> inline void store_unaligned(T* mem, batch_bool const& self, requires_arch) noexcept { return _mm512_storeu_si512((__m512i*)mem, self); } template inline void store_unaligned(float* mem, batch const& self, requires_arch) noexcept { return _mm512_storeu_ps(mem, self); } template inline void store_unaligned(double* mem, batch const& self, requires_arch) noexcept { return _mm512_storeu_pd(mem, self); } // sub template ::value, void>::type> inline batch sub(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return sub(batch(s), batch(o)); }, self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return sub(batch(s), batch(o)); }, self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_sub_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_sub_epi64(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } template inline batch sub(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_sub_ps(self, other); } template inline batch sub(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_sub_pd(self, other); } // swizzle (dynamic version) template inline batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm512_permutexvar_ps(mask, self); } template inline batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm512_permutexvar_pd(mask, self); } template inline batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm512_permutexvar_epi64(mask, self); } template inline batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512f {})); } template inline batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm512_permutexvar_epi32(mask, self); } template inline batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512f {})); } // swizzle (constant version) template inline batch swizzle(batch const& self, batch_constant, Vs...> mask, requires_arch) noexcept { return swizzle(self, (batch)mask, avx512f {}); } template inline batch swizzle(batch const& self, batch_constant, Vs...> mask, requires_arch) noexcept { return swizzle(self, (batch)mask, avx512f {}); } template inline batch swizzle(batch const& self, batch_constant, Vs...> mask, requires_arch) noexcept { return swizzle(self, (batch)mask, avx512f {}); } template inline batch swizzle(batch const& self, batch_constant, Vs...> mask, requires_arch) noexcept { return swizzle(self, (batch)mask, avx512f {}); } template inline batch swizzle(batch const& self, batch_constant, Vs...> mask, requires_arch) noexcept { return swizzle(self, (batch)mask, avx512f {}); } template inline batch swizzle(batch const& self, batch_constant, Vs...> mask, requires_arch) noexcept { return swizzle(self, (batch)mask, avx512f {}); } namespace detail { template struct is_pair_of_contiguous_indices; template struct is_pair_of_contiguous_indices : std::true_type { }; template struct is_pair_of_contiguous_indices : std::conditional<(Idx0 % 2 == 0) && (Idx0 + 1 == Idx1), is_pair_of_contiguous_indices, std::false_type>::type { }; template struct fold_batch_constant { using type = batch_constant, I0 / 2, I2 / 2, I4 / 2, I6 / 2, I8 / 2, I10 / 2, I12 / 2, I14 / 2, I16 / 2, I18 / 2, I20 / 2, I22 / 2, I24 / 2, I26 / 2, I28 / 2, I30 / 2>; }; } template ::value, void>::type> inline batch swizzle(batch const& self, batch_constant, Idx...>, requires_arch) noexcept { constexpr typename detail::fold_batch_constant::type mask32; return _mm512_permutexvar_epi32(static_cast>(mask32), self); } template inline batch swizzle(batch const& self, batch_constant, (uint16_t)1, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1>, requires_arch) noexcept { // FIXME: this sequence is very inefficient, but it's here to catch // a pattern generated by detail::reduce from xsimd_generic_math.hpp. // The whole pattern is actually decently folded by GCC and Clang, // so bare with it. constexpr batch_constant, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32; auto tmp = _mm512_permutexvar_epi32(static_cast>(mask32), self); alignas(A::alignment()) uint16_t buffer[32]; _mm512_store_si512((__m512i*)&buffer[0], tmp); buffer[0] = buffer[1]; return _mm512_load_si512(&buffer[0]); } template inline batch swizzle(batch const& self, batch_constant, Vs...> mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512f {})); } // trunc template inline batch trunc(batch const& self, requires_arch) noexcept { return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION); } template inline batch trunc(batch const& self, requires_arch) noexcept { return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION); } // zip_hi template ::value, void>::type> inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { __m512i lo, hi; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { assert(false && "not implemented yet"); return {}; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { assert(false && "not implemented yet"); return {}; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { lo = _mm512_unpacklo_epi32(self, other); hi = _mm512_unpackhi_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { lo = _mm512_unpacklo_epi64(self, other); hi = _mm512_unpackhi_epi64(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } return _mm512_inserti32x4( _mm512_inserti32x4( _mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0), _mm512_extracti32x4_epi32(lo, 3), 2), _mm512_extracti32x4_epi32(hi, 2), 1); } template inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm512_unpacklo_ps(self, other); auto hi = _mm512_unpackhi_ps(self, other); return _mm512_insertf32x4( _mm512_insertf32x4( _mm512_insertf32x4(hi, _mm512_extractf32x4_ps(lo, 2), 0), _mm512_extractf32x4_ps(lo, 3), 2), _mm512_extractf32x4_ps(hi, 2), 1); } template inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm512_castpd_ps(_mm512_unpacklo_pd(self, other)); auto hi = _mm512_castpd_ps(_mm512_unpackhi_pd(self, other)); return _mm512_castps_pd(_mm512_insertf32x4( _mm512_insertf32x4( _mm512_insertf32x4(hi, _mm512_extractf32x4_ps(lo, 2), 0), _mm512_extractf32x4_ps(lo, 3), 2), _mm512_extractf32x4_ps(hi, 2), 1)); } // zip_lo template ::value, void>::type> inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { __m512i lo, hi; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { assert(false && "not implemented yet"); return {}; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { assert(false && "not implemented yet"); return {}; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { lo = _mm512_unpacklo_epi32(self, other); hi = _mm512_unpackhi_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { lo = _mm512_unpacklo_epi64(self, other); hi = _mm512_unpackhi_epi64(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } return _mm512_inserti32x4( _mm512_inserti32x4( _mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1), _mm512_extracti32x4_epi32(hi, 1), 3), _mm512_extracti32x4_epi32(lo, 1), 2); } template inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm512_unpacklo_ps(self, other); auto hi = _mm512_unpackhi_ps(self, other); return _mm512_insertf32x4( _mm512_insertf32x4( _mm512_insertf32x4(lo, _mm512_extractf32x4_ps(hi, 0), 1), _mm512_extractf32x4_ps(hi, 1), 3), _mm512_extractf32x4_ps(lo, 1), 2); } template inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm512_castpd_ps(_mm512_unpacklo_pd(self, other)); auto hi = _mm512_castpd_ps(_mm512_unpackhi_pd(self, other)); return _mm512_castps_pd(_mm512_insertf32x4( _mm512_insertf32x4( _mm512_insertf32x4(lo, _mm512_extractf32x4_ps(hi, 0), 1), _mm512_extractf32x4_ps(hi, 1), 3), _mm512_extractf32x4_ps(lo, 1), 2)); } } } #endif xsimd-12.1.1/include/xsimd/arch/xsimd_avx512ifma.hpp000066400000000000000000000016531453610362700222200ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512VBMI_HPP #define XSIMD_AVX512VBMI_HPP #include #include #include "../types/xsimd_avx512vbmi_register.hpp" #endif xsimd-12.1.1/include/xsimd/arch/xsimd_avx512pf.hpp000066400000000000000000000016451453610362700217120ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512PF_HPP #define XSIMD_AVX512PF_HPP #include #include #include "../types/xsimd_avx512pf_register.hpp" #endif xsimd-12.1.1/include/xsimd/arch/xsimd_avx512vbmi.hpp000066400000000000000000000016531453610362700222410ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512VBMI_HPP #define XSIMD_AVX512VBMI_HPP #include #include #include "../types/xsimd_avx512vbmi_register.hpp" #endif xsimd-12.1.1/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp000066400000000000000000000017101453610362700237070ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512VNNI_AVX512_BW_HPP #define XSIMD_AVX512VNNI_AVX512_BW_HPP #include #include #include "../types/xsimd_avx512vnni_avx512bw_register.hpp" #endif xsimd-12.1.1/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi.hpp000066400000000000000000000017141453610362700242400ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512VNNI_AVX512VBMI_HPP #define XSIMD_AVX512VNNI_AVX512VBMI_HPP #include #include #include "../types/xsimd_avx512vnni_avx512vbmi_register.hpp" #endif xsimd-12.1.1/include/xsimd/arch/xsimd_avxvnni.hpp000066400000000000000000000016421453610362700220240ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVXVNNI_HPP #define XSIMD_AVXVNNI_HPP #include #include #include "../types/xsimd_avxvnni_register.hpp" #endif xsimd-12.1.1/include/xsimd/arch/xsimd_constants.hpp000066400000000000000000000332321453610362700223470ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NUMERICAL_CONSTANT_HPP #define XSIMD_NUMERICAL_CONSTANT_HPP #include #include "../types/xsimd_utils.hpp" namespace xsimd { namespace constants { #define XSIMD_DEFINE_CONSTANT(NAME, SINGLE, DOUBLE) \ template \ inline T NAME() noexcept \ { \ return T(NAME()); \ } \ template <> \ inline float NAME() noexcept \ { \ return SINGLE; \ } \ template <> \ inline double NAME() noexcept \ { \ return DOUBLE; \ } #define XSIMD_DEFINE_CONSTANT_HEX(NAME, SINGLE, DOUBLE) \ template \ inline T NAME() noexcept \ { \ return T(NAME()); \ } \ template <> \ inline float NAME() noexcept \ { \ return bit_cast((uint32_t)SINGLE); \ } \ template <> \ inline double NAME() noexcept \ { \ return bit_cast((uint64_t)DOUBLE); \ } // Under fast-math, GCC might replace signmask (minus zero) by zero #if defined(__FAST_MATH__) && defined(__GNUC__) && !defined(__clang__) #pragma GCC push_options #pragma GCC optimize("signed-zeros") #endif XSIMD_DEFINE_CONSTANT(infinity, (std::numeric_limits::infinity()), (std::numeric_limits::infinity())) XSIMD_DEFINE_CONSTANT(invlog_2, 1.442695040888963407359924681001892137426645954152986f, 1.442695040888963407359924681001892137426645954152986) XSIMD_DEFINE_CONSTANT_HEX(invlog_2hi, 0x3fb8b000, 0x3ff7154765200000) XSIMD_DEFINE_CONSTANT_HEX(invlog_2lo, 0xb9389ad4, 0x3de705fc2eefa200) XSIMD_DEFINE_CONSTANT(invlog10_2, 3.32192809488736234787031942949f, 3.32192809488736234787031942949) XSIMD_DEFINE_CONSTANT_HEX(invpi, 0x3ea2f983, 0x3fd45f306dc9c883) XSIMD_DEFINE_CONSTANT(log_2, 0.6931471805599453094172321214581765680755001343602553f, 0.6931471805599453094172321214581765680755001343602553) XSIMD_DEFINE_CONSTANT_HEX(log_2hi, 0x3f318000, 0x3fe62e42fee00000) XSIMD_DEFINE_CONSTANT_HEX(log_2lo, 0xb95e8083, 0x3dea39ef35793c76) XSIMD_DEFINE_CONSTANT_HEX(log10_2hi, 0x3e9a0000, 0x3fd3440000000000) XSIMD_DEFINE_CONSTANT_HEX(log10_2lo, 0x39826a14, 0x3ed3509f79fef312) XSIMD_DEFINE_CONSTANT_HEX(logeps, 0xc17f1402, 0xc04205966f2b4f12) XSIMD_DEFINE_CONSTANT_HEX(logpi, 0x3f928682, 0x3ff250d048e7a1bd) XSIMD_DEFINE_CONSTANT_HEX(logsqrt2pi, 0x3f6b3f8e, 0x3fed67f1c864beb5) XSIMD_DEFINE_CONSTANT(maxflint, 16777216.0f, 9007199254740992.0) XSIMD_DEFINE_CONSTANT(maxlog, 88.3762626647949f, 709.78271289338400) XSIMD_DEFINE_CONSTANT(maxlog2, 127.0f, 1023.) XSIMD_DEFINE_CONSTANT(maxlog10, 38.23080825805664f, 308.2547155599167) XSIMD_DEFINE_CONSTANT_HEX(mediumpi, 0x43490fdb, 0x412921fb54442d18) XSIMD_DEFINE_CONSTANT(minlog, -88.3762626647949f, -708.3964185322641) XSIMD_DEFINE_CONSTANT(minlog2, -127.0f, -1023.) XSIMD_DEFINE_CONSTANT(minlog10, -37.89999771118164f, -308.2547155599167) XSIMD_DEFINE_CONSTANT(minusinfinity, (-infinity()), (-infinity())) XSIMD_DEFINE_CONSTANT_HEX(nan, 0xffffffff, 0xffffffffffffffff) XSIMD_DEFINE_CONSTANT_HEX(oneosqrteps, 0x453504f3, 0x4190000000000000) XSIMD_DEFINE_CONSTANT_HEX(oneotwoeps, 0x4a800000, 0x4320000000000000) XSIMD_DEFINE_CONSTANT_HEX(pi, 0x40490fdb, 0x400921fb54442d18) XSIMD_DEFINE_CONSTANT_HEX(pio_2lo, 0xb33bbd2e, 0x3c91a62633145c07) XSIMD_DEFINE_CONSTANT_HEX(pio_4lo, 0xb2bbbd2e, 0x3c81a62633145c07) XSIMD_DEFINE_CONSTANT_HEX(pio2, 0x3fc90fdb, 0x3ff921fb54442d18) XSIMD_DEFINE_CONSTANT_HEX(pio2_1, 0x3fc90f80, 0x3ff921fb54400000) XSIMD_DEFINE_CONSTANT_HEX(pio2_1t, 0x37354443, 0x3dd0b4611a626331) XSIMD_DEFINE_CONSTANT_HEX(pio2_2, 0x37354400, 0x3dd0b4611a600000) XSIMD_DEFINE_CONSTANT_HEX(pio2_2t, 0x2e85a308, 0x3ba3198a2e037073) XSIMD_DEFINE_CONSTANT_HEX(pio2_3, 0x2e85a300, 0x3ba3198a2e000000) XSIMD_DEFINE_CONSTANT_HEX(pio2_3t, 0x248d3132, 0x397b839a252049c1) XSIMD_DEFINE_CONSTANT_HEX(pio4, 0x3f490fdb, 0x3fe921fb54442d18) XSIMD_DEFINE_CONSTANT_HEX(signmask, 0x80000000, 0x8000000000000000) XSIMD_DEFINE_CONSTANT(smallestposval, std::numeric_limits::min(), std::numeric_limits::min()) XSIMD_DEFINE_CONSTANT_HEX(sqrt_2pi, 0x40206c99, 0x40040d931ff62704) XSIMD_DEFINE_CONSTANT_HEX(sqrteps, 0x39b504f3, 0x3e50000000000000) XSIMD_DEFINE_CONSTANT_HEX(tanpio8, 0x3ed413cd, 0x3fda827999fcef31) XSIMD_DEFINE_CONSTANT_HEX(tan3pio8, 0x401a827a, 0x4003504f333f9de6) XSIMD_DEFINE_CONSTANT_HEX(twentypi, 0x427b53d1, 0x404f6a7a2955385e) XSIMD_DEFINE_CONSTANT_HEX(twoopi, 0x3f22f983, 0x3fe45f306dc9c883) XSIMD_DEFINE_CONSTANT(twotonmb, 8388608.0f, 4503599627370496.0) XSIMD_DEFINE_CONSTANT_HEX(twotonmbo3, 0x3ba14518, 0x3ed428a2f98d7286) #if defined(__FAST_MATH__) && defined(__GNUC__) && !defined(__clang__) #pragma GCC pop_options #endif #undef XSIMD_DEFINE_CONSTANT #undef XSIMD_DEFINE_CONSTANT_HEX template constexpr T allbits() noexcept; template constexpr as_integer_t mask1frexp() noexcept; template constexpr as_integer_t mask2frexp() noexcept; template constexpr as_integer_t maxexponent() noexcept; template constexpr as_integer_t maxexponentm1() noexcept; template constexpr int32_t nmb() noexcept; template constexpr T zero() noexcept; template constexpr T minvalue() noexcept; template constexpr T maxvalue() noexcept; /************************** * allbits implementation * **************************/ namespace detail { template ::value> struct allbits_impl { static constexpr T get_value() noexcept { return T(~0); } }; template struct allbits_impl { static constexpr T get_value() noexcept { return nan(); } }; } template inline constexpr T allbits() noexcept { return T(detail::allbits_impl::get_value()); } /***************************** * mask1frexp implementation * *****************************/ template inline constexpr as_integer_t mask1frexp() noexcept { return as_integer_t(mask1frexp()); } template <> inline constexpr int32_t mask1frexp() noexcept { return 0x7f800000; } template <> inline constexpr int64_t mask1frexp() noexcept { return 0x7ff0000000000000; } /***************************** * mask2frexp implementation * *****************************/ template inline constexpr as_integer_t mask2frexp() noexcept { return as_integer_t(mask2frexp()); } template <> inline constexpr int32_t mask2frexp() noexcept { return 0x3f000000; } template <> inline constexpr int64_t mask2frexp() noexcept { return 0x3fe0000000000000; } /****************************** * maxexponent implementation * ******************************/ template inline constexpr as_integer_t maxexponent() noexcept { return as_integer_t(maxexponent()); } template <> inline constexpr int32_t maxexponent() noexcept { return 127; } template <> inline constexpr int64_t maxexponent() noexcept { return 1023; } /****************************** * maxexponent implementation * ******************************/ template inline constexpr as_integer_t maxexponentm1() noexcept { return as_integer_t(maxexponentm1()); } template <> inline constexpr int32_t maxexponentm1() noexcept { return 126; } template <> inline constexpr int64_t maxexponentm1() noexcept { return 1022; } /********************** * nmb implementation * **********************/ template inline constexpr int32_t nmb() noexcept { return nmb(); } template <> inline constexpr int32_t nmb() noexcept { return 23; } template <> inline constexpr int32_t nmb() noexcept { return 52; } /*********************** * zero implementation * ***********************/ template inline constexpr T zero() noexcept { return T(typename T::value_type(0)); } /*************************** * minvalue implementation * ***************************/ namespace detail { template struct minvalue_impl { static constexpr T get_value() noexcept { return std::numeric_limits::min(); } }; template struct minvalue_common { static constexpr T get_value() noexcept { return std::numeric_limits::min(); } }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl { inline static float get_value() noexcept { return bit_cast((uint32_t)0xff7fffff); } }; template <> struct minvalue_impl { inline static double get_value() noexcept { return bit_cast((uint64_t)0xffefffffffffffff); } }; } template constexpr T minvalue() noexcept { return T(detail::minvalue_impl::get_value()); } /*************************** * maxvalue implementation * ***************************/ template constexpr T maxvalue() noexcept { return T(std::numeric_limits::max()); } } } #endif xsimd-12.1.1/include/xsimd/arch/xsimd_fma3_avx.hpp000066400000000000000000000056621453610362700220450ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_FMA3_AVX_HPP #define XSIMD_FMA3_AVX_HPP #include "../types/xsimd_fma3_avx_register.hpp" namespace xsimd { namespace kernel { using namespace types; // fnma template inline batch fnma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fnmadd_ps(x, y, z); } template inline batch fnma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fnmadd_pd(x, y, z); } // fnms template inline batch fnms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fnmsub_ps(x, y, z); } template inline batch fnms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fnmsub_pd(x, y, z); } // fma template inline batch fma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fmadd_ps(x, y, z); } template inline batch fma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fmadd_pd(x, y, z); } // fms template inline batch fms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fmsub_ps(x, y, z); } template inline batch fms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fmsub_pd(x, y, z); } } } #endif xsimd-12.1.1/include/xsimd/arch/xsimd_fma3_avx2.hpp000066400000000000000000000030701453610362700221160ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_FMA3_AVX2_HPP #define XSIMD_FMA3_AVX2_HPP #include "../types/xsimd_fma3_avx2_register.hpp" // Allow inclusion of xsimd_fma3_avx.hpp #ifdef XSIMD_FMA3_AVX_HPP #undef XSIMD_FMA3_AVX_HPP #define XSIMD_FORCE_FMA3_AVX_HPP #endif // Disallow inclusion of ./xsimd_fma3_avx_register.hpp #ifndef XSIMD_FMA3_AVX_REGISTER_HPP #define XSIMD_FMA3_AVX_REGISTER_HPP #define XSIMD_FORCE_FMA3_AVX_REGISTER_HPP #endif // Include ./xsimd_fma3_avx.hpp but s/avx/avx2 #define avx avx2 #include "./xsimd_fma3_avx.hpp" #undef avx #undef XSIMD_FMA3_AVX_HPP // Carefully restore guards #ifdef XSIMD_FORCE_FMA3_AVX_HPP #define XSIMD_FMA3_AVX_HPP #undef XSIMD_FORCE_FMA3_AVX_HPP #endif #ifdef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP #undef XSIMD_FMA3_AVX_REGISTER_HPP #undef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP #endif #endif xsimd-12.1.1/include/xsimd/arch/xsimd_fma3_sse.hpp000066400000000000000000000056611453610362700220400ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_FMA3_SSE_HPP #define XSIMD_FMA3_SSE_HPP #include "../types/xsimd_fma3_sse_register.hpp" namespace xsimd { namespace kernel { using namespace types; // fnma template inline batch fnma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fnmadd_ps(x, y, z); } template inline batch fnma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fnmadd_pd(x, y, z); } // fnms template inline batch fnms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fnmsub_ps(x, y, z); } template inline batch fnms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fnmsub_pd(x, y, z); } // fma template inline batch fma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fmadd_ps(x, y, z); } template inline batch fma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fmadd_pd(x, y, z); } // fms template inline batch fms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fmsub_ps(x, y, z); } template inline batch fms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fmsub_pd(x, y, z); } } } #endif xsimd-12.1.1/include/xsimd/arch/xsimd_fma4.hpp000066400000000000000000000060351453610362700211630ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_FMA4_HPP #define XSIMD_FMA4_HPP #include "../types/xsimd_fma4_register.hpp" namespace xsimd { namespace kernel { using namespace types; // fnma template inline batch fnma(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept { return _mm_nmacc_ps(x, y, z); } template inline batch fnma(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept { return _mm_nmacc_pd(x, y, z); } // fnms template inline batch fnms(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept { return _mm_nmsub_ps(x, y, z); } template inline batch fnms(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept { return _mm_nmsub_pd(x, y, z); } // fma template inline batch fma(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept { return _mm_macc_ps(x, y, z); } template inline batch fma(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept { return _mm_macc_pd(x, y, z); } // fms template inline batch fms(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept { return _mm_msub_ps(x, y, z); } template inline batch fms(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept { return _mm_msub_pd(x, y, z); } } } #endif xsimd-12.1.1/include/xsimd/arch/xsimd_generic.hpp000066400000000000000000000022211453610362700217410ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_GENERIC_HPP #define XSIMD_GENERIC_HPP #include "./generic/xsimd_generic_arithmetic.hpp" #include "./generic/xsimd_generic_complex.hpp" #include "./generic/xsimd_generic_logical.hpp" #include "./generic/xsimd_generic_math.hpp" #include "./generic/xsimd_generic_memory.hpp" #include "./generic/xsimd_generic_rounding.hpp" #include "./generic/xsimd_generic_trigo.hpp" #endif xsimd-12.1.1/include/xsimd/arch/xsimd_generic_fwd.hpp000066400000000000000000000052131453610362700226050ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_GENERIC_FWD_HPP #define XSIMD_GENERIC_FWD_HPP #include "../types/xsimd_batch_constant.hpp" #include namespace xsimd { namespace kernel { // forward declaration template ::value, void>::type> inline batch abs(batch const& self, requires_arch) noexcept; template ::value, void>::type> inline batch bitwise_lshift(batch const& self, batch const& other, requires_arch) noexcept; template ::value, void>::type> inline batch bitwise_rshift(batch const& self, batch const& other, requires_arch) noexcept; template inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept; template ::value, void>::type> inline batch mul(batch const& self, batch const& other, requires_arch) noexcept; template ::value, void>::type> inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept; template ::value, void>::type> inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept; template ::value, void>::type> inline T hadd(batch const& self, requires_arch) noexcept; } } #endif xsimd-12.1.1/include/xsimd/arch/xsimd_isa.hpp000066400000000000000000000050041453610362700211030ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_ISA_HPP #define XSIMD_ISA_HPP #include "../config/xsimd_arch.hpp" #include "./xsimd_generic_fwd.hpp" #if XSIMD_WITH_SSE2 #include "./xsimd_sse2.hpp" #endif #if XSIMD_WITH_SSE3 #include "./xsimd_sse3.hpp" #endif #if XSIMD_WITH_SSSE3 #include "./xsimd_ssse3.hpp" #endif #if XSIMD_WITH_SSE4_1 #include "./xsimd_sse4_1.hpp" #endif #if XSIMD_WITH_SSE4_2 #include "./xsimd_sse4_2.hpp" #endif #if XSIMD_WITH_FMA3_SSE #include "./xsimd_fma3_sse.hpp" #endif #if XSIMD_WITH_FMA4 #include "./xsimd_fma4.hpp" #endif #if XSIMD_WITH_AVX #include "./xsimd_avx.hpp" #endif #if XSIMD_WITH_FMA3_AVX #include "./xsimd_fma3_avx.hpp" #endif #if XSIMD_WITH_AVXVNNI #include "./xsimd_avxvnni.hpp" #endif #if XSIMD_WITH_AVX2 #include "./xsimd_avx2.hpp" #endif #if XSIMD_WITH_FMA3_AVX2 #include "./xsimd_fma3_avx2.hpp" #endif #if XSIMD_WITH_AVX512F #include "./xsimd_avx512f.hpp" #endif #if XSIMD_WITH_AVX512BW #include "./xsimd_avx512bw.hpp" #endif #if XSIMD_WITH_AVX512ER #include "./xsimd_avx512er.hpp" #endif #if XSIMD_WITH_AVX512PF #include "./xsimd_avx512pf.hpp" #endif #if XSIMD_WITH_AVX512IFMA #include "./xsimd_avx512ifma.hpp" #endif #if XSIMD_WITH_AVX512VBMI #include "./xsimd_avx512vbmi.hpp" #endif #if XSIMD_WITH_AVX512VNNI_AVX512BW #include "./xsimd_avx512vnni_avx512bw.hpp" #endif #if XSIMD_WITH_AVX512VNNI_AVX512VBMI #include "./xsimd_avx512vnni_avx512vbmi.hpp" #endif #if XSIMD_WITH_NEON #include "./xsimd_neon.hpp" #endif #if XSIMD_WITH_NEON64 #include "./xsimd_neon64.hpp" #endif #if XSIMD_WITH_SVE #include "./xsimd_sve.hpp" #endif #if XSIMD_WITH_RVV #include "./xsimd_rvv.hpp" #endif #if XSIMD_WITH_WASM #include "./xsimd_wasm.hpp" #endif // Must come last to have access to all conversion specializations. #include "./xsimd_generic.hpp" #endif xsimd-12.1.1/include/xsimd/arch/xsimd_neon.hpp000066400000000000000000003472371453610362700213070ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NEON_HPP #define XSIMD_NEON_HPP #include #include #include #include #include "../types/xsimd_neon_register.hpp" #include "../types/xsimd_utils.hpp" // Wrap intrinsics so we can pass them as function pointers // - OP: intrinsics name prefix, e.g., vorrq // - RT: type traits to deduce intrinsics return types #define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ namespace wrap \ { \ inline RT OP##_u8(uint8x16_t a, uint8x16_t b) noexcept \ { \ return ::OP##_u8(a, b); \ } \ inline RT OP##_s8(int8x16_t a, int8x16_t b) noexcept \ { \ return ::OP##_s8(a, b); \ } \ inline RT OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \ { \ return ::OP##_u16(a, b); \ } \ inline RT OP##_s16(int16x8_t a, int16x8_t b) noexcept \ { \ return ::OP##_s16(a, b); \ } \ inline RT OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \ { \ return ::OP##_u32(a, b); \ } \ inline RT OP##_s32(int32x4_t a, int32x4_t b) noexcept \ { \ return ::OP##_s32(a, b); \ } \ } #define WRAP_BINARY_INT(OP, RT) \ WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ namespace wrap \ { \ inline RT OP##_u64(uint64x2_t a, uint64x2_t b) noexcept \ { \ return ::OP##_u64(a, b); \ } \ inline RT OP##_s64(int64x2_t a, int64x2_t b) noexcept \ { \ return ::OP##_s64(a, b); \ } \ } #define WRAP_BINARY_FLOAT(OP, RT) \ namespace wrap \ { \ inline RT OP##_f32(float32x4_t a, float32x4_t b) noexcept \ { \ return ::OP##_f32(a, b); \ } \ } #define WRAP_UNARY_INT_EXCLUDING_64(OP) \ namespace wrap \ { \ inline uint8x16_t OP##_u8(uint8x16_t a) noexcept \ { \ return ::OP##_u8(a); \ } \ inline int8x16_t OP##_s8(int8x16_t a) noexcept \ { \ return ::OP##_s8(a); \ } \ inline uint16x8_t OP##_u16(uint16x8_t a) noexcept \ { \ return ::OP##_u16(a); \ } \ inline int16x8_t OP##_s16(int16x8_t a) noexcept \ { \ return ::OP##_s16(a); \ } \ inline uint32x4_t OP##_u32(uint32x4_t a) noexcept \ { \ return ::OP##_u32(a); \ } \ inline int32x4_t OP##_s32(int32x4_t a) noexcept \ { \ return ::OP##_s32(a); \ } \ } #define WRAP_UNARY_INT(OP) \ WRAP_UNARY_INT_EXCLUDING_64(OP) \ namespace wrap \ { \ inline uint64x2_t OP##_u64(uint64x2_t a) noexcept \ { \ return ::OP##_u64(a); \ } \ inline int64x2_t OP##_s64(int64x2_t a) noexcept \ { \ return ::OP##_s64(a); \ } \ } #define WRAP_UNARY_FLOAT(OP) \ namespace wrap \ { \ inline float32x4_t OP##_f32(float32x4_t a) noexcept \ { \ return ::OP##_f32(a); \ } \ } // Dummy identity caster to ease coding inline uint8x16_t vreinterpretq_u8_u8(uint8x16_t arg) noexcept { return arg; } inline int8x16_t vreinterpretq_s8_s8(int8x16_t arg) noexcept { return arg; } inline uint16x8_t vreinterpretq_u16_u16(uint16x8_t arg) noexcept { return arg; } inline int16x8_t vreinterpretq_s16_s16(int16x8_t arg) noexcept { return arg; } inline uint32x4_t vreinterpretq_u32_u32(uint32x4_t arg) noexcept { return arg; } inline int32x4_t vreinterpretq_s32_s32(int32x4_t arg) noexcept { return arg; } inline uint64x2_t vreinterpretq_u64_u64(uint64x2_t arg) noexcept { return arg; } inline int64x2_t vreinterpretq_s64_s64(int64x2_t arg) noexcept { return arg; } inline float32x4_t vreinterpretq_f32_f32(float32x4_t arg) noexcept { return arg; } namespace xsimd { template struct batch_bool_constant; namespace kernel { using namespace types; namespace detail { template