pax_global_header00006660000000000000000000000064141010123450014501gustar00rootroot0000000000000052 comment=b948624424c1ce297e0d7a3380a114f4c245b190 xsimd-7.6.0/000077500000000000000000000000001410101234500126375ustar00rootroot00000000000000xsimd-7.6.0/.appveyor.yml000066400000000000000000000031471410101234500153120ustar00rootroot00000000000000build: false os: Visual Studio 2015 platform: - x64 environment: global: MINICONDA: C:\xsimd-conda matrix: - JOB: "AVX2" CXXFLAGS: "/arch:AVX2" VCVARSALL: "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\vcvarsall.bat" RUNTEST: ".\\test_xsimd" - JOB: "AVX512" CXXFLAGS: "/arch:AVX512" APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 VCVARSALL: "C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\VC\\Auxiliary\\Build\\vcvarsall.bat" RUNTEST: "ECHO" - JOB: "AVX2" CXXFLAGS: "/arch:AVX2" APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019 VCVARSALL: "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Auxiliary\\Build\\vcvarsall.bat" init: - "ECHO %MINICONDA%" - call "%VCVARSALL%" %PLATFORM% - ps: if($env:Platform -eq "x64"){Start-FileDownload 'http://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86_64.exe' C:\Miniconda.exe; echo "Done"} - ps: if($env:Platform -eq "x86"){Start-FileDownload 'http://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86.exe' C:\Miniconda.exe; echo "Done"} - cmd: C:\Miniconda.exe /S /D=C:\xsimd-conda - "set PATH=%MINICONDA%;%MINICONDA%\\Scripts;%MINICONDA%\\Library\\bin;%PATH%" install: - conda config --set always_yes yes --set changeps1 no - conda update -q conda - conda info -a - conda install cmake xtl -c conda-forge - cmake -G "NMake Makefiles" -D DOWNLOAD_GTEST=1 -D ENABLE_XTL_COMPLEX=1 -D CMAKE_INSTALL_PREFIX=%MINICONDA%\\LIBRARY -D CMAKE_BUILD_TYPE=Release . - nmake test_xsimd - cd test build_script: - "%RUNTEST%" xsimd-7.6.0/.azure-pipelines/000077500000000000000000000000001410101234500160315ustar00rootroot00000000000000xsimd-7.6.0/.azure-pipelines/azure-pipelines-linux-clang.yml000066400000000000000000000045771410101234500241240ustar00rootroot00000000000000jobs: - job: 'Linux_0' strategy: matrix: clang_4_armv7: llvm_version: '4.0' cross_compile: 1 arm_arch_dir: 'arm-linux-gnueabi' arm_arch_target: 'armv7-a' clang_4_armv8: llvm_version: '4.0' cross_compile: 1 arm_arch_dir: 'arm-linux-gnueabi' arm_arch_target: 'armv8-a' clang_5: llvm_version: '5.0' clang_6: llvm_version: '6.0' clang_7: llvm_version: '7' clang_8: llvm_version: '8' clang_9: llvm_version: '9' clang_10_avx512: llvm_version: '10' avx512: 1 pool: vmImage: ubuntu-16.04 variables: CC: clang-$(llvm_version) CXX: clang++-$(llvm_version) timeoutInMinutes: 30 steps: - script: | sudo add-apt-repository ppa:ubuntu-toolchain-r/test if [[ $(llvm_version) == '4.0' || $(llvm_version) == '5.0' ]]; then sudo apt-get update sudo apt-get --no-install-suggests --no-install-recommends install clang-$(llvm_version) if [[ $(cross_compile) == 1 ]]; then if [[ $(arm_arch_dir) == 'aarch64-linux-gnu' ]]; then sudo apt-get --no-install-suggests --no-install-recommends install g++-4.9-aarch64-linux-gnu gcc-4.9-aarch64-linux-gnu else sudo apt-get --no-install-suggests --no-install-recommends install g++-4.9-arm-linux-gnueabi fi sudo apt-get --no-install-suggests --no-install-recommends install g++-4.9-multilib gcc-4.9-multilib sudo apt-get --no-install-suggests --no-install-recommends install qemu qemu-system-arm else sudo apt-get --no-install-suggests --no-install-recommends install gcc-4.9 fi else LLVM_VERSION=$(llvm_version) get -O - http://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - sudo add-apt-repository "deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-$LLVM_VERSION main" sudo apt-get update sudo apt-get --no-install-suggests --no-install-recommends install clang-$(llvm_version) fi displayName: Install build toolchain - bash: echo "##vso[task.prependpath]$CONDA/bin" displayName: Add conda to PATH - template: unix-build.yml xsimd-7.6.0/.azure-pipelines/azure-pipelines-linux-gcc.yml000066400000000000000000000033471410101234500235660ustar00rootroot00000000000000jobs: - job: 'Linux_1' strategy: matrix: gcc_4: gcc_version: '4.9' gcc_5: gcc_version: '5' gcc_6_force_no_instr_set: gcc_version: '6' force_no_instr_set: 1 gcc_6_enable_xtl_complex: gcc_version: '6' enable_xtl_complex: 1 gcc_7_avx512: gcc_version: '7' avx512: 1 gcc_4_aarch64: cross_compile: 1 gcc_version: '4.9' arm_arch_dir: 'aarch64-linux-gnu' arm_arch_target: 'armv8-a' gcc_8_enable_fallback: gcc_version: '8' enable_xtl_complex: 1 enable_fallback: 1 gcc_9: gcc_version: '9' pool: vmImage: ubuntu-16.04 variables: CC: gcc-$(gcc_version) CXX: g++-$(gcc_version) timeoutInMinutes: 30 steps: - script: | if [[ $(gcc_version) == '4.9' || $(gcc_version) == '6' || $(gcc_version) == '7' || $(gcc_version) == '8' ]]; then sudo add-apt-repository ppa:ubuntu-toolchain-r/test sudo apt-get update sudo apt-get --no-install-suggests --no-install-recommends install g++-$(gcc_version) fi if [[ $(cross_compile) == 1 ]]; then if [[ $(arm_arch_dir) == 'aarch64-linux-gnu' ]]; then sudo apt-get --no-install-suggests --no-install-recommends install g++-4.9-aarch64-linux-gnu gcc-4.9-aarch64-linux-gnu sudo apt-get --no-install-suggests --no-install-recommends install qemu qemu-system-aarch64 fi fi displayName: Install build toolchain - bash: echo "##vso[task.prependpath]$CONDA/bin" displayName: Add conda to PATH - template: unix-build.yml xsimd-7.6.0/.azure-pipelines/azure-pipelines-win.yml000066400000000000000000000055531410101234500224730ustar00rootroot00000000000000jobs: # Configure, build, install, and test job - job: 'Windows_clangcl' pool: vmImage: 'vs2017-win2016' timeoutInMinutes: 360 steps: # Install Chocolatey (https://chocolatey.org/install#install-with-powershellexe) - powershell: | Set-ExecutionPolicy Bypass -Scope Process -Force iex ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1')) Write-Host "##vso[task.setvariable variable=PATH]$env:PATH" choco --version displayName: "Install Chocolatey" # Install Miniconda - script: | choco install miniconda3 --yes set PATH=C:\tools\miniconda3\Scripts;C:\tools\miniconda3;C:\tools\miniconda3\Library\bin;%PATH% echo '##vso[task.setvariable variable=PATH]%PATH%' set LIB=C:\tools\miniconda3\Library\lib;%LIB% echo '##vso[task.setvariable variable=LIB]%LIB%' conda --version displayName: "Install Miniconda" # Configure Miniconda - script: | conda config --set always_yes yes conda config --append channels conda-forge conda info displayName: "Configure Miniconda" # Create conda enviroment # Note: conda activate doesn't work here, because it creates a new shell! - script: | conda install cmake=3.14.0 ^ ninja ^ python=3.6 conda list displayName: "Install conda packages" # Install LLVM # Note: LLVM distributed by conda is too old - script: | choco install llvm --yes set PATH=C:\Program Files\LLVM\bin;%PATH% echo '##vso[task.setvariable variable=PATH]%PATH%' clang-cl --version displayName: "Install LLVM" # Configure - script: | setlocal EnableDelayedExpansion call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" x86_amd64 mkdir build & cd build cmake -G Ninja ^ -DCMAKE_BUILD_TYPE=Release ^ -DCMAKE_C_COMPILER=clang-cl ^ -DCMAKE_CXX_COMPILER=clang-cl ^ -DDOWNLOAD_GTEST=ON ^ $(Build.SourcesDirectory) displayName: "Configure xsimd" workingDirectory: $(Build.BinariesDirectory) # Build - script: | call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" x86_amd64 cmake --build . ^ --config Release ^ --target test_xsimd ^ -- -v displayName: "Build xsimd" workingDirectory: $(Build.BinariesDirectory)/build # Test - script: | setlocal EnableDelayedExpansion cd test .\test_xsimd displayName: "Test xsimd" workingDirectory: $(Build.BinariesDirectory)/build/test xsimd-7.6.0/.azure-pipelines/unix-build.yml000066400000000000000000000051731410101234500206420ustar00rootroot00000000000000steps: - script: | conda config --set always_yes yes --set changeps1 no conda update -q conda conda create -n xsimd source activate xsimd if [[ $(enable_xtl_complex) == 1 ]]; then conda install xtl -c conda-forge fi if test "x$(avx512)" = "x1" ; then sh $(Build.SourcesDirectory)/install_sde.sh fi displayName: Install dependencies workingDirectory: $(Build.BinariesDirectory) - script: | source activate xsimd mkdir build cd build if [[ $(cross_compile) == 1 ]]; then if [[ $(arm_arch_dir) == 'aarch64-linux-gnu' ]]; then # this arch is using gcc as a cross compiler CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DCROSS_COMPILE_ARM=ON -DARM_ARCH_DIRECTORY=$(arm_arch_dir) -DARM_GCC_VER=4.9.3 -DTARGET_ARCH=$(arm_arch_target) -DCMAKE_LINKER=aarch64-linux-gnu-ld -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++-4.9 -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc-4.9"; else # while this one uses clang CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DCROSS_COMPILE_ARM=ON -DARM_ARCH_DIRECTORY=$(arm_arch_dir) -DARM_GCC_VER=4.9.3 -DTARGET_ARCH=$(arm_arch_target)"; fi fi if [[ $(enable_xtl_complex) == 1 ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DENABLE_XTL_COMPLEX=ON"; fi if [[ $(enable_fallback) == 1 ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DENABLE_FALLBACK=ON"; fi if [[ $(avx512) == 1 ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" fi if [[ $(force_no_instr_set) == 1 ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DXSIMD_FORCE_X86_INSTR_SET=0 -DXSIMD_FORCE_X86_AMD_INSTR_SET=0"; fi cmake -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX $CMAKE_EXTRA_ARGS -DDOWNLOAD_GTEST=ON -DXSIMD_ENABLE_WERROR=1 $(Build.SourcesDirectory) displayName: Configure xsimd workingDirectory: $(Build.BinariesDirectory) - script: | source activate xsimd make -j2 test_xsimd displayName: Build xsimd workingDirectory: $(Build.BinariesDirectory)/build - script: | source activate xsimd if [[ $(avx512) == 1 ]]; then ../../sde-external-8.56.0-2020-07-05-lin/sde64 -skx -- ./test_xsimd elif [[ $(cross_compile) == 1 ]]; then echo $(arm_arch_dir); if [[ $(arm_arch_dir) == 'aarch64-linux-gnu' ]]; then qemu-aarch64 -L /usr/aarch64-linux-gnu/ ./test_xsimd else qemu-arm -L /usr/arm-linux-gnueabi/ ./test_xsimd fi else ./test_xsimd fi displayName: Test xsimd workingDirectory: $(Build.BinariesDirectory)/build/test xsimd-7.6.0/.gitignore000066400000000000000000000006161410101234500146320ustar00rootroot00000000000000# Generated pkg-config files *.pc # Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app # Vim tmp files *.swp # Build folder build/ # Documentation build artefacts docs/CMakeCache.txt docs/xml/ docs/build/ xsimd-7.6.0/CMakeLists.txt000066400000000000000000000213221410101234500153770ustar00rootroot00000000000000############################################################################ # Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and # # Martin Renou # # Copyright (c) QuantStack # # # # Distributed under the terms of the BSD 3-Clause License. # # # # The full license is in the file LICENSE, distributed with this software. # ############################################################################ cmake_minimum_required(VERSION 3.1) project(xsimd) set(XSIMD_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) # Versionning # =========== file(STRINGS "${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_config.hpp" xsimd_version_defines REGEX "#define XSIMD_VERSION_(MAJOR|MINOR|PATCH)") foreach(ver ${xsimd_version_defines}) if(ver MATCHES "#define XSIMD_VERSION_(MAJOR|MINOR|PATCH) +([^ ]+)$") set(XSIMD_VERSION_${CMAKE_MATCH_1} "${CMAKE_MATCH_2}" CACHE INTERNAL "") endif() endforeach() set(${PROJECT_NAME}_VERSION ${XSIMD_VERSION_MAJOR}.${XSIMD_VERSION_MINOR}.${XSIMD_VERSION_PATCH}) message(STATUS "xsimd v${${PROJECT_NAME}_VERSION}") # Build # ===== set(XSIMD_HEADERS ${XSIMD_INCLUDE_DIR}/xsimd/xsimd.hpp ${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_align.hpp ${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_config.hpp ${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_include.hpp ${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_instruction_set.hpp ${XSIMD_INCLUDE_DIR}/xsimd/math/xsimd_basic_math.hpp ${XSIMD_INCLUDE_DIR}/xsimd/math/xsimd_error.hpp ${XSIMD_INCLUDE_DIR}/xsimd/math/xsimd_exp_reduction.hpp ${XSIMD_INCLUDE_DIR}/xsimd/math/xsimd_exponential.hpp ${XSIMD_INCLUDE_DIR}/xsimd/math/xsimd_fp_manipulation.hpp ${XSIMD_INCLUDE_DIR}/xsimd/math/xsimd_fp_sign.hpp ${XSIMD_INCLUDE_DIR}/xsimd/math/xsimd_gamma.hpp ${XSIMD_INCLUDE_DIR}/xsimd/math/xsimd_horner.hpp ${XSIMD_INCLUDE_DIR}/xsimd/math/xsimd_hyperbolic.hpp ${XSIMD_INCLUDE_DIR}/xsimd/math/xsimd_invtrigo.hpp ${XSIMD_INCLUDE_DIR}/xsimd/math/xsimd_logarithm.hpp ${XSIMD_INCLUDE_DIR}/xsimd/math/xsimd_math.hpp ${XSIMD_INCLUDE_DIR}/xsimd/math/xsimd_math_complex.hpp ${XSIMD_INCLUDE_DIR}/xsimd/math/xsimd_numerical_constant.hpp ${XSIMD_INCLUDE_DIR}/xsimd/math/xsimd_power.hpp ${XSIMD_INCLUDE_DIR}/xsimd/math/xsimd_rem_pio2.hpp ${XSIMD_INCLUDE_DIR}/xsimd/math/xsimd_rounding.hpp ${XSIMD_INCLUDE_DIR}/xsimd/math/xsimd_trigo_reduction.hpp ${XSIMD_INCLUDE_DIR}/xsimd/math/xsimd_trigonometric.hpp ${XSIMD_INCLUDE_DIR}/xsimd/memory/xsimd_aligned_allocator.hpp ${XSIMD_INCLUDE_DIR}/xsimd/memory/xsimd_aligned_stack_buffer.hpp ${XSIMD_INCLUDE_DIR}/xsimd/memory/xsimd_alignment.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx_complex.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx_conversion.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx_double.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx_float.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx_int8.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx_int16.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx_int32.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx_int64.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx_int_base.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx512_bool.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx512_complex.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx512_conversion.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx512_double.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx512_float.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx512_int8.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx512_int16.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx512_int32.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx512_int64.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx512_int_base.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_neon_bool.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_neon_complex.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_neon_conversion.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_neon_double.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_neon_float.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_neon_int8.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_neon_int16.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_neon_int32.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_neon_int64.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_neon_uint8.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_neon_uint16.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_neon_uint32.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_neon_uint64.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_neon_utils.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_base.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_base_bool.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_complex_base.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_fallback.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_int_conversion.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse_complex.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse_conversion.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse_double.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse_float.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse_int8.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse_int16.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse_int32.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse_int64.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse_int_base.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_traits.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_types_include.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_utils.hpp ) add_library(xsimd INTERFACE) target_include_directories(xsimd INTERFACE $ $) target_compile_features(xsimd INTERFACE cxx_std_11) OPTION(ENABLE_FALLBACK "build tests/benchmarks with fallback implementation" OFF) OPTION(ENABLE_XTL_COMPLEX "enables support for xcomplex defined in xtl" OFF) OPTION(BUILD_TESTS "xsimd test suite" OFF) OPTION(DOWNLOAD_GTEST "build gtest from downloaded sources" OFF) if(DOWNLOAD_GTEST OR GTEST_SRC_DIR) set(BUILD_TESTS ON) endif() if(ENABLE_FALLBACK) add_definitions(-DXSIMD_ENABLE_FALLBACK=1) endif() if(ENABLE_XTL_COMPLEX) add_definitions(-DXSIMD_ENABLE_XTL_COMPLEX=1) find_package(xtl 0.4.11 REQUIRED) target_link_libraries(xsimd INTERFACE xtl) endif() if(BUILD_TESTS) enable_testing() add_subdirectory(test) endif() OPTION(BUILD_BENCHMARK "xsimd benchmarks" OFF) if(BUILD_BENCHMARK) add_subdirectory(benchmark) endif() OPTION(BUILD_EXAMPLES "xsimd examples" OFF) if(BUILD_EXAMPLES) add_subdirectory(examples) endif() # Installation # ============ include(GNUInstallDirs) include(CMakePackageConfigHelpers) install(TARGETS xsimd EXPORT ${PROJECT_NAME}-targets) # Makes the project importable from the build directory export(EXPORT ${PROJECT_NAME}-targets FILE "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Targets.cmake") install(DIRECTORY ${XSIMD_INCLUDE_DIR}/xsimd DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". set(XSIMD_CMAKECONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}" CACHE STRING "install path for xsimdConfig.cmake") configure_package_config_file(${PROJECT_NAME}Config.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" INSTALL_DESTINATION ${XSIMD_CMAKECONFIG_INSTALL_DIR}) # xsimd is header-only and does not depend on the architecture. # Remove CMAKE_SIZEOF_VOID_P from xtensorConfigVersion.cmake so that an xtensorConfig.cmake # generated for a 64 bit target can be used for 32 bit targets and vice versa. set(_XTENSOR_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P}) unset(CMAKE_SIZEOF_VOID_P) write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake VERSION ${${PROJECT_NAME}_VERSION} COMPATIBILITY SameMajorVersion) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake DESTINATION ${XSIMD_CMAKECONFIG_INSTALL_DIR}) install(EXPORT ${PROJECT_NAME}-targets FILE ${PROJECT_NAME}Targets.cmake DESTINATION ${XSIMD_CMAKECONFIG_INSTALL_DIR}) configure_file(${PROJECT_NAME}.pc.in "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc" @ONLY) install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc" DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") xsimd-7.6.0/LICENSE000066400000000000000000000030441410101234500136450ustar00rootroot00000000000000Copyright (c) 2016, Johan Mabille, Sylvain Corlay, Wolf Vollprecht and Martin Renou Copyright (c) 2016, QuantStack All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsimd-7.6.0/README.md000066400000000000000000000150241410101234500141200ustar00rootroot00000000000000# ![xsimd](docs/source/xsimd.svg) [![Appveyor](https://ci.appveyor.com/api/projects/status/wori7my48os31nu0?svg=true)](https://ci.appveyor.com/project/xtensor-stack/xsimd) [![Azure](https://dev.azure.com/xtensor-stack/xtensor-stack/_apis/build/status/xtensor-stack.xsimd?branchName=master)](https://dev.azure.com/xtensor-stack/xtensor-stack/_build/latest?definitionId=3&branchName=master) [![Documentation Status](http://readthedocs.org/projects/xsimd/badge/?version=latest)](https://xsimd.readthedocs.io/en/latest/?badge=latest) [![Join the Gitter Chat](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/QuantStack/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) C++ wrappers for SIMD intrinsics ## Introduction SIMD (Single Instruction, Multiple Data) is a feature of microprocessors that has been available for many years. SIMD instructions perform a single operation on a batch of values at once, and thus provide a way to significantly accelerate code execution. However, these instructions differ between microprocessor vendors and compilers. `xsimd` provides a unified means for using these features for library authors. Namely, it enables manipulation of batches of numbers with the same arithmetic operators as for single values. It also provides accelerated implementation of common mathematical functions operating on batches. You can find out more about this implementation of C++ wrappers for SIMD intrinsics at the [The C++ Scientist](http://johanmabille.github.io/blog/archives/). The mathematical functions are a lightweight implementation of the algorithms used in [boost.SIMD](https://github.com/NumScale/boost.simd). `xsimd` requires a C++11 compliant compiler. The following C++ compilers are supported: Compiler | Version ------------------------|------------------------------- Microsoft Visual Studio | MSVC 2015 update 2 and above g++ | 4.9 and above clang | 4.0 and above The following SIMD instruction set extensions are supported: Architecture | Instruction set extensions -------------|----------------------------------------------------- x86 | SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, FMA3, AVX2 x86 | AVX512 (gcc7 and higher) x86 AMD | same as above + SSE4A, FMA4, XOP ARM | ARMv7, ARMv8 ## Installation ### Install from conda-forge A package for xsimd is available on the mamba (or conda) package manager. ```bash mamba install -c conda-forge xsimd ``` ### Install with Spack A package for xsimd is available on the Spack package manager. ```bash spack install xsimd spack load xsimd ``` ### Install from sources You can directly install it from the sources with cmake: ```bash cmake -D CMAKE_INSTALL_PREFIX=your_install_prefix make install ``` ## Documentation To get started with using `xsimd`, check out the full documentation http://xsimd.readthedocs.io/ ## Usage ### Explicit use of an instruction set extension Here is an example that computes the mean of two sets of 4 double floating point values, assuming AVX extension is supported: ```cpp #include #include "xsimd/xsimd.hpp" namespace xs = xsimd; int main(int argc, char* argv[]) { xs::batch a(1.5, 2.5, 3.5, 4.5); xs::batch b(2.5, 3.5, 4.5, 5.5); auto mean = (a + b) / 2; std::cout << mean << std::endl; return 0; } ``` Do not forget to enable AVX extension when building the example. With gcc or clang, this is done with the `-march=native` flag, on MSVC you have to pass the `/arch:AVX` option. This example outputs: ```cpp (2.0, 3.0, 4.0, 5.0) ``` ### Auto detection of the instruction set extension to be used The same computation operating on vectors and using the most performant instruction set available: ```cpp #include #include #include "xsimd/xsimd.hpp" namespace xs = xsimd; using vector_type = std::vector>; void mean(const vector_type& a, const vector_type& b, vector_type& res) { std::size_t size = a.size(); constexpr std::size_t simd_size = xsimd::simd_type::size; std::size_t vec_size = size - size % simd_size; for(std::size_t i = 0; i < vec_size; i += simd_size) { auto ba = xs::load_aligned(&a[i]); auto bb = xs::load_aligned(&b[i]); auto bres = (ba + bb) / 2.; bres.store_aligned(&res[i]); } for(std::size_t i = vec_size; i < size; ++i) { res[i] = (a[i] + b[i]) / 2.; } } ``` We also implement STL algorithms to work optimally on batches. Using `xsimd::transform` the loop from the example becomes: ```cpp #include #include #include "xsimd/xsimd.hpp" #include "xsimd/stl/algorithms.hpp" namespace xs = xsimd; using vector_type = std::vector>; void mean(const vector_type& a, const vector_type& b, vector_type& res) { xsimd::transform(a.begin(), a.end(), b.begin(), res.begin(), [](const auto& x, const auto& y) { (x + y) / 2.; }); } ``` ## Building and Running the Tests Building the tests requires the [GTest](https://github.com/google/googletest) testing framework and [cmake](https://cmake.org). gtest and cmake are available as a packages for most linux distributions. Besides, they can also be installed with the `conda` package manager (even on windows): ```bash conda install -c conda-forge gtest cmake ``` Once `gtest` and `cmake` are installed, you can build and run the tests: ```bash mkdir build cd build cmake ../ -DBUILD_TESTS=ON make xtest ``` In the context of continuous integration with Travis CI, tests are run in a `conda` environment, which can be activated with ```bash cd test conda env create -f ./test-environment.yml source activate test-xsimd cd .. cmake . -DBUILD_TESTS=ON make xtest ``` ## Building the HTML Documentation xsimd's documentation is built with three tools - [doxygen](http://www.doxygen.org) - [sphinx](http://www.sphinx-doc.org) - [breathe](https://breathe.readthedocs.io) While doxygen must be installed separately, you can install breathe by typing ```bash pip install breathe ``` Breathe can also be installed with `conda` ```bash conda install -c conda-forge breathe ``` Finally, build the documentation with ```bash make html ``` from the `docs` subdirectory. ## License We use a shared copyright model that enables all contributors to maintain the copyright on their contributions. This software is licensed under the BSD-3-Clause license. See the [LICENSE](LICENSE) file for details. xsimd-7.6.0/azure-pipelines.yml000066400000000000000000000003341410101234500164760ustar00rootroot00000000000000trigger: - master - 7.x jobs: - template: ./.azure-pipelines/azure-pipelines-win.yml - template: ./.azure-pipelines/azure-pipelines-linux-gcc.yml - template: ./.azure-pipelines/azure-pipelines-linux-clang.yml xsimd-7.6.0/benchmark/000077500000000000000000000000001410101234500145715ustar00rootroot00000000000000xsimd-7.6.0/benchmark/CMakeLists.txt000066400000000000000000000051251410101234500173340ustar00rootroot00000000000000############################################################################ # Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and # # Martin Renou # # Copyright (c) QuantStack # # # # Distributed under the terms of the BSD 3-Clause License. # # # # The full license is in the file LICENSE, distributed with this software. # ############################################################################ cmake_minimum_required(VERSION 3.1) if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) project(xsimd-benchmark) find_package(xsimd REQUIRED CONFIG) set(XSIMD_INCLUDE_DIR ${xsimd_INCLUDE_DIR}) endif () if(NOT CMAKE_BUILD_TYPE) message(STATUS "Setting tests build type to Release") set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) else() message(STATUS "Tests build type is ${CMAKE_BUILD_TYPE}") endif() include(CheckCXXCompilerFlag) string(TOUPPER "${CMAKE_BUILD_TYPE}" U_CMAKE_BUILD_TYPE) if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Intel") if(NOT CMAKE_CXX_FLAGS MATCHES "-march") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wunused-parameter -Wextra -Wreorder -Wconversion") if(NOT MSVC) CHECK_CXX_COMPILER_FLAG("-std=c++11" HAS_CPP11_FLAG) if (HAS_CPP11_FLAG) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") else() message(FATAL_ERROR "Unsupported compiler -- xsimd requires C++11 support!") endif() endif() endif() if(MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /MP /bigobj") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") set(CMAKE_EXE_LINKER_FLAGS /MANIFEST:NO) foreach(flag_var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) string(REPLACE "/MD" "-MT" ${flag_var} "${${flag_var}}") endforeach() endif() include_directories(${XSIMD_INCLUDE_DIR}) set(XSIMD_BENCHMARK main.cpp xsimd_benchmark.hpp ) set(XSIMD_BENCHMARK_TARGET benchmark_xsimd) add_executable(${XSIMD_BENCHMARK_TARGET} ${XSIMD_BENCHMARK} ${XSIMD_HEADERS}) add_custom_target(xbenchmark COMMAND benchmark_xsimd DEPENDS ${XSIMD_BENCHMARK_TARGET}) xsimd-7.6.0/benchmark/main.cpp000066400000000000000000000144701410101234500162270ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "xsimd_benchmark.hpp" #include void benchmark_operation() { //std::size_t size = 9984; std::size_t size = 20000; xsimd::run_benchmark_2op(xsimd::add_fn(), std::cout, size, 1000); xsimd::run_benchmark_2op(xsimd::sub_fn(), std::cout, size, 1000); xsimd::run_benchmark_2op(xsimd::mul_fn(), std::cout, size, 1000); xsimd::run_benchmark_2op(xsimd::div_fn(), std::cout, size, 1000); } void benchmark_exp_log() { std::size_t size = 20000; xsimd::run_benchmark_1op(xsimd::exp_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::exp2_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::expm1_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::log_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::log2_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::log10_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::log1p_fn(), std::cout, size, 1000); } void benchmark_trigo() { std::size_t size = 20000; xsimd::run_benchmark_1op(xsimd::sin_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::cos_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::tan_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::asin_fn(), std::cout, size, 1000, xsimd::init_method::arctrigo); xsimd::run_benchmark_1op(xsimd::acos_fn(), std::cout, size, 1000, xsimd::init_method::arctrigo); xsimd::run_benchmark_1op(xsimd::atan_fn(), std::cout, size, 1000, xsimd::init_method::arctrigo); } void benchmark_hyperbolic() { std::size_t size = 20000; xsimd::run_benchmark_1op(xsimd::sinh_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::cosh_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::tanh_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::asinh_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::acosh_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::atanh_fn(), std::cout, size, 100); } void benchmark_power() { std::size_t size = 20000; xsimd::run_benchmark_2op(xsimd::pow_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::sqrt_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::cbrt_fn(), std::cout, size, 100); xsimd::run_benchmark_2op(xsimd::hypot_fn(), std::cout, size, 1000); } void benchmark_rounding() { std::size_t size = 20000; xsimd::run_benchmark_1op(xsimd::ceil_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::floor_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::trunc_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::round_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::nearbyint_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::rint_fn(), std::cout, size, 100); } #ifdef XSIMD_POLY_BENCHMARKS void benchmark_poly_evaluation() { std::size_t size = 20000; xsimd::run_benchmark_1op(xsimd::horner_5_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::estrin_5_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::horner_10_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::estrin_10_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::horner_12_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::estrin_12_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::horner_14_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::estrin_14_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::horner_16_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::estrin_16_fn(), std::cout, size, 1000); } #endif void benchmark_basic_math() { std::size_t size = 20000; xsimd::run_benchmark_2op(xsimd::fmod_fn(), std::cout, size, 1000); xsimd::run_benchmark_2op(xsimd::remainder_fn(), std::cout, size, 1000); xsimd::run_benchmark_2op(xsimd::fdim_fn(), std::cout, size, 1000); xsimd::run_benchmark_3op(xsimd::clip_fn(), std::cout, size, 1000); #if 0 xsimd::run_benchmark_1op_pred(xsimd::isfinite_fn(), std::cout, size, 100); xsimd::run_benchmark_1op_pred(xsimd::isinf_fn(), std::cout, size, 100); xsimd::run_benchmark_1op_pred(xsimd::is_flint_fn(), std::cout, size, 100); xsimd::run_benchmark_1op_pred(xsimd::is_odd_fn(), std::cout, size, 100); xsimd::run_benchmark_1op_pred(xsimd::is_even_fn(), std::cout, size, 100); #endif } int main(int argc, char* argv[]) { const std::map> fn_map = { {"op", {"arithmetic", benchmark_operation}}, {"exp", {"exponential and logarithm", benchmark_exp_log}}, {"trigo", {"trigonometric", benchmark_trigo}}, {"hyperbolic", {"hyperbolic", benchmark_hyperbolic}}, {"power", {"power", benchmark_power}}, {"basic_math", {"basic math", benchmark_basic_math}}, {"rounding", {"rounding", benchmark_rounding}}, #ifdef XSIMD_POLY_BENCHMARKS {"utils", {"polynomial evaluation", benchmark_poly_evaluation}}, #endif }; if (argc > 1) { if (std::string(argv[1]) == "--help" || std::string(argv[1]) == "-h") { std::cout << "Available options:" << std::endl; for(auto const& kv : fn_map) { std::cout << kv.first << ": run benchmark on " << kv.second.first << " functions" << std::endl; } } else { for (int i = 1; i < argc; ++i) { fn_map.at(argv[i]).second(); } } } else { for(auto const& kv : fn_map) { kv.second.second(); } } return 0; } xsimd-7.6.0/benchmark/xsimd_benchmark.hpp000066400000000000000000000707511410101234500204520ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_BENCHMARK_HPP #define XSIMD_BENCHMARK_HPP #include #include #include #include #include "xsimd/xsimd.hpp" namespace xsimd { template std::string batch_name(); template <> inline std::string batch_name>() { return "sse/neon float"; } template <> inline std::string batch_name>() { return "sse/neon double"; } template <> inline std::string batch_name>() { return "avx float"; } template <> inline std::string batch_name>() { return "avx double"; } template <> inline std::string batch_name>() { return "fallback float"; } template <> inline std::string batch_name>() { return "fallback double"; } using duration_type = std::chrono::duration; template using bench_vector = std::vector>; template void init_benchmark(bench_vector& lhs, bench_vector& rhs, bench_vector& res, size_t size) { lhs.resize(size); rhs.resize(size); res.resize(size); for (size_t i = 0; i < size; ++i) { lhs[i] = T(0.5) + std::sqrt(T(i)) * T(9.) / T(size); rhs[i] = T(10.2) / T(i + 2) + T(0.25); } } template void init_benchmark(bench_vector& op0, bench_vector& op1, bench_vector& op2, bench_vector& res, size_t size) { op0.resize(size); op1.resize(size); op2.resize(size); res.resize(size); for (size_t i = 0; i < size; ++i) { op0[i] = T(0.5) + std::sqrt(T(i)) * T(9.) / T(size); op1[i] = T(10.2) / T(i + 2) + T(0.25); op2[i] = T(20.1) / T(i + 5) + T(0.65); } } template void init_benchmark_arctrigo(bench_vector& lhs, bench_vector& rhs, bench_vector& res, size_t size) { lhs.resize(size); rhs.resize(size); res.resize(size); for (size_t i = 0; i < size; ++i) { lhs[i] = T(-1.) + T(2.) * T(i) / T(size); rhs[i] = T(i) / T(i + 2) + T(0.25); } } enum class init_method { classic, arctrigo }; template duration_type benchmark_scalar(F f, V& lhs, V& res, std::size_t number) { size_t s = lhs.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (size_t i = 0; i < s; ++i) { res[i] = f(lhs[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_scalar(F f, V& lhs, V& rhs, V& res, std::size_t number) { size_t s = lhs.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (size_t i = 0; i < s; ++i) { res[i] = f(lhs[i], rhs[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_scalar(F f, V& op0, V& op1, V& op2, V& res, std::size_t number) { size_t s = op0.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (size_t i = 0; i < s; ++i) { res[i] = f(op0[i], op1[i], op2[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd(F f, V& lhs, V& res, std::size_t number) { std::size_t s = lhs.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - B::size); i += B::size) { B blhs(&lhs[i], aligned_mode()); B bres = f(blhs); bres.store_aligned(&res[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd_unrolled(F f, V& lhs, V& res, std::size_t number) { std::size_t s = lhs.size(); std::size_t inc = 4 * B::size; duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - inc); i += inc) { size_t j = i + B::size; size_t k = j + B::size; size_t l = k + B::size; B blhs(&lhs[i], aligned_mode()), blhs2(&lhs[j], aligned_mode()), blhs3(&lhs[k], aligned_mode()), blhs4(&lhs[l], aligned_mode()); B bres = f(blhs); B bres2 = f(blhs2); B bres3 = f(blhs3); B bres4 = f(blhs4); bres.store_aligned(&res[i]); bres2.store_aligned(&res[j]); bres3.store_aligned(&res[k]); bres4.store_aligned(&res[l]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd(F f, V& lhs, V& rhs, V& res, std::size_t number) { std::size_t s = lhs.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - B::size); i += B::size) { B blhs(&lhs[i], aligned_mode()), brhs(&rhs[i], aligned_mode()); B bres = f(blhs, brhs); bres.store_aligned(&res[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd_unrolled(F f, V& lhs, V& rhs, V& res, std::size_t number) { std::size_t s = lhs.size(); std::size_t inc = 4 * B::size; duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - inc); i += inc) { size_t j = i + B::size; size_t k = j + B::size; size_t l = k + B::size; B blhs(&lhs[i], aligned_mode()), brhs(&rhs[i], aligned_mode()), blhs2(&lhs[j], aligned_mode()), brhs2(&rhs[j], aligned_mode()); B blhs3(&lhs[k], aligned_mode()), brhs3(&rhs[k], aligned_mode()), blhs4(&lhs[l], aligned_mode()), brhs4(&rhs[l], aligned_mode()); B bres = f(blhs, brhs); B bres2 = f(blhs2, brhs2); B bres3 = f(blhs3, brhs3); B bres4 = f(blhs4, brhs4); bres.store_aligned(&res[i]); bres2.store_aligned(&res[j]); bres3.store_aligned(&res[k]); bres4.store_aligned(&res[l]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd(F f, V& op0, V& op1, V& op2, V& res, std::size_t number) { std::size_t s = op0.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - B::size); i += B::size) { B bop0(&op0[i], aligned_mode()), bop1(&op1[i], aligned_mode()), bop2(&op2[i], aligned_mode()); B bres = f(bop0, bop1, bop2); bres.store_aligned(&res[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd_unrolled(F f, V& op0, V& op1, V& op2, V& res, std::size_t number) { std::size_t s = op0.size(); std::size_t inc = 4 * B::size; duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - inc); i += inc) { size_t j = i + B::size; size_t k = j + B::size; size_t l = k + B::size; B bop0_i(&op0[i], aligned_mode()), bop1_i(&op1[i], aligned_mode()), bop2_i(&op2[i], aligned_mode()); B bop0_j(&op0[j], aligned_mode()), bop1_j(&op1[j], aligned_mode()), bop2_j(&op2[j], aligned_mode()); B bop0_k(&op0[k], aligned_mode()), bop1_k(&op1[k], aligned_mode()), bop2_k(&op2[k], aligned_mode()); B bop0_l(&op0[l], aligned_mode()), bop1_l(&op1[l], aligned_mode()), bop2_l(&op2[l], aligned_mode()); B bres_i = f(bop0_i, bop1_i, bop2_i); B bres_j = f(bop0_j, bop1_j, bop2_j); B bres_k = f(bop0_k, bop1_k, bop2_k); B bres_l = f(bop0_l, bop1_l, bop2_l); bres_i.store_aligned(&res[i]); bres_j.store_aligned(&res[j]); bres_k.store_aligned(&res[k]); bres_l.store_aligned(&res[l]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template void run_benchmark_1op(F f, OS& out, std::size_t size, std::size_t iter, init_method init = init_method::classic) { bench_vector f_lhs, f_rhs, f_res; bench_vector d_lhs, d_rhs, d_res; switch (init) { case init_method::classic: init_benchmark(f_lhs, f_rhs, f_res, size); init_benchmark(d_lhs, d_rhs, d_res, size); break; case init_method::arctrigo: init_benchmark_arctrigo(f_lhs, f_rhs, f_res, size); init_benchmark_arctrigo(d_lhs, d_rhs, d_res, size); break; default: init_benchmark(f_lhs, f_rhs, f_res, size); init_benchmark(d_lhs, d_rhs, d_res, size); break; } #ifndef XSIMD_POLY_BENCHMARKS duration_type t_float_scalar = benchmark_scalar(f, f_lhs, f_res, iter); duration_type t_double_scalar = benchmark_scalar(f, d_lhs, d_res, iter); #endif #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION duration_type t_float_sse = benchmark_simd>(f, f_lhs, f_res, iter); duration_type t_float_sse_u = benchmark_simd_unrolled>(f, f_lhs, f_res, iter); duration_type t_double_sse = benchmark_simd>(f, d_lhs, d_res, iter); duration_type t_double_sse_u = benchmark_simd_unrolled>(f, d_lhs, d_res, iter); #endif #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION duration_type t_float_avx = benchmark_simd>(f, f_lhs, f_res, iter); duration_type t_float_avx_u = benchmark_simd_unrolled>(f, f_lhs, f_res, iter); duration_type t_double_avx = benchmark_simd>(f, d_lhs, d_res, iter); duration_type t_double_avx_u = benchmark_simd_unrolled>(f, d_lhs, d_res, iter); #endif #if defined(XSIMD_ARM_INSTR_SET) duration_type t_float_neon = benchmark_simd>(f, f_lhs, f_res, iter); duration_type t_float_neon_u = benchmark_simd_unrolled>(f, f_lhs, f_res, iter); duration_type t_double_neon = benchmark_simd>(f, d_lhs, d_res, iter); duration_type t_double_neon_u = benchmark_simd_unrolled>(f, d_lhs, d_res, iter); #endif #if defined(XSIMD_ENABLE_FALLBACK) duration_type t_float_fallback = benchmark_simd>(f, f_lhs, f_res, iter); duration_type t_float_fallback_u = benchmark_simd_unrolled>(f, f_lhs, f_res, iter); duration_type t_double_fallback = benchmark_simd>(f, d_lhs, d_res, iter); duration_type t_double_fallback_u = benchmark_simd_unrolled>(f, d_lhs, d_res, iter); #endif out << "============================" << std::endl; out << f.name() << std::endl; #ifndef XSIMD_POLY_BENCHMARKS out << "scalar float : " << t_float_scalar.count() << "ms" << std::endl; #endif #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION out << "sse float : " << t_float_sse.count() << "ms" << std::endl; out << "sse float unr : " << t_float_sse_u.count() << "ms" << std::endl; #endif #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION out << "avx float : " << t_float_avx.count() << "ms" << std::endl; out << "avx float unr : " << t_float_avx_u.count() << "ms" << std::endl; #endif #if defined(XSIMD_ARM_INSTR_SET) out << "neon float : " << t_float_neon.count() << "ms" << std::endl; out << "neon float unr : " << t_float_neon_u.count() << "ms" << std::endl; #endif #if defined(XSIMD_ENABLE_FALLBACK) out << "flbk float : " << t_float_fallback.count() << "ms" << std::endl; out << "flbk float unr : " << t_float_fallback_u.count() << "ms" << std::endl; #endif #ifndef XSIMD_POLY_BENCHMARKS out << "scalar double : " << t_double_scalar.count() << "ms" << std::endl; #endif #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION out << "sse double : " << t_double_sse.count() << "ms" << std::endl; out << "sse double unr : " << t_double_sse_u.count() << "ms" << std::endl; #endif #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION out << "avx double : " << t_double_avx.count() << "ms" << std::endl; out << "avx double unr : " << t_double_avx_u.count() << "ms" << std::endl; #endif #if defined(XSIMD_ARM_INSTR_SET) out << "neon double : " << t_double_neon.count() << "ms" << std::endl; out << "neon double unr: " << t_double_neon_u.count() << "ms" << std::endl; #endif #if defined(XSIMD_ENABLE_FALLBACK) out << "flbk double : " << t_double_fallback.count() << "ms" << std::endl; out << "flbk double unr: " << t_double_fallback_u.count() << "ms" << std::endl; #endif out << "============================" << std::endl; } template void run_benchmark_2op(F f, OS& out, std::size_t size, std::size_t iter) { bench_vector f_lhs, f_rhs, f_res; bench_vector d_lhs, d_rhs, d_res; init_benchmark(f_lhs, f_rhs, f_res, size); init_benchmark(d_lhs, d_rhs, d_res, size); duration_type t_float_scalar = benchmark_scalar(f, f_lhs, f_rhs, f_res, iter); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION duration_type t_float_sse = benchmark_simd>(f, f_lhs, f_rhs, f_res, iter); duration_type t_float_sse_u = benchmark_simd_unrolled>(f, f_lhs, f_rhs, f_res, iter); #endif #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION duration_type t_float_avx = benchmark_simd>(f, f_lhs, f_rhs, f_res, iter); duration_type t_float_avx_u = benchmark_simd_unrolled>(f, f_lhs, f_rhs, f_res, iter); #endif duration_type t_double_scalar = benchmark_scalar(f, d_lhs, d_rhs, d_res, iter); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION duration_type t_double_sse = benchmark_simd>(f, d_lhs, d_rhs, d_res, iter); duration_type t_double_sse_u = benchmark_simd_unrolled>(f, d_lhs, d_rhs, d_res, iter); #endif #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION duration_type t_double_avx = benchmark_simd>(f, d_lhs, d_rhs, d_res, iter); duration_type t_double_avx_u = benchmark_simd_unrolled>(f, d_lhs, d_rhs, d_res, iter); #endif #if defined(XSIMD_ARM_INSTR_SET) duration_type t_float_neon = benchmark_simd>(f, f_lhs, f_rhs, f_res, iter); duration_type t_float_neon_u = benchmark_simd_unrolled>(f, f_lhs, f_rhs, f_res, iter); duration_type t_double_neon = benchmark_simd>(f, d_lhs, d_rhs, d_res, iter); duration_type t_double_neon_u = benchmark_simd_unrolled>(f, d_lhs, d_rhs, d_res, iter); #endif #if defined(XSIMD_ENABLE_FALLBACK) duration_type t_float_fallback = benchmark_simd>(f, f_lhs, f_rhs, f_res, iter); duration_type t_float_fallback_u = benchmark_simd_unrolled>(f, f_lhs, f_rhs, f_res, iter); duration_type t_double_fallback = benchmark_simd>(f, d_lhs, d_rhs, d_res, iter); duration_type t_double_fallback_u = benchmark_simd_unrolled>(f, d_lhs, d_rhs, d_res, iter); #endif out << "============================" << std::endl; out << f.name() << std::endl; out << "scalar float : " << t_float_scalar.count() << "ms" << std::endl; #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION out << "sse float : " << t_float_sse.count() << "ms" << std::endl; out << "sse float unr : " << t_float_sse_u.count() << "ms" << std::endl; #endif #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION out << "avx float : " << t_float_avx.count() << "ms" << std::endl; out << "avx float unr : " << t_float_avx_u.count() << "ms" << std::endl; #endif #if defined(XSIMD_ARM_INSTR_SET) out << "neon float : " << t_float_neon.count() << "ms" << std::endl; out << "neon float unr : " << t_float_neon_u.count() << "ms" << std::endl; #endif #if defined(XSIMD_ENABLE_FALLBACK) out << "flbk float : " << t_float_fallback.count() << "ms" << std::endl; out << "flbk float unr : " << t_float_fallback_u.count() << "ms" << std::endl; #endif out << "scalar double : " << t_double_scalar.count() << "ms" << std::endl; #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION out << "sse double : " << t_double_sse.count() << "ms" << std::endl; out << "sse double unr : " << t_double_sse_u.count() << "ms" << std::endl; #endif #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION out << "avx double : " << t_double_avx.count() << "ms" << std::endl; out << "avx double unr : " << t_double_avx_u.count() << "ms" << std::endl; #endif #if defined(XSIMD_ARM_INSTR_SET) out << "neon double : " << t_double_neon.count() << "ms" << std::endl; out << "neon double unr: " << t_double_neon_u.count() << "ms" << std::endl; #endif #if defined(XSIMD_ENABLE_FALLBACK) out << "flbk double : " << t_double_fallback.count() << "ms" << std::endl; out << "flbk double unr: " << t_double_fallback_u.count() << "ms" << std::endl; #endif out << "============================" << std::endl; } template void run_benchmark_3op(F f, OS& out, std::size_t size, std::size_t iter) { bench_vector f_op0, f_op1, f_op2, f_res; bench_vector d_op0, d_op1, d_op2, d_res; init_benchmark(f_op0, f_op1, f_op2, f_res, size); init_benchmark(d_op0, d_op1, d_op2, d_res, size); duration_type t_float_scalar = benchmark_scalar(f, f_op0, f_op1, f_op2, f_res, iter); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION duration_type t_float_sse = benchmark_simd>(f, f_op0, f_op1, f_op2, f_res, iter); duration_type t_float_sse_u = benchmark_simd_unrolled>(f, f_op0, f_op1, f_op2, f_res, iter); #endif #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION duration_type t_float_avx = benchmark_simd>(f, f_op0, f_op1, f_op2, f_res, iter); duration_type t_float_avx_u = benchmark_simd_unrolled>(f, f_op0, f_op1, f_op2, f_res, iter); #endif duration_type t_double_scalar = benchmark_scalar(f, d_op0, d_op1, d_op2, d_res, iter); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION duration_type t_double_sse = benchmark_simd>(f, d_op0, d_op1, d_op2, d_res, iter); duration_type t_double_sse_u = benchmark_simd_unrolled>(f, d_op0, d_op1, d_op2, d_res, iter); #endif #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION duration_type t_double_avx = benchmark_simd>(f, d_op0, d_op1, d_op2, d_res, iter); duration_type t_double_avx_u = benchmark_simd_unrolled>(f, d_op0, d_op1, d_op2, d_res, iter); #endif #if defined(XSIMD_ARM_INSTR_SET) duration_type t_float_neon = benchmark_simd>(f, f_op0, f_op1, f_op2, f_res, iter); duration_type t_float_neon_u = benchmark_simd_unrolled>(f, f_op0, f_op1, f_op2, f_res, iter); duration_type t_double_neon = benchmark_simd>(f, d_op0, d_op1, d_op2, d_res, iter); duration_type t_double_neon_u = benchmark_simd_unrolled>(f, d_op0, d_op1, d_op2, d_res, iter); #endif #if defined(XSIMD_ENABLE_FALLBACK) duration_type t_float_fallback = benchmark_simd>(f, f_op0, f_op1, f_op2, f_res, iter); duration_type t_float_fallback_u = benchmark_simd_unrolled>(f, f_op0, f_op1, f_op2, f_res, iter); duration_type t_double_fallback = benchmark_simd>(f, d_op0, d_op1, d_op2, d_res, iter); duration_type t_double_fallback_u = benchmark_simd_unrolled>(f, d_op0, d_op1, d_op2, d_res, iter); #endif out << "============================" << std::endl; out << f.name() << std::endl; out << "scalar float : " << t_float_scalar.count() << "ms" << std::endl; #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION out << "sse float : " << t_float_sse.count() << "ms" << std::endl; out << "sse float unr : " << t_float_sse_u.count() << "ms" << std::endl; #endif #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION out << "avx float : " << t_float_avx.count() << "ms" << std::endl; out << "avx float unr : " << t_float_avx_u.count() << "ms" << std::endl; #endif #if defined(XSIMD_ARM_INSTR_SET) out << "neon float : " << t_float_neon.count() << "ms" << std::endl; out << "neon float unr : " << t_float_neon_u.count() << "ms" << std::endl; #endif #if defined(XSIMD_ENABLE_FALLBACK) out << "flbk float : " << t_float_fallback.count() << "ms" << std::endl; out << "flbk float unr : " << t_float_fallback_u.count() << "ms" << std::endl; #endif out << "scalar double : " << t_double_scalar.count() << "ms" << std::endl; #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION out << "sse double : " << t_double_sse.count() << "ms" << std::endl; out << "sse double unr : " << t_double_sse_u.count() << "ms" << std::endl; #endif #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION out << "avx double : " << t_double_avx.count() << "ms" << std::endl; out << "avx double unr : " << t_double_avx_u.count() << "ms" << std::endl; #endif #if defined(XSIMD_ARM_INSTR_SET) out << "neon double : " << t_double_neon.count() << "ms" << std::endl; out << "neon double unr: " << t_double_neon_u.count() << "ms" << std::endl; #endif #if defined(XSIMD_ENABLE_FALLBACK) out << "flbk double : " << t_double_fallback.count() << "ms" << std::endl; out << "flbk double unr: " << t_double_fallback_u.count() << "ms" << std::endl; #endif out << "============================" << std::endl; } #define DEFINE_OP_FUNCTOR_2OP(OP, NAME)\ struct NAME##_fn {\ template \ inline T operator()(const T& lhs, const T& rhs) const { return lhs OP rhs; }\ inline std::string name() const { return #NAME; }\ } #define DEFINE_FUNCTOR_1OP(FN)\ struct FN##_fn {\ template \ inline T operator()(const T& x) const { using xsimd::FN; return FN(x); }\ inline std::string name() const { return #FN; }\ } #define DEFINE_FUNCTOR_1OP_TEMPLATE(FN, N, ...)\ struct FN##_##N##_fn {\ template \ inline T operator()(const T& x) const { using xsimd::FN; return FN(x); }\ inline std::string name() const { return #FN " " #N ; }\ } #define DEFINE_FUNCTOR_2OP(FN)\ struct FN##_fn{\ template \ inline T operator()(const T&lhs, const T& rhs) const { using xsimd::FN; return FN(lhs, rhs); }\ inline std::string name() const { return #FN; }\ } #define DEFINE_FUNCTOR_3OP(FN)\ struct FN##_fn{\ template \ inline T operator()(const T& op0, const T& op1, const T& op2) const { using xsimd::FN; return FN(op0, op1, op2); }\ inline std::string name() const { return #FN; }\ } DEFINE_OP_FUNCTOR_2OP(+, add); DEFINE_OP_FUNCTOR_2OP(-, sub); DEFINE_OP_FUNCTOR_2OP(*, mul); DEFINE_OP_FUNCTOR_2OP(/, div); DEFINE_FUNCTOR_1OP(exp); DEFINE_FUNCTOR_1OP(exp2); DEFINE_FUNCTOR_1OP(expm1); DEFINE_FUNCTOR_1OP(log); DEFINE_FUNCTOR_1OP(log10); DEFINE_FUNCTOR_1OP(log2); DEFINE_FUNCTOR_1OP(log1p); DEFINE_FUNCTOR_1OP(sin); DEFINE_FUNCTOR_1OP(cos); DEFINE_FUNCTOR_1OP(tan); DEFINE_FUNCTOR_1OP(asin); DEFINE_FUNCTOR_1OP(acos); DEFINE_FUNCTOR_1OP(atan); DEFINE_FUNCTOR_1OP(sinh); DEFINE_FUNCTOR_1OP(cosh); DEFINE_FUNCTOR_1OP(tanh); DEFINE_FUNCTOR_1OP(asinh); DEFINE_FUNCTOR_1OP(acosh); DEFINE_FUNCTOR_1OP(atanh); DEFINE_FUNCTOR_2OP(pow); DEFINE_FUNCTOR_1OP(sqrt); DEFINE_FUNCTOR_1OP(cbrt); DEFINE_FUNCTOR_2OP(hypot); DEFINE_FUNCTOR_1OP(ceil); DEFINE_FUNCTOR_1OP(floor); DEFINE_FUNCTOR_1OP(trunc); DEFINE_FUNCTOR_1OP(round); DEFINE_FUNCTOR_1OP(nearbyint); DEFINE_FUNCTOR_1OP(rint); DEFINE_FUNCTOR_2OP(fmod); DEFINE_FUNCTOR_2OP(remainder); DEFINE_FUNCTOR_2OP(fdim); DEFINE_FUNCTOR_3OP(clip); #if 0 DEFINE_FUNCTOR_1OP(isfinite); DEFINE_FUNCTOR_1OP(isinf); DEFINE_FUNCTOR_1OP(is_flint); DEFINE_FUNCTOR_1OP(is_odd); DEFINE_FUNCTOR_1OP(is_even); #endif #ifdef XSIMD_POLY_BENCHMARKS DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 5, 1, 2, 3, 4, 5); DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 5, 1, 2, 3, 4, 5); DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10); DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10); DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12); DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12); DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14); DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14); DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); #endif } #endif xsimd-7.6.0/docs/000077500000000000000000000000001410101234500135675ustar00rootroot00000000000000xsimd-7.6.0/docs/Doxyfile000066400000000000000000000005331410101234500152760ustar00rootroot00000000000000PROJECT_NAME = "xsimd" XML_OUTPUT = xml INPUT = ../include source/api/xsimd_batch.hpp GENERATE_LATEX = NO GENERATE_MAN = NO GENERATE_RTF = NO CASE_SENSE_NAMES = NO GENERATE_HTML = NO GENERATE_XML = YES RECURSIVE = YES QUIET = YES JAVADOC_AUTOBRIEF = YES WARN_IF_UNDOCUMENTED = NO xsimd-7.6.0/docs/Makefile000066400000000000000000000147261410101234500152410ustar00rootroot00000000000000# You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = BUILDDIR = build # User-friendly check for sphinx-build ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) endif # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext api default: html help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " applehelp to make an Apple Help Book" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" @echo " text to make text files" @echo " man to make manual pages" @echo " texinfo to make Texinfo files" @echo " info to make Texinfo files and run them through makeinfo" @echo " gettext to make PO message catalogs" @echo " changes to make an overview of all changed/added/deprecated items" @echo " xml to make Docutils-native XML files" @echo " pseudoxml to make pseudoxml-XML files for display purposes" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" @echo " coverage to run coverage check of the documentation (if enabled)" clean: rm -rf $(BUILDDIR)/* html: doxygen $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." dirhtml: doxygen $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." singlehtml: doxygen $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." pickle: doxygen $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: doxygen $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." htmlhelp: doxygen $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." epub: doxygen $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." latex: doxygen $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." latexpdf: doxygen $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." $(MAKE) -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." latexpdfja: doxygen $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through platex and dvipdfmx..." $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." text: doxygen $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." man: doxygen $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." texinfo: doxygen $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." @echo "Run \`make' in that directory to run these through makeinfo" \ "(use \`make info' here to do that automatically)." info: doxygen $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo "Running Texinfo files through makeinfo..." make -C $(BUILDDIR)/texinfo info @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." gettext: doxygen $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale @echo @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." changes: doxygen $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: doxygen $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: doxygen $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." coverage: doxygen $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage @echo "Testing of coverage in the sources finished, look at the " \ "results in $(BUILDDIR)/coverage/python.txt." xml: doxygen $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml @echo @echo "Build finished. The XML files are in $(BUILDDIR)/xml." pseudoxml: doxygen $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml @echo @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." xsimd-7.6.0/docs/environment.yml000066400000000000000000000001171410101234500166550ustar00rootroot00000000000000name: xsimd-docs channels: - conda-forge dependencies: - breathe==4.25.1 xsimd-7.6.0/docs/make.bat000066400000000000000000000161651410101234500152050ustar00rootroot00000000000000@ECHO OFF REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set BUILDDIR=build set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source set I18NSPHINXOPTS=%SPHINXOPTS% source if NOT "%PAPER%" == "" ( set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% ) if "%1" == "" goto help if "%1" == "help" ( :help echo.Please use `make ^` where ^ is one of echo. html to make standalone HTML files echo. dirhtml to make HTML files named index.html in directories echo. singlehtml to make a single large HTML file echo. pickle to make pickle files echo. json to make JSON files echo. htmlhelp to make HTML files and a HTML help project echo. qthelp to make HTML files and a qthelp project echo. devhelp to make HTML files and a Devhelp project echo. epub to make an epub echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter echo. text to make text files echo. man to make manual pages echo. texinfo to make Texinfo files echo. gettext to make PO message catalogs echo. changes to make an overview over all changed/added/deprecated items echo. xml to make Docutils-native XML files echo. pseudoxml to make pseudoxml-XML files for display purposes echo. linkcheck to check all external links for integrity echo. doctest to run all doctests embedded in the documentation if enabled echo. coverage to run coverage check of the documentation if enabled goto end ) if "%1" == "clean" ( for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i del /q /s %BUILDDIR%\* goto end ) REM Check if sphinx-build is available and fallback to Python version if any %SPHINXBUILD% 1>NUL 2>NUL if errorlevel 9009 goto sphinx_python goto sphinx_ok :sphinx_python set SPHINXBUILD=python -m sphinx.__init__ %SPHINXBUILD% 2> nul if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) :sphinx_ok if "%1" == "html" ( doxygen %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/html. goto end ) if "%1" == "dirhtml" ( %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. goto end ) if "%1" == "singlehtml" ( %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. goto end ) if "%1" == "pickle" ( %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the pickle files. goto end ) if "%1" == "json" ( %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the JSON files. goto end ) if "%1" == "htmlhelp" ( %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run HTML Help Workshop with the ^ .hhp project file in %BUILDDIR%/htmlhelp. goto end ) if "%1" == "qthelp" ( %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run "qcollectiongenerator" with the ^ .qhcp project file in %BUILDDIR%/qthelp, like this: echo.^> qcollectiongenerator %BUILDDIR%\qthelp\packagename.qhcp echo.To view the help file: echo.^> assistant -collectionFile %BUILDDIR%\qthelp\packagename.ghc goto end ) if "%1" == "devhelp" ( %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp if errorlevel 1 exit /b 1 echo. echo.Build finished. goto end ) if "%1" == "epub" ( %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub if errorlevel 1 exit /b 1 echo. echo.Build finished. The epub file is in %BUILDDIR%/epub. goto end ) if "%1" == "latex" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex if errorlevel 1 exit /b 1 echo. echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. goto end ) if "%1" == "latexpdf" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf cd %~dp0 echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end ) if "%1" == "latexpdfja" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf-ja cd %~dp0 echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end ) if "%1" == "text" ( %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text if errorlevel 1 exit /b 1 echo. echo.Build finished. The text files are in %BUILDDIR%/text. goto end ) if "%1" == "man" ( %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man if errorlevel 1 exit /b 1 echo. echo.Build finished. The manual pages are in %BUILDDIR%/man. goto end ) if "%1" == "texinfo" ( %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo if errorlevel 1 exit /b 1 echo. echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. goto end ) if "%1" == "gettext" ( %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale if errorlevel 1 exit /b 1 echo. echo.Build finished. The message catalogs are in %BUILDDIR%/locale. goto end ) if "%1" == "changes" ( %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes if errorlevel 1 exit /b 1 echo. echo.The overview file is in %BUILDDIR%/changes. goto end ) if "%1" == "linkcheck" ( %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck if errorlevel 1 exit /b 1 echo. echo.Link check complete; look for any errors in the above output ^ or in %BUILDDIR%/linkcheck/output.txt. goto end ) if "%1" == "doctest" ( %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest if errorlevel 1 exit /b 1 echo. echo.Testing of doctests in the sources finished, look at the ^ results in %BUILDDIR%/doctest/output.txt. goto end ) if "%1" == "coverage" ( %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage if errorlevel 1 exit /b 1 echo. echo.Testing of coverage in the sources finished, look at the ^ results in %BUILDDIR%/coverage/python.txt. goto end ) if "%1" == "xml" ( %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml if errorlevel 1 exit /b 1 echo. echo.Build finished. The XML files are in %BUILDDIR%/xml. goto end ) if "%1" == "pseudoxml" ( %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml if errorlevel 1 exit /b 1 echo. echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. goto end ) :end xsimd-7.6.0/docs/source/000077500000000000000000000000001410101234500150675ustar00rootroot00000000000000xsimd-7.6.0/docs/source/_static/000077500000000000000000000000001410101234500165155ustar00rootroot00000000000000xsimd-7.6.0/docs/source/_static/main_stylesheet.css000066400000000000000000000000741410101234500224250ustar00rootroot00000000000000.wy-nav-content{ max-width: 1000px; margin: auto; } xsimd-7.6.0/docs/source/api/000077500000000000000000000000001410101234500156405ustar00rootroot00000000000000xsimd-7.6.0/docs/source/api/aligned_allocator.rst000066400000000000000000000006701410101234500220400ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Aligned memory allocator ======================== .. doxygenclass:: xsimd::aligned_allocator :project: xsimd :members: Comparison operators -------------------- .. doxygengroup:: allocator_comparison :project: xsimd :content-only: xsimd-7.6.0/docs/source/api/available_wrappers.rst000066400000000000000000000251141410101234500222400ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Available wrappers ================== The :ref:`batch ` and :ref:`batch_bool ` generic template classes are not implemented by default, only full specializations of these templates are available depending on the instruction set macros defined according to the instruction sets provided by the compiler. Fallback implementation ----------------------- You may optionally enable a fallback implementation, which translates batch and batch_bool variants that do not exist in hardware into scalar loops. This is done by setting the XSIMD_ENABLE_FALLBACK preprocessor flag before including any xsimd header. This scalar fallback enables you to test the correctness of your computations without having matching hardware available, but you should be aware that it is only intended for use in validation scenarios. It has generally speaking not been tuned for performance, and its run-time characteristics may vary enormously from one compiler to another. Enabling it in performance-conscious production code is therefore strongly discouraged. x86 architecture ---------------- Depending on the value of XSIMD_X86_INSTR_SET, the following wrappers are available: - XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION +--------------------------------+-------------------------------------+ | batch | batch_bool | +================================+=====================================+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch, 4> | batch_bool, 4> | +--------------------------------+-------------------------------------+ | batch, 2> | batch_bool, 2> | +--------------------------------+-------------------------------------+ - XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION In addition to the wrappers defined above, the following wrappers are available: +--------------------------------+-------------------------------------+ | batch | batch_bool | +================================+=====================================+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch, 8> | batch_bool, 8> | +--------------------------------+-------------------------------------+ | batch, 4> | batch_bool, 4> | +--------------------------------+-------------------------------------+ - XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX512_VERSION In addition to the wrappers defined above, the following wrappers are available: +--------------------------------+-------------------------------------+ | batch | batch_bool | +================================+=====================================+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch, 16> | batch_bool, 16> | +--------------------------------+-------------------------------------+ | batch, 8> | batch_bool, 8> | +--------------------------------+-------------------------------------+ ARM architecture ---------------- Depending on the value of XSIMD_ARM_INSTR_SET, the following wrappers are available: - XSIMD_ARM_INSTR_SET >= XSIMD_ARM7_NEON_VERSION +--------------------------------+-------------------------------------+ | batch | batch_bool | +================================+=====================================+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch, 4> | batch_bool, 4> | +--------------------------------+-------------------------------------+ - XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION In addition to the wrappers defined above, the following wrappers are available: +--------------------------------+-------------------------------------+ | batch | batch_bool | +================================+=====================================+ | batch | batch_bool | +--------------------------------+-------------------------------------+ | batch, 2> | batch_bool, 2> | +--------------------------------+-------------------------------------+ .. warning:: Support for ``std::complex`` on ARM is still experimental. You may experience accuracy errors with ``std::complex``. XTL complex support ------------------- If the preprocessor token ``XSIMD_ENABLE_XTL_COMPLEX`` is defined, ``xsimd`` provides batches for ``xtl::xcomplex``, similar to those for ``std::complex``. This requires ``xtl`` to be installed. xsimd-7.6.0/docs/source/api/basic_functions.rst000066400000000000000000000041471410101234500215510ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Basic functions =============== .. _abs-function-reference: .. doxygenfunction:: abs(const simd_base& ) :project: xsimd .. _fabs-function-reference: .. doxygenfunction:: fabs(const simd_base& ) :project: xsimd .. _fmod-function-reference: .. doxygenfunction:: fmod(const simd_base&, const simd_base&) :project: xsimd .. _remainder-func-ref: .. doxygenfunction:: remainder(const simd_base&, const simd_base&) :project: xsimd .. _fma-function-reference: .. doxygenfunction:: fma(const simd_base&, const simd_base&, const simd_base&) :project: xsimd .. _fms-function-reference: .. doxygenfunction:: fms(const simd_base&, const simd_base&, const simd_base&) :project: xsimd .. _fnma-function-reference: .. doxygenfunction:: fnma(const simd_base&, const simd_base&, const simd_base&) :project: xsimd .. _fnms-function-reference: .. doxygenfunction:: fnms(const simd_base&, const simd_base&, const simd_base&) :project: xsimd .. _min-function-reference: .. doxygenfunction:: min(const simd_base&, const simd_base&) :project: xsimd .. _max-function-reference: .. doxygenfunction:: max(const simd_base&, const simd_base&) :project: xsimd .. _fmin-function-reference: .. doxygenfunction:: fmin(const simd_base&, const simd_base&) :project: xsimd .. _fmax-function-reference: .. doxygenfunction:: fmax(const simd_base&, const simd_base&) :project: xsimd .. _fdim-function-reference: .. doxygenfunction:: fdim(const batch&, const batch&) :project: xsimd .. _sadd-function-reference: .. doxygenfunction:: sadd(const simd_base&, const simd_base&) :project: xsimd .. _ssub-function-reference: .. doxygenfunction:: ssub(const simd_base&, const simd_base&) :project: xsimd .. _clip-function-reference: .. doxygenfunction:: clip(const simd_base&, const simd_base&, const simd_base&) :project: xsimd xsimd-7.6.0/docs/source/api/batch_index.rst000066400000000000000000000005441410101234500206450ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Wrapper types ============= .. toctree:: xsimd_base xsimd_batch_bool xsimd_batch xsimd_complex_bool_base xsimd_complex_base available_wrappers xsimd-7.6.0/docs/source/api/classification_functions.rst000066400000000000000000000007271410101234500234630ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Classification functions ======================== .. _isfinite-func-ref: .. doxygenfunction:: isfinite :project: xsimd .. _isinf-func-ref: .. doxygenfunction:: isinf :project: xsimd .. _isnan-func-ref: .. doxygenfunction:: isnan(const simd_base&) :project: xsimd xsimd-7.6.0/docs/source/api/data_transfer.rst000066400000000000000000000007251410101234500212130ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Data transfer ============= Data transfer instructions -------------------------- .. doxygengroup:: data_transfer :project: xsimd :content-only: Generic load and store ---------------------- .. doxygengroup:: generic_load_store :project: xsimd :content-only: xsimd-7.6.0/docs/source/api/error_functions.rst000066400000000000000000000010221410101234500216060ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Error and gamma functions ========================= .. _erf-function-reference: .. doxygenfunction:: erf :project: xsimd .. _erfc-function-reference: .. doxygenfunction:: erfc :project: xsimd .. _tgamma-func-ref: .. doxygenfunction:: tgamma :project: xsimd .. _lgamma-func-ref: .. doxygenfunction:: lgamma :project: xsimd xsimd-7.6.0/docs/source/api/exponential_functions.rst000066400000000000000000000015641410101234500230160ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Exponential functions ===================== .. _exp-function-reference: .. doxygenfunction:: exp :project: xsimd .. _exp2-function-reference: .. doxygenfunction:: exp2 :project: xsimd .. _exp10-func-ref: .. doxygenfunction:: exp10(const simd_base&) :project: xsimd .. _expm1-func-ref: .. doxygenfunction:: expm1(const simd_base&) :project: xsimd .. _log-function-reference: .. doxygenfunction:: log :project: xsimd .. _log2-function-reference: .. doxygenfunction:: log2(const simd_base&) :project: xsimd .. _log10-func-ref: .. doxygenfunction:: log10 :project: xsimd .. _log1p-func-ref: .. doxygenfunction:: log1p(const simd_base&) :project: xsimd xsimd-7.6.0/docs/source/api/hyperbolic_functions.rst000066400000000000000000000012511410101234500226210ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Hyperbolic functions ==================== .. _sinh-function-reference: .. doxygenfunction:: sinh :project: xsimd .. _cosh-function-reference: .. doxygenfunction:: cosh :project: xsimd .. _tanh-function-reference: .. doxygenfunction:: tanh(const simd_base&) :project: xsimd .. _asinh-func-ref: .. doxygenfunction:: asinh :project: xsimd .. _acosh-func-ref: .. doxygenfunction:: acosh :project: xsimd .. _atanh-func-ref: .. doxygenfunction:: atanh :project: xsimd xsimd-7.6.0/docs/source/api/instr_macros.rst000066400000000000000000000122121410101234500210730ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille and Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Instruction set macros ====================== `xsimd` defines different macros depending on the symbols defined by the compiler options. x86 architecture ---------------- If one of the following symbols is detected, XSIMD_X86_INSTR_SET is set to the corresponding version and XSIMD_X86_INSTR_SET_AVAILABLE is defined. +-------------------+-----------------------------+ | Symbol | Version | +===================+=============================+ | __SSE__ | XSIMD_X86_SSE_VERSION | +-------------------+-----------------------------+ | _M_IX86_FP >= 1 | XSIMD_X86_SSE_VERSION | +-------------------+-----------------------------+ | __SSE2__ | XSIMD_X86_SSE2_VERSION | +-------------------+-----------------------------+ | _M_X64 | XSIMD_X86_SSE2_VERSION | +-------------------+-----------------------------+ | _M_IX86_FP >= 2 | XSIMD_X86_SSE2_VERSION | +-------------------+-----------------------------+ | __SSE3__ | XSIMD_X86_SSE3_VERSION | +-------------------+-----------------------------+ | __SSSE3__ | XSIMD_X86_SSSE3_VERSION | +-------------------+-----------------------------+ | __SSE4_1__ | XSIMD_X86_SSE4_1_VERSION | +-------------------+-----------------------------+ | __SSE4_2__ | XSIMD_X86_SSE4_2_VERSION | +-------------------+-----------------------------+ | __AVX__ | XSIMD_X86_AVX_VERSION | +-------------------+-----------------------------+ | __FMA__ | XSIMD_X86_FMA3_VERSION | +-------------------+-----------------------------+ | __AVX2__ | XSIMD_X86_AVX2_VERSION | +-------------------+-----------------------------+ | __AVX512__ | XSIMD_X86_AVX512_VERSION | +-------------------+-----------------------------+ | __KNCNI__ | XSIMD_X86_AVX512_VERSION | +-------------------+-----------------------------+ | __AVX512F__ | XSIMD_X86_AVX512_VERSION | +-------------------+-----------------------------+ x86_AMD architecture -------------------- If one of the following symbols is detected, XSIMD_X86_AMD_INSTR_SET is set to the corresponding version and XSIMD_X86_AMD_SET_AVAILABLE is defined. +-------------------+-----------------------------+ | Symbol | Version | +===================+=============================+ | __SSE4A__ | XSIMD_X86_AMD_SSE4A_VERSION | +-------------------+-----------------------------+ | __FMA__ | XSIMD_X86_AMD_FMA4_VERSION | +-------------------+-----------------------------+ | __XOP__ | XSIMD_X86_AMD_XOP_VERSION | +-------------------+-----------------------------+ If one of the previous symbol is defined, other x86 instruction sets not specific to AMD should be available too; thus XSIMD_X86_INSTR_SET and XSIMD_X86_INSTR_SET_AVAILABLE should be defined. In that case, XSIMD_X86_AMD_INSTR_SET is set to the maximum of XSIMD_X86_INSTR_SET and the current value of XSIMD_X86_AMD_INSTR_SET. PPC architecture ---------------- If one of the following symbols is detected, XSIMD_PPC_INSTR_SET is set to the corresponding version and XSIMD_PPC_INSTR_AVAILABLE is defined. +-------------------+-----------------------------+ | Symbol | Version | +===================+=============================+ | __ALTIVEC__ | XSIMD_PPC_VMX_VERSION | +-------------------+-----------------------------+ | __VEC__ | XSIMD_PPC_VMX_VERSION | +-------------------+-----------------------------+ | __VSX__ | XSIMD_PPC_VSX_VERSION | +-------------------+-----------------------------+ | __VECTOR4DOUBLE__ | XSIMD_PPC_QPX_VERSION | +-------------------+-----------------------------+ ARM architecture ---------------- If one of the following condition is detected, XSIMD_ARM_INSTR_SET is set to the corresponding version and XSIMD_ARM_INSTR_AVAILABLE is defined. +-------------------+-----------------------------+ | Symbol | Version | +===================+=============================+ | __ARM_ARCH == 7 | XSIMD_ARM7_NEON_VERSION | +-------------------+-----------------------------+ | __ARM_ARCH == 8 | XSIMD_ARM8_32_NEON_VERSION | | && ! __aarch64__ | | +-------------------+-----------------------------+ | __ARM_ARCH == 8 | XSIMD_ARM8_64_NEON_VERSION | | && __aarch64__ | | +-------------------+-----------------------------+ Generic instruction set ----------------------- If XSIMD_*_INSTR_SET_AVAILABLE has been defined as explained above, XSIMD_INSTR_SET is set to XSIMD_*_INSTR_SET and XSIMD_INSTR_SET_AVAILABLE is defined. xsimd-7.6.0/docs/source/api/math_index.rst000066400000000000000000000274661410101234500205310ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Mathematical functions ====================== .. toctree:: basic_functions +---------------------------------------+----------------------------------------------------+ | :ref:`abs ` | absolute value | +---------------------------------------+----------------------------------------------------+ | :ref:`fabs ` | absolute value | +---------------------------------------+----------------------------------------------------+ | :ref:`fmod ` | remainder of the floating point division operation | +---------------------------------------+----------------------------------------------------+ | :ref:`remainder ` | signed remainder of the division operation | +---------------------------------------+----------------------------------------------------+ | :ref:`fma ` | fused multiply-add operation | +---------------------------------------+----------------------------------------------------+ | :ref:`fms ` | fused multiply-sub operation | +---------------------------------------+----------------------------------------------------+ | :ref:`fnma ` | fused negated multiply-add operation | +---------------------------------------+----------------------------------------------------+ | :ref:`fnms ` | fused negated multiply-sub operation | +---------------------------------------+----------------------------------------------------+ | :ref:`min ` | smaller of two batches | +---------------------------------------+----------------------------------------------------+ | :ref:`max ` | larger of two batches | +---------------------------------------+----------------------------------------------------+ | :ref:`fmin ` | smaller of two batches of floating point values | +---------------------------------------+----------------------------------------------------+ | :ref:`fmax ` | larger of two batches of floating point values | +---------------------------------------+----------------------------------------------------+ | :ref:`fdim ` | positive difference | +---------------------------------------+----------------------------------------------------+ | :ref:`sadd ` | saturated addition | +---------------------------------------+----------------------------------------------------+ | :ref:`ssub ` | saturated subtraction | +---------------------------------------+----------------------------------------------------+ | :ref:`clip ` | clipping operation | +---------------------------------------+----------------------------------------------------+ .. toctree:: exponential_functions +---------------------------------------+----------------------------------------------------+ | :ref:`exp ` | natural exponential function | +---------------------------------------+----------------------------------------------------+ | :ref:`exp2 ` | base 2 exponential function | +---------------------------------------+----------------------------------------------------+ | :ref:`exp10 ` | base 10 exponential function | +---------------------------------------+----------------------------------------------------+ | :ref:`expm1 ` | natural exponential function, minus one | +---------------------------------------+----------------------------------------------------+ | :ref:`log ` | natural logarithm function | +---------------------------------------+----------------------------------------------------+ | :ref:`log2 ` | base 2 logarithm function | +---------------------------------------+----------------------------------------------------+ | :ref:`log10 ` | base 10 logarithm function | +---------------------------------------+----------------------------------------------------+ | :ref:`log1p ` | natural logarithm of one plus function | +---------------------------------------+----------------------------------------------------+ .. toctree:: power_functions +---------------------------------------+----------------------------------------------------+ | :ref:`pow ` | power function | +---------------------------------------+----------------------------------------------------+ | :ref:`sqrt ` | square root function | +---------------------------------------+----------------------------------------------------+ | :ref:`cbrt ` | cubic root function | +---------------------------------------+----------------------------------------------------+ | :ref:`hypot ` | hypotenuse function | +---------------------------------------+----------------------------------------------------+ .. toctree:: trigonometric_functions +---------------------------------------+----------------------------------------------------+ | :ref:`sin ` | sine function | +---------------------------------------+----------------------------------------------------+ | :ref:`cos ` | cosine function | +---------------------------------------+----------------------------------------------------+ | :ref:`sincos ` | sine and cosine function | +---------------------------------------+----------------------------------------------------+ | :ref:`tan ` | tangent function | +---------------------------------------+----------------------------------------------------+ | :ref:`asin ` | arc sine function | +---------------------------------------+----------------------------------------------------+ | :ref:`acos ` | arc cosine function | +---------------------------------------+----------------------------------------------------+ | :ref:`atan ` | arc tangent function | +---------------------------------------+----------------------------------------------------+ | :ref:`atan2 ` | arc tangent function, determining quadrants | +---------------------------------------+----------------------------------------------------+ .. toctree:: hyperbolic_functions +---------------------------------------+----------------------------------------------------+ | :ref:`sinh ` | hyperbolic sine function | +---------------------------------------+----------------------------------------------------+ | :ref:`cosh ` | hyperbolic cosine function | +---------------------------------------+----------------------------------------------------+ | :ref:`tanh ` | hyperbolic tangent function | +---------------------------------------+----------------------------------------------------+ | :ref:`asinh ` | inverse hyperbolic sine function | +---------------------------------------+----------------------------------------------------+ | :ref:`acosh ` | inverse hyperbolic cosine function | +---------------------------------------+----------------------------------------------------+ | :ref:`atanh ` | inverse hyperbolic tangent function | +---------------------------------------+----------------------------------------------------+ .. toctree:: error_functions +---------------------------------------+----------------------------------------------------+ | :ref:`erf ` | error function | +---------------------------------------+----------------------------------------------------+ | :ref:`erfc ` | complementary error function | +---------------------------------------+----------------------------------------------------+ | :ref:`tgamma ` | gamma function | +---------------------------------------+----------------------------------------------------+ | :ref:`lgamma ` | natural logarithm of the gamma function | +---------------------------------------+----------------------------------------------------+ .. toctree:: nearint_operations +---------------------------------------+----------------------------------------------------+ | :ref:`ceil ` | nearest integers not less | +---------------------------------------+----------------------------------------------------+ | :ref:`floor ` | nearest integers not greater | +---------------------------------------+----------------------------------------------------+ | :ref:`trunc ` | nearest integers not greater in magnitude | +---------------------------------------+----------------------------------------------------+ | :ref:`round ` | nearest integers, rounding away from zero | +---------------------------------------+----------------------------------------------------+ | :ref:`nearbyint ` | nearest integers using current rounding mode | +---------------------------------------+----------------------------------------------------+ | :ref:`rint ` | nearest integers using current rounding mode | +---------------------------------------+----------------------------------------------------+ .. toctree:: classification_functions +---------------------------------------+----------------------------------------------------+ | :ref:`isfinite ` | Checks for finite values | +---------------------------------------+----------------------------------------------------+ | :ref:`isinf ` | Checks for infinite values | +---------------------------------------+----------------------------------------------------+ | :ref:`isnan ` | Checks for NaN values | +---------------------------------------+----------------------------------------------------+ xsimd-7.6.0/docs/source/api/nearint_operations.rst000066400000000000000000000014751410101234500223040ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Nearest integer floating point operations ========================================= .. _ceil-function-reference: .. doxygenfunction:: ceil(const simd_base&) :project: xsimd .. _floor-func-ref: .. doxygenfunction:: floor(const simd_base&) :project: xsimd .. _trunc-func-ref: .. doxygenfunction:: trunc(const simd_base&) :project: xsimd .. _round-func-ref: .. doxygenfunction:: round(const simd_base&) :project: xsimd .. _nearbyint-func-ref: .. doxygenfunction:: nearbyint(const simd_base&) :project: xsimd .. _rint-function-reference: .. doxygenfunction:: rint(const simd_base&) :project: xsimd xsimd-7.6.0/docs/source/api/power_functions.rst000066400000000000000000000011021410101234500216100ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Power functions =============== .. _pow-function-reference: .. doxygenfunction:: pow(const simd_base&, const simd_base&) :project: xsimd .. _sqrt-function-reference: .. doxygenfunction:: sqrt(const simd_base&) :project: xsimd .. _cbrt-function-reference: .. doxygenfunction:: cbrt :project: xsimd .. _hypot-func-ref: .. doxygenfunction:: hypot :project: xsimd xsimd-7.6.0/docs/source/api/trigonometric_functions.rst000066400000000000000000000020261410101234500233470ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Trigonometric functions ======================= .. _sin-function-reference: .. doxygenfunction:: sin(const simd_base&) :project: xsimd .. _cos-function-reference: .. doxygenfunction:: cos(const simd_base&) :project: xsimd .. _sincos-func-ref: .. doxygenfunction:: sincos(const simd_base&, batch_type_t&, batch_type_t&) :project: xsimd .. _tan-function-reference: .. doxygenfunction:: tan(const simd_base&) :project: xsimd .. _asin-function-reference: .. doxygenfunction:: asin(const simd_base&) :project: xsimd .. _acos-function-reference: .. doxygenfunction:: acos(const simd_base&) :project: xsimd .. _atan-function-reference: .. doxygenfunction:: atan(const simd_base&) :project: xsimd .. _atan2-func-ref: .. doxygenfunction:: atan2(const simd_base&, const simd_base&) :project: xsimd xsimd-7.6.0/docs/source/api/xsimd_base.rst000066400000000000000000000020341410101234500205070ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. simd_batch ========== .. doxygenclass:: xsimd::simd_batch :project: xsimd :members: Arithmetic operators -------------------- .. doxygengroup:: simd_batch_arithmetic :project: xsimd :content-only: Comparison operators -------------------- .. doxygengroup:: simd_batch_comparison :project: xsimd :content-only: Bitwise operators ----------------- .. doxygengroup:: simd_batch_bitwise :project: xsimd :content-only: Reducers -------- .. doxygengroup:: simd_batch_reducers :project: xsimd :content-only: Miscellaneous ------------- .. doxygengroup:: simd_batch_miscellaneous :project: xsimd :content-only: Other operators --------------- .. doxygenfunction:: xsimd::operator!(const simd_batch&) :project: xsimd .. doxygenfunction:: xsimd::operator<<(std::ostream&, const simd_batch&) :project: xsimd xsimd-7.6.0/docs/source/api/xsimd_base_bool.rst000066400000000000000000000013731410101234500215270ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. simd_batch_bool =============== .. doxygenclass:: xsimd::simd_batch_bool :project: xsimd :members: Bitwise operators ----------------- .. doxygengroup:: simd_batch_bool_bitwise :project: xsimd :content-only: Logical operators ----------------- .. doxygengroup:: simd_batch_bool_logical :project: xsimd :content-only: Comparison operators -------------------- .. doxygengroup:: simd_batch_bool_comparison :project: xsimd :content-only: Reducers -------- .. doxygengroup:: simd_batch_bool_reducers :project: xsimd :content-only: xsimd-7.6.0/docs/source/api/xsimd_batch.hpp000066400000000000000000000101601410101234500206340ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) 2016, Johan Mabille and Sylvain Corlay * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ // This file is for generating the documentation namespace xsimd { /** * @class batch_bool * @brief Batch of boolean values. * * The batch_bool class represents a batch of boolean values, that can be used * in operations involving batches of integer or floating point values. The boolean * values are stored as integer or floating point values, depending on the type of * batch they are dedicated to. * * @tparam T The value type used for encoding boolean. * @tparam N The number of scalar in the batch. */ template class batch_bool : public simd_batch_bool> { public: /** * Builds an uninitialized batch of boolean values. */ batch_bool(); /** * Initializes all the values of the batch to \c b. */ explicit batch_bool(bool b); /** * Initializes a batch of booleans with the specified boolean values. */ batch_bool(bool b0, ..., bool bn); /** * Initializes a batch of boolean with the specified SIMD value. */ batch_bool(const simd_data& rhs); /** * Assigns the specified SIMD value. */ batch_bool& operator=(const simd_data& rhs); /** * Converts \c this to a SIMD value. */ operator simd_data() const; }; template class batch : public simd_batch> { public: /** * Builds an uninitialized batch. */ batch(); /** * Initializes all the values of the batch to \c b. */ explicit batch(T f); /** * Initializes a batch with the specified scalar values. */ batch(T f0, ..., T f3); /* * Initializes a batch to the values pointed by \c src; \c src * does not need to be aligned. */ explicit batch(const T* src); /** * Initializes a batch to the N contiguous values pointed by \c src; \c src * is not required to be aligned. */ batch(const T* src, aligned_mode); /** * Initializes a batch to the values pointed by \c src; \c src * does not need to be aligned. */ batch(const T* src, unaligned_mode); /** * Initializes a batch with the specified SIMD value. */ batch(const simd_data& rhs); /** * Assigns the specified SIMD value to the batch. */ batch& operator=(const simd_data& rhs); /** * Converts \c this to a SIMD value. */ operator simd_data() const; /** * Loads the N contiguous values pointed by \c src into the batch. * \c src must be aligned. */ batch& load_aligned(const T* src); /** * Loads the N contiguous values pointed by \c src into the batch. * \c src is not required to be aligned. */ batch& load_unaligned(const T* src); /** * Stores the N values of the batch into a contiguous array * pointed by \c dst. \c dst must be aligned. */ void store_aligned(T* dst) const; /** * Stores the N values of the batch into a contiguous array * pointed by \c dst. \c dst is not required to be aligned. */ void store_unaligned(T* dst) const; /** * Return the i-th scalar in the batch. */ T operator[](std::size_t i) const; }; } xsimd-7.6.0/docs/source/api/xsimd_batch.rst000066400000000000000000000004441410101234500206610ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. batch ===== .. _xsimd-batch-ref: .. doxygenclass:: xsimd::batch :project: xsimd :members: xsimd-7.6.0/docs/source/api/xsimd_batch_bool.rst000066400000000000000000000004701410101234500216730ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. batch_bool ========== .. _xsimd-batch-bool-ref: .. doxygenclass:: xsimd::batch_bool :project: xsimd :members: xsimd-7.6.0/docs/source/api/xsimd_complex_base.rst000066400000000000000000000016341410101234500222430ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. simd_complex_batch ================== .. doxygenclass:: xsimd::simd_complex_batch :project: xsimd :members: Arithmetic operators -------------------- .. doxygengroup:: simd_complex_batch_arithmetic :project: xsimd :content-only: Comparison operators -------------------- .. doxygengroup:: simd_complex_batch_comparison :project: xsimd :content-only: Reducers -------- .. doxygengroup:: simd_complex_batch_reducers :project: xsimd :content-only: Miscellaneous ------------- .. doxygengroup:: simd_complex_batch_miscellaneous :project: xsimd :content-only: Other operators --------------- .. doxygenfunction:: xsimd::operator<<(std::ostream&, const simd_complex_batch&) :project: xsimd xsimd-7.6.0/docs/source/api/xsimd_complex_bool_base.rst000066400000000000000000000005041410101234500232510ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. simd_complex_batch_bool ======================= .. doxygenclass:: xsimd::simd_complex_batch_bool :project: xsimd :members: xsimd-7.6.0/docs/source/basic_usage.rst000066400000000000000000000035541410101234500200750ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille and Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Basic usage =========== Explicit use of an instruction set extension -------------------------------------------- Here is an example that computes the mean of two sets of 4 double floating point values, assuming AVX extension is supported: .. code:: #include #include "xsimd/xsimd.hpp" namespace xs = xsimd; int main(int argc, char* argv[]) { xs::batch a(1.5, 2.5, 3.5, 4.5); xs::batch b(2.5, 3.5, 4.5, 5.5); auto mean = (a + b) / 2; std::cout << mean << std::endl; return 0; } This example outputs: .. code:: (2.0, 3.0, 4.0, 5.0) Auto detection of the instruction set extension to be used ---------------------------------------------------------- The same computation operating on vectors and using the most performant instruction set available: .. code:: #include #include #include "xsimd/xsimd.hpp" namespace xs = xsimd; using vector_type = std::vector>; void mean(const vector_type& a, const vector_type& b, vector_type& res) { std::size_t size = a.size(); constexpr std::size_t simd_size = xsimd::simd_type::size; std::size_t vec_size = size - size % simd_size; for(std::size_t i = 0; i < vec_size; i += simd_size) { auto ba = xs::load_aligned(&a[i]); auto bb = xs::load_aligned(&b[i]); auto bres = (ba + bb) / 2; bres.store_aligned(&res[i]); } for(std::size_t i = vec_size; i < size; ++i) { res[i] = (a[i] + b[i]) / 2; } } xsimd-7.6.0/docs/source/cmake.svg000066400000000000000000000425311410101234500166750ustar00rootroot00000000000000 image/svg+xml xsimd-7.6.0/docs/source/conda.svg000066400000000000000000000034151410101234500166770ustar00rootroot00000000000000xsimd-7.6.0/docs/source/conf.py000066400000000000000000000015031410101234500163650ustar00rootroot00000000000000#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os import subprocess on_rtd = os.environ.get('READTHEDOCS', None) == 'True' if on_rtd: subprocess.call('cd ..; doxygen', shell=True) import sphinx_rtd_theme html_theme = "sphinx_rtd_theme" html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] def setup(app): app.add_stylesheet("main_stylesheet.css") extensions = ['breathe'] breathe_projects = { 'xsimd': '../xml' } templates_path = ['_templates'] html_static_path = ['_static'] source_suffix = '.rst' master_doc = 'index' project = 'xsimd' copyright = '2016, Johan Mabille and Sylvain Corlay' author = 'Johan Mabille and Sylvain Corlay' html_logo = 'quantstack-white.svg' exclude_patterns = [] highlight_language = 'c++' pygments_style = 'sphinx' todo_include_todos = False htmlhelp_basename = 'xsimddoc' xsimd-7.6.0/docs/source/index.rst000066400000000000000000000063401410101234500167330ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille and Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. image:: xsimd.svg :alt: xsimd C++ wrappers for SIMD intrinsics. Introduction ------------ SIMD (Single Instruction, Multiple Data) is a feature of microprocessors that has been available for many years. SIMD instructions perform a single operation on a batch of values at once, and thus provide a way to significantly accelerate code execution. However, these instructions differ between microprocessor vendors and compilers. `xsimd` provides a unified means for using these features for library authors. Namely, it enables manipulation of batches of numbers with the same arithmetic operators as for single values. It also provides accelerated implementation of common mathematical functions operating on batches. You can find out more about this implementation of C++ wrappers for SIMD intrinsics at the `The C++ Scientist`_. The mathematical functions are a lightweight implementation of the algorithms also used in `boost.SIMD`_. `xsimd` requires a C++11 compliant compiler. The following C++ compilers are supported: +-------------------------+-------------------------------+ | Compiler | Version | +=========================+===============================+ | Microsoft Visual Studio | MSVC 2015 update 2 and above | +-------------------------+-------------------------------+ | g++ | 4.9 and above | +-------------------------+-------------------------------+ | clang | 3.7 and above | +-------------------------+-------------------------------+ The following SIMD instruction set extensions are supported: +--------------+----------------------------------------------------+ | Architecture | Instruction set extensions | +==============+====================================================+ | x86 | SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, FMA3, AVX2 | +--------------+----------------------------------------------------+ | x86 | AVX512 (gcc7 and higher) | +--------------+----------------------------------------------------+ | x86 AMD | same as above + SSE4A, FMA4, XOP | +--------------+----------------------------------------------------+ | ARM | ARMv7, ARMv8 | +--------------+----------------------------------------------------+ Licensing --------- We use a shared copyright model that enables all contributors to maintain the copyright on their contributions. This software is licensed under the BSD-3-Clause license. See the LICENSE file for details. .. toctree:: :caption: INSTALLATION :maxdepth: 2 installation .. toctree:: :caption: USAGE :maxdepth: 2 basic_usage vectorized_code .. toctree:: :caption: API REFERENCE :maxdepth: 2 api/instr_macros api/batch_index api/data_transfer api/batch_manip api/math_index api/aligned_allocator .. _The C++ Scientist: http://johanmabille.github.io/blog/archives/ .. _boost.SIMD: https://github.com/NumScale/boost.simd xsimd-7.6.0/docs/source/installation.rst000066400000000000000000000031531410101234500203240ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille and Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Installation ============ Although ``xsimd`` is a header-only library, we provide standardized means to install it, with package managers or with cmake. Besides the xsimd headers, all these methods place the ``cmake`` project configuration file in the right location so that third-party projects can use cmake's ``find_package`` to locate xsimd headers. .. image:: conda.svg Using the conda-forge package ----------------------------- A package for xsimd is available for the mamba (or conda) package manager. .. code:: mamba install -c conda-forge xsimd .. image:: spack.svg Using the Spack package ----------------------- A package for xsimd is available on the Spack package manager. .. code:: spack install xsimd spack load xsimd .. image:: cmake.svg From source with cmake ---------------------- You can also install ``xsimd`` from source with cmake. On Unix platforms, from the source directory: .. code:: mkdir build cd build cmake -DCMAKE_INSTALL_PREFIX=/path/to/prefix .. make install On Windows platforms, from the source directory: .. code:: mkdir build cd build cmake -G "NMake Makefiles" -DCMAKE_INSTALL_PREFIX=/path/to/prefix .. nmake nmake install xsimd-7.6.0/docs/source/quantstack-white.svg000066400000000000000000000116361410101234500211130ustar00rootroot00000000000000 image/svg+xmlxsimd-7.6.0/docs/source/spack.svg000066400000000000000000000046711410101234500167210ustar00rootroot00000000000000 xsimd-7.6.0/docs/source/vectorized_code.rst000066400000000000000000000147761410101234500210100ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille and Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Writing vectorized code ======================= Assume that we have a simple function that computes the mean of two vectors, something like: .. code:: #include #include void mean(const std:vector& a, const std::vector& b, std::vector& res) { std::size_t size = res.size(); for(std::size_t i = 0; i < size; ++i) { res[i] = (a[i] + b[i]) / 2; } } How can we used `xsimd` to take advantage of vectorization ? Explicit use of an instruction set ---------------------------------- `xsimd` provides the template class ``batch`` where ``N`` is the number of scalar values of type ``T`` involved in SIMD instructions. If you know which intruction set is available on your machine, you can directly use the corresponding specialization of ``batch``. For instance, assuming the AVX instruction set is available, the previous code can be vectorized the following way: .. code:: #include #include #include "xsimd/xsimd.hpp" void mean(const std::vector& a, const std::vector& b, std::vector& res) { using b_type = xsimd::batch; std::size_t inc = b_type::size; std::size_t size = res.size(); // size for which the vectorization is possible std::size_t vec_size = size - size % inc; for(std::size_t i = 0; i < vec_size; i +=inc) { b_type avec(&a[i]); b_type bvec(&b[i]); b_type rvec = (avec + bvec) / 2; rvec.store_unaligned(&res[i]); } // Remaining part that cannot be vectorize for(std::size_t i = vec_size; i < size; ++i) { res[i] = (a[i] + b[i]) / 2; } } However, if you want to write code that is portable, you cannot rely on the use of ``batch``. Indeed this won't compile on a CPU where only SSE2 instruction set is available for instance. To solve this, `xsimd` provides an auto-detection mechanism so you can use the most performant SIMD instruction set available on your hardware. Auto detecting the instruction set ---------------------------------- Using the auto detection mechanism does not require a lot of change: .. code:: #include #include #include "xsimd/xsimd.hpp" void mean(const std::vector& a, const std::vector& b, std::vector& res) { using b_type = xsimd::simd_type; std::size_t inc = b_type::size; std::size_t size = res.size(); // size for which the vectorization is possible std::size_t vec_size = size - size % inc; for(std::size_t i = 0; i < vec_size; i += inc) { b_type avec = xsimd::load_unaligned(&a[i]); b_type bvec = xsimd::load_unaligned(&b[i]); b_type rvec = (avec + bvec) / 2; xsimd::store_unaligned(&res[i], rvec); // or rvec.store_unaligned(&res[i]); } // Remaining part that cannot be vectorize for(std::size_t i = vec_size; i < size; ++i) { res[i] = (a[i] + b[i]) / 2; } } Aligned vs unaligned memory --------------------------- In the previous example, you may have noticed the ``load_unaligned/store_unaligned`` functions. These are meant for loading values from contiguous dynamically allocated memory into SIMD registers and reciprocally. When dealing with memory transfer operations, some instructions sets required the memory to be aligned by a given amount, others can handle both aligned and unaligned modes. In that latter case, operating on aligned memory is always faster than operating on unaligned memory. `xsimd` provides an aligned memory allocator which follows the standard requirements, so it can be used with STL containers. Let's change the previous code so it can take advantage of this allocator: .. code:: #include #include #include "xsimd/xsimd.hpp" using vector_type = std::vector; void mean(const vector_type& a, const vector_type& b, vector_type& res) { using b_type = xsimd::simd_type; std::size_t inc = b_type::size; std::size_t size = res.size(); // size for which the vectorization is possible std::size_t vec_size = size - size % inc; for(std::size_t i = 0; i < vec_size; i += inc) { b_type avec = xsimd::load_aligned(&a[i]); b_type bvec = xsimd::load_aligned(&b[i]); b_type rvec = (avec + bvec) / 2; xsimd::store_unaligned(&res[i], rvec); // or rvec.store_unaligned(&res[i]); } // Remaining part that cannot be vectorize for(std::size_t i = vec_size; i < size; ++i) { res[i] = (a[i] + b[i]) / 2; } } Memory alignment and tag dispatching ------------------------------------ You may need to write code that can operate on any type of vectors or arrays, not only the STL ones. In that case, you cannot make assumption on the memory alignment of the container. `xsimd` provides a tag dispatching mechanism that allows you to easily write such a generic code: .. code:: #include #include #include "xsimd/xsimd.hpp" template void mean(const C& a, const C& b, C& res) { using b_type = xsimd::simd_type; std::size_t inc = b_type::size; std::size_t size = res.size(); // size for which the vectorization is possible std::size_t vec_size = size - size % inc; for(std::size_t i = 0; i < vec_size; i += inc) { b_type avec = xsimd::load_simd(&a[i], Tag()); b_type bvec = xsimd::load_simd(&b[i], Tag()); b_type rvec = (avec + bvec) / 2; xsimd::store_simd(&res[i], rvec, Tag()); } // Remaining part that cannot be vectorize for(std::size_t i = vec_size; i < size; ++i) { res[i] = (a[i] + b[i]) / 2; } } Here, the ``Tag`` template parameter can be ``xsimd::aligned_mode`` or ``xsimd::unaligned_mode``. Assuming the existence of a ``get_alignment_tag`` metafunction in the code, the previous code can be invoked this way: .. code:: mean>(a, b, res); xsimd-7.6.0/docs/source/xsimd.svg000066400000000000000000000055741410101234500167470ustar00rootroot00000000000000 xsimd-7.6.0/examples/000077500000000000000000000000001410101234500144555ustar00rootroot00000000000000xsimd-7.6.0/examples/CMakeLists.txt000066400000000000000000000030131410101234500172120ustar00rootroot00000000000000############################################################################ # Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and # # Martin Renou # # Copyright (c) QuantStack # # # # Distributed under the terms of the BSD 3-Clause License. # # # # The full license is in the file LICENSE, distributed with this software. # ############################################################################ cmake_minimum_required(VERSION 3.1) if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) project(xsimd-examples) find_package(xsimd REQUIRED CONFIG) set(XSIMD_INCLUDE_DIR ${xsimd_INCLUDE_DIR}) endif () include_directories(${XSIMD_INCLUDE_DIR}) if(NOT CMAKE_BUILD_TYPE) message(STATUS "Setting examples build type to Release") set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) else() message(STATUS "Tests build type is ${CMAKE_BUILD_TYPE}") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -march=native -mtune=native -std=c++14") if(NOT CMAKE_CXX_COMPILER_ID MATCHES Clang) # We are using clang-cl set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") endif() add_executable(mandelbrot mandelbrot.cpp ${XSIMD_HEADERS}) add_custom_target(xmandelbrot COMMAND mandelbrot DEPENDS mandelbrot) xsimd-7.6.0/examples/mandelbrot.cpp000066400000000000000000000325071410101234500173170ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ // This file is derived from tsimd (MIT License) // https://github.com/ospray/tsimd/blob/master/benchmarks/mandelbrot.cpp // Author Jefferson Amstutz / intel #include #include #include #include #include "pico_bench.hpp" #define XSIMD_ENABLE_FALLBACK #include // helper function to write the rendered image as PPM file inline void writePPM(const std::string &fileName, const int sizeX, const int sizeY, const int *pixel) { FILE* file = fopen(fileName.c_str(), "wb"); fprintf(file, "P6\n%i %i\n255\n", sizeX, sizeY); unsigned char* out = (unsigned char*)alloca(3 * sizeX); for (int y = 0; y < sizeY; y++) { const unsigned char* in = (const unsigned char*) &pixel[(sizeY - 1 - y) * sizeX]; for (int x = 0; x < sizeX; x++) { out[3 * x + 0] = in[4 * x + 0]; out[3 * x + 1] = in[4 * x + 1]; out[3 * x + 2] = in[4 * x + 2]; } fwrite(out, 3 * sizeX, sizeof(char), file); } fprintf(file, "\n"); fclose(file); } namespace xsimd { template inline batch mandel(const batch_bool &_active, const batch &c_re, const batch &c_im, int maxIters) { batch z_re = c_re; batch z_im = c_im; batch vi(0); for (int i = 0; i < maxIters; ++i) { auto active = _active & ((z_re * z_re + z_im * z_im) <= batch(4.f)); if (!xsimd::any(active)) { break; } batch new_re = z_re * z_re - z_im * z_im; batch new_im = 2.f * z_re * z_im; z_re = c_re + new_re; z_im = c_im + new_im; vi = select(bool_cast(active), vi + 1, vi); } return vi; } template void mandelbrot(float x0, float y0, float x1, float y1, int width, int height, int maxIters, int output[]) { float dx = (x1 - x0) / width; float dy = (y1 - y0) / height; float arange[N]; std::iota(&arange[0], &arange[N], 0.f); batch programIndex(&arange[0], xsimd::aligned_mode()); // std::iota(programIndex.begin(), programIndex.end(), 0.f); for (int j = 0; j < height; j++) { for (int i = 0; i < width; i += N) { batch x(x0 + (i + programIndex) * dx); batch y(y0 + j * dy); auto active = x < batch(width); int base_index = (j * width + i); auto result = mandel(active, x, y, maxIters); // implement masked store! // xsimd::store_aligned(result, output + base_index, active); batch prev_data(output + base_index); select(bool_cast(active), result, prev_data) .store_aligned(output + base_index); } } } } // namespace xsimd // omp version //////////////////////////////////////////////////////////////// namespace omp { #pragma omp declare simd template inline int mandel(T c_re, T c_im, int count) { T z_re = c_re, z_im = c_im; int i; for (i = 0; i < count; ++i) { if (z_re * z_re + z_im * z_im > 4.f) { break; } T new_re = z_re * z_re - z_im * z_im; T new_im = 2.f * z_re * z_im; z_re = c_re + new_re; z_im = c_im + new_im; } return i; } void mandelbrot(float x0, float y0, float x1, float y1, int width, int height, int maxIterations, int output[]) { float dx = (x1 - x0) / width; float dy = (y1 - y0) / height; for (int j = 0; j < height; j++) { #pragma omp simd for (int i = 0; i < width; ++i) { float x = x0 + i * dx; float y = y0 + j * dy; int index = (j * width + i); output[index] = mandel(x, y, maxIterations); } } } } // namespace omp // scalar version ///////////////////////////////////////////////////////////// namespace scalar { inline int mandel(float c_re, float c_im, int count) { float z_re = c_re, z_im = c_im; int i; for (i = 0; i < count; ++i) { if (z_re * z_re + z_im * z_im > 4.f) { break; } float new_re = z_re * z_re - z_im * z_im; float new_im = 2.f * z_re * z_im; z_re = c_re + new_re; z_im = c_im + new_im; } return i; } void mandelbrot(float x0, float y0, float x1, float y1, int width, int height, int maxIterations, int output[]) { float dx = (x1 - x0) / width; float dy = (y1 - y0) / height; for (int j = 0; j < height; j++) { for (int i = 0; i < width; ++i) { float x = x0 + i * dx; float y = y0 + j * dy; int index = (j * width + i); output[index] = mandel(x, y, maxIterations); } } } } // namespace scalar int main() { using namespace std::chrono; const unsigned int width = 1024; const unsigned int height = 768; const float x0 = -2; const float x1 = 1; const float y0 = -1; const float y1 = 1; const int maxIters = 256; std::vector> buf(width * height); auto bencher = pico_bench::Benchmarker{64, seconds{10}}; std::cout << "starting benchmarks (results in 'ms')... " << '\n'; // scalar run /////////////////////////////////////////////////////////////// std::fill(buf.begin(), buf.end(), 0); auto stats_scalar = bencher([&]() { scalar::mandelbrot(x0, y0, x1, y1, width, height, maxIters, buf.data()); }); const float scalar_min = stats_scalar.min().count(); std::cout << '\n' << "scalar " << stats_scalar << '\n'; writePPM("mandelbrot_scalar.ppm", width, height, buf.data()); // omp run ////////////////////////////////////////////////////////////////// std::fill(buf.begin(), buf.end(), 0); auto stats_omp = bencher([&]() { omp::mandelbrot(x0, y0, x1, y1, width, height, maxIters, buf.data()); }); const float omp_min = stats_omp.min().count(); std::cout << '\n' << "omp " << stats_omp << '\n'; writePPM("mandelbrot_omp.ppm", width, height, buf.data()); // xsimd_1 run ////////////////////////////////////////////////////////////// std::fill(buf.begin(), buf.end(), 0); auto stats_1 = bencher([&]() { xsimd::mandelbrot<1>(x0, y0, x1, y1, width, height, maxIters, buf.data()); }); const float xsimd1_min = stats_1.min().count(); std::cout << '\n' << "xsimd_1 " << stats_1 << '\n'; writePPM("mandelbrot_xsimd1.ppm", width, height, buf.data()); // xsimd_4 run ////////////////////////////////////////////////////////////// std::fill(buf.begin(), buf.end(), 0); auto stats_4 = bencher([&]() { xsimd::mandelbrot<4>(x0, y0, x1, y1, width, height, maxIters, buf.data()); }); const float xsimd4_min = stats_4.min().count(); std::cout << '\n' << "xsimd_4 " << stats_4 << '\n'; writePPM("mandelbrot_xsimd4.ppm", width, height, buf.data()); // xsimd_8 run ////////////////////////////////////////////////////////////// std::fill(buf.begin(), buf.end(), 0); auto stats_8 = bencher([&]() { xsimd::mandelbrot<8>(x0, y0, x1, y1, width, height, maxIters, buf.data()); }); const float xsimd8_min = stats_8.min().count(); std::cout << '\n' << "xsimd_8 " << stats_8 << '\n'; writePPM("mandelbrot_xsimd8.ppm", width, height, buf.data()); // xsimd_16 run ///////////////////////////////////////////////////////////// std::fill(buf.begin(), buf.end(), 0); auto stats_16 = bencher([&]() { xsimd::mandelbrot<16>(x0, y0, x1, y1, width, height, maxIters, buf.data()); }); const float xsimd16_min = stats_16.min().count(); std::cout << '\n' << "xsimd_16 " << stats_16 << '\n'; writePPM("mandelbrot_xsimd16.ppm", width, height, buf.data()); // conclusions ////////////////////////////////////////////////////////////// std::cout << '\n' << "Conclusions: " << '\n'; // scalar // std::cout << '\n' << "--> scalar was " << omp_min / scalar_min << "x the speed of omp"; std::cout << '\n' << "--> scalar was " << xsimd1_min / scalar_min << "x the speed of xsimd_1"; std::cout << '\n' << "--> scalar was " << xsimd4_min / scalar_min << "x the speed of xsimd_4"; std::cout << '\n' << "--> scalar was " << xsimd8_min / scalar_min << "x the speed of xsimd_8"; std::cout << '\n' << "--> scalar was " << xsimd16_min / scalar_min << "x the speed of xsimd_16" << '\n'; // omp // std::cout << '\n' << "--> omp was " << scalar_min / omp_min << "x the speed of scalar"; std::cout << '\n' << "--> omp was " << xsimd1_min / omp_min << "x the speed of xsimd_1"; std::cout << '\n' << "--> omp was " << xsimd4_min / omp_min << "x the speed of xsimd_4"; std::cout << '\n' << "--> omp was " << xsimd8_min / omp_min << "x the speed of xsimd_8"; std::cout << '\n' << "--> omp was " << xsimd16_min / omp_min << "x the speed of xsimd_16" << '\n'; // xsimd1 // std::cout << '\n' << "--> xsimd1 was " << scalar_min / xsimd1_min << "x the speed of scalar"; std::cout << '\n' << "--> xsimd1 was " << omp_min / xsimd1_min << "x the speed of omp"; std::cout << '\n' << "--> xsimd1 was " << xsimd4_min / xsimd1_min << "x the speed of xsimd_4"; std::cout << '\n' << "--> xsimd1 was " << xsimd8_min / xsimd1_min << "x the speed of xsimd_8"; std::cout << '\n' << "--> xsimd1 was " << xsimd16_min / xsimd1_min << "x the speed of xsimd_16" << '\n'; // xsimd4 // std::cout << '\n' << "--> xsimd4 was " << scalar_min / xsimd4_min << "x the speed of scalar"; std::cout << '\n' << "--> xsimd4 was " << omp_min / xsimd4_min << "x the speed of omp"; std::cout << '\n' << "--> xsimd4 was " << xsimd1_min / xsimd4_min << "x the speed of xsimd_1"; std::cout << '\n' << "--> xsimd4 was " << xsimd8_min / xsimd4_min << "x the speed of xsimd_8"; std::cout << '\n' << "--> xsimd4 was " << xsimd16_min / xsimd4_min << "x the speed of xsimd_16" << '\n'; // xsimd8 // std::cout << '\n' << "--> xsimd8 was " << scalar_min / xsimd8_min << "x the speed of scalar"; std::cout << '\n' << "--> xsimd8 was " << omp_min / xsimd8_min << "x the speed of omp"; std::cout << '\n' << "--> xsimd8 was " << xsimd1_min / xsimd8_min << "x the speed of xsimd_1"; std::cout << '\n' << "--> xsimd8 was " << xsimd4_min / xsimd8_min << "x the speed of xsimd_4"; std::cout << '\n' << "--> xsimd8 was " << xsimd16_min / xsimd8_min << "x the speed of xsimd_16" << '\n'; // xsimd16 // std::cout << '\n' << "--> xsimd16 was " << scalar_min / xsimd16_min << "x the speed of scalar"; std::cout << '\n' << "--> xsimd16 was " << omp_min / xsimd16_min << "x the speed of omp"; std::cout << '\n' << "--> xsimd16 was " << xsimd1_min / xsimd16_min << "x the speed of xsimd_1"; std::cout << '\n' << "--> xsimd16 was " << xsimd4_min / xsimd16_min << "x the speed of xsimd_4"; std::cout << '\n' << "--> xsimd16 was " << xsimd8_min / xsimd16_min << "x the speed of xsimd_8" << '\n'; std::cout << '\n' << "wrote output images to 'mandelbrot_[type].ppm'" << '\n'; return 0; } xsimd-7.6.0/examples/pico_bench.hpp000066400000000000000000000175611410101234500172710ustar00rootroot00000000000000/**************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ // This file is derived from tsimd (MIT License) // https://github.com/ospray/tsimd/blob/master/benchmarks/pico_bench.h // Author Jefferson Amstutz / intel #ifndef PICO_BENCH_H #define PICO_BENCH_H #include #include #include #include #include #include #include #include #include #include namespace pico_bench { /* Statistics on some time measurement value T, e.g. T = * std::chrono::milliseconds T must be some std::chrono::duration type */ template class Statistics { using rep = typename T::rep; std::vector samples; public: std::string time_suffix; Statistics(std::vector s) : samples(s) { std::sort(std::begin(samples), std::end(samples)); } T percentile(const float p) const { return percentile(p, samples); } // Winsorize the data, sets all entries above 100 - limit percentile and // below limit percentile to the value of that percentile void winsorize(const float limit) { winsorize(limit, samples); } T median() const { return percentile(50.0, samples); } T median_abs_dev() const { const auto m = median(); std::vector deviations; deviations.reserve(samples.size()); std::transform(std::begin(samples), std::end(samples), std::back_inserter(deviations), [&m](const T& t) { return T{ std::abs((t - m).count()) }; }); std::sort(std::begin(deviations), std::end(deviations)); return percentile(50.0, deviations); } T mean() const { const auto m = std::accumulate(std::begin(samples), std::end(samples), T{ 0 }); return m / samples.size(); } T std_dev() const { const auto m = mean(); auto val = std::accumulate( std::begin(samples), std::end(samples), T{ 0 }, [&m](const T& p, const T& t) { return T{ static_cast(p.count() + std::pow((t - m).count(), 2)) }; }); return T{ static_cast(std::sqrt(1.0 / static_cast(samples.size()) * static_cast(val.count()))) }; } T min() const { return samples.front(); } T max() const { return samples.back(); } std::size_t size() const { return samples.size(); } const T& operator[](size_t i) const { return samples[i]; } private: // Winsorize the data, sets all entries above 100 - limit percentile and // below limit percentile to the value of that percentile static void winsorize(const float limit, std::vector& samples) { const auto low = percentile(limit, samples); const auto high = percentile(100.0 - limit, samples); for (auto& t : samples) { if (t < low) { t = low; } else if (t > high) { t = high; } } } static T percentile(const float p, const std::vector& samples) { assert(!samples.empty()); assert(p <= 100.0); assert(p >= 0.0); if (samples.size() == 1) { return samples.front(); } if (p == 100.0) { return samples.back(); } const double rank = p / 100.0 * (static_cast(samples.size()) - 1.0); const double low_r = std::floor(rank); const double dist = rank - low_r; const size_t k = static_cast(low_r); const auto low = samples[k]; const auto high = samples[k + 1]; return T{ static_cast(low.count() + (high - low).count() * dist) }; } }; /* Benchmarking measurment using some desired unit of time measurement, * e.g. T = std::chrono::milliseconds. T must be some std::chrono::duration */ template class Benchmarker { const size_t MAX_ITER; const T MAX_RUNTIME; template struct BenchWrapper { Fn fn; BenchWrapper(Fn fn) : fn(fn) { } T operator()() { auto start = std::chrono::high_resolution_clock::now(); fn(); auto end = std::chrono::high_resolution_clock::now(); return std::chrono::duration_cast(end - start); } }; public: using stats_type = Statistics; // Benchmark the functions either max_iter times or until max_runtime // seconds have elapsed max_runtime should be > 0 Benchmarker(const size_t max_iter, const std::chrono::seconds max_runtime) : MAX_ITER(max_iter) , MAX_RUNTIME(std::chrono::duration_cast(max_runtime)) { } // Create a benchmarker that will run the function for the desired number of // iterations, regardless of how long it takes Benchmarker(const size_t max_iter) : MAX_ITER(max_iter) , MAX_RUNTIME(0) { } template typename std::enable_if()())>::value, stats_type>::type operator()(Fn fn) const { return (*this)(BenchWrapper{ fn }); } template typename std::enable_if()()), T>::value, stats_type>::type operator()(Fn fn) const { // Do a single un-timed warm up run fn(); T elapsed{ 0 }; std::vector samples; for (size_t i = 0; i < MAX_ITER && (MAX_RUNTIME.count() == 0 || elapsed < MAX_RUNTIME); ++i, elapsed += samples.back()) { samples.push_back(fn()); } return stats_type{ samples }; } }; } // namespace pico_bench template std::ostream& operator<<(std::ostream& os, const pico_bench::Statistics& stats) { os << "Statistics:\n" << "\tmax: " << stats.max().count() << stats.time_suffix << "\n" << "\tmin: " << stats.min().count() << stats.time_suffix << "\n" << "\tmedian: " << stats.median().count() << stats.time_suffix << "\n" << "\tmedian abs dev: " << stats.median_abs_dev().count() << stats.time_suffix << "\n" << "\tmean: " << stats.mean().count() << stats.time_suffix << "\n" << "\tstd dev: " << stats.std_dev().count() << stats.time_suffix << "\n" << "\t# of samples: " << stats.size(); return os; } #endif xsimd-7.6.0/include/000077500000000000000000000000001410101234500142625ustar00rootroot00000000000000xsimd-7.6.0/include/xsimd/000077500000000000000000000000001410101234500154065ustar00rootroot00000000000000xsimd-7.6.0/include/xsimd/config/000077500000000000000000000000001410101234500166535ustar00rootroot00000000000000xsimd-7.6.0/include/xsimd/config/xsimd_align.hpp000066400000000000000000000042531410101234500216660ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_ALIGN_HPP #define XSIMD_ALIGN_HPP #include "xsimd_instruction_set.hpp" /************************************************ * Platform checks for aligned malloc functions * ************************************************/ #if ((defined __QNXNTO__) || (defined _GNU_SOURCE) || ((defined _XOPEN_SOURCE) && (_XOPEN_SOURCE >= 600))) \ && (defined _POSIX_ADVISORY_INFO) && (_POSIX_ADVISORY_INFO > 0) #define XSIMD_HAS_POSIX_MEMALIGN 1 #else #define XSIMD_HAS_POSIX_MEMALIGN 0 #endif #if defined(XSIMD_X86_INSTR_SET_AVAILABLE) #define XSIMD_HAS_MM_MALLOC 1 #else #define XSIMD_HAS_MM_MALLOC 0 #endif /******************** * Stack allocation * ********************/ #ifndef XSIMD_ALLOCA #if defined(__linux__) #define XSIMD_ALLOCA alloca #elif defined(_MSC_VER) #define XSIMD_ALLOCA _alloca #endif #endif /********************* * Default alignment * *********************/ #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX512_VERSION #define XSIMD_DEFAULT_ALIGNMENT 64 #elif XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION #define XSIMD_DEFAULT_ALIGNMENT 32 #elif XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION #define XSIMD_DEFAULT_ALIGNMENT 16 #elif XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION #define XSIMD_DEFAULT_ALIGNMENT 32 #elif XSIMD_ARM_INSTR_SET >= XSIMD_ARM7_NEON_VERSION #define XSIMD_DEFAULT_ALIGNMENT 16 #else // Set the default to the requirements of posix_memalign #define XSIMD_DEFAULT_ALIGNMENT sizeof(void*) #endif #endif xsimd-7.6.0/include/xsimd/config/xsimd_config.hpp000066400000000000000000000024651410101234500220440ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_CONFIG_HPP #define XSIMD_CONFIG_HPP #include "xsimd_align.hpp" #define XSIMD_VERSION_MAJOR 7 #define XSIMD_VERSION_MINOR 6 #define XSIMD_VERSION_PATCH 0 #ifndef XSIMD_DEFAULT_ALLOCATOR #if XSIMD_X86_INSTR_SET_AVAILABLE #define XSIMD_DEFAULT_ALLOCATOR(T) xsimd::aligned_allocator #else #define XSIMD_DEFAULT_ALLOCATOR(T) std::allocator #endif #endif #ifndef XSIMD_STACK_ALLOCATION_LIMIT #define XSIMD_STACK_ALLOCATION_LIMIT 20000 #endif #if defined(__LP64__) || defined(_WIN64) #define XSIMD_64_BIT_ABI #else #define XSIMD_32_BIT_ABI #endif #endif xsimd-7.6.0/include/xsimd/config/xsimd_include.hpp000066400000000000000000000051141410101234500222140ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_INCLUDE_HPP #define XSIMD_INCLUDE_HPP #include "xsimd_instruction_set.hpp" // X86 intruction sets #if XSIMD_X86_INSTR_SET >= XSIMD_X86_FMA3_VERSION // FMA3 and later #ifdef __GNUC__ #include // x86intrin.h includes header files for whatever instruction // sets are specified on the compiler command line, such as: // xopintrin.h, fma4intrin.h #else #include // MS version of immintrin.h covers AVX, AVX2 and FMA3 #endif // __GNUC__ #elif XSIMD_X86_INSTR_SET == XSIMD_X86_AVX_VERSION #include // AVX #elif XSIMD_X86_INSTR_SET == XSIMD_X86_SSE4_2_VERSION #include // SSE4.2 #elif XSIMD_X86_INSTR_SET == XSIMD_X86_SSE4_1_VERSION #include // SSE4.1 #elif XSIMD_X86_INSTR_SET == XSIMD_X86_SSSE3_VERSION #include // SSSE3 #elif XSIMD_X86_INSTR_SET == XSIMD_X86_SSE3_VERSION #include // SSE3 #elif XSIMD_X86_INSTR_SET == XSIMD_X86_SSE2_VERSION #include // SSE2 #elif XSIMD_X86_INSTR_SET == XSIMD_X86_SSE_VERSION #include // SSE #endif // XSIMD_X86_INSTR_SET // AMD instruction sets #if XSIMD_X86_AMD_INSTR_SET >= XSIMD_X86_AMD_FMA4_VERSION #ifdef _MSC_VER #include #else #include #if XSIMD_X86_AMD_INSTR_SET >= XSIMD_X86_AMD_XOP_VERSION #include #else #include #endif #endif // _MSC_VER #elif XSIMD_X86_AMD_INSTR_SET == XSIMD_X86_AMD_SSE4A_VERSION #include #endif // XSIMD_X86_AMD_INSTR_SET #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM7_NEON_VERSION #include #endif // TODO: add ALTIVEC instruction set #endif xsimd-7.6.0/include/xsimd/config/xsimd_instruction_set.hpp000066400000000000000000000241121410101234500240240ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ // Simplified version of boost.predef #ifndef XSIMD_INSTRUCTION_SET_HPP #define XSIMD_INSTRUCTION_SET_HPP /****************** * VERSION NUMBER * ******************/ // major number can be in [0, 99] // minor number can be in [0, 99] // patch number can be in [0, 999999] #define XSIMD_VERSION_NUMBER(major, minor, patch) \ ((((major) % 100) * 10000000) + (((minor) % 100) * 100000) + ((patch) % 100000)) #define XSIMD_VERSION_NUMBER_NOT_AVAILABLE \ XSIMD_VERSION_NUMBER(0, 0, 0) #define XSIMD_VERSION_NUMBER_AVAILABLE \ XSIMD_VERSION_NUMBER(0, 0, 1) /************************* * CLEAR INSTRUCTION SET * *************************/ #undef XSIMD_X86_INSTR_SET #undef XSIMD_X86_INSTR_SET_AVAILABLE #undef XSIMD_X86_AMD_INSTR_SET #undef XSIMD_X86_AMD_INSTR_SET_AVAILABLE #undef XSIMD_PPC_INSTR_SET #undef XSIMD_PPC_INSTR_SET_AVAILABLE #undef XSIMD_ARM_INSTR_SET #undef XSIMD_ARM_INSTR_SET_AVAILABLE /********************** * USER CONFIGURATION * **********************/ #ifdef XSIMD_FORCE_X86_INSTR_SET #define XSIMD_X86_INSTR_SET XSIMD_FORCE_X86_INSTR_SET #define XSIMD_X86_INSTR_SET_AVAILABLE XSIMD_VERSION_NUMBER_AVAILABLE #ifndef XSIMD_SKIP_ON_WERROR #ifdef _MSC_VER #pragma message("Warning: Forcing X86 instruction set") #else #warning "Forcing X86 instruction set" #endif #endif #elif defined(XSIMD_FORCE_X86_AMD_INSTR_SET) #define XSIMD_X86_AMD_INSTR_SET XSIMD_FORCE_X86_AMD_INSTR_SET #define XSIMD_X86_AMD_INSTR_SET_AVAILABLE XSIMD_VERSION_NUMBER_AVAILABLE #ifndef XSIMD_SKIP_ON_WERROR #ifdef _MSC_VER #pragma message("Warning: Forcing X86 AMD instruction set") #else #warning "Forcing X86 AMD instruction set" #endif #endif #elif defined(XSIMD_FORCE_PPC_INSTR_SET) #define XSIMD_PPC_INSTR_SET XSIMD_FORCE_PPC_INSTR_SET #define XSIMD_PPC_INSTR_SET_AVAILABLE XSIMD_VERSION_NUMBER_AVAILABLE #ifndef XSIMD_SKIP_ON_WERROR #ifdef _MSC_VER #pragma message("Warning: Forcing PPC instruction set") #else #warning "Forcing PPC instruction set" #endif #endif #elif defined(XSIMD_FORCE_ARM_INSTR_SET) #define XSIMD_ARM_INSTR_SET XSIMD_FORCE_ARM_INSTR_SET #define XSIMD_ARM_INSTR_SET_AVAILABLE XSIMD_VERSION_NUMBER_AVAILABLE #ifndef XSIMD_SKIP_ON_WERROR #ifdef _MSC_VER #pragma message("Warning: Forcing ARM instruction set") #else #warning "Forcing ARM instruction set" #endif #endif #endif /*********************** * X86 INSTRUCTION SET * ***********************/ #define XSIMD_X86_SSE_VERSION XSIMD_VERSION_NUMBER(1, 0, 0) #define XSIMD_X86_SSE2_VERSION XSIMD_VERSION_NUMBER(2, 0, 0) #define XSIMD_X86_SSE3_VERSION XSIMD_VERSION_NUMBER(3, 0, 0) #define XSIMD_X86_SSSE3_VERSION XSIMD_VERSION_NUMBER(3, 1, 0) #define XSIMD_X86_SSE4_1_VERSION XSIMD_VERSION_NUMBER(4, 1, 0) #define XSIMD_X86_SSE4_2_VERSION XSIMD_VERSION_NUMBER(4, 2, 0) #define XSIMD_X86_AVX_VERSION XSIMD_VERSION_NUMBER(5, 0, 0) #define XSIMD_X86_FMA3_VERSION XSIMD_VERSION_NUMBER(5, 2, 0) #define XSIMD_X86_AVX2_VERSION XSIMD_VERSION_NUMBER(5, 3, 0) #define XSIMD_X86_AVX512_VERSION XSIMD_VERSION_NUMBER(6, 0, 0) #define XSIMD_X86_MIC_VERSION XSIMD_VERSION_NUMBER(9, 0, 0) #if !defined(XSIMD_X86_INSTR_SET) && defined(__MIC__) #define XSIMD_X86_INSTR_SET XSIMD_X86_MIC_VERSION #endif // AVX512 instructions are supported starting with gcc 6 // see https://www.gnu.org/software/gcc/gcc-6/changes.html #if !defined(XSIMD_X86_INSTR_SET) && (defined(__AVX512__) || defined(__KNCNI__) || defined(__AVX512F__)\ && (defined(__clang__) || (!defined(__GNUC__) || __GNUC__ >= 6))) #define XSIMD_X86_INSTR_SET XSIMD_X86_AVX512_VERSION #if defined(__AVX512VL__) #define XSIMD_AVX512VL_AVAILABLE 1 #endif #if defined(__AVX512DQ__) #define XSIMD_AVX512DQ_AVAILABLE 1 #endif #if defined(__AVX512BW__) #define XSIMD_AVX512BW_AVAILABLE 1 #endif #if __GNUC__ == 6 #define XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY 1 #endif #endif #if !defined(XSIMD_X86_INSTR_SET) && defined(__AVX2__) #define XSIMD_X86_INSTR_SET XSIMD_X86_AVX2_VERSION #endif #if !defined(XSIMD_X86_INSTR_SET) && defined(__FMA__) #define XSIMD_X86_INSTR_SET XSIMD_X86_FMA3_VERSION #endif #if !defined(XSIMD_X86_INSTR_SET) && defined(__AVX__) #define XSIMD_X86_INSTR_SET XSIMD_X86_AVX_VERSION #endif #if !defined(XSIMD_X86_INSTR_SET) && defined(__SSE4_2__) #define XSIMD_X86_INSTR_SET XSIMD_X86_SSE4_2_VERSION #endif #if !defined(XSIMD_X86_INSTR_SET) && defined(__SSE4_1__) #define XSIMD_X86_INSTR_SET XSIMD_X86_SSE4_1_VERSION #endif #if !defined(XSIMD_X86_INSTR_SET) && defined(__SSSE3__) #define XSIMD_X86_INSTR_SET XSIMD_X86_SSSE3_VERSION #endif #if !defined(XSIMD_X86_INSTR_SET) && defined(__SSE3__) #define XSIMD_X86_INSTR_SET XSIMD_X86_SSE3_VERSION #endif #if !defined(XSIMD_X86_INSTR_SET) && (defined(__SSE2__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)) #define XSIMD_X86_INSTR_SET XSIMD_X86_SSE2_VERSION #endif #if !defined(XSIMD_X86_INSTR_SET) && (defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)) #define XSIMD_X86_INSTR_SET XSIMD_X86_SSE_VERSION #endif #if !(defined XSIMD_X86_INSTR_SET) #define XSIMD_X86_INSTR_SET XSIMD_VERSION_NUMBER_NOT_AVAILABLE #else #define XSIMD_X86_INSTR_SET_AVAILABLE XSIMD_VERSION_NUMBER_AVAILABLE #endif /*************************** * X86_AMD INSTRUCTION SET * ***************************/ #define XSIMD_X86_AMD_SSE4A_VERSION XSIMD_VERSION_NUMBER(4, 0, 0) #define XSIMD_X86_AMD_FMA4_VERSION XSIMD_VERSION_NUMBER(5, 1, 0) #define XSIMD_X86_AMD_XOP_VERSION XSIMD_VERSION_NUMBER(5, 1, 1) #if !defined(XSIMD_X86_AMD_INSTR_SET) && defined(__XOP__) #define XSIMD_X86_AMD_INSTR_SET XSIMD_X86_AMD_XOP_VERSION #endif #if !defined(XSIMD_X86_AMD_INSTR_SET) && defined(__FMA4__) #define XSIMD_X86_AMD_INSTR_SET XSIMD_X86_AMD_FMA4_VERSION #endif #if !defined(XSIMD_X86_AMD_INSTR_SET) && defined(__SSE4A__) #define XSIMD_X86_AMD_INSTR_SET XSIMD_X86_AMD_SSE4A_VERSION #endif #if !defined(XSIMD_X86_AMD_INSTR_SET) #define XSIMD_X86_AMD_INSTR_SET XSIMD_VERSION_NUMBER_NOT_AVAILABLE #else // X86_AMD implies X86 #if XSIMD_X86_INSTR_SET > XSIMD_X86_AMD_INSTR_SET #undef XSIMD_X86_AMD_INSTR_SET #define XSIMD_X86_AMD_INSTR_SET XSIMD_X86_INSTR_SET #endif #define XSIMD_X86_AMD_INSTR_SET_AVAILABLE XSIMD_VERSION_NUMBER_AVAILABLE #endif /*********************** * PPC INSTRUCTION SET * ***********************/ // We haven't implemented any support for PPC, so we should // not enable detection for this instructoin set /*#define XSIMD_PPC_VMX_VERSION XSIMD_VERSION_NUMBER(1, 0, 0) #define XSIMD_PPC_VSX_VERSION XSIMD_VERSION_NUMBER(1, 1, 0) #define XSIMD_PPC_QPX_VERSION XSIMD_VERSION_NUMBER(2, 0, 0) #if !defined(XSIMD_PPC_INSTR_SET) && defined(__VECTOR4DOUBLE__) #define XSIMD_PPC_INSTR_SET XSIMD_PPC_QPX_VERSION #endif #if !defined(XSIMD_PPC_INSTR_SET) && defined(__VSX__) #define XSIMD_PPC_INSTR_SET XSIMD_PPC_VSX_VERSION #endif #if !defined(XSIMD_PPC_INSTR_SET) && (defined(__ALTIVEC__) || defined(__VEC__)) #define XSIMD_PPC_INSTR_SET XSIMD_PPC_VMX_VERSION #endif #if !defined(XSIMD_PPC_INSTR_SET) #define XSIMD_PPC_INSTR_SET XSIMD_VERSION_NUMBER_NOT_AVAILABLE #else #define XSIMD_PPC_INSTR_SET_AVAILABLE XSIMD_VERSION_NUMBER_AVAILABLE #endif*/ /*********************** * ARM INSTRUCTION SET * ***********************/ #define XSIMD_ARM7_NEON_VERSION XSIMD_VERSION_NUMBER(7, 0, 0) #define XSIMD_ARM8_32_NEON_VERSION XSIMD_VERSION_NUMBER(8, 0, 0) #define XSIMD_ARM8_64_NEON_VERSION XSIMD_VERSION_NUMBER(8, 1, 0) // TODO __ARM_FEATURE_FMA #if !defined(XSIMD_ARM_INSTR_SET) && (defined(__ARM_NEON)) #if __ARM_ARCH >= 8 #if defined(__aarch64__) #define XSIMD_ARM_INSTR_SET XSIMD_ARM8_64_NEON_VERSION #else #define XSIMD_ARM_INSTR_SET XSIMD_ARM8_32_NEON_VERSION #endif #elif __ARM_ARCH >= 7 #define XSIMD_ARM_INSTR_SET XSIMD_ARM7_NEON_VERSION #elif defined(XSIMD_ENABLE_FALLBACK) #warning "NEON instruction set not supported, using fallback mode." #else static_assert(false, "NEON instruction set not supported."); #endif #endif #if !defined(XSIMD_ARM_INSTR_SET) #define XSIMD_ARM_INSTR_SET XSIMD_VERSION_NUMBER_NOT_AVAILABLE #else #define XSIMD_ARM_INSTR_SET_AVAILABLE XSIMD_VERSION_NUMBER_AVAILABLE #endif /*************************** * GENERIC INSTRUCTION SET * ***************************/ #undef XSIMD_INSTR_SET #undef XSIMD_INSTR_SET_AVAILABLE #if defined(XSIMD_X86_AMD_AVAILABLE) #if XSIMD_X86_INSTR_SET > XSIMD_X86_AMD_INSTR_SET #define XSIMD_INSTR_SET XSIMD_X86_INSTR_SET #else #define XSIMD_INSTR_SET XSIMD_X86_AMD_INSTR_SET #endif #endif #if !defined(XSIMD_INSTR_SET) && defined(XSIMD_X86_INSTR_SET_AVAILABLE) #define XSIMD_INSTR_SET XSIMD_X86_INSTR_SET #endif #if !defined(XSIMD_INSTR_SET) && defined(XSIMD_PPC_INSTR_SET_AVAILABLE) #define XSIMD_INSTR_SET XSIMD_PPC_INSTR_SET #endif #if !defined(XSIMD_INSTR_SET) && defined(XSIMD_ARM_INSTR_SET_AVAILABLE) #define XSIMD_INSTR_SET XSIMD_ARM_INSTR_SET #endif #if !defined(XSIMD_INSTR_SET) #define XSIMD_INSTR_SET XSIMD_VERSION_NUMBER_NOT_AVAILABLE #elif XSIMD_INSTR_SET != XSIMD_VERSION_NUMBER_NOT_AVAILABLE #define XSIMD_INSTR_SET_AVAILABLE XSIMD_VERSION_NUMBER_AVAILABLE #endif #endif xsimd-7.6.0/include/xsimd/math/000077500000000000000000000000001410101234500163375ustar00rootroot00000000000000xsimd-7.6.0/include/xsimd/math/xsimd_basic_math.hpp000066400000000000000000000325431410101234500223550ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_BASIC_MATH_HPP #define XSIMD_BASIC_MATH_HPP #include "xsimd_numerical_constant.hpp" #include "xsimd_rounding.hpp" #include "../types/xsimd_traits.hpp" namespace xsimd { /******************** * Basic operations * ********************/ template batch_type_t fmod(const simd_base& x, const simd_base& y); template batch_type_t remainder(const simd_base& x, const simd_base& y); template batch_type_t fdim(const simd_base& x, const simd_base& y); template batch_type_t clip(const simd_base& x, const simd_base& lo, const simd_base& hi); template batch_type_t nextafter(const simd_base& from, const simd_base& to); /**************************** * Classification functions * ****************************/ template typename simd_batch_traits::batch_bool_type isfinite(const simd_base& x); template typename simd_batch_traits::batch_bool_type isinf(const simd_base& x); template typename simd_batch_traits::batch_bool_type is_flint(const simd_base& x); template typename simd_batch_traits::batch_bool_type is_odd(const simd_base& x); template typename simd_batch_traits::batch_bool_type is_even(const simd_base& x); /*********************************** * Basic operations implementation * ***********************************/ namespace detail { template struct get_batch_value_type { using type = T; }; template struct get_batch_value_type> { using type = T; }; template using get_batch_value_type_t = typename get_batch_value_type::type; template >::value> struct remainder_kernel { using batch_type = B; using size_type = std::size_t; static constexpr size_type double_size = batch_type::size; static inline batch_type fmod(const batch_type& x, const batch_type& y) { return fnma(trunc(x / y), y, x); } static inline batch_type remainder(const batch_type& x, const batch_type& y) { return fnma(nearbyint(x / y), y, x); } template static inline batch_type to_double(const IB& b, size_type offset) { batch_type res; for (size_type i = 0; i < double_size; ++i) { res[i] = b[i + offset]; } return res; } template static inline void to_int(const batch_type& src, IB& dst, size_type offset) { for (size_type i = 0; i < double_size; ++i) { dst[i + offset] = src[i]; } } }; template <> struct remainder_kernel { using size_type = std::size_t; static inline double fmod(double x, double y) { return std::fmod(x, y); } static inline double remainder(double x, double y) { return std::remainder(x, y); } template static inline double to_double(const IB& b, size_type offset) { double res = b[offset]; return res; } template static inline void to_int(double src, IB& dst, size_type offset) { dst[offset] = src; } }; template struct remainder_kernel { using batch_type = B; using double_batch = typename simd_traits::type; using double_kernel = remainder_kernel; using size_type = std::size_t; static constexpr size_type int_size = B::size; static constexpr size_type double_size = simd_traits::size; static inline batch_type fmod(const batch_type& x, const batch_type& y) { batch_type res; for (size_type i = 0; i < int_size; i += double_size) { double_batch tmp = double_kernel::fmod(double_kernel::to_double(x, i), double_kernel::to_double(y, i)); double_kernel::to_int(tmp, res, i); } return res; } static inline batch_type remainder(const batch_type& x, const batch_type& y) { batch_type res; for (size_type i = 0; i < int_size; i += double_size) { double_batch tmp = double_kernel::remainder(double_kernel::to_double(x, i), double_kernel::to_double(y, i)); double_kernel::to_int(tmp, res, i); } return res; } }; } /** * @brief Computes the floating-point remainder of the division operation \c x/y. * * The floating-point remainder of the division operation \c x/y calculated by this * function is exactly the value x - n*y, where \c n is \c x/y with its fractional * part truncated. The returned value has the same sign as \c x and is less than \c y in magnitude. * @param x batch of floating point values. * @param y batch of floating point values. * @return the floating-point remainder of the division. */ template inline batch_type_t fmod(const simd_base& x, const simd_base& y) { return detail::remainder_kernel::fmod(x(), y()); } /** * @brief Computes the IEEE remainder of the floating point division operation \c x/y. * * The IEEE floating-point remainder of the division operation \c x/y calculated by this * function is exactly the value x - n*y, where the value n is the integral value * nearest the exact value \c x/y. When |n-x/y| = 0.5, the value n is chosen to be even. * In contrast to fmod, the returned value is not guaranteed to have the same sign as \c x. * If the returned value is 0, it will have the same sign as \c x. * @param x batch of floating point values. * @param y batch of floating point values. * @return the IEEE remainder remainder of the floating point division. */ template inline batch_type_t remainder(const simd_base& x, const simd_base& y) { return detail::remainder_kernel::remainder(x(), y()); } /** * Computes the positive difference between \c x and \c y, that is, * max(0, x-y). * @param x batch of floating point values. * @param y batch of floating point values. * @return the positive difference. */ template inline batch_type_t fdim(const simd_base& x, const simd_base& y) { return fmax(batch_type_t(0.), x - y); } /** * Clips the values of the batch \c x between those of the batches \c lo and \c hi. * @param x batch of floating point values. * @param lo batch of floating point values. * @param hi batch of floating point values. * @return the result of the clipping. */ template inline batch_type_t clip(const simd_base& x, const simd_base& lo, const simd_base& hi) { return min(hi, max(x, lo)); } // TODO move scalar version? template ::value>::type> inline T clip(const T& x, const T& lo, const T& hi) { return std::min(hi, std::max(x, lo)); } namespace detail { template ::value> struct nextafter_kernel { using batch_type = batch; static inline batch_type next(const batch_type& b) noexcept { return b; } static inline batch_type prev(const batch_type& b) noexcept { return b; } }; template struct bitwise_cast_batch; template struct bitwise_cast_batch { using type = batch; }; template struct bitwise_cast_batch { using type = batch; }; template struct nextafter_kernel { using batch_type = batch; using int_batch = typename bitwise_cast_batch::type; using int_type = typename int_batch::value_type; static inline batch_type next(const batch_type& b) noexcept { batch_type n = bitwise_cast(bitwise_cast(b) + int_type(1)); return select(b == infinity(), b, n); } static inline batch_type prev(const batch_type& b) noexcept { batch_type p = bitwise_cast(bitwise_cast(b) - int_type(1)); return select(b == minusinfinity(), b, p); } }; } template inline batch_type_t nextafter(const simd_base& from, const simd_base& to) { using kernel = detail::nextafter_kernel::value_type, batch_type_t::size>; return select(from == to, from, select(to > from, kernel::next(from()), kernel::prev(from()))); } /******************************************* * Classification functions implementation * *******************************************/ /** * Determines if the scalars in the given batch \c x are finite values, * i.e. they are different from infinite or NaN. * @param x batch of floating point values. * @return a batch of booleans. */ template inline typename simd_batch_traits::batch_bool_type isfinite(const simd_base& x) { return (x - x) == batch_type_t(0.); } namespace detail { template ::value> struct isinf_kernel { using batch_type = batch; using batch_bool_type = typename simd_batch_traits::batch_bool_type; static inline batch_bool_type run(const batch_type& x) { return abs(x) == ::xsimd::infinity>(); } }; template struct isinf_kernel { using batch_type = batch; using batch_bool_type = typename simd_batch_traits::batch_bool_type; static inline batch_bool_type run(const batch_type&) { return batch_bool_type(false); } }; } /** * Determines if the scalars in the given batch \c x are positive * or negative infinity. * @param x batch of floating point values. * @return a batch of booleans. */ template inline typename simd_batch_traits::batch_bool_type isinf(const simd_base& x) { using kernel_type = detail::isinf_kernel::value_type, batch_type_t::size>; return kernel_type::run(x()); //return abs(x) == infinity>(); } template inline typename simd_batch_traits::batch_bool_type is_flint(const simd_base& x) { using b_type = batch_type_t; b_type frac = select(xsimd::isnan(x - x), nan(), x - trunc(x)); return frac == b_type(0.); } template inline typename simd_batch_traits::batch_bool_type is_odd(const simd_base& x) { return is_even(x - batch_type_t(1.)); } template inline typename simd_batch_traits::batch_bool_type is_even(const simd_base& x) { return is_flint(x * batch_type_t(0.5)); } } #endif xsimd-7.6.0/include/xsimd/math/xsimd_error.hpp000066400000000000000000000345031410101234500214120ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_ERROR_HPP #define XSIMD_ERROR_HPP #include "xsimd_basic_math.hpp" #include "xsimd_exponential.hpp" #include "xsimd_fp_sign.hpp" #include "xsimd_horner.hpp" namespace xsimd { /** * Computes the error function of the batch \c x. * @param x batch of floating point values. * @return the error function of \c x. */ template batch_type_t erf(const simd_base& x); /** * Computes the complementary error function of the batch \c x. * @param x batch of floating point values. * @return the error function of \c x. */ template batch_type_t erfc(const simd_base& x); /********************** * erf implementation * **********************/ namespace detail { /* origin: boost/simd/arch/common/detail/generic/erf_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct erf_kernel; template struct erf_kernel { // computes erf(a0)/a0 // x is sqr(a0) and 0 <= abs(a0) <= 2/3 static inline B erf1(const B& x) { return horner(x); } // computes erfc(x)*exp(sqr(x)) // x >= 2/3 static inline B erfc2(const B& x) { return horner(x); } static inline B erfc3(const B& x) { return (B(1.) - x) * horner(x); } }; template struct erf_kernel { // computes erf(a0)/a0 // x is sqr(a0) and 0 <= abs(a0) <= 0.65 static inline B erf1(const B& x) { return horner(x) / horner(x); } // computes erfc(x)*exp(x*x) // 0.65 <= abs(x) <= 2.2 static inline B erfc2(const B& x) { return horner(x) / horner(x); } // computes erfc(x)*exp(x*x) // 2.2 <= abs(x) <= 6 static inline B erfc3(const B& x) { return horner(x) / horner(x); } // computes erfc(rx)*exp(rx*rx) // x >= 6 rx = 1/x static inline B erfc4(const B& x) { return horner(x); } }; /* origin: boost/simd/arch/common/simd/function/erf.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct erf_impl; template struct erf_impl { static inline B compute(const B& a) { B x = abs(a); B r1 = B(0.); auto test1 = x < B(2.f / 3.f); if (any(test1)) { r1 = a * erf_kernel::erf1(x * x); if (all(test1)) return r1; } B z = x / (B(1.) + x); z -= B(0.4f); B r2 = B(1.) - exp(-x * x) * erf_kernel::erfc2(z); r2 = select(a < B(0.), -r2, r2); r1 = select(test1, r1, r2); #ifndef XSIMD_NO_INFINITIES r1 = select(xsimd::isinf(a), sign(a), r1); #endif return r1; } }; template struct erf_impl { static inline B compute(const B& a) { B x = abs(a); B xx = x * x; B lim1 = B(0.65); B lim2 = B(2.2); auto test1 = x < lim1; B r1 = B(0.); if (any(test1)) { r1 = a * erf_kernel::erf1(xx); if (all(test1)) return r1; } auto test2 = x < lim2; auto test3 = test2 && !test1; B ex = exp(-xx); if (any(test3)) { B z = B(1.) - ex * erf_kernel::erfc2(x); B r2 = select(a < B(0.), -z, z); r1 = select(test1, r1, r2); if (all(test1 || test3)) return r1; } B z = B(1.) - ex * erf_kernel::erfc3(x); z = select(a < B(0.), -z, z); #ifndef XSIMD_NO_INFINITIES z = select(xsimd::isinf(a), sign(a), z); #endif return select(test2, r1, z); } }; } template inline batch_type_t erf(const simd_base& x) { return detail::erf_impl>::compute(x()); } /*********************** * erfc implementation * ***********************/ namespace detail { template struct erfc_impl; template struct erfc_impl { static inline B compute(const B& a) { B x = abs(a); auto test0 = a < B(0.); B r1 = B(0.); auto test1 = x < B(2.f / 3.f); B z = x / (B(1.) + x); if (any(test1)) { r1 = erf_kernel::erfc3(z); if (all(test1)) return select(test0, B(2.) - r1, r1); } z -= B(0.4f); B r2 = exp(-x * x) * erf_kernel::erfc2(z); r1 = select(test1, r1, r2); #ifndef XSIMD_NO_INFINITIES r1 = select(x == infinity(), B(0.), r1); #endif return select(test0, B(2.) - r1, r1); } }; template struct erfc_impl { static inline B compute(const B& a) { B x = abs(a); B xx = x * x; B lim1 = B(0.65); B lim2 = B(2.2); auto test0 = a < B(0.); auto test1 = x < lim1; B r1 = B(0.); if (any(test1)) { r1 = B(1.) - x * erf_kernel::erf1(xx); if (all(test1)) return select(test0, B(2.) - r1, r1); } auto test2 = x < lim2; auto test3 = test2 && !test1; B ex = exp(-xx); if (any(test3)) { B z = ex * erf_kernel::erfc2(x); r1 = select(test1, r1, z); if (all(test1 || test3)) return select(test0, B(2.) - r1, r1); } B z = ex * erf_kernel::erfc3(x); r1 = select(test2, r1, z); #ifndef XSIMD_NO_INFINITIES r1 = select(x == infinity(), B(0.), r1); #endif return select(test0, B(2.) - r1, r1); } }; } template inline batch_type_t erfc(const simd_base& x) { return detail::erfc_impl>::compute(x()); } } #endif xsimd-7.6.0/include/xsimd/math/xsimd_estrin.hpp000066400000000000000000000062131410101234500215620ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_ESTRIN_HPP #define XSIMD_ESTRIN_HPP #include #include "../types/xsimd_types_include.hpp" namespace xsimd { namespace detail { template inline T coef() noexcept { using value_type = typename T::value_type; return T(caster_t(as_unsigned_integer_t(c)).f); } template struct estrin { T x; template inline T operator()(const Ts&... coefs) noexcept { return eval(coefs...); } private: inline T eval(const T& c0) noexcept { return c0; } inline T eval(const T& c0, const T& c1) noexcept { return fma(x, c1, c0); } template inline T eval(index_sequence, const Tuple& tuple) { return estrin{x * x}(std::get(tuple)...); } template inline T eval(const std::tuple& tuple) noexcept { return eval(make_index_sequence(), tuple); } template inline T eval(const std::tuple& tuple, const T& c0) noexcept { return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0)))); } template inline T eval(const std::tuple& tuple, const T& c0, const T& c1) noexcept { return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1)))); } template inline T eval(const std::tuple& tuple, const T& c0, const T& c1, const Ts&... coefs) noexcept { return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1))), coefs...); } template inline T eval(const T& c0, const T& c1, const Ts&... coefs) noexcept { return eval(std::make_tuple(eval(c0, c1)), coefs...); } }; } /********** * estrin * **********/ template inline T estrin(const T& x) noexcept { return detail::estrin{x}(detail::coef()...); } } #endif xsimd-7.6.0/include/xsimd/math/xsimd_exp_reduction.hpp000066400000000000000000000217071410101234500231330ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_EXP_REDUCTION_HPP #define XSIMD_EXP_REDUCTION_HPP #include "xsimd_horner.hpp" #include "xsimd_numerical_constant.hpp" #include "xsimd_rounding.hpp" namespace xsimd { struct exp_tag { }; struct exp2_tag { }; struct exp10_tag { }; namespace detail { /********************** * exp_reduction_base * **********************/ template struct exp_reduction_base; template struct exp_reduction_base { static constexpr B maxlog() noexcept { return xsimd::maxlog(); } static constexpr B minlog() noexcept { return xsimd::minlog(); } }; template struct exp_reduction_base { static constexpr B maxlog() noexcept { return xsimd::maxlog2(); } static constexpr B minlog() noexcept { return xsimd::minlog2(); } }; template struct exp_reduction_base { static constexpr B maxlog() noexcept { return xsimd::maxlog10(); } static constexpr B minlog() noexcept { return xsimd::minlog10(); } }; /***************** * exp_reduction * *****************/ template struct exp_reduction; /* origin: boost/simd/arch/common/detail/generic/f_expo_reduction.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct exp_reduction : exp_reduction_base { static inline B approx(const B& x) { B y = horner(x); return ++fma(y, x * x, x); } static inline B reduce(const B& a, B& x) { B k = nearbyint(invlog_2() * a); x = fnma(k, log_2hi(), a); x = fnma(k, log_2lo(), x); return k; } }; template struct exp_reduction : exp_reduction_base { static inline B approx(const B& x) { B y = horner(x); return ++fma(y, x * x, x * log_2()); } static inline B reduce(const B& a, B& x) { B k = nearbyint(a); x = (a - k); return k; } }; template struct exp_reduction : exp_reduction_base { static inline B approx(const B& x) { return ++(horner(x) * x); } static inline B reduce(const B& a, B& x) { B k = nearbyint(invlog10_2() * a); x = fnma(k, log10_2hi(), a); x -= k * log10_2lo(); return k; } }; /* origin: boost/simd/arch/common/detail/generic/d_expo_reduction.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct exp_reduction : exp_reduction_base { static inline B approx(const B& x) { B t = x * x; return fnma(t, horner(t), x); } static inline B reduce(const B& a, B& hi, B& lo, B& x) { B k = nearbyint(invlog_2() * a); hi = fnma(k, log_2hi(), a); lo = k * log_2lo(); x = hi - lo; return k; } static inline B finalize(const B& x, const B& c, const B& hi, const B& lo) { return B(1.) - (((lo - (x * c) / (B(2.) - c)) - hi)); } }; template struct exp_reduction : exp_reduction_base { static inline B approx(const B& x) { B t = x * x; return fnma(t, horner(t), x); } static inline B reduce(const B& a, B&, B&, B& x) { B k = nearbyint(a); x = (a - k) * log_2(); return k; } static inline B finalize(const B& x, const B& c, const B&, const B&) { return B(1.) + x + x * c / (B(2.) - c); } }; template struct exp_reduction : exp_reduction_base { static inline B approx(const B& x) { B xx = x * x; B px = x * horner(xx); B x2 = px / (horner1(xx) - px); return ++(x2 + x2); } static inline B reduce(const B& a, B&, B&, B& x) { B k = nearbyint(invlog10_2() * a); x = fnma(k, log10_2hi(), a); x = fnma(k, log10_2lo(), x); return k; } static inline B finalize(const B&, const B& c, const B&, const B&) { return c; } }; } } #endif xsimd-7.6.0/include/xsimd/math/xsimd_exponential.hpp000066400000000000000000000207751410101234500226150ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_EXPONENTIAL_HPP #define XSIMD_EXPONENTIAL_HPP #include "xsimd_exp_reduction.hpp" #include "xsimd_fp_manipulation.hpp" namespace xsimd { /** * Computes the natural exponential of the batch \c x. * @param x batch of floating point values. * @return the natural exponential of \c x. */ template batch_type_t exp(const simd_base& x); /** * Computes the base 2 exponential of the batch \c x. * @param x batch of floating point values. * @return the base 2 exponential of \c x. */ template batch_type_t exp2(const simd_base& x); /** * Computes the base 10 exponential of the batch \c x. * @param x batch of floating point values. * @return the base 10 exponential of \c x. */ template batch_type_t exp10(const simd_base& x); /** * Computes the natural exponential of the batch \c x, minus one. * @param x batch of floating point values. * @return the natural exponential of \c x, minus one. */ template batch_type_t expm1(const simd_base& x); /****************************** * exponential implementation * ******************************/ namespace detail { template struct exp_kernel; template struct exp_kernel { /* origin: boost/simd/arch/common/detail/simd/expo_base.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ static inline B compute(const B& a) { using reducer_t = exp_reduction; B x; B k = reducer_t::reduce(a, x); x = reducer_t::approx(x); x = select(a <= reducer_t::minlog(), B(0.), ldexp(x, to_int(k))); x = select(a >= reducer_t::maxlog(), infinity(), x); return x; } }; template struct exp_kernel { /* origin: boost/simd/arch/common/detail/simd/expo_base.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ static inline B compute(const B& a) { using reducer_t = exp_reduction; B hi, lo, x; B k = reducer_t::reduce(a, hi, lo, x); B c = reducer_t::approx(x); c = reducer_t::finalize(x, c, hi, lo); c = select(a <= reducer_t::minlog(), B(0.), ldexp(c, to_int(k))); c = select(a >= reducer_t::maxlog(), infinity(), c); return c; } }; } template inline batch_type_t exp(const simd_base& x) { return detail::exp_kernel, exp_tag>::compute(x()); } template inline batch_type_t exp2(const simd_base& x) { return detail::exp_kernel, exp2_tag>::compute(x()); } template inline batch_type_t exp10(const simd_base& x) { return detail::exp_kernel, exp10_tag>::compute(x()); } /************************ * expm1 implementation * ************************/ namespace detail { template struct expm1_kernel; template inline B expm1_real_impl(const B& x) { return select(x < logeps(), B(-1.), select(x > maxlog(), infinity(), expm1_kernel::compute_impl(x))); } template struct expm1_kernel { /* origin: boost/simd/arch/common/detail/generic/expm1_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ static inline B compute_impl(const B& a) { B k = nearbyint(invlog_2() * a); B x = fnma(k, log_2hi(), a); x = fnma(k, log_2lo(), x); B hx = x * B(0.5); B hxs = x * hx; B r = horner(hxs); B t = fnma(r, hx, B(3.)); B e = hxs * ((r - t) / (B(6.) - x * t)); e = fms(x, e, hxs); using i_type = as_integer_t; i_type ik = to_int(k); B two2mk = bitwise_cast((maxexponent() - ik) << nmb()); B y = B(1.) - two2mk - (e - x); return ldexp(y, ik); } static inline B compute(const B& a) { return expm1_real_impl(a); } }; template struct expm1_kernel { /* origin: boost/simd/arch/common/detail/generic/expm1_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ static inline B compute_impl(const B& a) { B k = nearbyint(invlog_2() * a); B hi = fnma(k, log_2hi(), a); B lo = k * log_2lo(); B x = hi - lo; B hxs = x * x * B(0.5); B r = horner(hxs); B t = B(3.) - r * B(0.5) * x; B e = hxs * ((r - t) / (B(6) - x * t)); B c = (hi - x) - lo; e = (x * (e - c) - c) - hxs; using i_type = as_integer_t; i_type ik = to_int(k); B two2mk = bitwise_cast((maxexponent() - ik) << nmb()); B ct1 = B(1.) - two2mk - (e - x); B ct2 = ++(x - (e + two2mk)); B y = select(k < B(20.), ct1, ct2); return ldexp(y, ik); } static inline B compute(const B& a) { return expm1_real_impl(a); } }; } template inline batch_type_t expm1(const simd_base& x) { return detail::expm1_kernel>::compute(x()); } } #endif xsimd-7.6.0/include/xsimd/math/xsimd_fp_manipulation.hpp000066400000000000000000000055641410101234500234530ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_FP_MANIPULATION_HPP #define XSIMD_FP_MANIPULATION_HPP #include "xsimd_numerical_constant.hpp" namespace xsimd { template batch ldexp(const batch& x, const batch, N>& e); template batch frexp(const batch& arg, batch, N>& exp); /******************************************************** * Floating point manipulation functions implementation * ********************************************************/ /* origin: boost/simd/arch/common/simd/function/ldexp.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline batch ldexp(const batch& x, const batch, N>& e) { using btype = batch; using itype = as_integer_t; itype ik = e + maxexponent(); ik = ik << nmb(); return x * bitwise_cast(ik); } /* origin: boost/simd/arch/common/simd/function/ifrexp.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline batch frexp(const batch& arg, batch, N>& exp) { using b_type = batch; using i_type = batch, N>; i_type m1f = mask1frexp(); i_type r1 = m1f & bitwise_cast(arg); b_type x = arg & bitwise_cast(~m1f); exp = (r1 >> nmb()) - maxexponentm1(); exp = select(bool_cast(arg != b_type(0.)), exp, zero()); return select((arg != b_type(0.)), x | bitwise_cast(mask2frexp()), b_type(0.)); } } #endif xsimd-7.6.0/include/xsimd/math/xsimd_fp_sign.hpp000066400000000000000000000103171410101234500217030ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_FP_SIGN_HPP #define XSIMD_FP_SIGN_HPP #include #include "xsimd_numerical_constant.hpp" namespace xsimd { template batch_type_t bitofsign(const simd_base& x); template batch_type_t copysign(const simd_base& x1, const simd_base& x2); template batch_type_t sign(const simd_base& x); template batch_type_t signnz(const simd_base& x); /************************** * fp_sign implementation * **************************/ template inline batch_type_t bitofsign(const simd_base& x) { return x() & minuszero>(); } template inline batch_type_t copysign(const simd_base& x1, const simd_base& x2) { return abs(x1) | bitofsign(x2); } /*********************** * sign implementation * ***********************/ namespace detail { /* origin: boost/simd/arch/common/simd/function/sign.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template ::value> struct sign_impl { static inline B compute(const B& a) { return select(a > zero(), B(1), zero()) - select(a < zero(), B(1), zero()); } }; template struct sign_impl { static inline B compute(const B& a) { B r = select(a > B(0.), B(1.), B(0.)) - select(a < B(0.), B(1.), B(0.)); #ifdef XSIMD_NO_NANS return r; #else return select(xsimd::isnan(a), nan(), r); #endif } }; } template inline batch_type_t sign(const simd_base& x) { return detail::sign_impl>::compute(x()); } /************************* * signnz implementation * *************************/ namespace detail { /* origin: boost/simd/arch/common/simd/function/signnz.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template ::value> struct signnz_impl { static inline B compute(const B& x) { using value_type = typename B::value_type; return (x >> (sizeof(value_type) * 8 - 1)) | B(1.); } }; template struct signnz_impl { static inline B compute(const B& x) { #ifndef XSIMD_NO_NANS return select(xsimd::isnan(x), nan(), B(1.) | (signmask() & x)); #else return B(1.) | (signmask() & x); #endif } }; } template inline batch_type_t signnz(const simd_base& x) { return detail::signnz_impl>::compute(x()); } } #endif xsimd-7.6.0/include/xsimd/math/xsimd_gamma.hpp000066400000000000000000000551341410101234500213460ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_GAMMA_HPP #define XSIMD_GAMMA_HPP #include "xsimd_basic_math.hpp" #include "xsimd_exponential.hpp" #include "xsimd_horner.hpp" #include "xsimd_logarithm.hpp" #include "xsimd_trigonometric.hpp" namespace xsimd { /** * Computes the gamma function of the batch \c x. * @param x batch of floating point values. * @return the gamma function of \c x. */ template batch_type_t tgamma(const simd_base& x); /** * Computes the natural logarithm of the gamma function of the batch \c x. * @param x batch of floating point values. * @return the natural logarithm of the gamma function of \c x. */ template batch_type_t lgamma(const simd_base& x); /************************* * tgamma implementation * *************************/ namespace detail { /* origin: boost/simd/arch/common/detail/generic/stirling_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct stirling_kernel; template struct stirling_kernel { static inline B compute(const B& x) { return horner(x); } static inline B split_limit() { return B(detail::caster32_t(uint32_t(0x41d628f6)).f); } static inline B large_limit() { return B(detail::caster32_t(uint32_t(0x420c28f3)).f); } }; template struct stirling_kernel { static inline B compute(const B& x) { return horner(x); } static inline B split_limit() { return B(detail::caster64_t(uint64_t(0x4061e083ba3443d4)).f); } static inline B large_limit() { return B(detail::caster64_t(uint64_t(0x4065800000000000)).f); } }; /* origin: boost/simd/arch/common/simd/function/stirling.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline B stirling(const B& a) { const B stirlingsplitlim = stirling_kernel::split_limit(); const B stirlinglargelim = stirling_kernel::large_limit(); B x = select(a >= B(0.), a, nan()); B w = B(1.) / x; w = fma(w, stirling_kernel::compute(w), B(1.)); B y = exp(-x); auto test = (x < stirlingsplitlim); B z = x - B(0.5); z = select(test, z, B(0.5) * z); B v = exp(z * log(abs(x))); y *= v; y = select(test, y, y * v); y *= sqrt_2pi() * w; #ifndef XSIMD_NO_INFINITIES y = select(xsimd::isinf(x), x, y); #endif return select(x > stirlinglargelim, infinity(), y); } } /* origin: boost/simd/arch/common/detail/generic/gamma_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ namespace detail { template struct tgamma_kernel; template struct tgamma_kernel { static inline B compute(const B& x) { return horner(x); } }; template struct tgamma_kernel { static inline B compute(const B& x) { return horner(x) / horner(x); } }; } /* origin: boost/simd/arch/common/simd/function/gamma.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ namespace detail { template B tgamma_large_negative(const B& a) { B st = stirling(a); B p = floor(a); B sgngam = select(is_even(p), -B(1.), B(1.)); B z = a - p; auto test2 = z < B(0.5); z = select(test2, z - B(1.), z); z = a * trigo_kernel::sin(z, trigo_pi_tag()); z = abs(z); return sgngam * pi() / (z * st); } template B tgamma_other(const B& a, const BB& test) { B x = select(test, B(2.), a); #ifndef XSIMD_NO_INFINITIES auto inf_result = (a == infinity()); x = select(inf_result, B(2.), x); #endif B z = B(1.); auto test1 = (x >= B(3.)); while (any(test1)) { x = select(test1, x - B(1.), x); z = select(test1, z * x, z); test1 = (x >= B(3.)); } test1 = (x < B(0.)); while (any(test1)) { z = select(test1, z / x, z); x = select(test1, x + B(1.), x); test1 = (x < B(0.)); } auto test2 = (x < B(2.)); while (any(test2)) { z = select(test2, z / x, z); x = select(test2, x + B(1.), x); test2 = (x < B(2.)); } x = z * tgamma_kernel::compute(x - B(2.)); #ifndef XSIMD_NO_INFINITIES return select(inf_result, a, x); #else return x; #endif } template inline B tgamma_impl(const B& a) { auto nan_result = (a < B(0.) && is_flint(a)); #ifndef XSIMD_NO_INVALIDS nan_result = xsimd::isnan(a) || nan_result; #endif B q = abs(a); auto test = (a < B(-33.)); B r = nan(); if (any(test)) { r = tgamma_large_negative(q); if (all(test)) return select(nan_result, nan(), r); } B r1 = tgamma_other(a, test); B r2 = select(test, r, r1); return select(a == B(0.), copysign(infinity(), a), select(nan_result, nan(), r2)); } } template inline batch_type_t tgamma(const simd_base& x) { return detail::tgamma_impl(x()); } /************************* * lgamma implementation * *************************/ /* origin: boost/simd/arch/common/detail/generic/gammaln_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ namespace detail { template struct lgamma_kernel; template struct lgamma_kernel { static inline B gammalnB(const B& x) { return horner(x); } static inline B gammalnC(const B& x) { return horner(x); } static inline B gammaln2(const B& x) { return horner(x); } }; template struct lgamma_kernel { static inline B gammaln1(const B& x) { return horner(x) / horner(x); } static inline B gammalnA(const B& x) { return horner(x); } }; } /* origin: boost/simd/arch/common/simd/function/gammaln.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ namespace detail { template struct lgamma_impl; template struct lgamma_impl { static inline B compute(const B& a) { auto inf_result = (a <= B(0.)) && is_flint(a); B x = select(inf_result, nan(), a); B q = abs(x); #ifndef XSIMD_NO_INFINITIES inf_result = (x == infinity()) || inf_result; #endif auto ltza = a < B(0.); B r; B r1 = other(q); if (any(ltza)) { r = select(inf_result, infinity(), negative(q, r1)); if (all(ltza)) return r; } B r2 = select(ltza, r, r1); return select(a == minusinfinity(), nan(), select(inf_result, infinity(), r2)); } private: static inline B negative(const B& q, const B& w) { B p = floor(q); B z = q - p; auto test2 = z < B(0.5); z = select(test2, z - B(1.), z); z = q * trigo_kernel::sin(z, trigo_pi_tag()); return -log(invpi() * abs(z)) - w; } static inline B other(const B& x) { auto xlt650 = (x < B(6.5)); B r0x = x; B r0z = x; B r0s = B(1.); B r1 = B(0.); B p = nan(); if (any(xlt650)) { B z = B(1.); B tx = select(xlt650, x, B(0.)); B nx = B(0.); const B _075 = B(0.75); const B _150 = B(1.50); const B _125 = B(1.25); const B _250 = B(2.50); auto xge150 = (x >= _150); auto txgt250 = (tx > _250); // x >= 1.5 while (any(xge150 && txgt250)) { nx = select(txgt250, nx - B(1.), nx); tx = select(txgt250, x + nx, tx); z = select(txgt250, z * tx, z); txgt250 = (tx > _250); } r0x = select(xge150, x + nx - B(2.), x); r0z = select(xge150, z, r0z); r0s = select(xge150, B(1.), r0s); // x >= 1.25 && x < 1.5 auto xge125 = (x >= _125); auto xge125t = xge125 && !xge150; if (any(xge125)) { r0x = select(xge125t, x - B(1.), r0x); r0z = select(xge125t, z * x, r0z); r0s = select(xge125t, B(-1.), r0s); } // x >= 0.75 && x < 1.5 auto kernelC = as_logical_t(false); auto xge075 = (x >= _075); auto xge075t = xge075 && !xge125; if (any(xge075t)) { kernelC = xge075t; r0x = select(xge075t, x - B(1.), x); r0z = select(xge075t, B(1.), r0z); r0s = select(xge075t, B(-1.), r0s); p = lgamma_kernel::gammalnC(r0x); } // tx < 1.5 && x < 0.75 auto txlt150 = (tx < _150) && !xge075; if (any(txlt150)) { auto orig = txlt150; while (any(txlt150)) { z = select(txlt150, z * tx, z); nx = select(txlt150, nx + B(1.), nx); tx = select(txlt150, x + nx, tx); txlt150 = (tx < _150) && !xge075; } r0x = select(orig, r0x + nx - B(2.), r0x); r0z = select(orig, z, r0z); r0s = select(orig, B(-1.), r0s); } p = select(kernelC, p, lgamma_kernel::gammalnB(r0x)); if (all(xlt650)) return fma(r0x, p, r0s * log(abs(r0z))); } r0z = select(xlt650, abs(r0z), x); B m = log(r0z); r1 = fma(r0x, p, r0s * m); B r2 = fma(x - B(0.5), m, logsqrt2pi() - x); r2 += lgamma_kernel::gammaln2(B(1.) / (x * x)) / x; return select(xlt650, r1, r2); } }; template struct lgamma_impl { static inline B compute(const B& a) { auto inf_result = (a <= B(0.)) && is_flint(a); B x = select(inf_result, nan(), a); B q = abs(x); #ifndef XSIMD_NO_INFINITIES inf_result = (q == infinity()); #endif auto test = (a < B(-34.)); B r = nan(); if (any(test)) { r = large_negative(q); if (all(test)) return select(inf_result, nan(), r); } B r1 = other(a); B r2 = select(test, r, r1); return select(a == minusinfinity(), nan(), select(inf_result, infinity(), r2)); } private: static inline B large_negative(const B& q) { B w = lgamma(q); B p = floor(q); B z = q - p; auto test2 = (z < B(0.5)); z = select(test2, z - B(1.), z); z = q * trigo_kernel::sin(z, trigo_pi_tag()); z = abs(z); return logpi() - log(z) - w; } static inline B other(const B& xx) { B x = xx; auto test = (x < B(13.)); B r1 = B(0.); if (any(test)) { B z = B(1.); B p = B(0.); B u = select(test, x, B(0.)); auto test1 = (u >= B(3.)); while (any(test1)) { p = select(test1, p - B(1.), p); u = select(test1, x + p, u); z = select(test1, z * u, z); test1 = (u >= B(3.)); } auto test2 = (u < B(2.)); while (any(test2)) { z = select(test2, z / u, z); p = select(test2, p + B(1.), p); u = select(test2, x + p, u); test2 = (u < B(2.)); } z = abs(z); x += p - B(2.); r1 = x * lgamma_kernel::gammaln1(x) + log(z); if (all(test)) return r1; } B r2 = fma(xx - B(0.5), log(xx), logsqrt2pi() - xx); B p = B(1.) / (xx * xx); r2 += lgamma_kernel::gammalnA(p) / xx; return select(test, r1, r2); } }; } template inline batch_type_t lgamma(const simd_base& x) { return detail::lgamma_impl>::compute(x()); } } #endif xsimd-7.6.0/include/xsimd/math/xsimd_horner.hpp000066400000000000000000000046611410101234500215600ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_HORNER_HPP #define XSIMD_HORNER_HPP #include "../types/xsimd_types_include.hpp" #include "xsimd_estrin.hpp" namespace xsimd { /********** * horner * **********/ /* origin: boost/simdfunction/horn.hpp*/ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline T horner(const T&) noexcept { return T(0.); } template inline T horner(const T&) noexcept { return detail::coef(); } template inline T horner(const T& x) noexcept { return fma(x, horner(x), detail::coef()); } /*********** * horner1 * ***********/ /* origin: boost/simdfunction/horn1.hpp*/ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline T horner1(const T&) noexcept { return T(1.); } template inline T horner1(const T& x) noexcept { return x + detail::coef(); } template inline T horner1(const T& x) noexcept { return fma(x, horner1(x), detail::coef()); } } #endif xsimd-7.6.0/include/xsimd/math/xsimd_hyperbolic.hpp000066400000000000000000000414311410101234500224170ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_HYPERBOLIC_HPP #define XSIMD_HYPERBOLIC_HPP #include #include "xsimd_exponential.hpp" #include "xsimd_fp_sign.hpp" #include "xsimd_logarithm.hpp" #include "xsimd_power.hpp" namespace xsimd { template batch_type_t average(const simd_base& x1, const simd_base& x2); /** * Computes the hyperbolic sine of the batch \c x. * @param x batch of floating point values. * @return the hyperbolic sine of \c x. */ template batch_type_t sinh(const simd_base& x); /** * Computes the hyperbolic cosine of the batch \c x. * @param x batch of floating point values. * @return the hyperbolic cosine of \c x. */ template batch_type_t cosh(const simd_base& x); /** * Computes the hyperbolic tangent of the batch \c x. * @param x batch of floating point values. * @return the hyperbolic tangent of \c x. */ template batch_type_t tanh(const simd_base& x); /** * Computes the inverse hyperbolic sine of the batch \c x. * @param x batch of floating point values. * @return the inverse hyperbolic sine of \c x. */ template batch_type_t asinh(const simd_base& x); /** * Computes the inverse hyperbolic cosine of the batch \c x. * @param x batch of floating point values. * @return the inverse hyperbolic cosine of \c x. */ template batch_type_t acosh(const simd_base& x); /** * Computes the inverse hyperbolic tangent of the batch \c x. * @param x batch of floating point values. * @return the inverse hyperbolic tangent of \c x. */ template batch_type_t atanh(const simd_base& x); /*************************** * average implementation * ***************************/ namespace detail { /* origin: boost/simd/arch/common/simd/function/average.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template ::value> struct average_impl { static inline B compute(const B& x1, const B& x2) { return (x1 & x2) + ((x1 ^ x2) >> 1); } }; template struct average_impl { static inline B compute(const B& x1, const B& x2) { return fma(x1, B(0.5), x2 * B(0.5)); } }; } template inline batch_type_t average(const simd_base& x1, const simd_base& x2) { return detail::average_impl>::compute(x1(), x2()); } /*********************** * sinh implementation * ***********************/ namespace detail { /* origin: boost/simd/arch/common/detail/generic/sinh_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct sinh_kernel_impl; template struct sinh_kernel_impl { static inline B compute(const B& x) { B sqrx = x * x; return horner(sqrx) * x; } }; template struct sinh_kernel_impl { static inline B compute(const B& x) { B sqrx = x * x; return fma(x, (horner(sqrx) / horner1(sqrx)) * sqrx, x); } }; template struct sinh_kernel { static inline B compute(const B& a) { using b_type = B; b_type half = b_type(0.5); b_type x = abs(a); auto lt1 = x < b_type(1.); b_type bts = bitofsign(a); b_type z(0.); if (any(lt1)) { z = sinh_kernel_impl::compute(x); if (all(lt1)) return z ^ bts; } auto test1 = x >(maxlog() - log_2()); b_type fac = select(test1, half, b_type(1.)); b_type tmp = exp(x * fac); b_type tmp1 = half * tmp; b_type r = select(test1, tmp1 * tmp, tmp1 - half / tmp); return select(lt1, z, r) ^ bts; } }; } /* origin: boost/simd/arch/common/simd/function/sinh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline batch_type_t sinh(const simd_base& a) { return detail::sinh_kernel>::compute(a()); } /*********************** * cosh implementation * ***********************/ /* origin: boost/simd/arch/common/simd/function/cosh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ namespace detail { template struct cosh_kernel { static inline B compute(const B& a) { using b_type = B; b_type x = abs(a); auto test1 = x > (maxlog() - log_2()); b_type fac = select(test1, b_type(0.5), b_type(1.)); b_type tmp = exp(x * fac); b_type tmp1 = b_type(0.5) * tmp; return select(test1, tmp1 * tmp, average(tmp, b_type(1.) / tmp)); } }; } template inline batch_type_t cosh(const simd_base& a) { return detail::cosh_kernel>::compute(a()); } /*********************** * tanh implementation * ***********************/ namespace detail { /* origin: boost/simd/arch/common/detail/generic/tanh_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct tanh_kernel_impl; template struct tanh_kernel_impl { static inline B tanh(const B& x) { B sqrx = x * x; return fma(horner(sqrx) * sqrx, x, x); } static inline B cotanh(const B& x) { return B(1.) / tanh(x); } }; template struct tanh_kernel_impl { static inline B tanh(const B& x) { B sqrx = x * x; return fma(sqrx * p(sqrx) / q(sqrx), x, x); } static inline B cotanh(const B& x) { B sqrx = x * x; B qval = q(sqrx); return qval / (x * fma(p(sqrx), sqrx, qval)); } static inline B p(const B& x) { return horner(x); } static inline B q(const B& x) { return horner1(x); } }; template struct tanh_kernel { static inline B compute(const B& a) { using b_type = B; b_type one(1.); b_type x = abs(a); auto test = x < (b_type(5.) / b_type(8.)); b_type bts = bitofsign(a); b_type z = one; if (any(test)) { z = tanh_kernel_impl::tanh(x); if (all(test)) return z ^ bts; } b_type r = fma(b_type(-2.), one / (one + exp(x + x)), one); return select(test, z, r) ^ bts; } }; } /* origin: boost/simd/arch/common/simd/function/tanh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template inline batch_type_t tanh(const simd_base& a) { return detail::tanh_kernel>::compute(a()); } /*********************** * sinh implementation * ***********************/ namespace detail { /* origin: boost/simd/arch/common/simd/function/asinh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct asinh_kernel; template struct asinh_kernel { static inline B compute(const B& a) { B x = abs(a); auto lthalf = x < B(0.5); B x2 = x * x; B bts = bitofsign(a); B z(0.); if (any(lthalf)) { z = horner(x2) * x; if (all(lthalf)) return z ^ bts; } B tmp = select(x > oneosqrteps(), x, average(x, hypot(B(1.), x))); #ifndef XSIMD_NO_NANS return select(xsimd::isnan(a), nan(), select(lthalf, z, log(tmp) + log_2()) ^ bts); #else return select(lthalf, z, log(tmp) + log_2()) ^ bts; #endif } }; template struct asinh_kernel { static inline B compute(const B& a) { B x = abs(a); auto test = x > oneosqrteps(); B z = select(test, x - B(1.), x + x * x / (B(1.) + hypot(B(1.), x))); #ifndef XSIMD_NO_INFINITIES z = select(x == infinity(), x, z); #endif B l1pz = log1p(z); z = select(test, l1pz + log_2(), l1pz); return bitofsign(a) ^ z; } }; } template inline batch_type_t asinh(const simd_base& x) { return detail::asinh_kernel>::compute(x()); } /************************ * acosh implementation * ************************/ /* origin: boost/simd/arch/common/simd/function/acosh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ namespace detail { template struct acosh_kernel { static inline B compute(const B& a) { using b_type = B; b_type x = a - b_type(1.); auto test = x > oneotwoeps(); b_type z = select(test, a, x + sqrt(x + x + x * x)); b_type l1pz = log1p(z); return select(test, l1pz + log_2(), l1pz); } }; } template inline batch_type_t acosh(const simd_base& a) { return detail::acosh_kernel>::compute(a()); } /************************ * atanh implementation * ************************/ /* origin: boost/simd/arch/common/simd/function/acosh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ namespace detail { template struct atanh_kernel { static inline B compute(const B& a) { using b_type = B; b_type x = abs(a); b_type t = x + x; b_type z = b_type(1.) - x; auto test = x < b_type(0.5); b_type tmp = select(test, x, t) / z; return bitofsign(a) ^ (b_type(0.5) * log1p(select(test, fma(t, tmp, t), tmp))); } }; } template inline batch_type_t atanh(const simd_base& a) { return detail::atanh_kernel>::compute(a()); } } #endif xsimd-7.6.0/include/xsimd/math/xsimd_invtrigo.hpp000066400000000000000000000201761410101234500221230ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_INVTRIGO_HPP #define XSIMD_INVTRIGO_HPP #include "xsimd_fp_sign.hpp" #include "xsimd_horner.hpp" #include "xsimd_numerical_constant.hpp" namespace xsimd { namespace detail { template struct invtrigo_kernel_impl; /* origin: boost/simd/arch/common/detail/simd/f_invtrig.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct invtrigo_kernel_impl { static inline B asin(const B& a) { B x = abs(a); B sign = bitofsign(a); auto x_larger_05 = x > B(0.5); B z = select(x_larger_05, B(0.5) * (B(1.) - x), x * x); x = select(x_larger_05, sqrt(z), x); B z1 = horner(z); z1 = fma(z1, z * x, x); z = select(x_larger_05, pio2() - (z1 + z1), z1); return z ^ sign; } static inline B kernel_atan(const B& x, const B& recx) { const auto flag1 = x < tan3pio8(); const auto flag2 = (x >= B(detail::caster32_t(0x3ed413cd).f)) && flag1; B yy = select(flag1, B(0.), pio2()); yy = select(flag2, pio4(), yy); B xx = select(flag1, x, -recx); xx = select(flag2, (x - B(1.)) / (x + B(1.)), xx); const B z = xx * xx; B z1 = horner(z); z1 = fma(xx, z1 * z, xx); z1 = select(flag2, z1 + pio_4lo(), z1); z1 = select(!flag1, z1 + pio_2lo(), z1); return yy + z1; } }; /* origin: boost/simd/arch/common/detail/simd/d_invtrig.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct invtrigo_kernel_impl { static inline B asin(const B& a) { B x = abs(a); auto small_cond = x < sqrteps(); B ct1 = B(detail::caster64_t(int64_t(0x3fe4000000000000)).f); B zz1 = B(1.) - x; B vp = zz1 * horner(zz1) / horner1(zz1); zz1 = sqrt(zz1 + zz1); B z = pio4() - zz1; zz1 = fms(zz1, vp, pio_2lo()); z = z - zz1; zz1 = z + pio4(); B zz2 = a * a; z = zz2 * horner(zz2) / horner1(zz2); zz2 = fma(x, z, x); return select(x > B(1.), nan(), select(small_cond, x, select(x > ct1, zz1, zz2)) ^ bitofsign(a)); } static inline B kernel_atan(const B& x, const B& recx) { const auto flag1 = x < tan3pio8(); const auto flag2 = (x >= tanpio8()) && flag1; B yy = select(flag1, B(0.), pio2()); yy = select(flag2, pio4(), yy); B xx = select(flag1, x, -recx); xx = select(flag2, (x - B(1.)) / (x + B(1.)), xx); B z = xx * xx; z *= horner(z) / horner1(z); z = fma(xx, z, xx); z = select(flag2, z + pio_4lo(), z); z = z + select(flag1, B(0.), pio_2lo()); return yy + z; } }; template struct invtrigo_kernel { static inline B asin(const B& a) { return invtrigo_kernel_impl::asin(a); } static inline B acos(const B& a) { B x = abs(a); auto x_larger_05 = x > B(0.5); x = select(x_larger_05, sqrt(fma(B(-0.5), x, B(0.5))), a); x = asin(x); x = select(x_larger_05, x + x, x); x = select(a < B(-0.5), pi() - x, x); return select(x_larger_05, x, pio2() - x); } static inline B atan(const B& a) { const B absa = abs(a); const B x = kernel_atan(absa, B(1.) / absa); return x ^ bitofsign(a); } static inline B acot(const B& a) { const B absa = abs(a); const B x = kernel_atan(B(1.) / absa, absa); return x ^ bitofsign(a); } static inline B atan2(const B& y, const B& x) { const B q = abs(y / x); const B z = kernel_atan(q, B(1.) / q); return select(x > B(0.), z, pi() - z) * signnz(y); } static inline B kernel_atan(const B& x, const B& recx) { return invtrigo_kernel_impl::kernel_atan(x, recx); } }; } } #endif xsimd-7.6.0/include/xsimd/math/xsimd_logarithm.hpp000066400000000000000000000543021410101234500222460ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_LOGARITHM_HPP #define XSIMD_LOGARITHM_HPP #include "xsimd_numerical_constant.hpp" namespace xsimd { /** * Computes the natural logarithm of the batch \c x. * @param x batch of floating point values. * @return the natural logarithm of \c x. */ template batch_type_t log(const simd_base& x); /** * Computes the base 2 logarithm of the batch \c x. * @param x batch of floating point values. * @return the base 2 logarithm of \c x. */ template batch_type_t log2(const simd_base& x); /** * Computes the base 10 logarithm of the batch \c x. * @param x batch of floating point values. * @return the base 10 logarithm of \c x. */ template batch_type_t log10(const simd_base& x); /** * Computes the natural logarithm of one plus the batch \c x. * @param x batch of floating point values. * @return the natural logarithm of one plus \c x. */ template batch_type_t log1p(const simd_base& x); /********************** * log implementation * **********************/ namespace detail { template struct log_kernel; template struct log_kernel { /* origin: boost/simd/arch/common/simd/function/log.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ static inline B compute(const B& a) { using i_type = as_integer_t; B x = a; i_type k(0); auto isnez = (a != B(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (a < smallestposval()) && isnez; if (any(test)) { k = select(bool_cast(test), k - i_type(23), k); x = select(test, x * B(8388608ul), x); } #endif i_type ix = bitwise_cast(x); ix += 0x3f800000 - 0x3f3504f3; k += (ix >> 23) - 0x7f; ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; x = bitwise_cast(ix); B f = --x; B s = f / (B(2.) + f); B z = s * s; B w = z * z; B t1 = w * horner(w); B t2 = z * horner(w); B R = t2 + t1; B hfsq = B(0.5) * f * f; B dk = to_float(k); B r = fma(dk, log_2hi(), fma(s, (hfsq + R), dk * log_2lo()) - hfsq + f); #ifndef XSIMD_NO_INFINITIES B zz = select(isnez, select(a == infinity(), infinity(), r), minusinfinity()); #else B zz = select(isnez, r, minusinfinity()); #endif return select(!(a >= B(0.)), nan(), zz); } }; template struct log_kernel { /* origin: boost/simd/arch/common/simd/function/log.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ static inline B compute(const B& a) { using i_type = as_integer_t; B x = a; i_type hx = bitwise_cast(x) >> 32; i_type k = zero(); auto isnez = (a != B(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (a < smallestposval()) && isnez; if (any(test)) { k = select(bool_cast(test), k - i_type(54), k); x = select(test, x * B(18014398509481984ull), x); } #endif hx += 0x3ff00000 - 0x3fe6a09e; k += (hx >> 20) - 0x3ff; B dk = to_float(k); hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; x = bitwise_cast(hx << 32 | (i_type(0xffffffff) & bitwise_cast(x))); B f = --x; B hfsq = B(0.5) * f * f; B s = f / (B(2.) + f); B z = s * s; B w = z * z; B t1 = w * horner(w); B t2 = z * horner(w); B R = t2 + t1; B r = fma(dk, log_2hi(), fma(s, (hfsq + R), dk * log_2lo()) - hfsq + f); #ifndef XSIMD_NO_INFINITIES B zz = select(isnez, select(a == infinity(), infinity(), r), minusinfinity()); #else B zz = select(isnez, r, minusinfinity()); #endif return select(!(a >= B(0.)), nan(), zz); } }; } template inline batch_type_t log(const simd_base& x) { return detail::log_kernel>::compute(x()); } /*********************** * log2 implementation * ***********************/ namespace detail { template struct log2_kernel; template struct log2_kernel { /* origin: FreeBSD /usr/src/lib/msun/src/e_log2f.c */ /* * ==================================================== * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. * * Developed at SunPro, a Sun Microsystems, Inc. business. * Permission to use, copy, modify, and distribute this * software is freely granted, provided that this notice * is preserved. * ==================================================== */ static inline B compute(const B& a) { using i_type = as_integer_t; B x = a; i_type k(0); auto isnez = (a != B(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (a < smallestposval()) && isnez; if (any(test)) { k = select(bool_cast(test), k - i_type(25), k); x = select(test, x * B(33554432ul), x); } #endif i_type ix = bitwise_cast(x); ix += 0x3f800000 - 0x3f3504f3; k += (ix >> 23) - 0x7f; ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; x = bitwise_cast(ix); B f = --x; B s = f / (B(2.) + f); B z = s * s; B w = z * z; B t1 = w * horner(w); B t2 = z * horner(w); B R = t1 + t2; B hfsq = B(0.5) * f * f; B dk = to_float(k); B r = fma(fms(s, hfsq + R, hfsq) + f, invlog_2(), dk); #ifndef XSIMD_NO_INFINITIES B zz = select(isnez, select(a == infinity(), infinity(), r), minusinfinity()); #else B zz = select(isnez, r, minusinfinity()); #endif return select(!(a >= B(0.)), nan(), zz); } }; template struct log2_kernel { /* origin: FreeBSD /usr/src/lib/msun/src/e_log2f.c */ /* * ==================================================== * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. * * Developed at SunPro, a Sun Microsystems, Inc. business. * Permission to use, copy, modify, and distribute this * software is freely granted, provided that this notice * is preserved. * ==================================================== */ static inline B compute(const B& a) { using i_type = as_integer_t; B x = a; i_type hx = bitwise_cast(x) >> 32; i_type k = zero(); auto isnez = (a != B(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (a < smallestposval()) && isnez; if (any(test)) { k = select(bool_cast(test), k - i_type(54), k); x = select(test, x * B(18014398509481984ull), x); } #endif hx += 0x3ff00000 - 0x3fe6a09e; k += (hx >> 20) - 0x3ff; hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; x = bitwise_cast(hx << 32 | (i_type(0xffffffff) & bitwise_cast(x))); B f = --x; B s = f / (B(2.) + f); B z = s * s; B w = z * z; B t1 = w * horner(w); B t2 = z * horner(w); B R = t2 + t1; B hfsq = B(0.5) * f * f; B hi = f - hfsq; hi = hi & bitwise_cast((allbits() << 32)); B lo = fma(s, hfsq + R, f - hi - hfsq); B val_hi = hi * invlog_2hi(); B val_lo = fma(lo + hi, invlog_2lo(), lo * invlog_2hi()); B dk = to_float(k); B w1 = dk + val_hi; val_lo += (dk - w1) + val_hi; val_hi = w1; B r = val_lo + val_hi; #ifndef XSIMD_NO_INFINITIES B zz = select(isnez, select(a == infinity(), infinity(), r), minusinfinity()); #else B zz = select(isnez, r, minusinfinity()); #endif return select(!(a >= B(0.)), nan(), zz); } }; } template inline batch_type_t log2(const simd_base& x) { return detail::log2_kernel>::compute(x()); } /************************ * log10 implementation * ************************/ namespace detail { template struct log10_kernel; template struct log10_kernel { /* origin: FreeBSD /usr/src/lib/msun/src/e_log10f.c */ /* * ==================================================== * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. * * Developed at SunPro, a Sun Microsystems, Inc. business. * Permission to use, copy, modify, and distribute this * software is freely granted, provided that this notice * is preserved. * ==================================================== */ static inline B compute(const B& a) { const B ivln10hi(4.3432617188e-01f), ivln10lo(-3.1689971365e-05f), log10_2hi(3.0102920532e-01f), log10_2lo(7.9034151668e-07f); using i_type = as_integer_t; B x = a; i_type k(0); auto isnez = (a != B(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (a < smallestposval()) && isnez; if (any(test)) { k = select(bool_cast(test), k - i_type(25), k); x = select(test, x * B(33554432ul), x); } #endif i_type ix = bitwise_cast(x); ix += 0x3f800000 - 0x3f3504f3; k += (ix >> 23) - 0x7f; ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; x = bitwise_cast(ix); B f = --x; B s = f / (B(2.) + f); B z = s * s; B w = z * z; B t1 = w * horner(w); B t2 = z * horner(w); B R = t2 + t1; B dk = to_float(k); B hfsq = B(0.5) * f * f; B hibits = f - hfsq; hibits &= bitwise_cast(i_type(0xfffff000)); B lobits = fma(s, hfsq + R, f - hibits - hfsq); B r = fma(dk, log10_2hi, fma(hibits, ivln10hi, fma(lobits, ivln10hi, fma(lobits + hibits, ivln10lo, dk * log10_2lo)))); #ifndef XSIMD_NO_INFINITIES B zz = select(isnez, select(a == infinity(), infinity(), r), minusinfinity()); #else B zz = select(isnez, r, minusinfinity()); #endif return select(!(a >= B(0.)), nan(), zz); } }; template struct log10_kernel { /* origin: FreeBSD /usr/src/lib/msun/src/e_log10f.c */ /* * ==================================================== * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. * * Developed at SunPro, a Sun Microsystems, Inc. business. * Permission to use, copy, modify, and distribute this * software is freely granted, provided that this notice * is preserved. * ==================================================== */ static inline B compute(const B& a) { const B ivln10hi(4.34294481878168880939e-01), ivln10lo(2.50829467116452752298e-11), log10_2hi(3.01029995663611771306e-01), log10_2lo(3.69423907715893078616e-13); using i_type = as_integer_t; B x = a; i_type hx = bitwise_cast(x) >> 32; i_type k = zero(); auto isnez = (a != B(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (a < smallestposval()) && isnez; if (any(test)) { k = select(bool_cast(test), k - i_type(54), k); x = select(test, x * B(18014398509481984ull), x); } #endif hx += 0x3ff00000 - 0x3fe6a09e; k += (hx >> 20) - 0x3ff; hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; x = bitwise_cast(hx << 32 | (i_type(0xffffffff) & bitwise_cast(x))); B f = --x; B dk = to_float(k); B s = f / (B(2.) + f); B z = s * s; B w = z * z; B t1 = w * horner(w); B t2 = z * horner(w); B R = t2 + t1; B hfsq = B(0.5) * f * f; B hi = f - hfsq; hi = hi & bitwise_cast(allbits() << 32); B lo = f - hi - hfsq + s * (hfsq + R); B val_hi = hi * ivln10hi; B y = dk * log10_2hi; B val_lo = dk * log10_2lo + (lo + hi) * ivln10lo + lo * ivln10hi; B w1 = y + val_hi; val_lo += (y - w1) + val_hi; val_hi = w1; B r = val_lo + val_hi; #ifndef XSIMD_NO_INFINITIES B zz = select(isnez, select(a == infinity(), infinity(), r), minusinfinity()); #else B zz = select(isnez, r, minusinfinity()); #endif return select(!(a >= B(0.)), nan(), zz); } }; } template inline batch_type_t log10(const simd_base& x) { return detail::log10_kernel>::compute(x()); } /************************ * log1p implementation * ************************/ namespace detail { template struct log1p_kernel; template struct log1p_kernel { /* origin: boost/simd/arch/common/simd/function/log1p.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ static inline B compute(const B& a) { using i_type = as_integer_t; const B uf = a + B(1.); auto isnez = (uf != B(0.)); i_type iu = bitwise_cast(uf); iu += 0x3f800000 - 0x3f3504f3; i_type k = (iu >> 23) - 0x7f; iu = (iu & i_type(0x007fffff)) + 0x3f3504f3; B f = --(bitwise_cast(iu)); B s = f / (B(2.) + f); B z = s * s; B w = z * z; B t1 = w * horner(w); B t2 = z * horner(w); B R = t2 + t1; B hfsq = B(0.5) * f * f; B dk = to_float(k); /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */ B c = select(bool_cast(k >= i_type(2)), B(1.) - (uf - a), a - (uf - B(1.))) / uf; B r = fma(dk, log_2hi(), fma(s, (hfsq + R), dk * log_2lo() + c) - hfsq + f); #ifndef XSIMD_NO_INFINITIES B zz = select(isnez, select(a == infinity(), infinity(), r), minusinfinity()); #else B zz = select(isnez, r, minusinfinity()); #endif return select(!(uf >= B(0.)), nan(), zz); } }; template struct log1p_kernel { /* origin: boost/simd/arch/common/simd/function/log1p.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ static inline B compute(const B& a) { using i_type = as_integer_t; const B uf = a + B(1.); auto isnez = (uf != B(0.)); i_type hu = bitwise_cast(uf) >> 32; hu += 0x3ff00000 - 0x3fe6a09e; i_type k = (hu >> 20) - 0x3ff; /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */ B c = select(bool_cast(k >= i_type(2)), B(1.) - (uf - a), a - (uf - B(1.))) / uf; hu = (hu & i_type(0x000fffff)) + 0x3fe6a09e; B f = bitwise_cast((hu << 32) | (i_type(0xffffffff) & bitwise_cast(uf))); f = --f; B hfsq = B(0.5) * f * f; B s = f / (B(2.) + f); B z = s * s; B w = z * z; B t1 = w * horner(w); B t2 = z * horner(w); B R = t2 + t1; B dk = to_float(k); B r = fma(dk, log_2hi(), fma(s, hfsq + R, dk * log_2lo() + c) - hfsq + f); #ifndef XSIMD_NO_INFINITIES B zz = select(isnez, select(a == infinity(), infinity(), r), minusinfinity()); #else B zz = select(isnez, r, minusinfinity()); #endif return select(!(uf >= B(0.)), nan(), zz); } }; } template inline batch_type_t log1p(const simd_base& x) { return detail::log1p_kernel>::compute(x()); } } #endif xsimd-7.6.0/include/xsimd/math/xsimd_math.hpp000066400000000000000000000021151410101234500212040ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_MATH_HPP #define XSIMD_MATH_HPP #include "xsimd_basic_math.hpp" #include "xsimd_error.hpp" #include "xsimd_exponential.hpp" #include "xsimd_fp_manipulation.hpp" #include "xsimd_gamma.hpp" #include "xsimd_hyperbolic.hpp" #include "xsimd_logarithm.hpp" #include "xsimd_power.hpp" #include "xsimd_rounding.hpp" #include "xsimd_trigonometric.hpp" #include "xsimd/types/xsimd_scalar.hpp" #endif xsimd-7.6.0/include/xsimd/math/xsimd_math_complex.hpp000066400000000000000000000751271410101234500227500ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_MATH_COMPLEX_HPP #define XSIMD_MATH_COMPLEX_HPP #include "../types/xsimd_complex_base.hpp" #include "xsimd_basic_math.hpp" #include "xsimd_exponential.hpp" #include "xsimd_hyperbolic.hpp" #include "xsimd_logarithm.hpp" #include "xsimd_power.hpp" #include "xsimd_trigonometric.hpp" namespace xsimd { template real_batch_type_t real(const simd_base& z); template real_batch_type_t imag(const simd_base& z); template real_batch_type_t arg(const simd_base& z); template batch_type_t conj(const simd_base& z); template real_batch_type_t norm(const simd_base& rhs); template batch_type_t proj(const simd_base& rhs); namespace detail { /******** * sign * ********/ template inline batch sign_complex_impl(const batch& z) { using b_type = batch; using r_type = typename b_type::real_batch; auto rz = z.real(); auto iz = z.imag(); return select(rz != r_type(0.), b_type(sign(rz)), b_type(sign(iz))); } template struct sign_impl, N>, false> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return sign_complex_impl(z); } }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct sign_impl, N>, false> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return sign_complex_impl(z); } }; #endif /******* * exp * *******/ template inline batch exp_complex_impl(const batch& z) { using b_type = batch; using r_type = typename b_type::real_batch; r_type icos, isin; sincos(z.imag(), isin, icos); return exp(z.real()) * batch(icos, isin); } template struct exp_kernel, N>, exp_tag, std::complex> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return exp_complex_impl(z); } }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct exp_kernel, N>, exp_tag, xtl::xcomplex> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return exp_complex_impl(z); } }; #endif /********* * expm1 * *********/ template inline batch expm1_complex_impl(const batch& z) { using b_type = batch; using r_type = typename b_type::real_batch; r_type isin = sin(z.imag()); r_type rem1 = expm1(z.real()); r_type re = rem1 + r_type(1.); r_type si = sin(z.imag() * r_type(0.5)); return batch(rem1 - r_type(2.) * re * si * si, re * isin); } template struct expm1_kernel, N>, std::complex> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return expm1_complex_impl(z); } }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct expm1_kernel, N>, xtl::xcomplex> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return expm1_complex_impl(z); } }; #endif /******* * log * *******/ template inline batch log_complex_impl(const batch& z) { return batch(log(abs(z)), atan2(z.imag(), z.real())); } template struct log_kernel, N>, std::complex> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return log_complex_impl(z); } }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct log_kernel, N>, xtl::xcomplex> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return log_complex_impl(z); } }; #endif /********************* * logN_complex_impl * *********************/ template inline batch logN_complex_impl(const batch& z, typename batch::real_value_type base) { using b_type = batch; using rv_type = typename b_type::real_value_type; return log(z) / b_type(rv_type(base)); } /******** * log2 * ********/ template struct log2_kernel, N>, std::complex> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return logN_complex_impl(z, std::log(2)); } }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct log2_kernel, N>, xtl::xcomplex> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return logN_complex_impl(z, std::log(2)); } }; #endif /********* * log10 * *********/ template struct log10_kernel, N>, std::complex> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return logN_complex_impl(z, std::log(10)); } }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct log10_kernel, N>, xtl::xcomplex> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return logN_complex_impl(z, std::log(10)); } }; #endif /********* * log1p * *********/ template inline batch log1p_complex_impl(const batch& z) { using b_type = batch; using r_type = typename b_type::real_batch; b_type u = b_type(1.) + z; b_type logu = log(u); return select(u == b_type(1.), z, select(u.real() <= r_type(0.), logu, logu * z / (u - b_type(1.)))); } template struct log1p_kernel, N>, std::complex> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return log1p_complex_impl(z); } }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct log1p_kernel, N>, xtl::xcomplex> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return log1p_complex_impl(z); } }; #endif /******* * pow * *******/ template inline batch pow_complex_impl(const batch& a, const batch& z) { using cplx_batch = batch; using real_batch = typename cplx_batch::real_batch; real_batch absa = abs(a); real_batch arga = arg(a); real_batch x = z.real(); real_batch y = z.imag(); real_batch r = pow(absa, x); real_batch theta = x * arga; real_batch ze = zero(); auto cond = (y == ze); r = select(cond, r, r * exp(-y * arga)); theta = select(cond, theta, theta + y * log(absa)); return select(absa == ze, cplx_batch(ze), cplx_batch(r * cos(theta), r * sin(theta))); } template struct pow_kernel, N>> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& a, const batch_type& z) { return pow_complex_impl(a, z); } }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct pow_kernel, N>> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& a, const batch_type& z) { return pow_complex_impl(a, z); } }; #endif /********* * trigo * *********/ template inline void sincos_complex_impl(const batch& z, batch& si, batch& co) { using b_type = batch; using r_type = typename b_type::real_batch; r_type rcos = cos(z.real()); r_type rsin = sin(z.real()); r_type icosh = cosh(z.imag()); r_type isinh = sinh(z.imag()); si = b_type(rsin * icosh, rcos * isinh); co = b_type(rcos * icosh, -rsin * isinh); } template inline batch sin_complex_impl(const batch& z) { return batch(sin(z.real()) * cosh(z.imag()), cos(z.real()) * sinh(z.imag())); } template inline batch cos_complex_impl(const batch& z) { return batch(cos(z.real()) * cosh(z.imag()), -sin(z.real()) * sinh(z.imag())); } template inline batch tan_complex_impl(const batch& z) { using b_type = batch; using r_type = typename b_type::real_batch; r_type d = cos(2 * z.real()) + cosh(2 * z.imag()); b_type winf(infinity(), infinity()); r_type wreal = sin(2 * z.real()) / d; r_type wimag = sinh(2 * z.imag()); b_type wres = select(xsimd::isinf(wimag), b_type(wreal, r_type(1.)), b_type(wreal, wimag / d)); return select(d == r_type(0.), winf, wres); } template struct trigo_kernel, N>> { using batch_type = batch, N>; static inline batch_type sin(const batch_type& z) { return sin_complex_impl(z); } static inline batch_type cos(const batch_type& z) { return cos_complex_impl(z); } static inline batch_type tan(const batch_type& z) { return tan_complex_impl(z); } static inline void sincos(const batch_type& z, batch_type& si, batch_type& co) { return sincos_complex_impl(z, si, co); } }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct trigo_kernel, N>> { using batch_type = batch, N>; static inline batch_type sin(const batch_type& z) { return sin_complex_impl(z); } static inline batch_type cos(const batch_type& z) { return cos_complex_impl(z); } static inline batch_type tan(const batch_type& z) { return tan_complex_impl(z); } static inline void sincos(const batch_type& z, batch_type& si, batch_type& co) { return sincos_complex_impl(z, si, co); } }; #endif /************ * invtrigo * ************/ template batch asin_complex_impl(const batch& z) { using b_type = batch; using r_type = typename b_type::real_batch; r_type x = z.real(); r_type y = z.imag(); b_type ct(-y, x); b_type zz(r_type(1.) - (x - y) * (x + y), -2 * x * y); zz = log(ct + sqrt(zz)); b_type resg(zz.imag(), -zz.real()); return select(y == r_type(0.), select(fabs(x) > r_type(1.), b_type(pio2(), r_type(0.)), b_type(asin(x), r_type(0.))), resg); } template batch acos_complex_impl(const batch& z) { using b_type = batch; using r_type = typename b_type::real_batch; b_type tmp = asin_complex_impl(z); return b_type(pio2() - tmp.real(), -tmp.imag()); } template batch atan_complex_impl(const batch& z) { using b_type = batch; using r_type = typename b_type::real_batch; r_type x = z.real(); r_type y = z.imag(); r_type x2 = x * x; r_type one = r_type(1.); r_type a = one - x2 - (y * y); r_type w = 0.5 * atan2(2. * x, a); r_type num = y + one; num = x2 + num * num; r_type den = y - one; den = x2 + den * den; b_type res = select((x == r_type(0.)) && (y == r_type(1.)), b_type(r_type(0.), infinity()), b_type(w, 0.25 * log(num / den))); return res; } template struct invtrigo_kernel, N>> { using batch_type = batch, N>; static inline batch_type asin(const batch_type& z) { return asin_complex_impl(z); } static inline batch_type acos(const batch_type& z) { return acos_complex_impl(z); } static inline batch_type atan(const batch_type& z) { return atan_complex_impl(z); } }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct invtrigo_kernel, N>> { using batch_type = batch, N>; static inline batch_type asin(const batch_type& z) { return asin_complex_impl(z); } static inline batch_type acos(const batch_type& z) { return acos_complex_impl(z); } static inline batch_type atan(const batch_type& z) { return atan_complex_impl(z); } }; #endif /******** * sinh * ********/ template inline batch sinh_complex_impl(const batch& z) { auto x = z.real(); auto y = z.imag(); return batch(sinh(x) * cos(y), cosh(x) * sin(y)); } template struct sinh_kernel, N>> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return sinh_complex_impl(z); } }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct sinh_kernel, N>> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return sinh_complex_impl(z); } }; #endif /******** * cosh * ********/ template inline batch cosh_complex_impl(const batch& z) { auto x = z.real(); auto y = z.imag(); return batch(cosh(x) * cos(y), sinh(x) * sin(y)); } template struct cosh_kernel, N >> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return cosh_complex_impl(z); } }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct cosh_kernel, N>> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return cosh_complex_impl(z); } }; #endif /******** * tanh * ********/ template inline batch tanh_complex_impl(const batch& z) { using rvt = typename batch::real_value_type; using real_batch = typename batch::real_batch; auto x = z.real(); auto y = z.imag(); real_batch two = real_batch(rvt(2)); auto d = cosh(two * x) + cos(two * y); return batch(sinh(two * x) / d, sin(two * y) / d); } template struct tanh_kernel, N >> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return tanh_complex_impl(z); } }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct tanh_kernel, N>> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return tanh_complex_impl(z); } }; #endif /********* * asinh * *********/ template inline batch asinh_complex_impl(const batch& z) { using b_type = batch; b_type w = asin(b_type(-z.imag(), z.real())); w = b_type(w.imag(), -w.real()); return w; } template struct asinh_kernel, N >> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return asinh_complex_impl(z); } }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct asinh_kernel, N>> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return asinh_complex_impl(z); } }; #endif /********* * acosh * *********/ template inline batch acosh_complex_impl(const batch& z) { using b_type = batch; b_type w = acos(z); w = b_type(-w.imag(), w.real()); return w; } template struct acosh_kernel, N >> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return acosh_complex_impl(z); } }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct acosh_kernel, N>> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return acosh_complex_impl(z); } }; #endif /********* * atanh * *********/ template inline batch atanh_complex_impl(const batch& z) { using b_type = batch; b_type w = atan(b_type(-z.imag(), z.real())); w = b_type(w.imag(), -w.real()); return w; } template struct atanh_kernel, N >> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return atanh_complex_impl(z); } }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct atanh_kernel, N>> { using batch_type = batch, N>; static inline batch_type compute(const batch_type& z) { return atanh_complex_impl(z); } }; #endif } namespace detail { template T csqrt_scale_factor() noexcept; template T csqrt_scale() noexcept; template <> inline float csqrt_scale_factor() noexcept { return 6.7108864e7f; } template <> inline float csqrt_scale() noexcept { return 1.220703125e-4f; } template <> inline double csqrt_scale_factor() noexcept { return 1.8014398509481984e16; } template <> inline double csqrt_scale() noexcept { return 7.450580596923828125e-9; } } namespace detail { template struct complex_batch_kernel { using batch_type = batch; using batch_bool_type = typename simd_batch_traits::batch_bool_type; using real_batch = typename batch_type::real_batch; static real_batch abs(const batch_type& z) { return hypot(z.real(), z.imag()); } static batch_type fma(const batch_type& a, const batch_type& b, const batch_type& c) { real_batch res_r = ::xsimd::fms(a.real(), b.real(), ::xsimd::fms(a.imag(), b.imag(), c.real())); real_batch res_i = ::xsimd::fma(a.real(), b.imag(), ::xsimd::fma(a.imag(), b.real(), c.imag())); return {res_r, res_i}; } static batch_type fms(const batch_type& a, const batch_type& b, const batch_type& c) { real_batch res_r = ::xsimd::fms(a.real(), b.real(), ::xsimd::fma(a.imag(), b.imag(), c.real())); real_batch res_i = ::xsimd::fma(a.real(), b.imag(), ::xsimd::fms(a.imag(), b.real(), c.imag())); return {res_r, res_i}; } static batch_type fnma(const batch_type& a, const batch_type& b, const batch_type& c) { real_batch res_r = - ::xsimd::fms(a.real(), b.real(), ::xsimd::fma(a.imag(), b.imag(), c.real())); real_batch res_i = - ::xsimd::fma(a.real(), b.imag(), ::xsimd::fms(a.imag(), b.real(), c.imag())); return {res_r, res_i}; } static batch_type fnms(const batch_type& a, const batch_type& b, const batch_type& c) { real_batch res_r = - ::xsimd::fms(a.real(), b.real(), ::xsimd::fms(a.imag(), b.imag(), c.real())); real_batch res_i = - ::xsimd::fma(a.real(), b.imag(), ::xsimd::fma(a.imag(), b.real(), c.imag())); return {res_r, res_i}; } static batch_type sqrt(const batch_type& z) { using rvt = typename real_batch::value_type; real_batch x = z.real(); real_batch y = z.imag(); real_batch sqrt_x = xsimd::sqrt(fabs(x)); real_batch sqrt_hy = xsimd::sqrt(0.5 * fabs(y)); auto cond = (fabs(x) > real_batch(4.) || fabs(y) > real_batch(4.)); x = select(cond, x * 0.25, x * detail::csqrt_scale_factor()); y = select(cond, y * 0.25, y * detail::csqrt_scale_factor()); real_batch scale = select(cond, real_batch(2.), real_batch(detail::csqrt_scale())); real_batch r = abs(batch_type(x, y)); auto condxp = x > real_batch(0.); real_batch t0 = select(condxp, xsimd::sqrt(0.5 * (r + x)), xsimd::sqrt(0.5 * (r - x))); real_batch r0 = scale * fabs((0.5 * y) / t0); t0 *= scale; real_batch t = select(condxp, t0, r0); r = select(condxp, r0, t0); batch_type resg = select(y < real_batch(0.), batch_type(t, -r), batch_type(t, r)); real_batch ze(0.); return select(y == ze, select(x == ze, batch_type(ze, ze), select(x < ze, batch_type(ze, sqrt_x), batch_type(sqrt_x, ze))), select(x == ze, select(y > ze, batch_type(sqrt_hy, sqrt_hy), batch_type(sqrt_hy, -sqrt_hy)), resg)); } static batch_bool_type isnan(const batch_type& z) { return batch_bool_type(xsimd::isnan(z.real()) || xsimd::isnan(z.imag())); } }; template struct batch_kernel, N> : public complex_batch_kernel, N> { }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct batch_kernel, N> : public complex_batch_kernel, N> { }; #endif } namespace detail { template struct real_imag_kernel { using return_type = typename B::real_batch; static return_type real(const B& z) { return z.real(); } static return_type imag(const B& z) { return z.imag(); } }; template struct real_imag_kernel { using return_type = B; static return_type real(const B& z) { return z; } static return_type imag(const B&) { return B(typename B::value_type(0)); } }; } template inline real_batch_type_t real(const simd_base& z) { using batch_type = batch_type_t; constexpr bool is_cplx = detail::is_complex::value; return detail::real_imag_kernel::real(z()); } template inline real_batch_type_t imag(const simd_base& z) { using batch_type = batch_type_t; constexpr bool is_cplx = detail::is_complex::value; return detail::real_imag_kernel::imag(z()); } template inline real_batch_type_t arg(const simd_base& z) { return atan2(imag(z), real(z)); } template inline batch_type_t conj(const simd_base& z) { return X(z().real(), -z().imag()); } template inline real_batch_type_t norm(const simd_base& rhs) { return real(rhs) * real(rhs) + imag(rhs) * imag(rhs); } template inline batch_type_t proj(const simd_base& rhs) { using batch_type = batch_type_t; using real_batch = typename simd_batch_traits::real_batch; auto cond = xsimd::isinf(rhs().real()) || xsimd::isinf(rhs().imag()); return select(cond, batch_type(infinity(), copysign(real_batch(0.), rhs().imag())), rhs()); } } #endif xsimd-7.6.0/include/xsimd/math/xsimd_numerical_constant.hpp000066400000000000000000000277211410101234500241550ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NUMERICAL_CONSTANT_HPP #define XSIMD_NUMERICAL_CONSTANT_HPP #include #include "../types/xsimd_types_include.hpp" namespace xsimd { #define XSIMD_DEFINE_CONSTANT(NAME, SINGLE, DOUBLE) \ template \ constexpr T NAME() noexcept \ { \ return T(NAME()); \ } \ template <> \ constexpr float NAME() noexcept \ { \ return SINGLE; \ } \ template <> \ constexpr double NAME() noexcept \ { \ return DOUBLE; \ } #define XSIMD_DEFINE_CONSTANT_HEX(NAME, SINGLE, DOUBLE) \ template \ constexpr T NAME() noexcept \ { \ return T(NAME()); \ } \ template <> \ constexpr float NAME() noexcept \ { \ return detail::caster32_t(uint32_t(SINGLE)).f; \ } \ template <> \ constexpr double NAME() noexcept \ { \ return detail::caster64_t(uint64_t(DOUBLE)).f; \ } XSIMD_DEFINE_CONSTANT(infinity, (std::numeric_limits::infinity()), (std::numeric_limits::infinity())) XSIMD_DEFINE_CONSTANT(invlog_2, 1.442695040888963407359924681001892137426645954152986f, 1.442695040888963407359924681001892137426645954152986) XSIMD_DEFINE_CONSTANT_HEX(invlog_2hi, 0x3fb8b000, 0x3ff7154765200000) XSIMD_DEFINE_CONSTANT_HEX(invlog_2lo, 0xb9389ad4, 0x3de705fc2eefa200) XSIMD_DEFINE_CONSTANT(invlog10_2, 3.32192809488736234787031942949f, 3.32192809488736234787031942949) XSIMD_DEFINE_CONSTANT_HEX(invpi, 0x3ea2f983, 0x3fd45f306dc9c883) XSIMD_DEFINE_CONSTANT(log_2, 0.6931471805599453094172321214581765680755001343602553f, 0.6931471805599453094172321214581765680755001343602553) XSIMD_DEFINE_CONSTANT_HEX(log_2hi, 0x3f318000, 0x3fe62e42fee00000) XSIMD_DEFINE_CONSTANT_HEX(log_2lo, 0xb95e8083, 0x3dea39ef35793c76) XSIMD_DEFINE_CONSTANT_HEX(log10_2hi, 0x3e9a0000, 0x3fd3440000000000) XSIMD_DEFINE_CONSTANT_HEX(log10_2lo, 0x39826a14, 0x3ed3509f79fef312) XSIMD_DEFINE_CONSTANT_HEX(logeps, 0xc17f1402, 0xc04205966f2b4f12) XSIMD_DEFINE_CONSTANT_HEX(logpi, 0x3f928682, 0x3ff250d048e7a1bd) XSIMD_DEFINE_CONSTANT_HEX(logsqrt2pi, 0x3f6b3f8e, 0x3fed67f1c864beb5) XSIMD_DEFINE_CONSTANT(maxflint, 16777216.0f, 9007199254740992.0) XSIMD_DEFINE_CONSTANT(maxlog, 88.3762626647949f, 709.78271289338400) XSIMD_DEFINE_CONSTANT(maxlog2, 127.0f, 1023.) XSIMD_DEFINE_CONSTANT(maxlog10, 38.23080825805664f, 308.2547155599167) XSIMD_DEFINE_CONSTANT_HEX(mediumpi, 0x43490fdb, 0x412921fb54442d18) XSIMD_DEFINE_CONSTANT(minlog, -88.3762626647949f, -708.3964185322641) XSIMD_DEFINE_CONSTANT(minlog2, -127.0f, -1023.) XSIMD_DEFINE_CONSTANT(minlog10, -37.89999771118164f, -308.2547155599167) XSIMD_DEFINE_CONSTANT(minusinfinity, (-infinity()), (-infinity())) XSIMD_DEFINE_CONSTANT(minuszero, -0.0f, -0.0) XSIMD_DEFINE_CONSTANT_HEX(nan, 0xffffffff, 0xffffffffffffffff) XSIMD_DEFINE_CONSTANT_HEX(oneosqrteps, 0x453504f3, 0x4190000000000000) XSIMD_DEFINE_CONSTANT_HEX(oneotwoeps, 0x4a800000, 0x4320000000000000) XSIMD_DEFINE_CONSTANT_HEX(pi, 0x40490fdb, 0x400921fb54442d18) XSIMD_DEFINE_CONSTANT_HEX(pio_2lo, 0xb33bbd2e, 0x3c91a62633145c07) XSIMD_DEFINE_CONSTANT_HEX(pio_4lo, 0xb2bbbd2e, 0x3c81a62633145c07) XSIMD_DEFINE_CONSTANT_HEX(pio2, 0x3fc90fdb, 0x3ff921fb54442d18) XSIMD_DEFINE_CONSTANT_HEX(pio2_1, 0x3fc90f80, 0x3ff921fb54400000) XSIMD_DEFINE_CONSTANT_HEX(pio2_1t, 0x37354443, 0x3dd0b4611a626331) XSIMD_DEFINE_CONSTANT_HEX(pio2_2, 0x37354400, 0x3dd0b4611a600000) XSIMD_DEFINE_CONSTANT_HEX(pio2_2t, 0x2e85a308, 0x3ba3198a2e037073) XSIMD_DEFINE_CONSTANT_HEX(pio2_3, 0x2e85a300, 0x3ba3198a2e000000) XSIMD_DEFINE_CONSTANT_HEX(pio2_3t, 0x248d3132, 0x397b839a252049c1) XSIMD_DEFINE_CONSTANT_HEX(pio4, 0x3f490fdb, 0x3fe921fb54442d18) XSIMD_DEFINE_CONSTANT_HEX(signmask, 0x80000000, 0x8000000000000000) XSIMD_DEFINE_CONSTANT(smallestposval, 1.1754944e-38f, 2.225073858507201e-308) XSIMD_DEFINE_CONSTANT_HEX(sqrt_2pi, 0x40206c99, 0x40040d931ff62704) XSIMD_DEFINE_CONSTANT_HEX(sqrteps, 0x39b504f3, 0x3e50000000000000) XSIMD_DEFINE_CONSTANT_HEX(tanpio8, 0x3ed413cd, 0x3fda827999fcef31) XSIMD_DEFINE_CONSTANT_HEX(tan3pio8, 0x401a827a, 0x4003504f333f9de6) XSIMD_DEFINE_CONSTANT_HEX(twentypi, 0x427b53d1, 0x404f6a7a2955385e) XSIMD_DEFINE_CONSTANT_HEX(twoopi, 0x3f22f983, 0x3fe45f306dc9c883) XSIMD_DEFINE_CONSTANT(twotonmb, 8388608.0f, 4503599627370496.0) XSIMD_DEFINE_CONSTANT_HEX(twotonmbo3, 0x3ba14518, 0x3ed428a2f98d7286) #undef XSIMD_DEFINE_CONSTANT #undef XSIMD_DEFINE_CONSTANT_HEX template constexpr T allbits() noexcept; template constexpr as_integer_t mask1frexp() noexcept; template constexpr as_integer_t mask2frexp() noexcept; template constexpr as_integer_t maxexponent() noexcept; template constexpr as_integer_t maxexponentm1() noexcept; template constexpr int32_t nmb() noexcept; template constexpr T zero() noexcept; template constexpr T minvalue() noexcept; template constexpr T maxvalue() noexcept; /************************** * allbits implementation * **************************/ namespace detail { template ::value> struct allbits_impl { static constexpr T get_value() noexcept { return T(~0); } }; template struct allbits_impl { static constexpr T get_value() noexcept { return nan(); } }; } template constexpr T allbits() noexcept { return T(detail::allbits_impl::get_value()); } /***************************** * mask1frexp implementation * *****************************/ template constexpr as_integer_t mask1frexp() noexcept { return as_integer_t(mask1frexp()); } template <> constexpr int32_t mask1frexp() noexcept { return 0x7f800000; } template <> constexpr int64_t mask1frexp() noexcept { return 0x7ff0000000000000; } /***************************** * mask2frexp implementation * *****************************/ template constexpr as_integer_t mask2frexp() noexcept { return as_integer_t(mask2frexp()); } template <> constexpr int32_t mask2frexp() noexcept { return 0x3f000000; } template <> constexpr int64_t mask2frexp() noexcept { return 0x3fe0000000000000; } /****************************** * maxexponent implementation * ******************************/ template constexpr as_integer_t maxexponent() noexcept { return as_integer_t(maxexponent()); } template <> constexpr int32_t maxexponent() noexcept { return 127; } template <> constexpr int64_t maxexponent() noexcept { return 1023; } /****************************** * maxexponent implementation * ******************************/ template constexpr as_integer_t maxexponentm1() noexcept { return as_integer_t(maxexponentm1()); } template <> constexpr int32_t maxexponentm1() noexcept { return 126; } template <> constexpr int64_t maxexponentm1() noexcept { return 1022; } /********************** * nmb implementation * **********************/ template constexpr int32_t nmb() noexcept { return nmb(); } template <> constexpr int32_t nmb() noexcept { return 23; } template <> constexpr int32_t nmb() noexcept { return 52; } /*********************** * zero implementation * ***********************/ template constexpr T zero() noexcept { return T(typename T::value_type(0)); } /*************************** * minvalue implementation * ***************************/ namespace detail { template struct minvalue_impl { static constexpr T get_value() noexcept { return std::numeric_limits::min(); } }; template struct minvalue_common { static constexpr T get_value() noexcept { return std::numeric_limits::min(); } }; template <> struct minvalue_impl : minvalue_common {}; template <> struct minvalue_impl : minvalue_common {}; template <> struct minvalue_impl : minvalue_common {}; template <> struct minvalue_impl : minvalue_common {}; template <> struct minvalue_impl : minvalue_common {}; template <> struct minvalue_impl : minvalue_common {}; template <> struct minvalue_impl : minvalue_common {}; template <> struct minvalue_impl : minvalue_common {}; template <> struct minvalue_impl { static constexpr float get_value() noexcept { return detail::caster32_t(uint32_t(0xff7fffff)).f; } }; template <> struct minvalue_impl { static constexpr double get_value() noexcept { return detail::caster64_t(uint64_t(0xffefffffffffffff)).f; } }; } template constexpr T minvalue() noexcept { return T(detail::minvalue_impl::get_value()); } /*************************** * maxvalue implementation * ***************************/ template constexpr T maxvalue() noexcept { return T(std::numeric_limits::max()); } } #endif xsimd-7.6.0/include/xsimd/math/xsimd_power.hpp000066400000000000000000000205621410101234500214150ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_POWER_HPP #define XSIMD_POWER_HPP #include "xsimd_basic_math.hpp" #include "xsimd_exponential.hpp" #include "xsimd_fp_manipulation.hpp" #include "xsimd_fp_sign.hpp" #include "xsimd_horner.hpp" #include "xsimd_logarithm.hpp" #include "xsimd_numerical_constant.hpp" #include "xsimd/types/xsimd_common_math.hpp" namespace xsimd { /** * Computes the value of the batch \c x raised to the power * \c y. * @param x batch of floating point values. * @param y batch of floating point values. * @return \c x raised to the power \c y. */ template batch_type_t pow(const simd_base& x, const simd_base& y); // integer specialization template inline typename std::enable_if::value, batch_type_t>::type pow(const simd_base& t0, const T1& t1); /** * Computes the cubic root of the batch \c x. * @param x batch of floating point values. * @return the cubic root of \c x. */ template batch_type_t cbrt(const simd_base& x); /** * Computes the square root of the sum of the squares of the batches * \c x, and \c y. * @param x batch of floating point values. * @param y batch of floating point values. * @return the square root of the sum of the squares of \c x and \c y. */ template batch_type_t hypot(const simd_base& x, const simd_base& y); /********************** * pow implementation * **********************/ /* origin: boost/simd/arch/common/simd/function/pow.hpp*/ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ namespace detail { template struct pow_kernel { static inline B compute(const B& x, const B& y) { using b_type = B; auto negx = x < b_type(0.); b_type z = exp(y * log(abs(x))); z = select(is_odd(y) && negx, -z, z); auto invalid = negx && !(is_flint(y) || xsimd::isinf(y)); return select(invalid, nan(), z); } }; } template inline batch_type_t pow(const simd_base& x, const simd_base& y) { return detail::pow_kernel>::compute(x(), y()); } template inline typename std::enable_if::value, batch_type_t>::type pow(const simd_base& t0, const T1& t1) { return detail::ipow(t0(), t1); } /*********************** * cbrt implementation * ***********************/ namespace detail { /* origin: boost/simd/arch/common/simd/function/cbrt.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct cbrt_kernel; template struct cbrt_kernel { static inline B compute(const B& a) { B z = abs(a); #ifndef XSIMD_NO_DENORMALS auto denormal = z < smallestposval(); z = select(denormal, z * twotonmb(), z); B f = select(denormal, twotonmbo3(), B(1.)); #endif const B CBRT2 = B(detail::caster32_t(0x3fa14518).f); const B CBRT4 = B(detail::caster32_t(0x3fcb2ff5).f); const B CBRT2I = B(detail::caster32_t(0x3f4b2ff5).f); const B CBRT4I = B(detail::caster32_t(0x3f214518).f); using i_type = as_integer_t; i_type e; B x = frexp(z, e); x = horner(x); auto flag = e >= i_type(0); i_type e1 = abs(e); i_type rem = e1; e1 /= i_type(3); rem -= e1 * i_type(3); e = e1 * sign(e); const B cbrt2 = select(bool_cast(flag), CBRT2, CBRT2I); const B cbrt4 = select(bool_cast(flag), CBRT4, CBRT4I); B fact = select(bool_cast(rem == i_type(1)), cbrt2, B(1.)); fact = select(bool_cast(rem == i_type(2)), cbrt4, fact); x = ldexp(x * fact, e); x -= (x - z / (x * x)) * B(1.f / 3.f); #ifndef XSIMD_NO_DENORMALS x = (x | bitofsign(a)) * f; #else x = x | bitofsign(a); #endif #ifndef XSIMD_NO_INFINITIES return select(a == B(0.) || xsimd::isinf(a), a, x); #else return select(a == B(0.), a, x); #endif } }; template struct cbrt_kernel { static inline B compute(const B& a) { B z = abs(a); #ifndef XSIMD_NO_DENORMALS auto denormal = z < smallestposval(); z = select(denormal, z * twotonmb(), z); B f = select(denormal, twotonmbo3(), B(1.)); #endif const B CBRT2 = B(detail::caster64_t(int64_t(0x3ff428a2f98d728b)).f); const B CBRT4 = B(detail::caster64_t(int64_t(0x3ff965fea53d6e3d)).f); const B CBRT2I = B(detail::caster64_t(int64_t(0x3fe965fea53d6e3d)).f); const B CBRT4I = B(detail::caster64_t(int64_t(0x3fe428a2f98d728b)).f); using i_type = as_integer_t; i_type e; B x = frexp(z, e); x = horner(x); auto flag = e >= zero(); i_type e1 = abs(e); i_type rem = e1; e1 /= i_type(3); rem -= e1 * i_type(3); e = e1 * sign(e); const B cbrt2 = select(bool_cast(flag), CBRT2, CBRT2I); const B cbrt4 = select(bool_cast(flag), CBRT4, CBRT4I); B fact = select(bool_cast(rem == i_type(1)), cbrt2, B(1.)); fact = select(bool_cast(rem == i_type(2)), cbrt4, fact); x = ldexp(x * fact, e); x -= (x - z / (x * x)) * B(1. / 3.); x -= (x - z / (x * x)) * B(1. / 3.); #ifndef XSIMD_NO_DENORMALS x = (x | bitofsign(a)) * f; #else x = x | bitofsign(a); #endif #ifndef XSIMD_NO_INFINITIES return select(a == B(0.) || xsimd::isinf(a), a, x); #else return select(a == B(0.), a, x); #endif } }; } template inline batch_type_t cbrt(const simd_base& x) { return detail::cbrt_kernel>::compute(x()); } /************************ * hypot implementation * ************************/ template inline batch_type_t hypot(const simd_base& x, const simd_base& y) { return sqrt(fma(x(), x(), y() * y())); } } #endif xsimd-7.6.0/include/xsimd/math/xsimd_rem_pio2.hpp000066400000000000000000000565121410101234500220010ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include #include #include namespace xsimd { namespace detail { /* origin: boost/simd/arch/common/scalar/function/rem_pio2.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ #if defined(_MSC_VER) #define ONCE0 \ __pragma(warning(push)) \ __pragma(warning(disable : 4127)) while (0) \ __pragma(warning(pop)) /**/ #else #define ONCE0 while (0) #endif /* * ==================================================== * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. * * Developed at SunPro, a Sun Microsystems, Inc. business. * Permission to use, copy, modify, and distribute this * software is freely granted, provided that this notice * is preserved. * ==================================================== */ #if defined(__GNUC__) && defined(__BYTE_ORDER__) #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #define XSIMD_LITTLE_ENDIAN #endif #elif defined(_WIN32) // We can safely assume that Windows is always little endian #define XSIMD_LITTLE_ENDIAN #elif defined(i386) || defined(i486) || \ defined(intel) || defined(x86) || defined(i86pc) || \ defined(__alpha) || defined(__osf__) #define XSIMD_LITTLE_ENDIAN #endif #ifdef XSIMD_LITTLE_ENDIAN #define LOW_WORD_IDX 0 #define HIGH_WORD_IDX sizeof(std::uint32_t) #else #define LOW_WORD_IDX sizeof(std::uint32_t) #define HIGH_WORD_IDX 0 #endif #define GET_HIGH_WORD(i, d) \ do \ { \ double f = (d); \ std::memcpy(&(i), reinterpret_cast(&f) + \ HIGH_WORD_IDX, \ sizeof(std::uint32_t)); \ } \ ONCE0 \ /**/ #define GET_LOW_WORD(i, d) \ do \ { \ double f = (d); \ std::memcpy(&(i), reinterpret_cast(&f) + \ LOW_WORD_IDX, \ sizeof(std::uint32_t)); \ } \ ONCE0 \ /**/ #define SET_HIGH_WORD(d, v) \ do \ { \ double f = (d); \ std::uint32_t value = (v); \ std::memcpy(reinterpret_cast(&f) + \ HIGH_WORD_IDX, \ &value, sizeof(std::uint32_t)); \ (d) = f; \ } \ ONCE0 \ /**/ #define SET_LOW_WORD(d, v) \ do \ { \ double f = (d); \ std::uint32_t value = (v); \ std::memcpy(reinterpret_cast(&f) + \ LOW_WORD_IDX, \ &value, sizeof(std::uint32_t)); \ (d) = f; \ } \ ONCE0 \ /**/ /* * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2) * double x[],y[]; int e0,nx,prec; int ipio2[]; * * __kernel_rem_pio2 return the last three digits of N with * y = x - N*pi/2 * so that |y| < pi/2. * * The method is to compute the integer (mod 8) and fraction parts of * (2/pi)*x without doing the full multiplication. In general we * skip the part of the product that are known to be a huge integer ( * more accurately, = 0 mod 8 ). Thus the number of operations are * independent of the exponent of the input. * * (2/pi) is represented by an array of 24-bit integers in ipio2[]. * * Input parameters: * x[] The input value (must be positive) is broken into nx * pieces of 24-bit integers in double precision format. * x[i] will be the i-th 24 bit of x. The scaled exponent * of x[0] is given in input parameter e0 (i.e., x[0]*2^e0 * match x's up to 24 bits. * * Example of breaking a double positive z into x[0]+x[1]+x[2]: * e0 = ilogb(z)-23 * z = scalbn(z,-e0) * for i = 0,1,2 * x[i] = floor(z) * z = (z-x[i])*2**24 * * * y[] ouput result in an array of double precision numbers. * The dimension of y[] is: * 24-bit precision 1 * 53-bit precision 2 * 64-bit precision 2 * 113-bit precision 3 * The actual value is the sum of them. Thus for 113-bit * precison, one may have to do something like: * * long double t,w,r_head, r_tail; * t = (long double)y[2] + (long double)y[1]; * w = (long double)y[0]; * r_head = t+w; * r_tail = w - (r_head - t); * * e0 The exponent of x[0] * * nx dimension of x[] * * prec an integer indicating the precision: * 0 24 bits (single) * 1 53 bits (double) * 2 64 bits (extended) * 3 113 bits (quad) * * ipio2[] * integer array, contains the (24*i)-th to (24*i+23)-th * bit of 2/pi after binary point. The corresponding * floating value is * * ipio2[i] * 2^(-24(i+1)). * * External function: * double scalbn(), floor(); * * * Here is the description of some local variables: * * jk jk+1 is the initial number of terms of ipio2[] needed * in the computation. The recommended value is 2,3,4, * 6 for single, double, extended,and quad. * * jz local integer variable indicating the number of * terms of ipio2[] used. * * jx nx - 1 * * jv index for pointing to the suitable ipio2[] for the * computation. In general, we want * ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8 * is an integer. Thus * e0-3-24*jv >= 0 or (e0-3)/24 >= jv * Hence jv = max(0,(e0-3)/24). * * jp jp+1 is the number of terms in PIo2[] needed, jp = jk. * * q[] double array with integral value, representing the * 24-bits chunk of the product of x and 2/pi. * * q0 the corresponding exponent of q[0]. Note that the * exponent for q[i] would be q0-24*i. * * PIo2[] double precision array, obtained by cutting pi/2 * into 24 bits chunks. * * f[] ipio2[] in floating point * * iq[] integer array by breaking up q[] in 24-bits chunk. * * fq[] final product of x*(2/pi) in fq[0],..,fq[jk] * * ih integer. If >0 it indicates q[] is >= 0.5, hence * it also indicates the *sign* of the result. * */ inline int32_t __kernel_rem_pio2(double* x, double* y, int32_t e0, int32_t nx, int32_t prec, const int32_t* ipio2) { static const int32_t init_jk[] = {2, 3, 4, 6}; /* initial value for jk */ static const double PIo2[] = { 1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */ 7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */ 5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */ 3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */ 1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */ 1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */ 2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */ 2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */ }; static const double zero = 0.0, one = 1.0, two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */ twon24 = 5.96046447753906250000e-08; /* 0x3E700000, 0x00000000 */ int32_t jz, jx, jv, jp, jk, carry, n, iq[20], i, j, k, m, q0, ih; double z, fw, f[20], fq[20], q[20]; /* initialize jk*/ jk = init_jk[prec]; jp = jk; /* determine jx,jv,q0, note that 3>q0 */ jx = nx - 1; jv = (e0 - 3) / 24; if (jv < 0) jv = 0; q0 = e0 - 24 * (jv + 1); /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */ j = jv - jx; m = jx + jk; for (i = 0; i <= m; i++, j++) f[i] = (j < 0) ? zero : (double)ipio2[j]; /* compute q[0],q[1],...q[jk] */ for (i = 0; i <= jk; i++) { for (j = 0, fw = 0.0; j <= jx; j++) fw += x[j] * f[jx + i - j]; q[i] = fw; } jz = jk; recompute: /* distill q[] into iq[] reversingly */ for (i = 0, j = jz, z = q[jz]; j > 0; i++, j--) { fw = (double)((int32_t)(twon24 * z)); iq[i] = (int)(z - two24 * fw); z = q[j - 1] + fw; } /* compute n */ z = std::scalbn(z, q0); /* actual value of z */ z -= 8.0 * std::floor(z * 0.125); /* trim off integer >= 8 */ n = (int32_t)z; z -= (double)n; ih = 0; if (q0 > 0) { /* need iq[jz-1] to determine n */ i = (iq[jz - 1] >> (24 - q0)); n += i; iq[jz - 1] -= i << (24 - q0); ih = iq[jz - 1] >> (23 - q0); } else if (q0 == 0) ih = iq[jz - 1] >> 23; else if (z >= 0.5) ih = 2; if (ih > 0) { /* q > 0.5 */ n += 1; carry = 0; for (i = 0; i < jz; i++) { /* compute 1-q */ j = iq[i]; if (carry == 0) { if (j != 0) { carry = 1; iq[i] = 0x1000000 - j; } } else iq[i] = 0xffffff - j; } if (q0 > 0) { /* rare case: chance is 1 in 12 */ switch (q0) { case 1: iq[jz - 1] &= 0x7fffff; break; case 2: iq[jz - 1] &= 0x3fffff; break; } } if (ih == 2) { z = one - z; if (carry != 0) z -= std::scalbn(one, q0); } } /* check if recomputation is needed */ if (z == zero) { j = 0; for (i = jz - 1; i >= jk; i--) j |= iq[i]; if (j == 0) { /* need recomputation */ for (k = 1; iq[jk - k] == 0; k++) ; /* k = no. of terms needed */ for (i = jz + 1; i <= jz + k; i++) { /* add q[jz+1] to q[jz+k] */ f[jx + i] = (double)ipio2[jv + i]; for (j = 0, fw = 0.0; j <= jx; j++) fw += x[j] * f[jx + i - j]; q[i] = fw; } jz += k; goto recompute; } } /* chop off zero terms */ if (z == 0.0) { jz -= 1; q0 -= 24; while (iq[jz] == 0) { jz--; q0 -= 24; } } else { /* break z into 24-bit if necessary */ z = std::scalbn(z, -q0); if (z >= two24) { fw = (double)((int32_t)(twon24 * z)); iq[jz] = (int32_t)(z - two24 * fw); jz += 1; q0 += 24; iq[jz] = (int32_t)fw; } else iq[jz] = (int32_t)z; } /* convert integer "bit" chunk to floating-point value */ fw = scalbn(one, q0); for (i = jz; i >= 0; i--) { q[i] = fw * (double)iq[i]; fw *= twon24; } /* compute PIo2[0,...,jp]*q[jz,...,0] */ for (i = jz; i >= 0; i--) { for (fw = 0.0, k = 0; k <= jp && k <= jz - i; k++) fw += PIo2[k] * q[i + k]; fq[jz - i] = fw; } /* compress fq[] into y[] */ switch (prec) { case 0: fw = 0.0; for (i = jz; i >= 0; i--) fw += fq[i]; y[0] = (ih == 0) ? fw : -fw; break; case 1: case 2: fw = 0.0; for (i = jz; i >= 0; i--) fw += fq[i]; y[0] = (ih == 0) ? fw : -fw; fw = fq[0] - fw; for (i = 1; i <= jz; i++) fw += fq[i]; y[1] = (ih == 0) ? fw : -fw; break; case 3: /* painful */ for (i = jz; i > 0; i--) { fw = fq[i - 1] + fq[i]; fq[i] += fq[i - 1] - fw; fq[i - 1] = fw; } for (i = jz; i > 1; i--) { fw = fq[i - 1] + fq[i]; fq[i] += fq[i - 1] - fw; fq[i - 1] = fw; } for (fw = 0.0, i = jz; i >= 2; i--) fw += fq[i]; if (ih == 0) { y[0] = fq[0]; y[1] = fq[1]; y[2] = fw; } else { y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw; } } return n & 7; } inline std::int32_t __ieee754_rem_pio2(double x, double* y) { static const std::int32_t two_over_pi[] = { 0xA2F983, 0x6E4E44, 0x1529FC, 0x2757D1, 0xF534DD, 0xC0DB62, 0x95993C, 0x439041, 0xFE5163, 0xABDEBB, 0xC561B7, 0x246E3A, 0x424DD2, 0xE00649, 0x2EEA09, 0xD1921C, 0xFE1DEB, 0x1CB129, 0xA73EE8, 0x8235F5, 0x2EBB44, 0x84E99C, 0x7026B4, 0x5F7E41, 0x3991D6, 0x398353, 0x39F49C, 0x845F8B, 0xBDF928, 0x3B1FF8, 0x97FFDE, 0x05980F, 0xEF2F11, 0x8B5A0A, 0x6D1F6D, 0x367ECF, 0x27CB09, 0xB74F46, 0x3F669E, 0x5FEA2D, 0x7527BA, 0xC7EBE5, 0xF17B3D, 0x0739F7, 0x8A5292, 0xEA6BFB, 0x5FB11F, 0x8D5D08, 0x560330, 0x46FC7B, 0x6BABF0, 0xCFBC20, 0x9AF436, 0x1DA9E3, 0x91615E, 0xE61B08, 0x659985, 0x5F14A0, 0x68408D, 0xFFD880, 0x4D7327, 0x310606, 0x1556CA, 0x73A8C9, 0x60E27B, 0xC08C6B, }; static const std::int32_t npio2_hw[] = { 0x3FF921FB, 0x400921FB, 0x4012D97C, 0x401921FB, 0x401F6A7A, 0x4022D97C, 0x4025FDBB, 0x402921FB, 0x402C463A, 0x402F6A7A, 0x4031475C, 0x4032D97C, 0x40346B9C, 0x4035FDBB, 0x40378FDB, 0x403921FB, 0x403AB41B, 0x403C463A, 0x403DD85A, 0x403F6A7A, 0x40407E4C, 0x4041475C, 0x4042106C, 0x4042D97C, 0x4043A28C, 0x40446B9C, 0x404534AC, 0x4045FDBB, 0x4046C6CB, 0x40478FDB, 0x404858EB, 0x404921FB, }; /* * invpio2: 53 bits of 2/pi * pio2_1: first 33 bit of pi/2 * pio2_1t: pi/2 - pio2_1 * pio2_2: second 33 bit of pi/2 * pio2_2t: pi/2 - (pio2_1+pio2_2) * pio2_3: third 33 bit of pi/2 * pio2_3t: pi/2 - (pio2_1+pio2_2+pio2_3) */ static const double zero = 0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */ half = 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */ two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */ invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */ pio2_1 = 1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */ pio2_1t = 6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */ pio2_2 = 6.07710050630396597660e-11, /* 0x3DD0B461, 0x1A600000 */ pio2_2t = 2.02226624879595063154e-21, /* 0x3BA3198A, 0x2E037073 */ pio2_3 = 2.02226624871116645580e-21, /* 0x3BA3198A, 0x2E000000 */ pio2_3t = 8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */ double z = 0., w, t, r, fn; double tx[3]; std::int32_t e0, i, j, nx, n, ix, hx; std::uint32_t low; GET_HIGH_WORD(hx, x); /* high word of x */ ix = hx & 0x7fffffff; if (ix <= 0x3fe921fb) /* |x| ~<= pi/4 , no need for reduction */ { y[0] = x; y[1] = 0; return 0; } if (ix < 0x4002d97c) { /* |x| < 3pi/4, special case with n=+-1 */ if (hx > 0) { z = x - pio2_1; if (ix != 0x3ff921fb) { /* 33+53 bit pi is good enough */ y[0] = z - pio2_1t; y[1] = (z - y[0]) - pio2_1t; } else { /* near pi/2, use 33+33+53 bit pi */ z -= pio2_2; y[0] = z - pio2_2t; y[1] = (z - y[0]) - pio2_2t; } return 1; } else { /* negative x */ z = x + pio2_1; if (ix != 0x3ff921fb) { /* 33+53 bit pi is good enough */ y[0] = z + pio2_1t; y[1] = (z - y[0]) + pio2_1t; } else { /* near pi/2, use 33+33+53 bit pi */ z += pio2_2; y[0] = z + pio2_2t; y[1] = (z - y[0]) + pio2_2t; } return -1; } } if (ix <= 0x413921fb) { /* |x| ~<= 2^19*(pi/2), medium_ size */ t = std::fabs(x); n = (std::int32_t)(t * invpio2 + half); fn = (double)n; r = t - fn * pio2_1; w = fn * pio2_1t; /* 1st round good to 85 bit */ if ((n < 32) && (n > 0) && (ix != npio2_hw[n - 1])) { y[0] = r - w; /* quick check no cancellation */ } else { std::uint32_t high; j = ix >> 20; y[0] = r - w; GET_HIGH_WORD(high, y[0]); i = j - ((high >> 20) & 0x7ff); if (i > 16) { /* 2nd iteration needed, good to 118 */ t = r; w = fn * pio2_2; r = t - w; w = fn * pio2_2t - ((t - r) - w); y[0] = r - w; GET_HIGH_WORD(high, y[0]); i = j - ((high >> 20) & 0x7ff); if (i > 49) { /* 3rd iteration need, 151 bits acc */ t = r; /* will cover all possible cases */ w = fn * pio2_3; r = t - w; w = fn * pio2_3t - ((t - r) - w); y[0] = r - w; } } } y[1] = (r - y[0]) - w; if (hx < 0) { y[0] = -y[0]; y[1] = -y[1]; return -n; } else return n; } /* * all other (large) arguments */ if (ix >= 0x7ff00000) { /* x is inf or NaN */ y[0] = y[1] = x - x; return 0; } /* set z = scalbn(|x|,ilogb(x)-23) */ GET_LOW_WORD(low, x); SET_LOW_WORD(z, low); e0 = (ix >> 20) - 1046; /* e0 = ilogb(z)-23; */ SET_HIGH_WORD(z, ix - ((std::int32_t)(e0 << 20))); for (i = 0; i < 2; i++) { tx[i] = (double)((std::int32_t)(z)); z = (z - tx[i]) * two24; } tx[2] = z; nx = 3; while (tx[nx - 1] == zero) nx--; /* skip zero term */ n = __kernel_rem_pio2(tx, y, e0, nx, 2, two_over_pi); if (hx < 0) { y[0] = -y[0]; y[1] = -y[1]; return -n; } return n; } } #undef XSIMD_LITTLE_ENDIAN #undef SET_LOW_WORD #undef SET_HIGH_WORD #undef GET_LOW_WORD #undef GET_HIGH_WORD #undef HIGH_WORD_IDX #undef LOW_WORD_IDX #undef ONCE0 } xsimd-7.6.0/include/xsimd/math/xsimd_rounding.hpp000066400000000000000000000414031410101234500221030ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_ROUNDING_HPP #define XSIMD_ROUNDING_HPP #include #include "xsimd_fp_sign.hpp" #include "xsimd_numerical_constant.hpp" namespace xsimd { /** * Computes the batch of smallest integer values not less than * scalars in \c x. * @param x batch of floating point values. * @return the batch of smallest integer values not less than \c x. */ template batch_type_t ceil(const simd_base& x); /** * Computes the batch of largest integer values not greater than * scalars in \c x. * @param x batch of floating point values. * @return the batch of largest integer values not greater than \c x. */ template batch_type_t floor(const simd_base& x); /** * Computes the batch of nearest integer values not greater in magnitude * than scalars in \c x. * @param x batch of floating point values. * @return the batch of nearest integer values not greater in magnitude than \c x. */ template batch_type_t trunc(const simd_base& x); /** * Computes the batch of nearest integer values to scalars in \c x (in * floating point format), rounding halfway cases away from zero, regardless * of the current rounding mode. * @param x batch of flaoting point values. * @return the batch of nearest integer values. */ template batch_type_t round(const simd_base& x); // Contrary to their std counterpart, these functions // are assume that the rounding mode is FE_TONEAREST /** * Rounds the scalars in \c x to integer values (in floating point format), using * the current rounding mode. * @param x batch of flaoting point values. * @return the batch of nearest integer values. */ template batch_type_t nearbyint(const simd_base& x); /** * Rounds the scalars in \c x to integer values (in floating point format), using * the current rounding mode. * @param x batch of flaoting point values. * @return the batch of rounded values. */ template batch_type_t rint(const simd_base& x); namespace impl { template struct rounding_kernel; template struct rounding_kernel_int { static inline B ceil(const B& x) { return x; } static inline B floor(const B& x) { return x; } static inline B trunc(const B& x) { return x; } static inline B nearbyint(const B& x) { return x; } }; #define DEFINE_ROUNDING_KERNEL_INT(T, N) \ template <> \ struct rounding_kernel> \ : rounding_kernel_int> \ { \ } /********************** * SSE implementation * **********************/ #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION DEFINE_ROUNDING_KERNEL_INT(uint8_t, 16); DEFINE_ROUNDING_KERNEL_INT(int8_t, 16); DEFINE_ROUNDING_KERNEL_INT(uint16_t, 8); DEFINE_ROUNDING_KERNEL_INT(int16_t, 8); DEFINE_ROUNDING_KERNEL_INT(uint32_t, 4); DEFINE_ROUNDING_KERNEL_INT(int32_t, 4); DEFINE_ROUNDING_KERNEL_INT(uint64_t, 2); DEFINE_ROUNDING_KERNEL_INT(int64_t, 2); template <> struct rounding_kernel> { using batch_type = batch; static inline batch_type ceil(const batch_type& x) { return _mm_ceil_ps(x); } static inline batch_type floor(const batch_type& x) { return _mm_floor_ps(x); } static inline batch_type trunc(const batch_type& x) { return _mm_round_ps(x, _MM_FROUND_TO_ZERO); } static inline batch_type nearbyint(const batch_type& x) { return _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT); } }; template <> struct rounding_kernel> { using batch_type = batch; static inline batch_type ceil(const batch_type& x) { return _mm_ceil_pd(x); } static inline batch_type floor(const batch_type& x) { return _mm_floor_pd(x); } static inline batch_type trunc(const batch_type& x) { return _mm_round_pd(x, _MM_FROUND_TO_ZERO); } static inline batch_type nearbyint(const batch_type& x) { return _mm_round_pd(x, _MM_FROUND_TO_NEAREST_INT); } }; #elif (XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION) || (XSIMD_ARM_INSTR_SET == XSIMD_ARM7_NEON_VERSION) DEFINE_ROUNDING_KERNEL_INT(uint8_t, 16); DEFINE_ROUNDING_KERNEL_INT(int8_t, 16); DEFINE_ROUNDING_KERNEL_INT(uint16_t, 8); DEFINE_ROUNDING_KERNEL_INT(int16_t, 8); DEFINE_ROUNDING_KERNEL_INT(uint32_t, 4); DEFINE_ROUNDING_KERNEL_INT(int32_t, 4); DEFINE_ROUNDING_KERNEL_INT(uint64_t, 2); DEFINE_ROUNDING_KERNEL_INT(int64_t, 2); template struct rounding_kernel_base { static inline B ceil(const B& x) { B tx = trunc(x); return select(tx < x, tx + B(1), tx); } static inline B floor(const B& x) { B tx = trunc(x); return select(tx > x, tx - B(1), tx); } static inline B nearbyint(const B& x) { B s = bitofsign(x); B v = x ^ s; B t2n = twotonmb(); B d0 = v + t2n; return s ^ select(v < t2n, d0 - t2n, v); } }; template <> struct rounding_kernel> : rounding_kernel_base> { using batch_type = batch; static inline batch_type trunc(const batch_type& x) { return select(abs(x) < maxflint(), to_float(to_int(x)), x); } }; #if (XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION) template <> struct rounding_kernel> : rounding_kernel_base> { using batch_type = batch; static inline batch_type trunc(const batch_type& x) { return batch(std::trunc(x[0]), std::trunc(x[1])); } }; #endif #endif /********************** * AVX implementation * **********************/ #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION DEFINE_ROUNDING_KERNEL_INT(uint8_t, 32); DEFINE_ROUNDING_KERNEL_INT(int8_t, 32); DEFINE_ROUNDING_KERNEL_INT(uint16_t, 16); DEFINE_ROUNDING_KERNEL_INT(int16_t, 16); DEFINE_ROUNDING_KERNEL_INT(uint32_t, 8); DEFINE_ROUNDING_KERNEL_INT(int32_t, 8); DEFINE_ROUNDING_KERNEL_INT(uint64_t, 4); DEFINE_ROUNDING_KERNEL_INT(int64_t, 4); template <> struct rounding_kernel> { using batch_type = batch; static inline batch_type ceil(const batch_type& x) { return _mm256_round_ps(x, _MM_FROUND_CEIL); } static inline batch_type floor(const batch_type& x) { return _mm256_round_ps(x, _MM_FROUND_FLOOR); } static inline batch_type trunc(const batch_type& x) { return _mm256_round_ps(x, _MM_FROUND_TO_ZERO); } static inline batch_type nearbyint(const batch_type& x) { return _mm256_round_ps(x, _MM_FROUND_TO_NEAREST_INT); } }; template <> struct rounding_kernel> { using batch_type = batch; static inline batch_type ceil(const batch_type& x) { return _mm256_round_pd(x, _MM_FROUND_CEIL); } static inline batch_type floor(const batch_type& x) { return _mm256_round_pd(x, _MM_FROUND_FLOOR); } static inline batch_type trunc(const batch_type& x) { return _mm256_round_pd(x, _MM_FROUND_TO_ZERO); } static inline batch_type nearbyint(const batch_type& x) { return _mm256_round_pd(x, _MM_FROUND_TO_NEAREST_INT); } }; #endif #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX512_VERSION DEFINE_ROUNDING_KERNEL_INT(uint8_t, 64); DEFINE_ROUNDING_KERNEL_INT(int8_t, 64); DEFINE_ROUNDING_KERNEL_INT(uint16_t, 32); DEFINE_ROUNDING_KERNEL_INT(int16_t, 32); DEFINE_ROUNDING_KERNEL_INT(uint32_t, 16); DEFINE_ROUNDING_KERNEL_INT(int32_t, 16); DEFINE_ROUNDING_KERNEL_INT(uint64_t, 8); DEFINE_ROUNDING_KERNEL_INT(int64_t, 8); template <> struct rounding_kernel> { using batch_type = batch; static inline batch_type ceil(const batch_type& x) { auto res = _mm512_roundscale_ps(x, _MM_FROUND_TO_POS_INF); return res; } static inline batch_type floor(const batch_type& x) { auto res = _mm512_roundscale_ps(x, _MM_FROUND_TO_NEG_INF); return res; } static inline batch_type trunc(const batch_type& x) { auto res = _mm512_roundscale_round_ps(x, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION); return res; } static inline batch_type nearbyint(const batch_type& x) { auto res = _mm512_roundscale_round_ps(x, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION); return res; } }; template <> struct rounding_kernel> { using batch_type = batch; static inline batch_type ceil(const batch_type& x) { auto res = _mm512_roundscale_pd(x, _MM_FROUND_TO_POS_INF); return res; } static inline batch_type floor(const batch_type& x) { auto res = _mm512_roundscale_pd(x, _MM_FROUND_TO_NEG_INF); return res; } static inline batch_type trunc(const batch_type& x) { auto res = _mm512_roundscale_round_pd(x, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION); return res; } static inline batch_type nearbyint(const batch_type& x) { auto res = _mm512_roundscale_round_pd(x, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION); return res; } }; #endif #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_32_NEON_VERSION DEFINE_ROUNDING_KERNEL_INT(uint8_t, 16); DEFINE_ROUNDING_KERNEL_INT(int8_t, 16); DEFINE_ROUNDING_KERNEL_INT(uint16_t, 8); DEFINE_ROUNDING_KERNEL_INT(int16_t, 8); DEFINE_ROUNDING_KERNEL_INT(uint32_t, 4); DEFINE_ROUNDING_KERNEL_INT(int32_t, 4); DEFINE_ROUNDING_KERNEL_INT(uint64_t, 2); DEFINE_ROUNDING_KERNEL_INT(int64_t, 2); template <> struct rounding_kernel> { using batch_type = batch; static inline batch_type ceil(const batch_type& x) { return vrndpq_f32(x); } static inline batch_type floor(const batch_type& x) { return vrndmq_f32(x); } static inline batch_type trunc(const batch_type& x) { return vrndq_f32(x); } static inline batch_type nearbyint(const batch_type& x) { return vrndxq_f32(x); } }; #endif #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION template <> struct rounding_kernel> { using batch_type = batch; static inline batch_type ceil(const batch_type& x) { return vrndpq_f64(x); } static inline batch_type floor(const batch_type& x) { return vrndmq_f64(x); } static inline batch_type trunc(const batch_type& x) { return vrndq_f64(x); } static inline batch_type nearbyint(const batch_type& x) { return vrndxq_f64(x); } }; #endif /*************************** * Fallback implementation * ***************************/ #if defined(XSIMD_ENABLE_FALLBACK) template struct rounding_kernel> { using batch_type = batch; static inline batch_type ceil(const batch_type& x) { XSIMD_FALLBACK_BATCH_UNARY_FUNC(std::ceil, x) } static inline batch_type floor(const batch_type& x) { XSIMD_FALLBACK_BATCH_UNARY_FUNC(std::floor, x) } static inline batch_type trunc(const batch_type& x) { XSIMD_FALLBACK_BATCH_UNARY_FUNC(std::trunc, x) } static inline batch_type nearbyint(const batch_type& x) { XSIMD_FALLBACK_BATCH_UNARY_FUNC(std::nearbyint, x) } }; #endif /************************** * Generic implementation * **************************/ template ::value> struct round_impl; template struct round_impl, false> { using batch_type = batch; static inline batch_type round(const batch_type& x) { batch_type v = abs(x); batch_type c = ceil(v); batch_type cp = select(c - batch_type(0.5) > v, c - batch_type(1), c); return select(v > maxflint(), x, copysign(cp, x)); } }; template struct round_impl, true> { using batch_type = batch; static inline batch_type round(const batch_type& rhs) { return rhs; } }; template inline batch rint(const batch& x) { return nearbyint(x); } } template inline batch_type_t ceil(const simd_base& x) { return impl::rounding_kernel::ceil(x()); } template inline batch_type_t floor(const simd_base& x) { return impl::rounding_kernel::floor(x()); } template inline batch_type_t trunc(const simd_base& x) { return impl::rounding_kernel::trunc(x()); } template inline batch_type_t round(const simd_base& x) { return impl::round_impl::round(x()); } // Contrary to their std counterpart, these functions // are assume that the rounding mode is FE_TONEAREST template inline batch_type_t nearbyint(const simd_base& x) { return impl::rounding_kernel::nearbyint(x()); } template inline batch_type_t rint(const simd_base& x) { return impl::rint(x()); } } #endif xsimd-7.6.0/include/xsimd/math/xsimd_trigo_reduction.hpp000066400000000000000000000237541410101234500234670ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_TRIGO_REDUCTION_HPP #define XSIMD_TRIGO_REDUCTION_HPP #include #include #include "xsimd_horner.hpp" #include "xsimd_rem_pio2.hpp" #include "xsimd_rounding.hpp" namespace xsimd { template batch_type_t quadrant(const simd_base& x); namespace detail { template struct trigo_evaluation; /* origin: boost/simd/arch/common/detail/simd/f_trig_evaluation.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct trigo_evaluation { static inline B cos_eval(const B& z) { B y = horner(z); return B(1.) + fma(z, B(-0.5), y * z * z); } static inline B sin_eval(const B& z, const B& x) { B y = horner(z); return fma(y * z, x, x); } static inline B base_tancot_eval(const B& z) { B zz = z * z; B y = horner(zz); return fma(y, zz * z, z); } template static inline B tan_eval(const B& z, const BB& test) { B y = base_tancot_eval(z); return select(test, y, -B(1.) / y); } template static inline B cot_eval(const B& z, const BB& test) { B y = base_tancot_eval(z); return select(test, B(1.) / y, -y); } }; /* origin: boost/simd/arch/common/detail/simd/d_trig_evaluation.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct trigo_evaluation { static inline B cos_eval(const B& z) { B y = horner(z); return B(1.) - y * z; } static inline B sin_eval(const B& z, const B& x) { B y = horner(z); return fma(y * z, x, x); } static inline B base_tancot_eval(const B& z) { B zz = z * z; B num = horner(zz); B den = horner1(zz); return fma(z, (zz * (num / den)), z); } template static inline B tan_eval(const B& z, const BB& test) { B y = base_tancot_eval(z); return select(test, y, -B(1.) / y); } template static inline B cot_eval(const B& z, const BB& test) { B y = base_tancot_eval(z); return select(test, B(1.) / y, -y); } }; /* origin: boost/simd/arch/common/detail/simd/trig_reduction.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ struct trigo_radian_tag { }; struct trigo_pi_tag { }; template struct trigo_reducer { static inline B reduce(const B& x, B& xr) { if (all(x <= pio4())) { xr = x; return B(0.); } else if (all(x <= pio2())) { auto test = x > pio4(); xr = x - pio2_1(); xr -= pio2_2(); xr -= pio2_3(); xr = select(test, xr, x); return select(test, B(1.), B(0.)); } else if (all(x <= twentypi())) { B xi = nearbyint(x * twoopi()); xr = fnma(xi, pio2_1(), x); xr -= xi * pio2_2(); xr -= xi * pio2_3(); return quadrant(xi); } else if (all(x <= mediumpi())) { B fn = nearbyint(x * twoopi()); B r = x - fn * pio2_1(); B w = fn * pio2_1t(); B t = r; w = fn * pio2_2(); r = t - w; w = fn * pio2_2t() - ((t - r) - w); t = r; w = fn * pio2_3(); r = t - w; w = fn * pio2_3t() - ((t - r) - w); xr = r - w; return quadrant(fn); } else { static constexpr std::size_t size = B::size; using value_type = typename B::value_type; alignas(B) std::array tmp; alignas(B) std::array txr; for (std::size_t i = 0; i < size; ++i) { double arg = x[i]; if (arg == std::numeric_limits::infinity()) { tmp[i] = 0.; txr[i] = std::numeric_limits::quiet_NaN(); } else { double y[2]; std::int32_t n = detail::__ieee754_rem_pio2(arg, y); tmp[i] = value_type(n & 3); txr[i] = value_type(y[0]); } } xr.load_aligned(&txr[0]); B res; res.load_aligned(&tmp[0]); return res; } } }; template struct trigo_reducer { static inline B reduce(const B& x, B& xr) { B xi = nearbyint(x * B(2.)); B x2 = x - xi * B(0.5); xr = x2 * pi(); return quadrant(xi); } }; template struct quadrant_impl { static inline B compute(const B& x) { return x & B(3); } }; template struct quadrant_impl { static inline B compute(const B& x) { return to_float(quadrant(to_int(x))); } }; template struct quadrant_impl { static inline B compute(const B& x) { B a = x * B(0.25); return (a - floor(a)) * B(4.); } }; } template inline batch_type_t quadrant(const simd_base& x) { using b_type = batch_type_t; return detail::quadrant_impl::compute(x()); } } #endif xsimd-7.6.0/include/xsimd/math/xsimd_trigonometric.hpp000066400000000000000000000166611410101234500231530ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_TRIGONOMETRIC_HPP #define XSIMD_TRIGONOMETRIC_HPP #include "xsimd_fp_sign.hpp" #include "xsimd_invtrigo.hpp" #include "xsimd_trigo_reduction.hpp" namespace xsimd { /** * Computes the sine of the batch \c x. * @param x batch of floating point values. * @return the sine of \c x. */ template batch_type_t sin(const simd_base& x); /** * Computes the cosine of the batch \c x. * @param x batch of floating point values. * @return the cosine of \c x. */ template batch_type_t cos(const simd_base& x); /** * Computes the sine and the cosine of the batch \c x. This method is faster * than calling sine and cosine independently. * @param x batch of floating point values. * @param si the sine of x. * @param co the cosine of x. */ template void sincos(const simd_base& x, batch_type_t& si, batch_type_t& co); /** * Computes the tangent of the batch \c x. * @param x batch of floating point values. * @return the tangent of \c x. */ template batch_type_t tan(const simd_base& x); /** * Computes the arc sine of the batch \c x. * @param x batch of floating point values. * @return the arc sine of \c x. */ template batch_type_t asin(const simd_base& x); /** * Computes the arc cosine of the batch \c x. * @param x batch of floating point values. * @return the arc cosine of \c x. */ template batch_type_t acos(const simd_base& x); /** * Computes the arc tangent of the batch \c x. * @param x batch of floating point values. * @return the arc tangent of \c x. */ template batch_type_t atan(const simd_base& x); /** * Computes the arc tangent of the batch \c x/y, using the signs of the * arguments to determine the correct quadrant. * @param x batch of floating point values. * @param y batch of floating point values. * @return the arc tangent of \c x/y. */ template batch_type_t atan2(const simd_base& y, const simd_base& x); namespace detail { /* origin: boost/simd/arch/common/detail/simd/trig_base.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct trigo_kernel { template static inline B sin(const B& a, Tag = Tag()) { const B x = abs(a); B xr = nan(); const B n = trigo_reducer::reduce(x, xr); auto tmp = select(n >= B(2.), B(1.), B(0.)); auto swap_bit = fma(B(-2.), tmp, n); auto sign_bit = bitofsign(a) ^ select(tmp != B(0.), signmask(), B(0.)); const B z = xr * xr; const B se = trigo_evaluation::sin_eval(z, xr); const B ce = trigo_evaluation::cos_eval(z); const B z1 = select(swap_bit == B(0.), se, ce); return z1 ^ sign_bit; } static inline B cos(const B& a) { const B x = abs(a); B xr = nan(); const B n = trigo_reducer::reduce(x, xr); auto tmp = select(n >= B(2.), B(1.), B(0.)); auto swap_bit = fma(B(-2.), tmp, n); auto sign_bit = select((swap_bit ^ tmp) != B(0.), signmask(), B(0.)); const B z = xr * xr; const B se = trigo_evaluation::sin_eval(z, xr); const B ce = trigo_evaluation::cos_eval(z); const B z1 = select(swap_bit != B(0.), se, ce); return z1 ^ sign_bit; } static inline B tan(const B& a) { const B x = abs(a); B xr = nan(); const B n = trigo_reducer::reduce(x, xr); auto tmp = select(n >= B(2.), B(1.), B(0.)); auto swap_bit = fma(B(-2.), tmp, n); auto test = (swap_bit == B(0.)); const B y = trigo_evaluation::tan_eval(xr, test); return y ^ bitofsign(a); } static inline void sincos(const B& a, B& si, B& co) { const B x = abs(a); B xr = nan(); const B n = trigo_reducer::reduce(x, xr); auto tmp = select(n >= B(2.), B(1.), B(0.)); auto swap_bit = fma(B(-2.), tmp, n); const B z = xr * xr; const B se = trigo_evaluation::sin_eval(z, xr); const B ce = trigo_evaluation::cos_eval(z); auto sin_sign_bit = bitofsign(a) ^ select(tmp != B(0.), signmask(), B(0.)); const B sin_z1 = select(swap_bit == B(0.), se, ce); si = sin_z1 ^ sin_sign_bit; auto cos_sign_bit = select((swap_bit ^ tmp) != B(0.), signmask(), B(0.)); const B cos_z1 = select(swap_bit != B(0.), se, ce); co = cos_z1 ^ cos_sign_bit; } }; } template inline batch_type_t sin(const simd_base& x) { return detail::trigo_kernel>::sin(x()); } template inline batch_type_t cos(const simd_base& x) { return detail::trigo_kernel>::cos(x()); } template inline void sincos(const simd_base& x, batch_type_t& si, batch_type_t& co) { detail::trigo_kernel>::sincos(x(), si, co); } template inline batch_type_t tan(const simd_base& x) { return detail::trigo_kernel>::tan(x()); } template inline batch_type_t asin(const simd_base& x) { return detail::invtrigo_kernel>::asin(x()); } template inline batch_type_t acos(const simd_base& x) { return detail::invtrigo_kernel>::acos(x()); } template inline batch_type_t atan(const simd_base& x) { return detail::invtrigo_kernel>::atan(x()); } template inline batch_type_t atan2(const simd_base& y, const simd_base& x) { return detail::invtrigo_kernel>::atan2(y(), x()); } } #endif xsimd-7.6.0/include/xsimd/memory/000077500000000000000000000000001410101234500167165ustar00rootroot00000000000000xsimd-7.6.0/include/xsimd/memory/xsimd_aligned_allocator.hpp000066400000000000000000000245431410101234500243060ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_ALIGNED_ALLOCATOR_HPP #define XSIMD_ALIGNED_ALLOCATOR_HPP #include #include #include #include #include #include "../config/xsimd_align.hpp" #if defined(XSIMD_ALLOCA) #if defined(__GNUC__) #include #elif defined(_MSC_VER) #include #endif #endif namespace xsimd { /** * @class aligned_allocator * @brief Allocator for aligned memory * * The aligned_allocator class template is an allocator that * performs memory allocation aligned by the specified value. * * @tparam T type of objects to allocate. * @tparam Align alignment in bytes. */ template class aligned_allocator { public: using value_type = T; using pointer = T*; using const_pointer = const T*; using reference = T&; using const_reference = const T&; using size_type = size_t; using difference_type = ptrdiff_t; static constexpr size_t alignment = Align; template struct rebind { using other = aligned_allocator; }; aligned_allocator() noexcept; aligned_allocator(const aligned_allocator& rhs) noexcept; template aligned_allocator(const aligned_allocator& rhs) noexcept; ~aligned_allocator(); pointer address(reference) noexcept; const_pointer address(const_reference) const noexcept; pointer allocate(size_type n, const void* hint = 0); void deallocate(pointer p, size_type n); size_type max_size() const noexcept; size_type size_max() const noexcept; template void construct(U* p, Args&&... args); template void destroy(U* p); }; template bool operator==(const aligned_allocator& lhs, const aligned_allocator& rhs) noexcept; template bool operator!=(const aligned_allocator& lhs, const aligned_allocator& rhs) noexcept; void* aligned_malloc(size_t size, size_t alignment); void aligned_free(void* ptr); template size_t get_alignment_offset(const T* p, size_t size, size_t block_size); /************************************ * aligned_allocator implementation * ************************************/ /** * Default constructor. */ template inline aligned_allocator::aligned_allocator() noexcept { } /** * Copy constructor. */ template inline aligned_allocator::aligned_allocator(const aligned_allocator&) noexcept { } /** * Extended copy constructor. */ template template inline aligned_allocator::aligned_allocator(const aligned_allocator&) noexcept { } /** * Destructor. */ template inline aligned_allocator::~aligned_allocator() { } /** * Returns the actual address of \c r even in presence of overloaded \c operator&. * @param r the object to acquire address of. * @return the actual address of \c r. */ template inline auto aligned_allocator::address(reference r) noexcept -> pointer { return &r; } /** * Returns the actual address of \c r even in presence of overloaded \c operator&. * @param r the object to acquire address of. * @return the actual address of \c r. */ template inline auto aligned_allocator::address(const_reference r) const noexcept -> const_pointer { return &r; } /** * Allocates n * sizeof(T) bytes of uninitialized memory, aligned by \c A. * The alignment may require some extra memory allocation. * @param n the number of objects to allocate storage for. * @param hint unused parameter provided for standard compliance. * @return a pointer to the first byte of a memory block suitably aligned and sufficient to * hold an array of \c n objects of type \c T. */ template inline auto aligned_allocator::allocate(size_type n, const void*) -> pointer { pointer res = reinterpret_cast(aligned_malloc(sizeof(T) * n, A)); if (res == nullptr) throw std::bad_alloc(); return res; } /** * Deallocates the storage referenced by the pointer p, which must be a pointer obtained by * an earlier call to allocate(). The argument \c n must be equal to the first argument of the call * to allocate() that originally produced \c p; otherwise, the behavior is undefined. * @param p pointer obtained from allocate(). * @param n number of objects earlier passed to allocate(). */ template inline void aligned_allocator::deallocate(pointer p, size_type) { aligned_free(p); } /** * Returns the maximum theoretically possible value of \c n, for which the * call allocate(n, 0) could succeed. * @return the maximum supported allocated size. */ template inline auto aligned_allocator::max_size() const noexcept -> size_type { return size_type(-1) / sizeof(T); } /** * This method is deprecated, use max_size() instead */ template inline auto aligned_allocator::size_max() const noexcept -> size_type { return size_type(-1) / sizeof(T); } /** * Constructs an object of type \c T in allocated uninitialized memory * pointed to by \c p, using placement-new. * @param p pointer to allocated uninitialized memory. * @param args the constructor arguments to use. */ template template inline void aligned_allocator::construct(U* p, Args&&... args) { new ((void*)p) U(std::forward(args)...); } /** * Calls the destructor of the object pointed to by \c p. * @param p pointer to the object that is going to be destroyed. */ template template inline void aligned_allocator::destroy(U* p) { p->~U(); } /** * @defgroup allocator_comparison Comparison operators */ /** * @ingroup allocator_comparison * Compares two aligned memory allocator for equality. Since allocators * are stateless, return \c true iff A1 == A2. * @param lhs aligned_allocator to compare. * @param rhs aligned_allocator to compare. * @return true if the allocators have the same alignment. */ template inline bool operator==(const aligned_allocator& lhs, const aligned_allocator& rhs) noexcept { return lhs.alignment == rhs.alignment; } /** * @ingroup allocator_comparison * Compares two aligned memory allocator for inequality. Since allocators * are stateless, return \c true iff A1 != A2. * @param lhs aligned_allocator to compare. * @param rhs aligned_allocator to compare. * @return true if the allocators have different alignments. */ template inline bool operator!=(const aligned_allocator& lhs, const aligned_allocator& rhs) noexcept { return !(lhs == rhs); } /**************************************** * aligned malloc / free implementation * ****************************************/ namespace detail { inline void* xaligned_malloc(size_t size, size_t alignment) { assert(((alignment & (alignment - 1)) == 0) && "alignment must be a power of two"); assert((alignment >= sizeof(void*)) && "alignment must be at least the size of a pointer"); void* res = nullptr; #ifdef _WIN32 res = _aligned_malloc(size, alignment); #else if(posix_memalign(&res, alignment, size) != 0) { res = nullptr; } #endif return res; } inline void xaligned_free(void* ptr) { #ifdef _WIN32 _aligned_free(ptr); #else free(ptr); #endif } } inline void* aligned_malloc(size_t size, size_t alignment) { return detail::xaligned_malloc(size, alignment); } inline void aligned_free(void* ptr) { detail::xaligned_free(ptr); } template inline size_t get_alignment_offset(const T* p, size_t size, size_t block_size) { // size_t block_size = simd_traits::size; if (block_size == 1) { // The simd_block consists of exactly one scalar so that all // elements of the array // are "well" aligned. return 0; } else if (size_t(p) & (sizeof(T) - 1)) { // The array is not aligned to the size of a single element, so that // no element // of the array is well aligned return size; } else { size_t block_mask = block_size - 1; return std::min( (block_size - ((size_t(p) / sizeof(T)) & block_mask)) & block_mask, size); } } } #endif xsimd-7.6.0/include/xsimd/memory/xsimd_aligned_stack_buffer.hpp000066400000000000000000000102351410101234500247550ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_ALIGNED_STACK_BUFFER_HPP #define XSIMD_ALIGNED_STACK_BUFFER_HPP #include #include "xsimd_aligned_allocator.hpp" namespace xsimd { template class aligned_stack_buffer { public: using allocator = aligned_allocator; using value_type = typename allocator::value_type; using pointer = typename allocator::pointer; using const_pointer = typename allocator::const_pointer; using reference = typename allocator::reference; using const_reference = typename allocator::const_reference; using size_type = typename allocator::size_type; static constexpr alignment = allocator::alignment; explicit aligned_stack_buffer(size_type n); ~aligned_stack_buffer(); aligned_stack_buffer(const aligned_stack_buffer&) = delete; aligned_stack_buffer& operator=(const aligned_stack_buffer&) = delete; aligned_stack_buffer(aligned_stack_buffer&&) = delete; aligned_stack_buffer& operator=(aligned_stack_buffer&&) = delete; size_type size() const noexcept; reference operator[](size_type); const_reference operator[](size_type) const; operator pointer() noexcept; private: pointer m_ptr; size_type m_size; bool m_heap_allocation; }; /*************************************** * aligned_stack_buffer implementation * ***************************************/ namespace detail { inline void* aligned_alloc_stack(size_t size, size_t alignment) { return reinterpret_cast( reinterpret_cast(XSIMD_ALLOCA(size + alignment)) & ~(size_t(alignment - 1))) + alignment; } } template inline aligned_stack_buffer::aligned_stack_buffer(size_type n) : m_size(n) { #ifdef XSIMD_ALLOCA if (sizeof(T) * n <= XSIMD_STACK_ALLOCATION_LIMIT) { m_ptr = reinterpret_cast( (reinterpret_cast(XSIMD_ALLOCA(n + A)) & ~(size_t(A - 1))) + A); m_heap_allocation = false; } else { m_ptr = reinterpret_cast(aligned_malloc(n, A)); m_heap_allocation = true; } #else m_ptr = reinterpret_cast(aligned_malloc(n, A)); m_heap_allocation = true; #endif } template inline aligned_stack_buffer::~aligned_stack_buffer() { if (!std::is_trivially_destructible::value && m_ptr != 0) { for (auto p = m_ptr; p < m_ptr + m_size; ++p) { p->~T(); } } if (m_heap_allocation) { aligned_free(m_ptr); } } template inline auto aligned_stack_buffer::size() const noexcept -> size_type { return m_size; } template inline auto aligned_stack_buffer::operator[](size_type i) -> reference { return m_ptr[i]; } template inline auto aligned_stack_buffer::operator[](size_type i) const -> const_reference { return m_ptr[i]; } template inline aligned_stack_buffer::operator pointer() noexcept { return m_ptr; } } #endif xsimd-7.6.0/include/xsimd/memory/xsimd_alignment.hpp000066400000000000000000000043241410101234500226140ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_ALIGNMENT_HPP #define XSIMD_ALIGNMENT_HPP #include "../config/xsimd_align.hpp" #include "xsimd_aligned_allocator.hpp" namespace xsimd { /** * @struct aligned_mode * @brief tag for load and store of aligned memory. */ struct aligned_mode { }; /** * @struct unaligned_mode * @brief tag for load and store of unaligned memory. */ struct unaligned_mode { }; /*********************** * Allocator alignment * ***********************/ template struct allocator_alignment { using type = unaligned_mode; }; #if defined(XSIMD_DEFAULT_ALIGNMENT) template struct allocator_alignment> { using type = aligned_mode; }; #endif template using allocator_alignment_t = typename allocator_alignment::type; /*********************** * container alignment * ***********************/ namespace detail { template struct void_t { using type = void; }; } template struct container_alignment { using type = unaligned_mode; }; template struct container_alignment::type> { using type = allocator_alignment_t; }; template using container_alignment_t = typename container_alignment::type; } #endif xsimd-7.6.0/include/xsimd/memory/xsimd_load_store.hpp000066400000000000000000000613711410101234500227760ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_LOAD_STORE_HPP #define XSIMD_LOAD_STORE_HPP #include "../config/xsimd_config.hpp" #include "../types/xsimd_traits.hpp" namespace xsimd { /****************************** * Data transfer instructions * ******************************/ /** * @defgroup data_transfer Data Transfer Instructions */ /** * @ingroup data_transfer * Returns a batch with all values initialized to \c value. * @param value the scalar used to initialize the batch. * @return the batch wrapping the highest available instruction set. */ template simd_return_type set_simd(const T1& value); /** * @ingroup data_transfer * Loads the memory array pointed to by \c src into a batch and returns it. * \c src is required to be aligned. * @param src the pointer to the memory array to load. * @return the batch wrapping the highest available instruction set. */ template simd_return_type load_aligned(const T1* src); /** * @ingroup data_transfer * Loads the memory array pointed to by \c src into the batch \c dst. * \c src is required to be aligned. * @param src the pointer to the memory array to load. * @param dst the destination batch. */ template void load_aligned(const T1* src, simd_type& dst); /** * @ingroup data_transfer * Loads the memory arrays pointed to by \c real_src and \c imag_src * into a batch of complex numbers and returns it. \c real_src and * \c imag_src are required to be aligned. * @param real_src the pointer to the memory array containing the real part. * @param imag_src the pointer to the memory array containing the imaginary part. * @return the batch of complex wrapping the highest available instruction set. */ template simd_return_type load_aligned(const T1* real_src, const T1* imag_src); /** * @ingroup data_transfer * Loads the memory arrays pointed to by \c real_src and \c imag_src * into the batch \c dst. \c real_src and \c imag_src are required to be aligned. * @param real_src the pointer to the memory array containing the real part. * @param imag_src the pointer to the memory array containing the imaginary part. * @param dst the destination batch. */ template void load_aligned(const T1* real_src, const T1* imag_src, simd_type& dst); /** * @ingroup data_transfer * Loads the memory array pointed to by \c src into a batch and returns it. * \c src is not required to be aligned. * @param src the pointer to the memory array to load. * @return the batch wrapping the highest available instruction set. */ template simd_return_type load_unaligned(const T1* src); /** * @ingroup data_transfer * Loads the memory array pointed to by \c src into the batch \c dst. * \c src is not required to be aligned. * @param src the pointer to the memory array to load. * @param dst the destination batch. */ template void load_unaligned(const T1* src, simd_type& dst); /** * @ingroup data_transfer * Loads the memory arrays pointed to by \c real_src and \c imag_src * into a batch of complex numbers and returns it. \c real_src and * \c imag_src are not required to be aligned. * @param real_src the pointer to the memory array containing the real part. * @param imag_src the pointer to the memory array containing the imaginary part. * @return the batch of complex wrapping the highest available instruction set. */ template simd_return_type load_unaligned(const T1* real_src, const T1* imag_src); /** * @ingroup data_transfer * Loads the memory arrays pointed to by \c real_src and \c imag_src * into the batch \c dst. \c real_src and \c imag_src are not required to be aligned. * @param real_src the pointer to the memory array containing the real part. * @param imag_src the pointer to the memory array containing the imaginary part. * @param dst the destination batch. */ template void load_unaligned(const T1* real_src, const T1* imag_src, simd_type& dst); /** * @ingroup data_transfer * Stores the batch \c src into the memory array pointed to by \c dst. * \c dst is required to be aligned. * @param dst the pointer to the memory array. * @param src the batch to store. */ template void store_aligned(T1* dst, const simd_type& src); /** * @ingroup data_transfer * Stores the boolean batch \c src into the memory array pointed to by \c dst. * \c dst is required to be aligned. * @param dst the pointer to the memory array. * @param src the boolean batch to store. */ template void store_aligned(T1* dst, const simd_bool_type& src); /** * @ingroup data_transfer * Stores the batch \c src into the memory array pointed to by \c dst. * \c dst is not required to be aligned. * @param dst the pointer to the memory array. * @param src the batch to store. */ template void store_unaligned(T1* dst, const simd_type& src); /** * @ingroup data_transfer * Stores the boolean batch \c src into the memory array pointed to by \c dst. * \c dst is not required to be aligned. * @param dst the pointer to the memory array. * @param src the boolean batch to store. */ template void store_unaligned(T1* dst, const simd_bool_type& src); /** * @ingroup data_transfer * Stores the batch of complex numbers \c src into the memory arrays pointed * to by \c real_dst and \c imag_dst. \c real_dst and \c imag_dst are required * to be aligned. * @param real_dst the pointer to the memory array of the real part. * @param imag_dst the pointer to the memory array of the imaginary part. * @param src the batch to store. */ template void store_aligned(T1* real_dst, T1* imag_dst, const simd_type& src); /** * @ingroup data_transfer * Stores the batch of complex numbers \c src into the memory arrays pointed * to by \c real_dst and \c imag_dst. \c real_dst and \c imag_dst are not required * to be aligned. * @param real_dst the pointer to the memory array of the real part. * @param imag_dst the pointer to the memory array of the imaginary part. * @param src the batch to store. */ template void store_unaligned(T1* real_dst, T1* imag_dst, const simd_type& src); // Load / store generic functions /** * @defgroup generic_load_store Generic load and store */ /** * @ingroup generic_load_store * Loads the memory array pointed to by \c src into a batch and returns it. * \c src is required to be aligned. * @param src the pointer to the memory array to load. * @return the batch wrapping the highest available instruction set. */ template simd_return_type load_simd(const T1* src, aligned_mode); /** * @ingroup generic_load_store * Loads the memory array pointed to by \c src into the batch \c dst. * \c src is required to be aligned. * @param src the pointer to the memory array to load. * @param dst the destination batch. */ template void load_simd(const T1* src, simd_type& dst, aligned_mode); /** * @ingroup generic_load_store * Loads the memory arrays pointed to by \c real_src and \c imag_src * into a batch of complex numbers and returns it. \c real_src and * \c imag_src are required to be aligned. * @param real_src the pointer to the memory array containing the real part. * @param imag_src the pointer to the memory array containing the imaginary part. * @return the batch of complex wrapping the highest available instruction set. */ template simd_return_type load_simd(const T1* real_src, const T1* imag_src, aligned_mode); /** * @ingroup generic_load_store * Loads the memory arrays pointed to by \c real_src and \c imag_src * into the batch \c dst. \c real_src and \c imag_src are required to be aligned. * @param real_src the pointer to the memory array containing the real part. * @param imag_src the pointer to the memory array containing the imaginary part. * @param dst the destination batch. */ template void load_simd(const T1* real_src, const T1* imag_src, simd_type& dst, aligned_mode); /** * @ingroup generic_load_store * Loads the memory array pointed to by \c src into a batch and returns it. * \c src is not required to be aligned. * @param src the pointer to the memory array to load. * @return the batch wrapping the highest available instruction set. */ template simd_return_type load_simd(const T1* src, unaligned_mode); /** * @ingroup generic_load_store * Loads the memory array pointed to by \c src into the batch \c dst. * \c src is not required to be aligned. * @param src the pointer to the memory array to load. * @param dst the destination batch. */ template void load_simd(const T1* src, simd_type& dst, unaligned_mode); /** * @ingroup generic_load_store * Loads the memory arrays pointed to by \c real_src and \c imag_src * into a batch of complex numbers and returns it. \c real_src and * \c imag_src are not required to be aligned. * @param real_src the pointer to the memory array containing the real part. * @param imag_src the pointer to the memory array containing the imaginary part. * @return the batch of complex wrapping the highest available instruction set. */ template simd_return_type load_simd(const T1* real_src, const T1* imag_src, unaligned_mode); /** * @ingroup generic_load_store * Loads the memory arrays pointed to by \c real_src and \c imag_src * into the batch \c dst. \c real_src and \c imag_src are not required to be aligned. * @param real_src the pointer to the memory array containing the real part. * @param imag_src the pointer to the memory array containing the imaginary part. * @param dst the destination batch. */ template void load_simd(const T1* real_src, const T1* imag_src, simd_type& dst, unaligned_mode); /** * @ingroup generic_load_store * Stores the batch \c src into the memory array pointed to by \c dst. * \c dst is required to be aligned. * @param dst the pointer to the memory array. * @param src the batch to store. */ template void store_simd(T1* dst, const simd_type& src, aligned_mode); /** * @ingroup generic_load_store * Stores the boolean batch \c src into the memory array pointed to by \c dst. * \c dst is required to be aligned. * @param dst the pointer to the memory array. * @param src the boolean batch to store. */ template void store_simd(T1* dst, const simd_bool_type& src, aligned_mode); /** * @ingroup generic_load_store * Stores the batch \c src into the memory array pointed to by \c dst. * \c dst is not required to be aligned. * @param dst the pointer to the memory array. * @param src the batch to store. */ template void store_simd(T1* dst, const simd_type& src, unaligned_mode); /** * @ingroup generic_load_store * Stores the boolean batch \c src into the memory array pointed to by \c dst. * \c dst is not required to be aligned. * @param dst the pointer to the memory array. * @param src the boolean batch to store. */ template void store_simd(T1* dst, const simd_bool_type& src, unaligned_mode); /** * @ingroup generic_load_store * Stores the batch of complex numbers \c src into the memory arrays pointed * to by \c real_dst and \c imag_dst. \c real_dst and \c imag_dst are required * to be aligned. * @param real_dst the pointer to the memory array of the real part. * @param imag_dst the pointer to the memory array of the imaginary part. * @param src the batch to store. */ template void store_simd(T1* real_dst, T1* imag_dst, const simd_type& src, aligned_mode); /** * @ingroup generic_load_store * Stores the batch of complex numbers \c src into the memory arrays pointed * to by \c real_dst and \c imag_dst. \c real_dst and \c imag_dst are not required * to be aligned. * @param real_dst the pointer to the memory array of the real part. * @param imag_dst the pointer to the memory array of the imaginary part. * @param src the batch to store. */ template void store_simd(T1* real_dst, T1* imag_dst, const simd_type& src, unaligned_mode); // Prefetch template void prefetch(const T* address); /*************************** * detail implementation ***************************/ namespace detail { // Common implementation of SIMD functions for types supported // by vectorization. template struct simd_function_invoker { inline static V set_simd(const T& value) { using batch_value_type = typename V::value_type; using value_type = typename std::conditional::value, bool, batch_value_type>::type; return V(value_type(value)); } inline static V load_aligned(const T* src) { V res; return res.load_aligned(src); } inline static void load_aligned(const T* src, V& dst) { dst.load_aligned(src); } inline static V load_unaligned(const T* src) { V res; return res.load_unaligned(src); } inline static void load_unaligned(const T* src, V& dst) { dst.load_unaligned(src); } inline static void store_aligned(T* dst, const V& src) { src.store_aligned(dst); } inline static void store_unaligned(T* dst, const V& src) { src.store_unaligned(dst); } }; template struct simd_complex_invoker { inline static V load_aligned(const T* real_src, const T* imag_src) { V res; return res.load_aligned(real_src, imag_src); } inline static void load_aligned(const T* real_src, const T* imag_src, V& dst) { dst.load_aligned(real_src, imag_src); } inline static V load_unaligned(const T* real_src, const T* imag_src) { V res; return res.load_unaligned(real_src, imag_src); } inline static void load_unaligned(const T* real_src, const T* imag_src, V& dst) { dst.load_unaligned(real_src, imag_src); } inline static void store_aligned(T* real_dst, T* imag_dst, const V& src) { src.store_aligned(real_dst, imag_dst); } inline static void store_unaligned(T* real_dst, T* imag_dst, const V& src) { src.store_unaligned(real_dst, imag_dst); } }; // Default implementation of SIMD functions for types not supported // by vectorization. template struct simd_function_invoker { inline static T set_simd(const T& value) { return value; } inline static T load_aligned(const T* src) { return *src; } inline static void load_aligned(const T* src, T& dst) { dst = *src; } inline static T load_unaligned(const T* src) { return *src; } inline static void load_unaligned(const T* src, T& dst) { dst = *src; } inline static void store_aligned(T* dst, const T& src) { *dst = src; } inline static void store_unaligned(T* dst, const T& src) { *dst = src; } }; } /*********************************************** * Data transfer instructions implementation ***********************************************/ template inline simd_return_type set_simd(const T1& value) { return detail::simd_function_invoker>::set_simd(value); } template inline simd_return_type load_aligned(const T1* src) { return detail::simd_function_invoker>::load_aligned(src); } template inline void load_aligned(const T1* src, simd_type& dst) { detail::simd_function_invoker>::load_aligned(src, dst); } template inline simd_return_type load_aligned(const T1* real_src, const T1* imag_src) { return detail::simd_complex_invoker>::load_aligned(real_src, imag_src); } template inline void load_aligned(const T1* real_src, const T1* imag_src, simd_type& dst) { detail::simd_complex_invoker>::load_aligned(real_src, imag_src, dst); } template inline simd_return_type load_unaligned(const T1* src) { return detail::simd_function_invoker>::load_unaligned(src); } template inline void load_unaligned(const T1* src, simd_type& dst) { detail::simd_function_invoker>::load_unaligned(src, dst); } template inline simd_return_type load_unaligned(const T1* real_src, const T1* imag_src) { return detail::simd_complex_invoker>::load_unaligned(real_src, imag_src); } template inline void load_unaligned(const T1* real_src, const T1* imag_src, simd_type& dst) { detail::simd_complex_invoker>::load_unaligned(real_src, imag_src, dst); } template inline void store_aligned(T1* dst, const simd_type& src) { detail::simd_function_invoker>::store_aligned(dst, src); } template inline void store_aligned(T1* dst, const simd_bool_type& src) { detail::simd_function_invoker>::store_aligned(dst, src); } template inline void store_unaligned(T1* dst, const simd_type& src) { detail::simd_function_invoker>::store_unaligned(dst, src); } template inline void store_unaligned(T1* dst, const simd_bool_type& src) { detail::simd_function_invoker>::store_unaligned(dst, src); } template inline void store_aligned(T1* real_dst, T1* imag_dst, const simd_type& src) { detail::simd_complex_invoker>::store_aligned(real_dst, imag_dst, src); } template inline void store_unaligned(T1* real_dst, T1* imag_dst, const simd_type& src) { detail::simd_complex_invoker>::store_unaligned(real_dst, imag_dst, src); } /*************************************************** * Load / store generic functions implementation ***************************************************/ template inline simd_return_type load_simd(const T1* src, aligned_mode) { return load_aligned(src); } template inline void load_simd(const T1* src, simd_type& dst, aligned_mode) { load_aligned(src, dst); } template inline simd_return_type load_simd(const T1* real_src, const T1* imag_src, aligned_mode) { return load_aligned(real_src, imag_src); } template inline void load_simd(const T1* real_src, const T1* imag_src, simd_type& dst, aligned_mode) { load_aligned(real_src, imag_src, dst); } template inline simd_return_type load_simd(const T1* src, unaligned_mode) { return load_unaligned(src); } template inline void load_simd(const T1* src, simd_type& dst, unaligned_mode) { load_unaligned(src, dst); } template inline simd_return_type load_simd(const T1* real_src, const T1* imag_src, unaligned_mode) { return load_unaligned(real_src, imag_src); } template inline void load_simd(const T1* real_src, const T1* imag_src, simd_type& dst, unaligned_mode) { load_unaligned(real_src, imag_src, dst); } template inline void store_simd(T1* dst, const simd_type& src, aligned_mode) { store_aligned(dst, src); } template inline void store_simd(T1* dst, const simd_bool_type& src, aligned_mode) { store_aligned(dst, src); } template inline void store_simd(T1* dst, const simd_type& src, unaligned_mode) { store_unaligned(dst, src); } template inline void store_simd(T1* dst, const simd_bool_type& src, unaligned_mode) { store_unaligned(dst, src); } template inline void store_simd(T1* real_dst, T1* imag_dst, const simd_type& src, aligned_mode) { store_aligned(real_dst, imag_dst, src); } template inline void store_simd(T1* real_dst, T1* imag_dst, const simd_type& src, unaligned_mode) { store_unaligned(real_dst, imag_dst, src); } /***************************** * Prefetch implementation *****************************/ template inline void prefetch(const T* /*address*/) { } #if XSIMD_X86_INSTR_SET > XSIMD_VERSION_NUMBER_NOT_AVAILABLE template <> inline void prefetch(const int32_t* address) { _mm_prefetch(reinterpret_cast(address), _MM_HINT_T0); } template <> inline void prefetch(const int64_t* address) { _mm_prefetch(reinterpret_cast(address), _MM_HINT_T0); } template <> inline void prefetch(const float* address) { _mm_prefetch(reinterpret_cast(address), _MM_HINT_T0); } template <> inline void prefetch(const double* address) { _mm_prefetch(reinterpret_cast(address), _MM_HINT_T0); } #endif } #endif xsimd-7.6.0/include/xsimd/stl/000077500000000000000000000000001410101234500162105ustar00rootroot00000000000000xsimd-7.6.0/include/xsimd/stl/algorithms.hpp000066400000000000000000000175011410101234500210760ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_ALGORITHMS_HPP #define XSIMD_ALGORITHMS_HPP #include "../memory/xsimd_load_store.hpp" namespace xsimd { template void transform(I1 first, I2 last, O1 out_first, UF&& f) { using value_type = typename std::decay::type; using traits = simd_traits; using batch_type = typename traits::type; std::size_t size = static_cast(std::distance(first, last)); std::size_t simd_size = traits::size; const auto* ptr_begin = &(*first); auto* ptr_out = &(*out_first); std::size_t align_begin = xsimd::get_alignment_offset(ptr_begin, size, simd_size); std::size_t out_align = xsimd::get_alignment_offset(ptr_out, size, simd_size); std::size_t align_end = align_begin + ((size - align_begin) & ~(simd_size - 1)); if (align_begin == out_align) { for (std::size_t i = 0; i < align_begin; ++i) { out_first[i] = f(first[i]); } batch_type batch; for (std::size_t i = align_begin; i < align_end; i += simd_size) { xsimd::load_aligned(&first[i], batch); xsimd::store_aligned(&out_first[i], f(batch)); } for (std::size_t i = align_end; i < size; ++i) { out_first[i] = f(first[i]); } } else { for (std::size_t i = 0; i < align_begin; ++i) { out_first[i] = f(first[i]); } batch_type batch; for (std::size_t i = align_begin; i < align_end; i += simd_size) { xsimd::load_aligned(&first[i], batch); xsimd::store_unaligned(&out_first[i], f(batch)); } for (std::size_t i = align_end; i < size; ++i) { out_first[i] = f(first[i]); } } } template void transform(I1 first_1, I2 last_1, I3 first_2, O1 out_first, UF&& f) { using value_type = typename std::decay::type; using traits = simd_traits; using batch_type = typename traits::type; std::size_t size = static_cast(std::distance(first_1, last_1)); std::size_t simd_size = traits::size; const auto* ptr_begin_1 = &(*first_1); const auto* ptr_begin_2 = &(*first_2); auto* ptr_out = &(*out_first); std::size_t align_begin_1 = xsimd::get_alignment_offset(ptr_begin_1, size, simd_size); std::size_t align_begin_2 = xsimd::get_alignment_offset(ptr_begin_2, size, simd_size); std::size_t out_align = xsimd::get_alignment_offset(ptr_out, size, simd_size); std::size_t align_end = align_begin_1 + ((size - align_begin_1) & ~(simd_size - 1)); #define XSIMD_LOOP_MACRO(A1, A2, A3) \ for (std::size_t i = 0; i < align_begin_1; ++i) \ { \ out_first[i] = f(first_1[i], first_2[i]); \ } \ \ batch_type batch_1, batch_2; \ for (std::size_t i = align_begin_1; i < align_end; i += simd_size) \ { \ xsimd::A1(&first_1[i], batch_1); \ xsimd::A2(&first_2[i], batch_2); \ xsimd::A3(&out_first[i], f(batch_1, batch_2)); \ } \ \ for (std::size_t i = align_end; i < size; ++i) \ { \ out_first[i] = f(first_1[i], first_2[i]); \ } \ if (align_begin_1 == out_align && align_begin_1 == align_begin_2) { XSIMD_LOOP_MACRO(load_aligned, load_aligned, store_aligned); } else if (align_begin_1 == out_align && align_begin_1 != align_begin_2) { XSIMD_LOOP_MACRO(load_aligned, load_unaligned, store_aligned); } else if (align_begin_1 != out_align && align_begin_1 == align_begin_2) { XSIMD_LOOP_MACRO(load_aligned, load_aligned, store_unaligned); } else if (align_begin_1 != out_align && align_begin_1 != align_begin_2) { XSIMD_LOOP_MACRO(load_aligned, load_unaligned, store_unaligned); } #undef XSIMD_LOOP_MACRO } // TODO: Remove this once we drop C++11 support namespace detail { struct plus { template auto operator()(X&& x, Y&& y) -> decltype(x + y) { return x + y; } }; } template Init reduce(Iterator1 first, Iterator2 last, Init init, BinaryFunction&& binfun = detail::plus{}) { using value_type = typename std::decay::type; using traits = simd_traits; using batch_type = typename traits::type; std::size_t size = static_cast(std::distance(first, last)); constexpr std::size_t simd_size = traits::size; if(size < simd_size) { while(first != last) { init = binfun(init, *first++); } return init; } const auto* const ptr_begin = &(*first); std::size_t align_begin = xsimd::get_alignment_offset(ptr_begin, size, simd_size); std::size_t align_end = align_begin + ((size - align_begin) & ~(simd_size - 1)); // reduce initial unaligned part for (std::size_t i = 0; i < align_begin; ++i) { init = binfun(init, first[i]); } // reduce aligned part batch_type batch_init, batch; auto ptr = ptr_begin + align_begin; xsimd::load_aligned(ptr, batch_init); ptr += simd_size; for (auto const end = ptr_begin + align_end; ptr < end; ptr += simd_size) { xsimd::load_aligned(ptr, batch); batch_init = binfun(batch_init, batch); } // reduce across batch alignas(batch_type) std::array arr; xsimd::store_aligned(arr.data(), batch_init); for (auto x : arr) init = binfun(init, x); // reduce final unaligned part for (std::size_t i = align_end; i < size; ++i) { init = binfun(init, first[i]); } return init; } } #endif xsimd-7.6.0/include/xsimd/stl/iterator.hpp000066400000000000000000000130471410101234500205570ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_ITERATOR_HPP #define XSIMD_ITERATOR_HPP namespace xsimd { template class batch_proxy; template struct simd_batch_traits> : simd_batch_traits { }; template struct simd_batch_inner_types> { using batch_reference = X; using const_batch_reference = X; }; /** * Aligned proxy that iterators can dereference to */ template class batch_proxy : public simd_base> { public: using self_type = batch_proxy; using base_type = simd_type; using batch_reference = typename base_type::batch_reference; using const_batch_reference = typename base_type::const_batch_reference; using batch_type = B; using value_type = typename B::value_type; using pointer = value_type*; batch_proxy(pointer ptr); batch_reference get(); const_batch_reference get() const; self_type& set(const batch_type& rhs); self_type& set(const self_type& rhs); operator batch_type() const; batch_proxy& operator=(const batch_type& rhs); private: value_type* m_ptr; }; template struct is_proxy : std::false_type { }; template struct is_proxy> : std::true_type { }; template std::ostream& operator<<(std::ostream& os, const batch_proxy& bp) { return os << bp.get(); } template class aligned_iterator { public: using self_type = aligned_iterator; using batch_type = B; using value_type = typename B::value_type; static constexpr std::size_t batch_size = B::size; using proxy_type = batch_proxy; using pointer = value_type*; using reference = proxy_type; aligned_iterator(pointer memory); reference operator*(); void operator++(int); aligned_iterator& operator++(); bool equal(const aligned_iterator& rhs) const; private: pointer m_cur_pointer; }; /****************************** * batch proxy implementation * *****************************/ template inline batch_proxy::batch_proxy(pointer ptr) : m_ptr(ptr) { } template inline auto batch_proxy::get() -> batch_reference { return batch_type(*this); } template inline auto batch_proxy::get() const -> const_batch_reference { return batch_type(*this); } template inline auto batch_proxy::set(const batch_type& rhs) -> self_type& { xsimd::store_aligned(m_ptr, rhs); return *this; } template inline auto batch_proxy::set(const self_type& rhs) -> self_type& { xsimd::store_aligned(m_ptr, rhs.get()); return *this; } template inline batch_proxy::operator batch_type() const { batch_type m_reg; m_reg.load_aligned(m_ptr); return m_reg; } template inline auto batch_proxy::operator=(const batch_type& rhs) -> batch_proxy& { xsimd::store_aligned(m_ptr, rhs); return *this; } /*********************************** * aligned iterator implementation * ***********************************/ template inline aligned_iterator::aligned_iterator(pointer memory) : m_cur_pointer(memory) { } template inline auto aligned_iterator::operator*() -> reference { return m_cur_pointer; } template inline aligned_iterator& aligned_iterator::operator++() { m_cur_pointer += batch_size; return *this; } template inline void aligned_iterator::operator++(int) { m_cur_pointer += batch_size; } template inline bool aligned_iterator::equal(const aligned_iterator& rhs) const { return m_cur_pointer == rhs.m_cur_pointer; } template inline bool operator==(const aligned_iterator& lhs, const aligned_iterator& rhs) { return lhs.equal(rhs); } template inline bool operator!=(const aligned_iterator& lhs, const aligned_iterator& rhs) { return !lhs.equal(rhs); } #if defined(_WIN32) && defined(__clang__) // See comment at the end of simd_base.hpp template inline B fma(const batch_proxy& a, const batch_proxy& b, const batch_proxy& c) { using base_type = simd_base>; const base_type& sba = a; const base_type& sbb = b; const base_type& sbc = c; return fma(sba, sbb, sbc); } #endif } #endif xsimd-7.6.0/include/xsimd/types/000077500000000000000000000000001410101234500165525ustar00rootroot00000000000000xsimd-7.6.0/include/xsimd/types/xsimd_avx512_bool.hpp000066400000000000000000000302521410101234500225320ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512_BOOL_HPP #define XSIMD_AVX512_BOOL_HPP #include "xsimd_avx512_int_base.hpp" #include "xsimd_utils.hpp" namespace xsimd { /******************* * bool_mask_proxy * *******************/ template class bool_mask_proxy { public: bool_mask_proxy(MASK& ref, std::size_t idx); bool_mask_proxy(const bool_mask_proxy&) = default; bool_mask_proxy& operator=(const bool_mask_proxy&) = default; bool_mask_proxy(bool_mask_proxy&&) = default; bool_mask_proxy& operator=(bool_mask_proxy&&) = default; operator bool() const; bool_mask_proxy& operator=(bool); private: MASK& m_ref; std::size_t m_idx; }; /********************* * batch_bool_avx512 * *********************/ template class batch_bool_avx512 : public simd_batch_bool { public: batch_bool_avx512(); explicit batch_bool_avx512(bool b); template > batch_bool_avx512(Args... args); batch_bool_avx512(const bool (&init)[sizeof(MASK) * 8]); batch_bool_avx512(const MASK& rhs); batch_bool_avx512& operator=(const __m512& rhs); bool_mask_proxy operator[](std::size_t index); bool operator[](std::size_t index) const; operator MASK() const; private: B& load_array(const std::array& src); template B& load_values(Args... args); MASK m_value; friend class simd_batch_bool; }; /****************************** * avx512_fallback_batch_bool * ******************************/ template class avx512_fallback_batch_bool : public simd_batch_bool> { public: avx512_fallback_batch_bool(); explicit avx512_fallback_batch_bool(bool b); template > avx512_fallback_batch_bool(Args... args); avx512_fallback_batch_bool(const __m512i& rhs); avx512_fallback_batch_bool& operator=(const __m512i& rhs); operator __m512i() const; bool_proxy operator[](std::size_t index); bool operator[](std::size_t index) const; private: template batch_bool& load_values(Args... b); union { __m512i m_value; T m_array[N]; }; friend class simd_batch_bool>; }; /********************************** * bool_mask_proxy implementation * **********************************/ template inline bool_mask_proxy::bool_mask_proxy(MASK& ref, std::size_t idx) : m_ref(ref), m_idx(idx) { } template inline bool_mask_proxy::operator bool() const { return ((m_ref >> m_idx) & MASK(1)) != 0; } template inline bool_mask_proxy& bool_mask_proxy::operator=(bool rhs) { MASK tmp = static_cast(rhs); m_ref ^= (-tmp ^ m_ref) & (MASK(1) << m_idx); return *this; } /************************************ * batch_bool_avx512 implementation * ************************************/ template inline batch_bool_avx512::batch_bool_avx512() { } template template inline batch_bool_avx512::batch_bool_avx512(Args... args) : batch_bool_avx512({{static_cast(args)...}}) { } template inline batch_bool_avx512::batch_bool_avx512(bool b) : m_value(b ? -1 : 0) { } namespace detail { template constexpr T get_init_value_impl(const bool (&/*init*/)[sizeof(T) * 8]) { return T(0); } template constexpr T get_init_value_impl(const bool (&init)[sizeof(T) * 8]) { return (T(init[IX]) << IX) | get_init_value_impl(init); } template constexpr T get_init_value(const bool (&init)[sizeof(T) * 8], detail::index_sequence) { return get_init_value_impl(init); } } template inline batch_bool_avx512::batch_bool_avx512(const bool (&init)[sizeof(MASK) * 8]) : m_value(detail::get_init_value(init, detail::make_index_sequence{})) { } template inline batch_bool_avx512::batch_bool_avx512(const MASK& rhs) : m_value(rhs) { } template inline batch_bool_avx512::operator MASK() const { return m_value; } template inline bool_mask_proxy batch_bool_avx512::operator[](std::size_t idx) { std::size_t s = simd_batch_traits::size - 1; return bool_mask_proxy(m_value, idx & s); } template inline bool batch_bool_avx512::operator[](std::size_t idx) const { std::size_t s = simd_batch_traits::size - 1; return (m_value & (MASK(1) << (idx & s))) != 0; } template inline T& batch_bool_avx512::load_array(const std::array& src) { MASK tmp(false); for(std::size_t i = 0; i < sizeof(MASK) * 8; ++i) { tmp |= MASK(src[i]) << i; } m_value = tmp; return (*this)(); } template template inline T& batch_bool_avx512::load_values(Args... b) { return load_array({{b...}}); } namespace detail { template struct batch_bool_kernel_avx512 { using batch_type = batch_bool; using mt = typename mask_type::type; static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return mt(lhs) & mt(rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return mt(lhs) | mt(rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return mt(lhs) ^ mt(rhs); } static batch_type bitwise_not(const batch_type& rhs) { return ~mt(rhs); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return mt(lhs) ^ mt(rhs); } static batch_type equal(const batch_type& lhs, const batch_type& rhs) { return (~mt(lhs)) ^ mt(rhs); } static batch_type not_equal(const batch_type& lhs, const batch_type& rhs) { return mt(lhs) ^ mt(rhs); } static bool all(const batch_type& rhs) { return mt(rhs) == mt(-1); } static bool any(const batch_type& rhs) { return mt(rhs) != mt(0); } }; } /********************************************* * avx512_fallback_batch_bool implementation * *********************************************/ template inline avx512_fallback_batch_bool::avx512_fallback_batch_bool() { } template inline avx512_fallback_batch_bool::avx512_fallback_batch_bool(bool b) : m_value(_mm512_set1_epi64(-(int64_t)b)) { } template template inline avx512_fallback_batch_bool::avx512_fallback_batch_bool(Args... args) : m_value(avx512_detail::int_init(std::integral_constant{}, static_cast(-static_cast(args))...)) { } template inline avx512_fallback_batch_bool::avx512_fallback_batch_bool(const __m512i& rhs) : m_value(rhs) { } template inline avx512_fallback_batch_bool::operator __m512i() const { return m_value; } template inline avx512_fallback_batch_bool& avx512_fallback_batch_bool::operator=(const __m512i& rhs) { m_value = rhs; return *this; } template inline bool_proxy avx512_fallback_batch_bool::operator[](std::size_t idx) { return bool_proxy(m_array[idx & (N - 1)]); } template inline bool avx512_fallback_batch_bool::operator[](std::size_t idx) const { return static_cast(m_array[idx & (N - 1)]); } template template inline batch_bool& avx512_fallback_batch_bool::load_values(Args... b) { m_value = avx512_detail::int_init(std::integral_constant{}, static_cast(-static_cast(b))...); return (*this)(); } namespace detail { template struct avx512_fallback_batch_bool_kernel { using batch_type = batch_bool; static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return _mm512_and_si512(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return _mm512_or_si512(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return _mm512_xor_si512(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return _mm512_xor_si512(rhs, _mm512_set1_epi64(-1)); // xor with all one } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return _mm512_andnot_si512(lhs, rhs); } static batch_type equal(const batch_type& lhs, const batch_type& rhs) { return ~(lhs ^ rhs); } static batch_type not_equal(const batch_type& lhs, const batch_type& rhs) { return lhs ^ rhs; } static bool all(const batch_type& rhs) { XSIMD_SPLIT_AVX512(rhs); bool res_hi = _mm256_testc_si256(rhs_high, batch_bool(true)) != 0; bool res_lo = _mm256_testc_si256(rhs_low, batch_bool(true)) != 0; return res_hi && res_lo; } static bool any(const batch_type& rhs) { XSIMD_SPLIT_AVX512(rhs); bool res_hi = !_mm256_testz_si256(rhs_high, rhs_high); bool res_lo = !_mm256_testz_si256(rhs_low, rhs_low); return res_hi || res_lo; } }; } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_avx512_complex.hpp000066400000000000000000000410111410101234500232410ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512_COMPLEX_HPP #define XSIMD_AVX512_COMPLEX_HPP #include #ifdef XSIMD_ENABLE_XTL_COMPLEX #include "xtl/xcomplex.hpp" #endif #include "xsimd_avx512_float.hpp" #include "xsimd_avx512_double.hpp" #include "xsimd_complex_base.hpp" namespace xsimd { /*************************************** * batch_bool, 16> * ***************************************/ template <> struct simd_batch_traits, 16>> : complex_batch_bool_traits, float, 16, 64> { }; template<> class batch_bool, 16> : public simd_complex_batch_bool, 16>> { public: using self_type = batch_bool, 16>; using base_type = simd_complex_batch_bool; using real_batch = batch_bool; batch_bool() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch_bool(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7, bool b8, bool b9, bool b10, bool b11, bool b12, bool b13, bool b14, bool b15) : base_type(real_batch(b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15)) { } }; /********************************** * batch, 16> * **********************************/ template <> struct simd_batch_traits, 16>> : complex_batch_traits, float, 16, 64> { }; template <> class batch, 16> : public simd_complex_batch, 16>> { public: using self_type = batch, 16>; using base_type = simd_complex_batch; using value_type = std::complex; using real_batch = batch; batch() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch(value_type c0, value_type c1, value_type c2, value_type c3, value_type c4, value_type c5, value_type c6, value_type c7, value_type c8, value_type c9, value_type c10, value_type c11, value_type c12, value_type c13, value_type c14, value_type c15) : base_type(real_batch(c0.real(), c1.real(), c2.real(), c3.real(), c4.real(), c5.real(), c6.real(), c7.real(), c8.real(), c9.real(), c10.real(), c11.real(), c12.real(), c13.real(), c14.real(), c15.real()), real_batch(c0.imag(), c1.imag(), c2.imag(), c3.imag(), c4.imag(), c5.imag(), c6.imag(), c7.imag(), c8.imag(), c9.imag(), c10.imag(), c11.imag(), c12.imag(), c13.imag(), c14.imag(), c15.imag())) { } private: batch& load_complex(const real_batch& hi, const real_batch& lo); real_batch get_complex_high() const; real_batch get_complex_low() const; friend class simd_complex_batch, 16>>; }; /*************************************** * batch_bool, 8> * ***************************************/ template <> struct simd_batch_traits, 8>> : complex_batch_bool_traits, double, 8, 64> { }; template<> class batch_bool, 8> : public simd_complex_batch_bool, 8>> { public: using self_type = batch_bool, 8>; using base_type = simd_complex_batch_bool; using real_batch = batch_bool; batch_bool() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch_bool(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7) : base_type(real_batch(b0, b1, b2, b3, b4, b5, b6, b7)) { } }; /********************************** * batch, 8> * **********************************/ template <> struct simd_batch_traits, 8>> : complex_batch_traits, double, 8, 64> { }; template <> class batch, 8> : public simd_complex_batch, 8>> { public: using self_type = batch, 8>; using base_type = simd_complex_batch; using value_type = std::complex; using real_batch = batch; batch() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch(value_type c0, value_type c1, value_type c2, value_type c3, value_type c4, value_type c5, value_type c6, value_type c7) : base_type(real_batch(c0.real(), c1.real(), c2.real(), c3.real(), c4.real(), c5.real(), c6.real(), c7.real()), real_batch(c0.imag(), c1.imag(), c2.imag(), c3.imag(), c4.imag(), c5.imag(), c6.imag(), c7.imag())) { } private: batch& load_complex(const real_batch& hi, const real_batch& lo); real_batch get_complex_high() const; real_batch get_complex_low() const; friend class simd_complex_batch, 8>>; }; /******************************************** * batch, N> implementation * ********************************************/ inline batch, 16>& batch, 16>::load_complex(const real_batch& hi, const real_batch& lo) { __m512i real_idx = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); __m512i imag_idx = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31); this->m_real = _mm512_permutex2var_ps(hi, real_idx, lo); this->m_imag = _mm512_permutex2var_ps(hi, imag_idx, lo); return *this; } inline auto batch, 16>::get_complex_high() const -> real_batch { __m512i idx = _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); return _mm512_permutex2var_ps(this->m_real, idx, this->m_imag); } inline auto batch, 16>::get_complex_low() const -> real_batch { __m512i idx = _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); return _mm512_permutex2var_ps(this->m_real, idx, this->m_imag); } inline batch, 8>& batch, 8>::load_complex(const real_batch& hi, const real_batch& lo) { __m512i real_idx = _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14); __m512i imag_idx = _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15); this->m_real = _mm512_permutex2var_pd(hi, real_idx, lo); this->m_imag = _mm512_permutex2var_pd(hi, imag_idx, lo); return *this; } inline auto batch, 8>::get_complex_high() const -> real_batch { __m512i idx = _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11); return _mm512_permutex2var_pd(this->m_real, idx, this->m_imag); } inline auto batch, 8>::get_complex_low() const -> real_batch { __m512i idx = _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15); return _mm512_permutex2var_pd(this->m_real, idx, this->m_imag); } #ifdef XSIMD_ENABLE_XTL_COMPLEX /***************************************************** * batch_bool, 16> * *****************************************************/ template struct simd_batch_traits, 16>> : complex_batch_bool_traits, float, 16, 64> { }; template class batch_bool, 16> : public simd_complex_batch_bool, 16>> { public: using self_type = batch_bool, 16>; using base_type = simd_complex_batch_bool; using real_batch = batch_bool; batch_bool() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch_bool(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7, bool b8, bool b9, bool b10, bool b11, bool b12, bool b13, bool b14, bool b15) : base_type(real_batch(b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15)) { } }; /*********************************************** * batch, 16> * ***********************************************/ template struct simd_batch_traits, 16>> : complex_batch_traits, float, 16, 64> { }; template class batch, 16> : public simd_complex_batch, 16>> { public: using self_type = batch, 16>; using base_type = simd_complex_batch; using value_type = xtl::xcomplex; using real_batch = batch; batch() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch(value_type c0, value_type c1, value_type c2, value_type c3, value_type c4, value_type c5, value_type c6, value_type c7, value_type c8, value_type c9, value_type c10, value_type c11, value_type c12, value_type c13, value_type c14, value_type c15) : base_type(real_batch(c0.real(), c1.real(), c2.real(), c3.real(), c4.real(), c5.real(), c6.real(), c7.real(), c8.real(), c9.real(), c10.real(), c11.real(), c12.real(), c13.real(), c14.real(), c15.real()), real_batch(c0.imag(), c1.imag(), c2.imag(), c3.imag(), c4.imag(), c5.imag(), c6.imag(), c7.imag(), c8.imag(), c9.imag(), c10.imag(), c11.imag(), c12.imag(), c13.imag(), c14.imag(), c15.imag())) { } private: batch& load_complex(const real_batch& hi, const real_batch& lo); real_batch get_complex_high() const; real_batch get_complex_low() const; friend class simd_complex_batch, 16>>; }; /****************************************************** * batch_bool, 8> * ******************************************************/ template struct simd_batch_traits, 8>> : complex_batch_bool_traits, double, 8, 64> { }; template class batch_bool, 8> : public simd_complex_batch_bool, 8>> { public: using self_type = batch_bool, 8>; using base_type = simd_complex_batch_bool; using real_batch = batch_bool; batch_bool() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch_bool(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7) : base_type(real_batch(b0, b1, b2, b3, b4, b5, b6, b7)) { } }; /************************************************* * batch, 8> * *************************************************/ template struct simd_batch_traits, 8>> : complex_batch_traits, double, 8, 64> { }; template class batch, 8> : public simd_complex_batch, 8>> { public: using self_type = batch, 8>; using base_type = simd_complex_batch; using value_type = xtl::xcomplex; using real_batch = batch; batch() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch(value_type c0, value_type c1, value_type c2, value_type c3, value_type c4, value_type c5, value_type c6, value_type c7) : base_type(real_batch(c0.real(), c1.real(), c2.real(), c3.real(), c4.real(), c5.real(), c6.real(), c7.real()), real_batch(c0.imag(), c1.imag(), c2.imag(), c3.imag(), c4.imag(), c5.imag(), c6.imag(), c7.imag())) { } private: batch& load_complex(const real_batch& hi, const real_batch& lo); real_batch get_complex_high() const; real_batch get_complex_low() const; friend class simd_complex_batch, 8>>; }; /******************************************** * batch, N> implementation * ********************************************/ template inline batch, 16>& batch, 16>::load_complex(const real_batch& hi, const real_batch& lo) { __m512i real_idx = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); __m512i imag_idx = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31); this->m_real = _mm512_permutex2var_ps(hi, real_idx, lo); this->m_imag = _mm512_permutex2var_ps(hi, imag_idx, lo); return *this; } template inline auto batch, 16>::get_complex_high() const -> real_batch { __m512i idx = _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); return _mm512_permutex2var_ps(this->m_real, idx, this->m_imag); } template inline auto batch, 16>::get_complex_low() const -> real_batch { __m512i idx = _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); return _mm512_permutex2var_ps(this->m_real, idx, this->m_imag); } template inline batch, 8>& batch, 8>::load_complex(const real_batch& hi, const real_batch& lo) { __m512i real_idx = _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14); __m512i imag_idx = _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15); this->m_real = _mm512_permutex2var_pd(hi, real_idx, lo); this->m_imag = _mm512_permutex2var_pd(hi, imag_idx, lo); return *this; } template inline auto batch, 8>::get_complex_high() const -> real_batch { __m512i idx = _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11); return _mm512_permutex2var_pd(this->m_real, idx, this->m_imag); } template inline auto batch, 8>::get_complex_low() const -> real_batch { __m512i idx = _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15); return _mm512_permutex2var_pd(this->m_real, idx, this->m_imag); } #endif } #endif xsimd-7.6.0/include/xsimd/types/xsimd_avx512_conversion.hpp000066400000000000000000000312471410101234500237710ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512_CONVERSION_HPP #define XSIMD_AVX512_CONVERSION_HPP #include "xsimd_avx512_double.hpp" #include "xsimd_avx512_float.hpp" #include "xsimd_avx512_int8.hpp" #include "xsimd_avx512_int16.hpp" #include "xsimd_avx512_int32.hpp" #include "xsimd_avx512_int64.hpp" namespace xsimd { /************************ * conversion functions * ************************/ batch to_int(const batch& x); batch to_int(const batch& x); batch to_float(const batch& x); batch to_float(const batch& x); batch u8_to_u16(const batch& x); batch u16_to_u8(const batch& x); batch u8_to_u32(const batch& x); batch u32_to_u8(const batch& x); batch u8_to_u64(const batch& x); batch u64_to_u8(const batch& x); /************************** * boolean cast functions * **************************/ batch_bool bool_cast(const batch_bool& x); batch_bool bool_cast(const batch_bool& x); batch_bool bool_cast(const batch_bool& x); batch_bool bool_cast(const batch_bool& x); /******************************* * bitwise_cast implementation * *******************************/ XSIMD_DEFINE_BITWISE_CAST_ALL(8) /*************************************** * conversion functions implementation * ***************************************/ inline batch to_int(const batch& x) { return _mm512_cvttps_epi32(x); } inline batch to_int(const batch& x) { #if defined(XSIMD_AVX512DQ_AVAILABLE) return _mm512_cvttpd_epi64(x); #else return batch(static_cast(x[0]), static_cast(x[1]), static_cast(x[2]), static_cast(x[3]), static_cast(x[4]), static_cast(x[5]), static_cast(x[6]), static_cast(x[7])); #endif } inline batch to_float(const batch& x) { return _mm512_cvtepi32_ps(x); } inline batch to_float(const batch& x) { #if defined(XSIMD_AVX512DQ_AVAILABLE) return _mm512_cvtepi64_pd(x); #else return batch(static_cast(x[0]), static_cast(x[1]), static_cast(x[2]), static_cast(x[3]), static_cast(x[4]), static_cast(x[5]), static_cast(x[6]), static_cast(x[7])); #endif } /***************************************** * batch cast functions implementation * *****************************************/ XSIMD_BATCH_CAST_IMPLICIT(int8_t, uint8_t, 64) XSIMD_BATCH_CAST_IMPLICIT(uint8_t, int8_t, 64) XSIMD_BATCH_CAST_IMPLICIT(int16_t, uint16_t, 32) XSIMD_BATCH_CAST_INTRINSIC(int16_t, int32_t, 16, _mm512_cvtepi16_epi32) XSIMD_BATCH_CAST_INTRINSIC(int16_t, uint32_t, 16, _mm512_cvtepi16_epi32) XSIMD_BATCH_CAST_INTRINSIC(int16_t, int64_t, 8, _mm512_cvtepi16_epi64) XSIMD_BATCH_CAST_INTRINSIC(int16_t, uint64_t, 8, _mm512_cvtepi16_epi64) XSIMD_BATCH_CAST_INTRINSIC2(int16_t, float, 16, _mm512_cvtepi16_epi32, _mm512_cvtepi32_ps) XSIMD_BATCH_CAST_IMPLICIT(uint16_t, int16_t, 32) XSIMD_BATCH_CAST_INTRINSIC(uint16_t, int32_t, 16, _mm512_cvtepu16_epi32) XSIMD_BATCH_CAST_INTRINSIC(uint16_t, uint32_t, 16, _mm512_cvtepu16_epi32) XSIMD_BATCH_CAST_INTRINSIC(uint16_t, int64_t, 8, _mm512_cvtepu16_epi64) XSIMD_BATCH_CAST_INTRINSIC(uint16_t, uint64_t, 8, _mm512_cvtepu16_epi64) XSIMD_BATCH_CAST_INTRINSIC2(uint16_t, float, 16, _mm512_cvtepu16_epi32, _mm512_cvtepi32_ps) XSIMD_BATCH_CAST_INTRINSIC(int32_t, int8_t, 16, _mm512_cvtepi32_epi8) XSIMD_BATCH_CAST_INTRINSIC(int32_t, uint8_t, 16, _mm512_cvtepi32_epi8) XSIMD_BATCH_CAST_INTRINSIC(int32_t, int16_t, 16, _mm512_cvtepi32_epi16) XSIMD_BATCH_CAST_INTRINSIC(int32_t, uint16_t, 16, _mm512_cvtepi32_epi16) XSIMD_BATCH_CAST_IMPLICIT(int32_t, uint32_t, 16) XSIMD_BATCH_CAST_INTRINSIC(int32_t, int64_t, 8, _mm512_cvtepi32_epi64) XSIMD_BATCH_CAST_INTRINSIC(int32_t, uint64_t, 8, _mm512_cvtepi32_epi64) XSIMD_BATCH_CAST_INTRINSIC(int32_t, float, 16, _mm512_cvtepi32_ps) XSIMD_BATCH_CAST_INTRINSIC(int32_t, double, 8, _mm512_cvtepi32_pd) XSIMD_BATCH_CAST_INTRINSIC(uint32_t, int8_t, 16, _mm512_cvtepi32_epi8) XSIMD_BATCH_CAST_INTRINSIC(uint32_t, uint8_t, 16, _mm512_cvtepi32_epi8) XSIMD_BATCH_CAST_INTRINSIC(uint32_t, int16_t, 16, _mm512_cvtepi32_epi16) XSIMD_BATCH_CAST_INTRINSIC(uint32_t, uint16_t, 16, _mm512_cvtepi32_epi16) XSIMD_BATCH_CAST_IMPLICIT(uint32_t, int32_t, 16) XSIMD_BATCH_CAST_INTRINSIC(uint32_t, int64_t, 8, _mm512_cvtepu32_epi64) XSIMD_BATCH_CAST_INTRINSIC(uint32_t, uint64_t, 8, _mm512_cvtepu32_epi64) XSIMD_BATCH_CAST_INTRINSIC(uint32_t, float, 16, _mm512_cvtepu32_ps) XSIMD_BATCH_CAST_INTRINSIC(uint32_t, double, 8, _mm512_cvtepu32_pd) XSIMD_BATCH_CAST_INTRINSIC(int64_t, int16_t, 8, _mm512_cvtepi64_epi16) XSIMD_BATCH_CAST_INTRINSIC(int64_t, uint16_t, 8, _mm512_cvtepi64_epi16) XSIMD_BATCH_CAST_INTRINSIC(int64_t, int32_t, 8, _mm512_cvtepi64_epi32) XSIMD_BATCH_CAST_INTRINSIC(int64_t, uint32_t, 8, _mm512_cvtepi64_epi32) XSIMD_BATCH_CAST_IMPLICIT(int64_t, uint64_t, 8) XSIMD_BATCH_CAST_INTRINSIC(uint64_t, int16_t, 8, _mm512_cvtepi64_epi16) XSIMD_BATCH_CAST_INTRINSIC(uint64_t, uint16_t, 8, _mm512_cvtepi64_epi16) XSIMD_BATCH_CAST_INTRINSIC(uint64_t, int32_t, 8, _mm512_cvtepi64_epi32) XSIMD_BATCH_CAST_INTRINSIC(uint64_t, uint32_t, 8, _mm512_cvtepi64_epi32) XSIMD_BATCH_CAST_IMPLICIT(uint64_t, int64_t, 8) XSIMD_BATCH_CAST_INTRINSIC2(float, int8_t, 16, _mm512_cvttps_epi32, _mm512_cvtepi32_epi8) XSIMD_BATCH_CAST_INTRINSIC2(float, uint8_t, 16, _mm512_cvttps_epi32, _mm512_cvtepi32_epi8) XSIMD_BATCH_CAST_INTRINSIC2(float, int16_t, 16, _mm512_cvttps_epi32, _mm512_cvtepi32_epi16) XSIMD_BATCH_CAST_INTRINSIC2(float, uint16_t, 16, _mm512_cvttps_epi32, _mm512_cvtepi32_epi16) XSIMD_BATCH_CAST_INTRINSIC(float, int32_t, 16, _mm512_cvttps_epi32) XSIMD_BATCH_CAST_INTRINSIC(float, uint32_t, 16, _mm512_cvttps_epu32) XSIMD_BATCH_CAST_INTRINSIC(float, double, 8, _mm512_cvtps_pd) XSIMD_BATCH_CAST_INTRINSIC(double, int32_t, 8, _mm512_cvttpd_epi32) XSIMD_BATCH_CAST_INTRINSIC(double, uint32_t, 8, _mm512_cvttpd_epu32) XSIMD_BATCH_CAST_INTRINSIC(double, float, 8, _mm512_cvtpd_ps) #if defined(XSIMD_AVX512BW_AVAILABLE) XSIMD_BATCH_CAST_INTRINSIC(int8_t, int16_t, 32, _mm512_cvtepi8_epi16) XSIMD_BATCH_CAST_INTRINSIC(int8_t, uint16_t, 32, _mm512_cvtepi8_epi16) XSIMD_BATCH_CAST_INTRINSIC(int8_t, int32_t, 16, _mm512_cvtepi8_epi32) XSIMD_BATCH_CAST_INTRINSIC(int8_t, uint32_t, 16, _mm512_cvtepi8_epi32) XSIMD_BATCH_CAST_INTRINSIC2(int8_t, float, 16, _mm512_cvtepi8_epi32, _mm512_cvtepi32_ps) XSIMD_BATCH_CAST_INTRINSIC(uint8_t, int16_t, 32, _mm512_cvtepu8_epi16) XSIMD_BATCH_CAST_INTRINSIC(uint8_t, uint16_t, 32, _mm512_cvtepu8_epi16) XSIMD_BATCH_CAST_INTRINSIC(uint8_t, int32_t, 16, _mm512_cvtepu8_epi32) XSIMD_BATCH_CAST_INTRINSIC(uint8_t, uint32_t, 16, _mm512_cvtepu8_epi32) XSIMD_BATCH_CAST_INTRINSIC2(uint8_t, float, 16, _mm512_cvtepu8_epi32, _mm512_cvtepi32_ps) XSIMD_BATCH_CAST_INTRINSIC(int16_t, int8_t, 32, _mm512_cvtepi16_epi8) XSIMD_BATCH_CAST_INTRINSIC(int16_t, uint8_t, 32, _mm512_cvtepi16_epi8) XSIMD_BATCH_CAST_INTRINSIC(uint16_t, int8_t, 32, _mm512_cvtepi16_epi8) XSIMD_BATCH_CAST_INTRINSIC(uint16_t, uint8_t, 32, _mm512_cvtepi16_epi8) #endif #if defined(XSIMD_AVX512DQ_AVAILABLE) XSIMD_BATCH_CAST_INTRINSIC2(int16_t, double, 8, _mm512_cvtepi16_epi64, _mm512_cvtepi64_pd) XSIMD_BATCH_CAST_INTRINSIC2(uint16_t, double, 8, _mm512_cvtepu16_epi64, _mm512_cvtepi64_pd) XSIMD_BATCH_CAST_INTRINSIC(int64_t, float, 8, _mm512_cvtepi64_ps) XSIMD_BATCH_CAST_INTRINSIC(int64_t, double, 8, _mm512_cvtepi64_pd) XSIMD_BATCH_CAST_INTRINSIC(uint64_t, float, 8, _mm512_cvtepu64_ps) XSIMD_BATCH_CAST_INTRINSIC(uint64_t, double, 8, _mm512_cvtepu64_pd) XSIMD_BATCH_CAST_INTRINSIC(float, int64_t, 8, _mm512_cvttps_epi64) XSIMD_BATCH_CAST_INTRINSIC(float, uint64_t, 8, _mm512_cvttps_epu64) XSIMD_BATCH_CAST_INTRINSIC2(double, int16_t, 8, _mm512_cvttpd_epi64, _mm512_cvtepi64_epi16) XSIMD_BATCH_CAST_INTRINSIC2(double, uint16_t, 8, _mm512_cvttpd_epi64, _mm512_cvtepi64_epi16) XSIMD_BATCH_CAST_INTRINSIC(double, int64_t, 8, _mm512_cvttpd_epi64) XSIMD_BATCH_CAST_INTRINSIC(double, uint64_t, 8, _mm512_cvttpd_epu64) #endif inline batch u8_to_u16(const batch& x) { return static_cast>(x); } inline batch u16_to_u8(const batch& x) { return static_cast>(x); } inline batch u8_to_u32(const batch& x) { return static_cast>(x); } inline batch u32_to_u8(const batch& x) { return static_cast>(x); } inline batch u8_to_u64(const batch& x) { return static_cast>(x); } inline batch u64_to_u8(const batch& x) { return static_cast>(x); } /************************** * boolean cast functions * **************************/ inline batch_bool bool_cast(const batch_bool& x) { return __mmask16(x); } inline batch_bool bool_cast(const batch_bool& x) { return __mmask8(x); } inline batch_bool bool_cast(const batch_bool& x) { return __mmask16(x); } inline batch_bool bool_cast(const batch_bool& x) { return __mmask8(x); } /***************************************** * bitwise cast functions implementation * *****************************************/ XSIMD_BITWISE_CAST_INTRINSIC(float, 16, double, 8, _mm512_castps_pd) XSIMD_BITWISE_CAST_INTRINSIC(float, 16, int32_t, 16, _mm512_castps_si512) XSIMD_BITWISE_CAST_INTRINSIC(float, 16, int64_t, 8, _mm512_castps_si512) XSIMD_BITWISE_CAST_INTRINSIC(double, 8, float, 16, _mm512_castpd_ps) XSIMD_BITWISE_CAST_INTRINSIC(double, 8, int32_t, 16, _mm512_castpd_si512) XSIMD_BITWISE_CAST_INTRINSIC(double, 8, int64_t, 8, _mm512_castpd_si512) XSIMD_BITWISE_CAST_INTRINSIC(int32_t, 16, float, 16, _mm512_castsi512_ps) XSIMD_BITWISE_CAST_INTRINSIC(int32_t, 16, double, 8, _mm512_castsi512_pd) XSIMD_BITWISE_CAST_INTRINSIC(int64_t, 8, float, 16, _mm512_castsi512_ps) XSIMD_BITWISE_CAST_INTRINSIC(int64_t, 8, double, 8, _mm512_castsi512_pd) } #endif xsimd-7.6.0/include/xsimd/types/xsimd_avx512_double.hpp000066400000000000000000000451521410101234500230560ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512_DOUBLE_HPP #define XSIMD_AVX512_DOUBLE_HPP #include "xsimd_avx512_bool.hpp" #include "xsimd_base.hpp" namespace xsimd { /************************* * batch_bool * *************************/ template <> struct simd_batch_traits> { using value_type = double; static constexpr std::size_t size = 8; using batch_type = batch; static constexpr std::size_t align = 0; }; template <> class batch_bool : public batch_bool_avx512<__mmask8, batch_bool> { public: using base_class = batch_bool_avx512<__mmask8, batch_bool>; using base_class::base_class; }; namespace detail { template <> struct batch_bool_kernel : batch_bool_kernel_avx512 { }; } /******************** * batch * ********************/ template <> struct simd_batch_traits> { using value_type = double; static constexpr std::size_t size = 8; using batch_bool_type = batch_bool; static constexpr std::size_t align = 64; using storage_type = __m512d; }; template <> class batch : public simd_batch> { public: using self_type = batch; using base_type = simd_batch; batch(); explicit batch(double d); batch(double d0, double d1, double d2, double d3, double d4, double d5, double d6, double d7); explicit batch(const double* src); batch(const double* src, aligned_mode); batch(const double* src, unaligned_mode); batch(const __m512d& rhs); batch& operator=(const __m512d& rhs); batch(const batch_bool& rhs); batch& operator=(const batch_bool& rhs); operator __m512d() const; XSIMD_DECLARE_LOAD_STORE_ALL(double, 8) XSIMD_DECLARE_LOAD_STORE_LONG(double, 8) using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; }; /*********************************** * batch implementation * ***********************************/ inline batch::batch() { } inline batch::batch(double d) : base_type(_mm512_set1_pd(d)) { } inline batch::batch(double d0, double d1, double d2, double d3, double d4, double d5, double d6, double d7) : base_type(_mm512_setr_pd(d0, d1, d2, d3, d4, d5, d6, d7)) { } inline batch::batch(const double* src) : base_type(_mm512_loadu_pd(src)) { } inline batch::batch(const double* src, aligned_mode) : base_type(_mm512_load_pd(src)) { } inline batch::batch(const double* src, unaligned_mode) : base_type(_mm512_loadu_pd(src)) { } inline batch::batch(const __m512d& rhs) : base_type(rhs) { } inline batch& batch::operator=(const __m512d& rhs) { this->m_value = rhs; return *this; } inline batch::operator __m512d() const { return this->m_value; } XSIMD_DEFINE_LOAD_STORE(double, 8, bool, 64) inline batch& batch::load_aligned(const int8_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); __m512i tmp2 = _mm512_cvtepi8_epi64(tmp); this->m_value = _mm512_cvtepi64_pd(tmp2); return *this; } inline batch& batch::load_unaligned(const int8_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const uint8_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); __m512i tmp2 = _mm512_cvtepu8_epi64(tmp); this->m_value = _mm512_cvtepi64_pd(tmp2); return *this; } inline batch& batch::load_unaligned(const uint8_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const int16_t* src) { __m128i tmp = _mm_load_si128((const __m128i*)src); __m512i tmp2 = _mm512_cvtepi16_epi64(tmp); this->m_value = _mm512_cvtepi64_pd(tmp2); return *this; } inline batch& batch::load_unaligned(const int16_t* src) { __m128i tmp = _mm_loadu_si128((const __m128i*)src); __m512i tmp2 = _mm512_cvtepi16_epi64(tmp); this->m_value = _mm512_cvtepi64_pd(tmp2); return *this; } inline batch& batch::load_aligned(const uint16_t* src) { __m128i tmp = _mm_load_si128((const __m128i*)src); __m512i tmp2 = _mm512_cvtepu16_epi64(tmp); this->m_value = _mm512_cvtepi64_pd(tmp2); return *this; } inline batch& batch::load_unaligned(const uint16_t* src) { __m128i tmp = _mm_loadu_si128((const __m128i*)src); __m512i tmp2 = _mm512_cvtepu16_epi64(tmp); this->m_value = _mm512_cvtepi64_pd(tmp2); return *this; } inline batch& batch::load_aligned(const int32_t* src) { this->m_value = _mm512_cvtepi32_pd(_mm256_load_si256((__m256i const*)src)); return *this; } inline batch& batch::load_unaligned(const int32_t* src) { this->m_value = _mm512_cvtepi32_pd(_mm256_loadu_si256((__m256i const*)src)); return *this; } inline batch& batch::load_aligned(const uint32_t* src) { this->m_value = _mm512_cvtepu32_pd(_mm256_load_si256((__m256i const*)src)); return *this; } inline batch& batch::load_unaligned(const uint32_t* src) { this->m_value = _mm512_cvtepu32_pd(_mm256_loadu_si256((__m256i const*)src)); return *this; } XSIMD_DEFINE_LOAD_STORE(double, 8, int64_t, 64) XSIMD_DEFINE_LOAD_STORE(double, 8, uint64_t, 64) XSIMD_DEFINE_LOAD_STORE_LONG(double, 8, 64) inline batch& batch::load_aligned(const float* src) { this->m_value = _mm512_cvtps_pd(_mm256_load_ps(src)); return *this; } inline batch& batch::load_unaligned(const float* src) { this->m_value = _mm512_cvtps_pd(_mm256_loadu_ps(src)); return *this; } inline batch& batch::load_aligned(const double* src) { this->m_value = _mm512_load_pd(src); return *this; } inline batch& batch::load_unaligned(const double* src) { this->m_value = _mm512_loadu_pd(src); return *this; } inline void batch::store_aligned(int8_t* dst) const { __m512i tmp = _mm512_cvtpd_epi64(this->m_value); __m128i tmp2 = _mm512_cvtepi64_epi8(tmp); _mm_storel_epi64((__m128i*)dst, tmp2); } inline void batch::store_unaligned(int8_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint8_t* dst) const { __m512i tmp = _mm512_cvtpd_epi64(this->m_value); __m128i tmp2 = _mm512_cvtusepi64_epi8(tmp); _mm_storel_epi64((__m128i*)dst, tmp2); } inline void batch::store_unaligned(uint8_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(int16_t* dst) const { __m512i tmp = _mm512_cvtpd_epi64(this->m_value); __m128i tmp2 = _mm512_cvtepi64_epi16(tmp); _mm_store_si128((__m128i*)dst, tmp2); } inline void batch::store_unaligned(int16_t* dst) const { __m512i tmp = _mm512_cvtpd_epi64(this->m_value); __m128i tmp2 = _mm512_cvtepi64_epi16(tmp); _mm_storeu_si128((__m128i*)dst, tmp2); } inline void batch::store_aligned(uint16_t* dst) const { __m512i tmp = _mm512_cvtpd_epi64(this->m_value); __m128i tmp2 = _mm512_cvtusepi64_epi16(tmp); _mm_store_si128((__m128i*)dst, tmp2); } inline void batch::store_unaligned(uint16_t* dst) const { __m512i tmp = _mm512_cvtpd_epi64(this->m_value); __m128i tmp2 = _mm512_cvtusepi64_epi16(tmp); _mm_storeu_si128((__m128i*)dst, tmp2); } inline void batch::store_aligned(int32_t* dst) const { _mm256_store_si256((__m256i*)dst, _mm512_cvtpd_epi32(this->m_value)); } inline void batch::store_unaligned(int32_t* dst) const { _mm256_storeu_si256((__m256i*)dst, _mm512_cvtpd_epi32(this->m_value)); } inline void batch::store_aligned(uint32_t* dst) const { _mm256_store_si256((__m256i*)dst, _mm512_cvtpd_epu32(this->m_value)); } inline void batch::store_unaligned(uint32_t* dst) const { _mm256_storeu_si256((__m256i*)dst, _mm512_cvtpd_epu32(this->m_value)); } inline void batch::store_aligned(float* dst) const { _mm256_store_ps(dst, _mm512_cvtpd_ps(this->m_value)); } inline void batch::store_unaligned(float* dst) const { _mm256_storeu_ps(dst, _mm512_cvtpd_ps(this->m_value)); } inline void batch::store_aligned(double* dst) const { _mm512_store_pd(dst, this->m_value); } inline void batch::store_unaligned(double* dst) const { _mm512_storeu_pd(dst, this->m_value); } namespace detail { template <> struct batch_kernel { using batch_type = batch; using value_type = double; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { return _mm512_xor_pd(rhs, _mm512_castsi512_pd(_mm512_set1_epi64(0x8000000000000000))); } static batch_type add(const batch_type& lhs, const batch_type& rhs) { return _mm512_add_pd(lhs, rhs); } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { return _mm512_sub_pd(lhs, rhs); } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { return add(lhs, rhs); //do something for inf ? } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return sub(lhs, rhs); //do something for inf ? } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { return _mm512_mul_pd(lhs, rhs); } static batch_type div(const batch_type& lhs, const batch_type& rhs) { return _mm512_div_pd(lhs, rhs); } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmp_pd_mask(lhs, rhs, _CMP_EQ_OQ); } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmp_pd_mask(lhs, rhs, _CMP_NEQ_OQ); } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmp_pd_mask(lhs, rhs, _CMP_LT_OQ); } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmp_pd_mask(lhs, rhs, _CMP_LE_OQ); } static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return _mm512_and_pd(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return _mm512_or_pd(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return _mm512_xor_pd(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return _mm512_xor_pd(rhs, _mm512_castsi512_pd(_mm512_set1_epi32(-1))); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return _mm512_andnot_pd(lhs, rhs); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return _mm512_min_pd(lhs, rhs); } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return _mm512_max_pd(lhs, rhs); } static batch_type fmin(const batch_type& lhs, const batch_type& rhs) { return min(lhs, rhs); } static batch_type fmax(const batch_type& lhs, const batch_type& rhs) { return max(lhs, rhs); } static batch_type abs(const batch_type& rhs) { __m512d rhs_asd = (__m512d)rhs; __m512i rhs_asi = *reinterpret_cast<__m512i*>(&rhs_asd); __m512i res_asi = _mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF), rhs_asi); return *reinterpret_cast<__m512d*>(&res_asi); } static batch_type fabs(const batch_type& rhs) { return abs(rhs); } static batch_type sqrt(const batch_type& rhs) { return _mm512_sqrt_pd(rhs); } static batch_type fma(const batch_type& x, const batch_type& y, const batch_type& z) { return _mm512_fmadd_pd(x, y, z); } static batch_type fms(const batch_type& x, const batch_type& y, const batch_type& z) { return _mm512_fmsub_pd(x, y, z); } static batch_type fnma(const batch_type& x, const batch_type& y, const batch_type& z) { return _mm512_fnmadd_pd(x, y, z); } static batch_type fnms(const batch_type& x, const batch_type& y, const batch_type& z) { return _mm512_fnmsub_pd(x, y, z); } static value_type hadd(const batch_type& rhs) { __m256d tmp1 = _mm512_extractf64x4_pd(rhs, 1); __m256d tmp2 = _mm512_extractf64x4_pd(rhs, 0); __m256d res1 = _mm256_add_pd(tmp1, tmp2); return xsimd::hadd(batch(res1)); } static batch_type haddp(const batch_type* row) { #define step1(I, a, b) \ batch res ## I; \ { \ auto tmp1 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \ auto tmp2 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \ res ## I = _mm512_add_pd(tmp1, tmp2); \ } \ step1(1, row[0], row[2]); step1(2, row[4], row[6]); step1(3, row[1], row[3]); step1(4, row[5], row[7]); #undef step1 batch tmp5 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(2, 0, 2, 0)); batch tmp6 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(3, 1, 3, 1)); batch resx1 = _mm512_add_pd(tmp5, tmp6); batch tmp7 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(2, 0, 2, 0)); batch tmp8 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(3, 1, 3, 1)); batch resx2 = _mm512_add_pd(tmp7, tmp8); batch tmpx = _mm512_shuffle_pd(resx1, resx2, 0b00000000); batch tmpy = _mm512_shuffle_pd(resx1, resx2, 0b11111111); return tmpx + tmpy; } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { return _mm512_mask_blend_pd(cond, b, a); } static batch_bool_type isnan(const batch_type& x) { return _mm512_cmp_pd_mask(x, x, _CMP_UNORD_Q); } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { return _mm512_unpacklo_pd(lhs, rhs); } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { return _mm512_unpackhi_pd(lhs, rhs); } static batch_type extract_pair(const batch_type& lhs, const batch_type& rhs, const int n) { batch_type b_concatenate; for (int i = 0 ; i < (8 - n); ++i) { b_concatenate[i] = lhs[i + n]; if(i < n) { b_concatenate[8 - 1 - i] = rhs[n - 1 - i]; } } return b_concatenate; } }; } inline batch::batch(const batch_bool& rhs) : base_type(detail::batch_kernel::select(rhs, batch(double(1)), batch(double(0)))) { } inline batch& batch::operator=(const batch_bool& rhs) { this->m_value = detail::batch_kernel::select(rhs, batch(double(1)), batch(double(0))); return *this; } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_avx512_float.hpp000066400000000000000000000563561410101234500227210ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512_FLOAT_HPP #define XSIMD_AVX512_FLOAT_HPP #include #include "xsimd_avx512_bool.hpp" #include "xsimd_base.hpp" namespace xsimd { /************************* * batch_bool * *************************/ template <> struct simd_batch_traits> { using value_type = float; static constexpr std::size_t size = 16; using batch_type = batch; static constexpr std::size_t align = 0; }; template <> class batch_bool : public batch_bool_avx512<__mmask16, batch_bool> { public: using base_class = batch_bool_avx512<__mmask16, batch_bool>; using base_class::base_class; }; namespace detail { template <> struct batch_bool_kernel : batch_bool_kernel_avx512 { }; } /********************* * batch * *********************/ template <> struct simd_batch_traits> { using value_type = float; using batch_bool_type = batch_bool; static constexpr std::size_t size = 16; static constexpr std::size_t align = 64; using storage_type = __m512; }; template <> class batch : public simd_batch> { public: using self_type = batch; using base_type = simd_batch; batch(); explicit batch(float i); batch(float i0, float i1, float i2, float i3, float i4, float i5, float i6, float i7, float i8, float i9, float i10, float i11, float i12, float i13, float i14, float i15); explicit batch(const float* src); batch(const float* src, aligned_mode); batch(const float* src, unaligned_mode); batch(const __m512& rhs); batch& operator=(const __m512& rhs); batch(const batch_bool& rhs); batch& operator=(const batch_bool& rhs); operator __m512() const; XSIMD_DECLARE_LOAD_STORE_ALL(float, 16) XSIMD_DECLARE_LOAD_STORE_LONG(float, 16) using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; }; /************************************ * batch implementation * ************************************/ inline batch::batch() { } inline batch::batch(float i) : base_type(_mm512_set1_ps(i)) { } inline batch::batch(float i0, float i1, float i2, float i3, float i4, float i5, float i6, float i7, float i8, float i9, float i10, float i11, float i12, float i13, float i14, float i15) : base_type(_mm512_setr_ps(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15)) { } inline batch::batch(const float* src) : base_type(_mm512_loadu_ps(src)) { } inline batch::batch(const float* src, aligned_mode) : base_type(_mm512_load_ps(src)) { } inline batch::batch(const float* src, unaligned_mode) : base_type(_mm512_loadu_ps(src)) { } inline batch::batch(const __m512& rhs) : base_type(rhs) { } inline batch& batch::operator=(const __m512& rhs) { this->m_value = rhs; return *this; } inline batch::operator __m512() const { return this->m_value; } XSIMD_DEFINE_LOAD_STORE(float, 16, bool, 64) inline batch& batch::load_aligned(const int8_t* src) { __m128i tmp = _mm_load_si128((const __m128i*)src); __m512i tmp2 = _mm512_cvtepi8_epi32(tmp); this->m_value = _mm512_cvtepi32_ps(tmp2); return *this; } inline batch& batch::load_unaligned(const int8_t* src) { __m128i tmp = _mm_loadu_si128((const __m128i*)src); __m512i tmp2 = _mm512_cvtepi8_epi32(tmp); this->m_value = _mm512_cvtepi32_ps(tmp2); return *this; } inline batch& batch::load_aligned(const uint8_t* src) { __m128i tmp = _mm_load_si128((const __m128i*)src); __m512i tmp2 = _mm512_cvtepu8_epi32(tmp); this->m_value = _mm512_cvtepi32_ps(tmp2); return *this; } inline batch& batch::load_unaligned(const uint8_t* src) { __m128i tmp = _mm_loadu_si128((const __m128i*)src); __m512i tmp2 = _mm512_cvtepu8_epi32(tmp); this->m_value = _mm512_cvtepi32_ps(tmp2); return *this; } inline batch& batch::load_aligned(const int16_t* src) { __m256i tmp = _mm256_load_si256((const __m256i*)src); __m512i tmp2 = _mm512_cvtepi16_epi32(tmp); this->m_value = _mm512_cvtepi32_ps(tmp2); return *this; } inline batch& batch::load_unaligned(const int16_t* src) { __m256i tmp = _mm256_loadu_si256((const __m256i*)src); __m512i tmp2 = _mm512_cvtepi16_epi32(tmp); this->m_value = _mm512_cvtepi32_ps(tmp2); return *this; } inline batch& batch::load_aligned(const uint16_t* src) { __m256i tmp = _mm256_load_si256((const __m256i*)src); __m512i tmp2 = _mm512_cvtepu16_epi32(tmp); this->m_value = _mm512_cvtepi32_ps(tmp2); return *this; } inline batch& batch::load_unaligned(const uint16_t* src) { __m256i tmp = _mm256_loadu_si256((const __m256i*)src); __m512i tmp2 = _mm512_cvtepu16_epi32(tmp); this->m_value = _mm512_cvtepi32_ps(tmp2); return *this; } inline batch& batch::load_aligned(const int32_t* src) { // TODO select correct rounding direction? this->m_value = _mm512_cvt_roundepi32_ps(_mm512_load_si512(src), _MM_FROUND_CUR_DIRECTION); return *this; } inline batch& batch::load_unaligned(const int32_t* src) { this->m_value = _mm512_cvt_roundepi32_ps(_mm512_loadu_si512(src), _MM_FROUND_CUR_DIRECTION); return *this; } inline batch& batch::load_aligned(const uint32_t* src) { // TODO select correct rounding direction? this->m_value = _mm512_cvt_roundepu32_ps(_mm512_load_si512(src), _MM_FROUND_CUR_DIRECTION); return *this; } inline batch& batch::load_unaligned(const uint32_t* src) { this->m_value = _mm512_cvt_roundepu32_ps(_mm512_loadu_si512(src), _MM_FROUND_CUR_DIRECTION); return *this; } XSIMD_DEFINE_LOAD_STORE(float, 16, int64_t, 64) XSIMD_DEFINE_LOAD_STORE(float, 16, uint64_t, 64) XSIMD_DEFINE_LOAD_STORE_LONG(float, 16, 64) inline batch& batch::load_aligned(const float* src) { this->m_value = _mm512_load_ps(src); return *this; } inline batch& batch::load_unaligned(const float* src) { this->m_value = _mm512_loadu_ps(src); return *this; } inline batch& batch::load_aligned(const double* src) { __m256 tmp1 = _mm512_cvtpd_ps(_mm512_load_pd(src)); __m256 tmp2 = _mm512_cvtpd_ps(_mm512_load_pd(src + 8)); this->m_value = _mm512_castps256_ps512(tmp1); this->m_value = _mm512_insertf32x8(this->m_value, tmp2, 1); return *this; } inline batch& batch::load_unaligned(const double* src) { __m256 tmp1 = _mm512_cvtpd_ps(_mm512_loadu_pd(src)); __m256 tmp2 = _mm512_cvtpd_ps(_mm512_loadu_pd(src + 8)); this->m_value = _mm512_castps256_ps512(tmp1); this->m_value = _mm512_insertf32x8(this->m_value, tmp2, 1); return *this; } inline void batch::store_aligned(int8_t* dst) const { __m512i tmp = _mm512_cvtps_epi32(this->m_value); __m128i tmp2 = _mm512_cvtepi32_epi8(tmp); _mm_store_si128((__m128i*)dst, tmp2); } inline void batch::store_unaligned(int8_t* dst) const { __m512i tmp = _mm512_cvtps_epi32(this->m_value); __m128i tmp2 = _mm512_cvtepi32_epi8(tmp); _mm_storeu_si128((__m128i*)dst, tmp2); } inline void batch::store_aligned(uint8_t* dst) const { __m512i tmp = _mm512_cvtps_epu32(this->m_value); __m128i tmp2 = _mm512_cvtusepi32_epi8(tmp); _mm_store_si128((__m128i*)dst, tmp2); } inline void batch::store_unaligned(uint8_t* dst) const { __m512i tmp = _mm512_cvtps_epu32(this->m_value); __m128i tmp2 = _mm512_cvtusepi32_epi8(tmp); _mm_storeu_si128((__m128i*)dst, tmp2); } inline void batch::store_aligned(int16_t* dst) const { __m512i tmp = _mm512_cvtps_epi32(this->m_value); __m256i tmp2 = _mm512_cvtepi32_epi16(tmp); _mm256_store_si256((__m256i*)dst, tmp2); } inline void batch::store_unaligned(int16_t* dst) const { __m512i tmp = _mm512_cvtps_epi32(this->m_value); __m256i tmp2 = _mm512_cvtepi32_epi16(tmp); _mm256_storeu_si256((__m256i*)dst, tmp2); } inline void batch::store_aligned(uint16_t* dst) const { __m512i tmp = _mm512_cvtps_epu32(this->m_value); __m256i tmp2 = _mm512_cvtusepi32_epi16(tmp); _mm256_store_si256((__m256i*)dst, tmp2); } inline void batch::store_unaligned(uint16_t* dst) const { __m512i tmp = _mm512_cvtps_epu32(this->m_value); __m256i tmp2 = _mm512_cvtusepi32_epi16(tmp); _mm256_storeu_si256((__m256i*)dst, tmp2); } inline void batch::store_aligned(int32_t* dst) const { _mm512_store_si512((__m512i *)dst, _mm512_cvtps_epi32(this->m_value)); } inline void batch::store_unaligned(int32_t* dst) const { _mm512_storeu_si512((__m512i *)dst, _mm512_cvtps_epi32(this->m_value)); } inline void batch::store_aligned(uint32_t* dst) const { _mm512_store_si512((__m512i *)dst, _mm512_cvtps_epu32(this->m_value)); } inline void batch::store_unaligned(uint32_t* dst) const { _mm512_storeu_si512((__m512i *)dst, _mm512_cvtps_epu32(this->m_value)); } inline void batch::store_aligned(float* dst) const { _mm512_store_ps(dst, this->m_value); } inline void batch::store_unaligned(float* dst) const { _mm512_storeu_ps(dst, this->m_value); } inline void batch::store_aligned(double* dst) const { _mm512_store_pd(dst, _mm512_cvtps_pd(_mm512_extractf32x8_ps(this->m_value, 0))); _mm512_store_pd(dst + 8, _mm512_cvtps_pd(_mm512_extractf32x8_ps(this->m_value, 1))); } inline void batch::store_unaligned(double* dst) const { _mm512_storeu_pd(dst, _mm512_cvtps_pd(_mm512_extractf32x8_ps(this->m_value, 0))); _mm512_storeu_pd(dst + 8, _mm512_cvtps_pd(_mm512_extractf32x8_ps(this->m_value, 1))); } namespace detail { template <> struct batch_kernel { using batch_type = batch; using value_type = float; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { return _mm512_sub_ps(_mm512_setzero_ps(), rhs); } static batch_type add(const batch_type& lhs, const batch_type& rhs) { return _mm512_add_ps(lhs, rhs); } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { return _mm512_sub_ps(lhs, rhs); } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { return add(lhs, rhs); //do something for inf ? } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return sub(lhs, rhs); //do something for inf ? } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { return _mm512_mul_ps(lhs, rhs); } static batch_type div(const batch_type& lhs, const batch_type& rhs) { return _mm512_div_ps(lhs, rhs); } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmp_ps_mask(lhs, rhs, _CMP_EQ_OQ); } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmp_ps_mask(lhs, rhs, _CMP_NEQ_OQ); } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmp_ps_mask(lhs, rhs, _CMP_LT_OQ); } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmp_ps_mask(lhs, rhs, _CMP_LE_OQ); } static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return _mm512_and_ps(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return _mm512_or_ps(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return _mm512_xor_ps(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return _mm512_xor_ps(rhs, _mm512_castsi512_ps(_mm512_set1_epi32(-1))); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return _mm512_andnot_ps(lhs, rhs); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return _mm512_min_ps(lhs, rhs); } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return _mm512_max_ps(lhs, rhs); } static batch_type fmin(const batch_type& lhs, const batch_type& rhs) { return min(lhs, rhs); } static batch_type fmax(const batch_type& lhs, const batch_type& rhs) { return max(lhs, rhs); } static batch_type abs(const batch_type& rhs) { __m512 rhs_asf = (__m512)rhs; __m512i rhs_asi = *reinterpret_cast<__m512i*>(&rhs_asf); __m512i res_asi = _mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF), rhs_asi); return *reinterpret_cast<__m512*>(&res_asi); } static batch_type fabs(const batch_type& rhs) { return abs(rhs); } static batch_type sqrt(const batch_type& rhs) { return _mm512_sqrt_ps(rhs); } static batch_type fma(const batch_type& x, const batch_type& y, const batch_type& z) { return _mm512_fmadd_ps(x, y, z); } static batch_type fms(const batch_type& x, const batch_type& y, const batch_type& z) { return _mm512_fmsub_ps(x, y, z); } static batch_type fnma(const batch_type& x, const batch_type& y, const batch_type& z) { return _mm512_fnmadd_ps(x, y, z); } static batch_type fnms(const batch_type& x, const batch_type& y, const batch_type& z) { return _mm512_fnmsub_ps(x, y, z); } static value_type hadd(const batch_type& rhs) { __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1); __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0); __m256 res1 = _mm256_add_ps(tmp1, tmp2); return xsimd::hadd(batch(res1)); } static batch_type haddp(const batch_type* row) { // The following folds over the vector once: // tmp1 = [a0..8, b0..8] // tmp2 = [a8..f, b8..f] #define XSIMD_AVX512_HADDP_STEP1(I, a, b) \ batch res ## I; \ { \ auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \ auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \ res ## I = _mm512_add_ps(tmp1, tmp2); \ } \ XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]); XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]); XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]); XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]); XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]); XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]); XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]); XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]); #undef XSIMD_AVX512_HADDP_STEP1 // The following flds the code and shuffles so that hadd_ps produces the correct result // tmp1 = [a0..4, a8..12, b0..4, b8..12] (same for tmp3) // tmp2 = [a5..8, a12..16, b5..8, b12..16] (same for tmp4) // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ... #define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d) \ batch halfx ## I; \ { \ batch tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \ batch tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \ \ batch resx1 = _mm512_add_ps(tmp1, tmp2); \ \ batch tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \ batch tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \ \ batch resx2 = _mm512_add_ps(tmp3, tmp4); \ \ batch tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \ batch tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \ \ batch resx3 = _mm512_add_ps(tmp5, tmp6); \ \ halfx ## I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0), \ _mm512_extractf32x8_ps(resx3, 1)); \ } \ XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3); XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7); #undef XSIMD_AVX512_HADDP_STEP2 auto concat = _mm512_castps256_ps512(halfx0); concat = _mm512_insertf32x8(concat, halfx1, 1); return concat; } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { #if !defined(_MSC_VER) return _mm512_mask_blend_ps(cond, b, a); #else __m512i mcondi = _mm512_maskz_broadcastd_epi32 ((__mmask16)cond, _mm_set1_epi32(~0)); __m512 mcond = *reinterpret_cast<__m512*>(&mcondi); XSIMD_SPLITPS_AVX512(mcond); XSIMD_SPLITPS_AVX512(a); XSIMD_SPLITPS_AVX512(b); auto res_lo = _mm256_blendv_ps(b_low, a_low, mcond_low); auto res_hi = _mm256_blendv_ps(b_high, a_high, mcond_high); XSIMD_RETURN_MERGEDPS_AVX(res_lo, res_hi); #endif } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { return _mm512_unpacklo_ps(lhs, rhs); } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { return _mm512_unpackhi_ps(lhs, rhs); } static batch_type extract_pair(const batch_type& lhs, const batch_type& rhs, const int n) { batch_type b_concatenate; for (int i = 0 ; i < (16 - n); ++i) { b_concatenate[i] = lhs[i + n]; if(i < n) { b_concatenate[16 - 1 - i] = rhs[n - 1 - i]; } } return b_concatenate; } static batch_bool_type isnan(const batch_type& x) { return _mm512_cmp_ps_mask(x, x, _CMP_UNORD_Q); } }; } inline batch::batch(const batch_bool& rhs) : base_type(detail::batch_kernel::select(rhs, batch(float(1)), batch(float(0)))) { } inline batch& batch::operator=(const batch_bool& rhs) { this->m_value = detail::batch_kernel::select(rhs, batch(float(1)), batch(float(0))); return *this; } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_avx512_int16.hpp000066400000000000000000000524511410101234500225450ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512_INT16_HPP #define XSIMD_AVX512_INT16_HPP #include "xsimd_avx512_bool.hpp" #include "xsimd_avx512_int_base.hpp" namespace xsimd { #define XSIMD_APPLY_AVX2_FUNCTION_INT16(func, avx_lhs, avx_rhs) \ XSIMD_APPLY_AVX2_FUNCTION(16, func, avx_lhs, avx_rhs) /*************************** * batch_bool * ***************************/ template <> struct simd_batch_traits> { using value_type = int16_t; static constexpr std::size_t size = 32; using batch_type = batch; static constexpr std::size_t align = 64; }; template <> struct simd_batch_traits> { using value_type = uint16_t; static constexpr std::size_t size = 32; using batch_type = batch; static constexpr std::size_t align = 64; }; #if defined(XSIMD_AVX512BW_AVAILABLE) template <> class batch_bool : public batch_bool_avx512<__mmask32, batch_bool> { public: using base_class = batch_bool_avx512<__mmask32, batch_bool>; using base_class::base_class; }; template <> class batch_bool : public batch_bool_avx512<__mmask32, batch_bool> { public: using base_class = batch_bool_avx512<__mmask32, batch_bool>; using base_class::base_class; }; namespace detail { template <> struct batch_bool_kernel : batch_bool_kernel_avx512 { }; template <> struct batch_bool_kernel : batch_bool_kernel_avx512 { }; } #else template <> class batch_bool : public avx512_fallback_batch_bool { public: using base_class = avx512_fallback_batch_bool; using base_class::base_class; }; template <> class batch_bool : public avx512_fallback_batch_bool { public: using base_class = avx512_fallback_batch_bool; using base_class::base_class; }; namespace detail { template <> struct batch_bool_kernel : avx512_fallback_batch_bool_kernel { }; template <> struct batch_bool_kernel : avx512_fallback_batch_bool_kernel { }; } #endif /********************** * batch * **********************/ template <> struct simd_batch_traits> { using value_type = int16_t; static constexpr std::size_t size = 32; using batch_bool_type = batch_bool; static constexpr std::size_t align = 64; using storage_type = __m512i; }; template <> struct simd_batch_traits> { using value_type = uint16_t; static constexpr std::size_t size = 32; using batch_bool_type = batch_bool; static constexpr std::size_t align = 64; using storage_type = __m512i; }; template <> class batch : public avx512_int_batch { public: using base_class = avx512_int_batch; using base_class::base_class; using base_class::load_aligned; using base_class::load_unaligned; using base_class::store_aligned; using base_class::store_unaligned; batch() = default; explicit batch(const char* src) : batch(reinterpret_cast(src)) { } batch(const char* src, aligned_mode) : batch(reinterpret_cast(src), aligned_mode{}) { } batch(const char* src, unaligned_mode) : batch(reinterpret_cast(src), unaligned_mode{}) { } XSIMD_DECLARE_LOAD_STORE_INT16(int16_t, 32) XSIMD_DECLARE_LOAD_STORE_LONG(int16_t, 32) }; template <> class batch : public avx512_int_batch { public: using base_class = avx512_int_batch; using base_class::base_class; using base_class::load_aligned; using base_class::load_unaligned; using base_class::store_aligned; using base_class::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT16(uint16_t, 32) XSIMD_DECLARE_LOAD_STORE_LONG(uint16_t, 32) }; batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); /************************************* * batch implementation * *************************************/ namespace detail { template struct avx512_int16_batch_kernel : avx512_int_kernel_base> { using batch_type = batch; using value_type = T; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_sub_epi16(_mm512_setzero_si512(), rhs); #else XSIMD_SPLIT_AVX512(rhs); __m256i res_low = _mm256_sub_epi16(_mm256_setzero_si256(), rhs_low); __m256i res_high = _mm256_sub_epi16(_mm256_setzero_si256(), rhs_high); XSIMD_RETURN_MERGED_AVX(res_low, res_high); #endif } static batch_type add(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_add_epi16(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT16(add, lhs, rhs); #endif } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_sub_epi16(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT16(sub, lhs, rhs); #endif } static batch_type sadd(const batch_type &lhs, const batch_type &rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_adds_epi16(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT16(sadd, lhs, rhs); #endif } static batch_type ssub(const batch_type &lhs, const batch_type &rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_subs_epi16(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT16(ssub, lhs, rhs); #endif } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_mullo_epi16(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT16(mul, lhs, rhs); #endif } static batch_type div(const batch_type& lhs, const batch_type& rhs) { XSIMD_APPLY_AVX2_FUNCTION_INT16(div, lhs, rhs); } static batch_type mod(const batch_type& lhs, const batch_type& rhs) { XSIMD_MACRO_UNROLL_BINARY(%); } static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return _mm512_and_si512(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return _mm512_or_si512(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return _mm512_xor_si512(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return _mm512_xor_si512(rhs, _mm512_set1_epi16(-1)); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return _mm512_andnot_si512(lhs, rhs); } static batch_type fma(const batch_type& x, const batch_type& y, const batch_type& z) { return x * y + z; } static batch_type fms(const batch_type& x, const batch_type& y, const batch_type& z) { return x * y - z; } static batch_type fnma(const batch_type& x, const batch_type& y, const batch_type& z) { return -x * y + z; } static batch_type fnms(const batch_type& x, const batch_type& y, const batch_type& z) { return -x * y - z; } static value_type hadd(const batch_type& rhs) { XSIMD_SPLIT_AVX512(rhs); auto tmp = batch(rhs_low) + batch(rhs_high); return xsimd::hadd(batch(tmp)); } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { #if defined(XSIMD_AVX512BW_AVAILABLE) && !defined(_MSC_VER) auto res = _mm512_mask_blend_epi16((__mmask32)cond, (__m512i)b, (__m512i)a); return batch_type(res); #else __m512i mcond = _mm512_maskz_broadcastw_epi16((__mmask32)cond, _mm_set1_epi32(~0)); XSIMD_SPLIT_AVX512(mcond); XSIMD_SPLIT_AVX512(a); XSIMD_SPLIT_AVX512(b); auto res_lo = _mm256_blendv_epi8(b_low, a_low, mcond_low); auto res_hi = _mm256_blendv_epi8(b_high, a_high, mcond_high); XSIMD_RETURN_MERGED_AVX(res_lo, res_hi); #endif } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { return _mm512_unpacklo_epi16(lhs, rhs); } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { return _mm512_unpackhi_epi16(lhs, rhs); } static batch_type extract_pair(const batch_type& v_lhs, const batch_type& v_rhs, const int num) { #if defined(XSIMD_AVX512BW_AVAILABLE) const batch_type lhs = v_rhs; const batch_type rhs = v_lhs; const int n = 2 * num; switch(n) { case 0: return rhs; XSIMD_REPEAT_64_v2(_mm512_alignr_epi8); default: break; } return batch_type(T(0)); #else batch_type b_concatenate; const int n = num; for (int i = 0 ; i < (32 - n); ++i) { b_concatenate[i] = v_lhs[i + n]; if(i < n) { b_concatenate[32 - 1 - i] = v_rhs[n - 1 - i]; } } return b_concatenate; #endif } }; template <> struct batch_kernel : public avx512_int16_batch_kernel { static batch_type abs(const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_abs_epi16(rhs); #else XSIMD_SPLIT_AVX512(rhs); __m256i res_low = _mm256_abs_epi16(rhs_low); __m256i res_high = _mm256_abs_epi16(rhs_high); XSIMD_RETURN_MERGED_AVX(res_low, res_high); #endif } static batch_type min(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_min_epi16(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT16(min, lhs, rhs); #endif } static batch_type max(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_max_epi16(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT16(max, lhs, rhs); #endif } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_cmpeq_epi16_mask(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT16(eq, lhs, rhs); #endif } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_cmpneq_epi16_mask(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT16(neq, lhs, rhs); #endif } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_cmplt_epi16_mask(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT16(lt, lhs, rhs); #endif } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_cmple_epi16_mask(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT16(lte, lhs, rhs); #endif } }; template <> struct batch_kernel : public avx512_int16_batch_kernel { static batch_type abs(const batch_type& rhs) { return rhs; } static batch_type min(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_min_epu16(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT16(min, lhs, rhs); #endif } static batch_type max(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_max_epu16(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT16(max, lhs, rhs); #endif } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_cmpeq_epu16_mask(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT16(eq, lhs, rhs); #endif } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_cmpneq_epu16_mask(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT16(neq, lhs, rhs); #endif } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_cmplt_epu16_mask(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT16(lt, lhs, rhs); #endif } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_cmple_epu16_mask(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT16(lte, lhs, rhs); #endif } static batch_type sadd(const batch_type &lhs, const batch_type &rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_adds_epu16(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_UINT16(sadd, lhs, rhs); #endif } static batch_type ssub(const batch_type &lhs, const batch_type &rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_subs_epu16(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_UINT16(ssub, lhs, rhs); #endif } }; } inline batch operator<<(const batch& lhs, int32_t rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) return _mm512_sllv_epi16(lhs, _mm512_set1_epi16(rhs)); #else return _mm512_slli_epi16(lhs, rhs); #endif #else #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) __m512i tmp = _mm512_sllv_epi32(lhs, _mm512_set1_epi32(rhs)); #else __m512i tmp = _mm512_slli_epi32(lhs, rhs); #endif return _mm512_and_si512(_mm512_set1_epi16(0xFFFF << rhs), tmp); #endif } inline batch operator>>(const batch& lhs, int32_t rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) return _mm512_srav_epi16(lhs, _mm512_set1_epi16(rhs)); #else return _mm512_srai_epi16(lhs, rhs); #endif #else return avx512_detail::shift_impl([](int16_t val, int32_t s) { return val >> s; }, lhs, rhs); #endif } inline batch operator<<(const batch& lhs, const batch& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_sllv_epi16(lhs, rhs); #else return avx512_detail::shift_impl([](int16_t val, int16_t s) { return val << s; }, lhs, rhs); #endif } inline batch operator>>(const batch& lhs, const batch& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_srav_epi16(lhs, rhs); #else return avx512_detail::shift_impl([](int16_t val, int16_t s) { return val >> s; }, lhs, rhs); #endif } XSIMD_DEFINE_LOAD_STORE_INT16(int16_t, 32, 64) XSIMD_DEFINE_LOAD_STORE_LONG(int16_t, 32, 64) inline batch operator<<(const batch& lhs, int32_t rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) return _mm512_sllv_epi16(lhs, _mm512_set1_epi16(rhs)); #else return _mm512_slli_epi16(lhs, rhs); #endif #else #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) __m512i tmp = _mm512_sllv_epi32(lhs, _mm512_set1_epi32(rhs)); #else __m512i tmp = _mm512_slli_epi32(lhs, rhs); #endif return _mm512_and_si512(_mm512_set1_epi16(0xFFFF << rhs), tmp); #endif } inline batch operator>>(const batch& lhs, int32_t rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) return _mm512_srlv_epi16(lhs, _mm512_set1_epi16(rhs)); #else return _mm512_srli_epi16(lhs, rhs); #endif #else #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) __m512i tmp = _mm512_srlv_epi32(lhs, _mm512_set1_epi32(rhs)); #else __m512i tmp = _mm512_srli_epi32(lhs, rhs); #endif return _mm512_and_si512(_mm512_set1_epi16(0xFFFF >> rhs), tmp); #endif } inline batch operator<<(const batch& lhs, const batch& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_sllv_epi16(lhs, rhs); #else return avx512_detail::shift_impl([](uint16_t val, int16_t s) { return val << s; }, lhs, rhs); #endif } inline batch operator>>(const batch& lhs, const batch& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_srlv_epi16(lhs, rhs); #else return avx512_detail::shift_impl([](uint16_t val, int16_t s) { return val >> s; }, lhs, rhs); #endif } XSIMD_DEFINE_LOAD_STORE_INT16(uint16_t, 32, 64) XSIMD_DEFINE_LOAD_STORE_LONG(uint16_t, 32, 64) #undef XSIMD_APPLY_AVX2_FUNCTION_INT16 } #endif xsimd-7.6.0/include/xsimd/types/xsimd_avx512_int32.hpp000066400000000000000000000370221410101234500225400ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512_INT32_HPP #define XSIMD_AVX512_INT32_HPP #include "xsimd_avx512_bool.hpp" #include "xsimd_avx512_int_base.hpp" namespace xsimd { /*************************** * batch_bool * ***************************/ template <> struct simd_batch_traits> { using value_type = int32_t; static constexpr std::size_t size = 16; using batch_type = batch; static constexpr std::size_t align = 0; }; template <> struct simd_batch_traits> { using value_type = uint32_t; static constexpr std::size_t size = 16; using batch_type = batch; static constexpr std::size_t align = 0; }; template <> class batch_bool : public batch_bool_avx512<__mmask16, batch_bool> { public: using base_class = batch_bool_avx512<__mmask16, batch_bool>; using base_class::base_class; }; template <> class batch_bool : public batch_bool_avx512<__mmask16, batch_bool> { public: using base_class = batch_bool_avx512<__mmask16, batch_bool>; using base_class::base_class; }; namespace detail { template <> struct batch_bool_kernel : batch_bool_kernel_avx512 { }; template <> struct batch_bool_kernel : batch_bool_kernel_avx512 { }; } /********************** * batch * **********************/ template <> struct simd_batch_traits> { using value_type = int32_t; static constexpr std::size_t size = 16; using batch_bool_type = batch_bool; static constexpr std::size_t align = 64; using storage_type = __m512i; }; template <> struct simd_batch_traits> { using value_type = uint32_t; static constexpr std::size_t size = 16; using batch_bool_type = batch_bool; static constexpr std::size_t align = 64; using storage_type = __m512i; }; template <> class batch : public avx512_int_batch { public: using base_type = avx512_int_batch; using base_type::base_type; using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT32(int32_t, 16) XSIMD_DECLARE_LOAD_STORE_LONG(int32_t, 16) }; template <> class batch : public avx512_int_batch { public: using base_type = avx512_int_batch; using base_type::base_type; using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT32(uint32_t, 16) XSIMD_DECLARE_LOAD_STORE_LONG(uint32_t, 16) }; batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); /************************************* * batch implementation * *************************************/ XSIMD_DEFINE_LOAD_STORE_INT32(int32_t, 16, 64) XSIMD_DEFINE_LOAD_STORE_LONG(int32_t, 16, 64) /************************************* * batch implementation * *************************************/ XSIMD_DEFINE_LOAD_STORE_INT32(uint32_t, 16, 64) XSIMD_DEFINE_LOAD_STORE_LONG(uint32_t, 16, 64) namespace detail { template struct avx512_int32_batch_kernel : avx512_int_kernel_base> { using batch_type = batch; using value_type = T; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { return _mm512_sub_epi32(_mm512_setzero_si512(), rhs); } static batch_type add(const batch_type& lhs, const batch_type& rhs) { return _mm512_add_epi32(lhs, rhs); } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { return _mm512_sub_epi32(lhs, rhs); } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { batch_bool_type mask = _mm512_movepi32_mask(rhs); batch_type lhs_pos_branch = min(std::numeric_limits::max() - rhs, lhs); batch_type lhs_neg_branch = max(std::numeric_limits::min() - rhs, lhs); return rhs + select(mask, lhs_neg_branch, lhs_pos_branch); } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return sadd(lhs, neg(rhs)); } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { return _mm512_mullo_epi32(lhs, rhs); } static batch_type mod(const batch_type& lhs, const batch_type& rhs) { XSIMD_MACRO_UNROLL_BINARY(%); } static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return _mm512_and_si512(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return _mm512_or_si512(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return _mm512_xor_si512(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return _mm512_xor_si512(rhs, _mm512_set1_epi32(-1)); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return _mm512_andnot_si512(lhs, rhs); } static batch_type fma(const batch_type& x, const batch_type& y, const batch_type& z) { return x * y + z; } static batch_type fms(const batch_type& x, const batch_type& y, const batch_type& z) { return x * y - z; } static batch_type fnma(const batch_type& x, const batch_type& y, const batch_type& z) { return -x * y + z; } static batch_type fnms(const batch_type& x, const batch_type& y, const batch_type& z) { return -x * y - z; } static value_type hadd(const batch_type& rhs) { // TODO Why not _mm512_reduce_add_...? __m256i tmp1 = _mm512_extracti32x8_epi32(rhs, 0); __m256i tmp2 = _mm512_extracti32x8_epi32(rhs, 1); __m256i res1 = _mm256_add_epi32(tmp1, tmp2); return xsimd::hadd(batch(res1)); } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { return _mm512_mask_blend_epi32(cond, b, a); } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { return _mm512_unpacklo_epi32(lhs, rhs); } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { return _mm512_unpackhi_epi32(lhs, rhs); } static batch_type extract_pair(const batch_type& v_lhs, const batch_type& v_rhs, const int n) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX512_VERSION const batch_type lhs = v_rhs; const batch_type rhs = v_lhs; switch(n) { case 0: return rhs; XSIMD_REPEAT_16_v2(_mm512_alignr_epi32); default: break; } return batch_type(T(0)); #else batch_type b_concatenate; for (int i = 0 ; i < (16 - n); ++i) { b_concatenate[i] = v_lhs[i + n]; if(i < n) { b_concatenate[16 - 1 - i] = v_rhs[n - 1 - i]; } } return b_concatenate; #endif } }; template <> struct batch_kernel : public avx512_int32_batch_kernel { using batch_type = batch; using value_type = int32_t; using batch_bool_type = batch_bool; static batch_type div(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_FAST_INTEGER_DIVISION) return _mm512_cvttps_epi32(_mm512_div_ps(_mm512_cvtepi32_ps(lhs), _mm512_cvtepi32_ps(rhs))); #else XSIMD_MACRO_UNROLL_BINARY(/); #endif } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmpeq_epi32_mask(lhs, rhs); } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmpneq_epi32_mask(lhs, rhs); } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmplt_epi32_mask(lhs, rhs); } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmple_epi32_mask(lhs, rhs); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return _mm512_min_epi32(lhs, rhs); } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return _mm512_max_epi32(lhs, rhs); } static batch_type abs(const batch_type& rhs) { return _mm512_abs_epi32(rhs); } }; template <> struct batch_kernel : public avx512_int32_batch_kernel { using batch_type = batch; using value_type = uint32_t; using batch_bool_type = batch_bool; static batch_type div(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_FAST_INTEGER_DIVISION) return _mm512_cvttps_epu32(_mm512_div_ps(_mm512_cvtepu32_ps(lhs), _mm512_cvtepu32_ps(rhs))); #else XSIMD_MACRO_UNROLL_BINARY(/); #endif } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmpeq_epu32_mask(lhs, rhs); } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmpneq_epu32_mask(lhs, rhs); } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmplt_epu32_mask(lhs, rhs); } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmple_epu32_mask(lhs, rhs); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return _mm512_min_epu32(lhs, rhs); } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return _mm512_max_epu32(lhs, rhs); } static batch_type abs(const batch_type& rhs) { return rhs; } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { const auto diffmax = batch_type(std::numeric_limits::max()) - lhs; const auto mindiff = min(diffmax, rhs); return lhs + mindiff; } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { const auto diff = min(lhs, rhs); return lhs - diff; } }; } inline batch operator<<(const batch& lhs, int32_t rhs) { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) return _mm512_sllv_epi32(lhs, _mm512_set1_epi32(rhs)); #else return _mm512_slli_epi32(lhs, rhs); #endif } inline batch operator>>(const batch& lhs, int32_t rhs) { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) return _mm512_srav_epi32(lhs, _mm512_set1_epi32(rhs)); #else return _mm512_srai_epi32(lhs, rhs); #endif } inline batch operator<<(const batch& lhs, const batch& rhs) { return _mm512_sllv_epi32(lhs, rhs); } inline batch operator>>(const batch& lhs, const batch& rhs) { return _mm512_srav_epi32(lhs, rhs); } inline batch operator<<(const batch& lhs, int32_t rhs) { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) return _mm512_sllv_epi32(lhs, _mm512_set1_epi32(rhs)); #else return _mm512_slli_epi32(lhs, rhs); #endif } inline batch operator>>(const batch& lhs, int32_t rhs) { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) return _mm512_srlv_epi32(lhs, _mm512_set1_epi32(rhs)); #else return _mm512_srli_epi32(lhs, rhs); #endif } inline batch operator<<(const batch& lhs, const batch& rhs) { return _mm512_sllv_epi32(lhs, rhs); } inline batch operator>>(const batch& lhs, const batch& rhs) { return _mm512_srlv_epi32(lhs, rhs); } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_avx512_int64.hpp000066400000000000000000000436121410101234500225470ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512_INT64_HPP #define XSIMD_AVX512_INT64_HPP #include "xsimd_avx512_bool.hpp" #include "xsimd_avx512_int_base.hpp" namespace xsimd { /************************** * batch_bool * **************************/ template <> struct simd_batch_traits> { using value_type = int64_t; static constexpr std::size_t size = 8; using batch_type = batch; static constexpr std::size_t align = 0; }; template <> struct simd_batch_traits> { using value_type = uint64_t; static constexpr std::size_t size = 8; using batch_type = batch; static constexpr std::size_t align = 0; }; template <> class batch_bool : public batch_bool_avx512<__mmask8, batch_bool> { public: using base_class = batch_bool_avx512<__mmask8, batch_bool>; using base_class::base_class; }; template <> class batch_bool : public batch_bool_avx512<__mmask8, batch_bool> { public: using base_class = batch_bool_avx512<__mmask8, batch_bool>; using base_class::base_class; }; namespace detail { template <> struct batch_bool_kernel : batch_bool_kernel_avx512 { }; template <> struct batch_bool_kernel : batch_bool_kernel_avx512 { }; } /********************* * batch * *********************/ template <> struct simd_batch_traits> { using value_type = int64_t; static constexpr std::size_t size = 8; using batch_bool_type = batch_bool; static constexpr std::size_t align = 64; using storage_type = __m512i; }; template <> struct simd_batch_traits> { using value_type = uint64_t; static constexpr std::size_t size = 8; using batch_bool_type = batch_bool; static constexpr std::size_t align = 64; using storage_type = __m512i; }; template <> class batch : public avx512_int_batch { public: using base_type = avx512_int_batch; using base_type::base_type; using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT64(int64_t, 8) XSIMD_DECLARE_LOAD_STORE_LONG(int64_t, 8) }; template <> class batch : public avx512_int_batch { public: using base_type = avx512_int_batch; using base_type::base_type; using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT64(uint64_t, 8) XSIMD_DECLARE_LOAD_STORE_LONG(uint64_t, 8) }; batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); /************************************ * batch implementation * ************************************/ inline batch& batch::load_aligned(const double* src) { this->m_value = _mm512_cvttpd_epi64(_mm512_load_pd(src)); return *this; } inline batch& batch::load_unaligned(const double* src) { this->m_value = _mm512_cvttpd_epi64(_mm512_loadu_pd(src)); return *this; } inline void batch::store_aligned(double* dst) const { _mm512_store_pd(dst, _mm512_cvtepi64_pd(this->m_value)); } inline void batch::store_unaligned(double* dst) const { _mm512_storeu_pd(dst, _mm512_cvtepi64_pd(this->m_value)); } XSIMD_DEFINE_LOAD_STORE(int64_t, 8, bool, 64) XSIMD_DEFINE_LOAD_STORE(int64_t, 8, int8_t, 64) XSIMD_DEFINE_LOAD_STORE(int64_t, 8, uint8_t, 64) XSIMD_DEFINE_LOAD_STORE(int64_t, 8, int16_t, 64) XSIMD_DEFINE_LOAD_STORE(int64_t, 8, uint16_t, 64) XSIMD_DEFINE_LOAD_STORE(int64_t, 8, int32_t, 64) XSIMD_DEFINE_LOAD_STORE(int64_t, 8, uint32_t, 64) XSIMD_DEFINE_LOAD_STORE(int64_t, 8, float, 64) XSIMD_DEFINE_LOAD_STORE_LONG(int64_t, 8, 64) /************************************* * batch implementation * *************************************/ inline batch& batch::load_aligned(const double* src) { this->m_value = _mm512_cvttpd_epu64(_mm512_load_pd(src)); return *this; } inline batch& batch::load_unaligned(const double* src) { this->m_value = _mm512_cvttpd_epu64(_mm512_loadu_pd(src)); return *this; } inline void batch::store_aligned(double* dst) const { _mm512_store_pd(dst, _mm512_cvtepu64_pd(this->m_value)); } inline void batch::store_unaligned(double* dst) const { _mm512_storeu_pd(dst, _mm512_cvtepu64_pd(this->m_value)); } XSIMD_DEFINE_LOAD_STORE(uint64_t, 8, bool, 64) XSIMD_DEFINE_LOAD_STORE(uint64_t, 8, int8_t, 64) XSIMD_DEFINE_LOAD_STORE(uint64_t, 8, uint8_t, 64) XSIMD_DEFINE_LOAD_STORE(uint64_t, 8, int16_t, 64) XSIMD_DEFINE_LOAD_STORE(uint64_t, 8, uint16_t, 64) XSIMD_DEFINE_LOAD_STORE(uint64_t, 8, int32_t, 64) XSIMD_DEFINE_LOAD_STORE(uint64_t, 8, uint32_t, 64) XSIMD_DEFINE_LOAD_STORE(uint64_t, 8, float, 64) XSIMD_DEFINE_LOAD_STORE_LONG(uint64_t, 8, 64) namespace detail { template struct avx512_int64_batch_kernel : avx512_int_kernel_base> { using batch_type = batch; using value_type = T; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { return _mm512_sub_epi64(_mm512_setzero_si512(), rhs); } static batch_type add(const batch_type& lhs, const batch_type& rhs) { return _mm512_add_epi64(lhs, rhs); } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { return _mm512_sub_epi64(lhs, rhs); } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { batch_bool_type mask = _mm512_movepi64_mask(rhs); batch_type lhs_pos_branch = min(std::numeric_limits::max() - rhs, lhs); batch_type lhs_neg_branch = max(std::numeric_limits::min() - rhs, lhs); return rhs + select(mask, lhs_neg_branch, lhs_pos_branch); } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return sadd(lhs, neg(rhs)); } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { return _mm512_mullo_epi64(lhs, rhs); } static batch_type mod(const batch_type& lhs, const batch_type& rhs) { XSIMD_MACRO_UNROLL_BINARY(%); } static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return _mm512_and_si512(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return _mm512_or_si512(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return _mm512_xor_si512(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return _mm512_xor_si512(rhs, _mm512_set1_epi64(-1)); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return _mm512_andnot_si512(lhs, rhs); } static batch_type fma(const batch_type& x, const batch_type& y, const batch_type& z) { return x * y + z; } static batch_type fms(const batch_type& x, const batch_type& y, const batch_type& z) { return x * y - z; } static batch_type fnma(const batch_type& x, const batch_type& y, const batch_type& z) { return -x * y + z; } static batch_type fnms(const batch_type& x, const batch_type& y, const batch_type& z) { return -x * y - z; } static value_type hadd(const batch_type& rhs) { __m256i tmp1 = _mm512_extracti32x8_epi32(rhs, 0); __m256i tmp2 = _mm512_extracti32x8_epi32(rhs, 1); __m256i res1 = _mm256_add_epi64(tmp1, tmp2); return xsimd::hadd(batch(res1)); } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { #if !defined(_MSC_VER) return _mm512_mask_blend_epi64(cond, b, a); #else __m512i mcond = _mm512_maskz_broadcastq_epi64((__mmask8)cond, _mm_set1_epi32(~0)); XSIMD_SPLIT_AVX512(mcond); XSIMD_SPLIT_AVX512(a); XSIMD_SPLIT_AVX512(b); auto res_lo = _mm256_blendv_epi8(b_low, a_low, mcond_low); auto res_hi = _mm256_blendv_epi8(b_high, a_high, mcond_high); XSIMD_RETURN_MERGED_AVX(res_lo, res_hi); #endif } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { return _mm512_unpacklo_epi64(lhs, rhs); } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { return _mm512_unpackhi_epi64(lhs, rhs); } static batch_type extract_pair(const batch_type& v_lhs, const batch_type& v_rhs, const int n) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX512_VERSION const batch_type lhs = v_rhs; const batch_type rhs = v_lhs; switch(n) { case 0: return rhs; XSIMD_REPEAT_8_v2(_mm512_alignr_epi64); default: break; } return batch_type(T(0)); #else batch_type b_concatenate; for (int i = 0 ; i < (8 - n); ++i) { b_concatenate[i] = v_lhs[i + n]; if(i < n) { b_concatenate[8 - 1 - i] = v_rhs[n - 1 - i]; } } return b_concatenate; #endif } }; template <> struct batch_kernel : public avx512_int64_batch_kernel { using batch_type = batch; using value_type = int64_t; using batch_bool_type = batch_bool; static batch_type div(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_FAST_INTEGER_DIVISION) return _mm512_cvttpd_epi64(_mm512_div_pd(_mm512_cvtepi64_pd(lhs), _mm512_cvtepi64_pd(rhs))); #else XSIMD_MACRO_UNROLL_BINARY(/); #endif } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmpeq_epi64_mask(lhs, rhs); } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmpneq_epi64_mask(lhs, rhs); } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmplt_epi64_mask(lhs, rhs); } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmple_epi64_mask(lhs, rhs); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return _mm512_min_epi64(lhs, rhs); } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return _mm512_max_epi64(lhs, rhs); } static batch_type abs(const batch_type& rhs) { return _mm512_abs_epi64(rhs); } }; template <> struct batch_kernel : public avx512_int64_batch_kernel { using batch_type = batch; using value_type = uint64_t; using batch_bool_type = batch_bool; static batch_type div(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_FAST_INTEGER_DIVISION) return _mm512_cvttpd_epi64(_mm512_div_pd(_mm512_cvtepu64_pd(lhs), _mm512_cvtepu64_pd(rhs))); #else XSIMD_MACRO_UNROLL_BINARY(/); #endif } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmpeq_epu64_mask(lhs, rhs); } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmpneq_epu64_mask(lhs, rhs); } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmplt_epu64_mask(lhs, rhs); } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { return _mm512_cmple_epu64_mask(lhs, rhs); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return _mm512_min_epu64(lhs, rhs); } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return _mm512_max_epu64(lhs, rhs); } static batch_type abs(const batch_type& rhs) { return rhs; } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { const auto diffmax = batch_type(std::numeric_limits::max()) - lhs; const auto mindiff = min(diffmax, rhs); return lhs + mindiff; } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { const auto diff = min(lhs, rhs); return lhs - diff; } }; } inline batch operator<<(const batch& lhs, int32_t rhs) { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) return _mm512_sllv_epi64(lhs, _mm512_set1_epi64(rhs)); #else return _mm512_slli_epi64(lhs, rhs); #endif } inline batch operator>>(const batch& lhs, int32_t rhs) { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) return _mm512_srav_epi64(lhs, _mm512_set1_epi64(rhs)); #else return _mm512_srai_epi64(lhs, rhs); #endif } inline batch operator<<(const batch& lhs, const batch& rhs) { return _mm512_sllv_epi64(lhs, rhs); } inline batch operator>>(const batch& lhs, const batch& rhs) { return _mm512_srav_epi64(lhs, rhs); } inline batch operator<<(const batch& lhs, int32_t rhs) { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) return _mm512_sllv_epi64(lhs, _mm512_set1_epi64(rhs)); #else return _mm512_slli_epi64(lhs, rhs); #endif } inline batch operator>>(const batch& lhs, int32_t rhs) { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) return _mm512_srlv_epi64(lhs, _mm512_set1_epi64(rhs)); #else return _mm512_srli_epi64(lhs, rhs); #endif } inline batch operator<<(const batch& lhs, const batch& rhs) { return _mm512_sllv_epi64(lhs, rhs); } inline batch operator>>(const batch& lhs, const batch& rhs) { return _mm512_srlv_epi64(lhs, rhs); } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_avx512_int8.hpp000066400000000000000000000512761410101234500224720ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512_INT8_HPP #define XSIMD_AVX512_INT8_HPP #include "xsimd_avx512_bool.hpp" #include "xsimd_avx512_int_base.hpp" namespace xsimd { #define XSIMD_APPLY_AVX2_FUNCTION_INT8(func, avx_lhs, avx_rhs) \ XSIMD_APPLY_AVX2_FUNCTION(32, func, avx_lhs, avx_rhs) /**************************** * batch_bool * ****************************/ template <> struct simd_batch_traits> { using value_type = int8_t; static constexpr std::size_t size = 64; using batch_type = batch; static constexpr std::size_t align = 64; }; template <> struct simd_batch_traits> { using value_type = uint8_t; static constexpr std::size_t size = 64; using batch_type = batch; static constexpr std::size_t align = 64; }; #if defined(XSIMD_AVX512BW_AVAILABLE) template <> class batch_bool : public batch_bool_avx512<__mmask64, batch_bool> { public: using base_class = batch_bool_avx512<__mmask64, batch_bool>; using base_class::base_class; }; template <> class batch_bool : public batch_bool_avx512<__mmask64, batch_bool> { public: using base_class = batch_bool_avx512<__mmask64, batch_bool>; using base_class::base_class; }; namespace detail { template <> struct batch_bool_kernel : batch_bool_kernel_avx512 { }; template <> struct batch_bool_kernel : batch_bool_kernel_avx512 { }; } #else template <> class batch_bool : public avx512_fallback_batch_bool { public: using base_class = avx512_fallback_batch_bool; using base_class::base_class; }; template <> class batch_bool : public avx512_fallback_batch_bool { public: using base_class = avx512_fallback_batch_bool; using base_class::base_class; }; namespace detail { template <> struct batch_bool_kernel : avx512_fallback_batch_bool_kernel { }; template <> struct batch_bool_kernel : avx512_fallback_batch_bool_kernel { }; } #endif /********************* * batch * *********************/ template <> struct simd_batch_traits> { using value_type = int8_t; static constexpr std::size_t size = 64; using batch_bool_type = batch_bool; static constexpr std::size_t align = 64; using storage_type = __m512i; }; template <> struct simd_batch_traits> { using value_type = uint8_t; static constexpr std::size_t size = 64; using batch_bool_type = batch_bool; static constexpr std::size_t align = 64; using storage_type = __m512i; }; template <> class batch : public avx512_int_batch { public: using base_class = avx512_int_batch; using base_class::base_class; using base_class::load_aligned; using base_class::load_unaligned; using base_class::store_aligned; using base_class::store_unaligned; batch() = default; explicit batch(const char* src) : batch(reinterpret_cast(src)) { } batch(const char* src, aligned_mode) : batch(reinterpret_cast(src), aligned_mode{}) { } batch(const char* src, unaligned_mode) : batch(reinterpret_cast(src), unaligned_mode{}) { } XSIMD_DECLARE_LOAD_STORE_INT8(int8_t, 64) XSIMD_DECLARE_LOAD_STORE_LONG(int8_t, 64) }; template <> class batch : public avx512_int_batch { public: using base_class = avx512_int_batch; using base_class::base_class; using base_class::load_aligned; using base_class::load_unaligned; using base_class::store_aligned; using base_class::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT8(uint8_t, 64) XSIMD_DECLARE_LOAD_STORE_LONG(uint8_t, 64) }; batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); /************************************ * batch implementation * ************************************/ namespace detail { template struct avx512_int8_batch_kernel : avx512_int_kernel_base> { using batch_type = batch; using value_type = T; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_sub_epi8(_mm512_setzero_si512(), rhs); #else XSIMD_SPLIT_AVX512(rhs); __m256i res_low = _mm256_sub_epi8(_mm256_setzero_si256(), rhs_low); __m256i res_high = _mm256_sub_epi8(_mm256_setzero_si256(), rhs_high); XSIMD_RETURN_MERGED_AVX(res_low, res_high); #endif } static batch_type add(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_add_epi8(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT8(add, lhs, rhs); #endif } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_sub_epi8(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT8(sub, lhs, rhs); #endif } static batch_type sadd(const batch_type &lhs, const batch_type &rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_adds_epi8(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT8(sadd, lhs, rhs); #endif } static batch_type ssub(const batch_type &lhs, const batch_type &rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_subs_epi8(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT8(ssub, lhs, rhs); #endif } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) batch_type upper = _mm512_and_si512(_mm512_mullo_epi16(lhs, rhs), _mm512_srli_epi16(_mm512_set1_epi16(-1), 8)); batch_type lower = _mm512_slli_epi16(_mm512_mullo_epi16(_mm512_srli_epi16(lhs, 8), _mm512_srli_epi16(rhs, 8)), 8); return _mm512_or_si512(upper, lower); #else XSIMD_APPLY_AVX2_FUNCTION_INT8(mul, lhs, rhs); #endif } static batch_type div(const batch_type& lhs, const batch_type& rhs) { XSIMD_APPLY_AVX2_FUNCTION_INT8(div, lhs, rhs); } static batch_type mod(const batch_type& lhs, const batch_type& rhs) { XSIMD_MACRO_UNROLL_BINARY(%); } static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return _mm512_and_si512(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return _mm512_or_si512(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return _mm512_xor_si512(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return _mm512_xor_si512(rhs, _mm512_set1_epi8(-1)); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return _mm512_andnot_si512(lhs, rhs); } static batch_type fma(const batch_type& x, const batch_type& y, const batch_type& z) { return x * y + z; } static batch_type fms(const batch_type& x, const batch_type& y, const batch_type& z) { return x * y - z; } static batch_type fnma(const batch_type& x, const batch_type& y, const batch_type& z) { return -x * y + z; } static batch_type fnms(const batch_type& x, const batch_type& y, const batch_type& z) { return -x * y - z; } static value_type hadd(const batch_type& rhs) { XSIMD_SPLIT_AVX512(rhs); auto tmp = batch(rhs_low) + batch(rhs_high); return xsimd::hadd(batch(tmp)); } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { #if defined(XSIMD_AVX512BW_AVAILABLE) // Some compilers are not happy with passing directly a and b to the intrinsics // See https://github.com/xtensor-stack/xsimd/issues/315 __m512i ma = a; __m512i mb = b; return _mm512_mask_blend_epi8(cond, mb, ma); #else XSIMD_SPLIT_AVX512(cond); XSIMD_SPLIT_AVX512(a); XSIMD_SPLIT_AVX512(b); auto res_lo = _mm256_blendv_epi8(b_low, a_low, cond_low); auto res_hi = _mm256_blendv_epi8(b_high, a_high, cond_high); XSIMD_RETURN_MERGED_AVX(res_lo, res_hi); #endif } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { return _mm512_unpacklo_epi8(lhs, rhs); } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { return _mm512_unpackhi_epi8(lhs, rhs); } static batch_type extract_pair(const batch_type& v_lhs, const batch_type& v_rhs, const int n) { #if defined(XSIMD_AVX512BW_AVAILABLE) const batch_type lhs = v_rhs; const batch_type rhs = v_lhs; switch(n) { case 0: return rhs; XSIMD_REPEAT_64_v2(_mm512_alignr_epi8); default: break; } return batch_type(T(0)); #else batch_type b_concatenate; for (int i = 0 ; i < (64 - n); ++i) { b_concatenate[i] = v_lhs[i + n]; if(i < n) { b_concatenate[64 - 1 - i] = v_rhs[n - 1 - i]; } } return b_concatenate; #endif } }; template <> struct batch_kernel : public avx512_int8_batch_kernel { static batch_type abs(const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_abs_epi8(rhs); #else XSIMD_SPLIT_AVX512(rhs); __m256i res_low = _mm256_abs_epi8(rhs_low); __m256i res_high = _mm256_abs_epi8(rhs_high); XSIMD_RETURN_MERGED_AVX(res_low, res_high); #endif } static batch_type min(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_min_epi8(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT8(min, lhs, rhs); #endif } static batch_type max(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_max_epi8(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT8(max, lhs, rhs); #endif } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_cmpeq_epi8_mask(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT8(eq, lhs, rhs); #endif } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_cmpneq_epi8_mask(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT8(neq, lhs, rhs); #endif } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_cmplt_epi8_mask(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT8(lt, lhs, rhs); #endif } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_cmple_epi8_mask(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT8(lte, lhs, rhs); #endif } }; template <> struct batch_kernel : public avx512_int8_batch_kernel { static batch_type abs(const batch_type& rhs) { return rhs; } static batch_type min(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_min_epu8(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT8(min, lhs, rhs); #endif } static batch_type max(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_max_epu8(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT8(max, lhs, rhs); #endif } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_cmpeq_epu8_mask(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT8(eq, lhs, rhs); #endif } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_cmpneq_epu8_mask(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT8(neq, lhs, rhs); #endif } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_cmplt_epu8_mask(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT8(lt, lhs, rhs); #endif } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_cmple_epu8_mask(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_INT8(lte, lhs, rhs); #endif } static batch_type sadd(const batch_type &lhs, const batch_type &rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_adds_epu8(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_UINT8(sadd, lhs, rhs); #endif } static batch_type ssub(const batch_type &lhs, const batch_type &rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) return _mm512_subs_epu8(lhs, rhs); #else XSIMD_APPLY_AVX2_FUNCTION_UINT8(ssub, lhs, rhs); #endif } }; } inline batch operator<<(const batch& lhs, int32_t rhs) { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) __m512i tmp = _mm512_sllv_epi32(lhs, _mm512_set1_epi32(rhs)); #else __m512i tmp = _mm512_slli_epi32(lhs, rhs); #endif return _mm512_and_si512(_mm512_set1_epi8(0xFF << rhs), tmp); } inline batch operator>>(const batch& lhs, int32_t rhs) { #if defined(XSIMD_AVX512BW_AVAILABLE) __m512i sign_mask = _mm512_set1_epi16((0xFF00 >> rhs) & 0x00FF); __m512i zeros = _mm512_setzero_si512(); __mmask64 cmp_is_negative_mask = _mm512_cmpgt_epi8_mask(zeros, lhs); __m512i cmp_sign_mask = _mm512_mask_blend_epi8(cmp_is_negative_mask, zeros, sign_mask); #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) __m512i res = _mm512_srav_epi16(lhs, _mm512_set1_epi16(rhs)); #else __m512i res = _mm512_srai_epi16(lhs, rhs); #endif return _mm512_or_si512(cmp_sign_mask, _mm512_andnot_si512(sign_mask, res)); #else return avx512_detail::shift_impl([](int8_t val, int32_t s) { return val >> s; }, lhs, rhs); #endif } inline batch operator<<(const batch& lhs, const batch& rhs) { return avx512_detail::shift_impl([](int8_t val, int8_t s) { return val << s; }, lhs, rhs); } inline batch operator>>(const batch& lhs, const batch& rhs) { return avx512_detail::shift_impl([](int8_t val, int8_t s) { return val >> s; }, lhs, rhs); } XSIMD_DEFINE_LOAD_STORE_INT8(int8_t, 64, 64) XSIMD_DEFINE_LOAD_STORE_LONG(int8_t, 64, 64) inline batch operator<<(const batch& lhs, int32_t rhs) { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) __m512i tmp = _mm512_sllv_epi32(lhs, _mm512_set1_epi32(rhs)); #else __m512i tmp = _mm512_slli_epi32(lhs, rhs); #endif return _mm512_and_si512(_mm512_set1_epi8(0xFF << rhs), tmp); } inline batch operator>>(const batch& lhs, int32_t rhs) { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) __m512i tmp = _mm512_srlv_epi32(lhs, _mm512_set1_epi32(rhs)); #else __m512i tmp = _mm512_srli_epi32(lhs, rhs); #endif return _mm512_and_si512(_mm512_set1_epi8(0xFF >> rhs), tmp); } inline batch operator<<(const batch& lhs, const batch& rhs) { return avx512_detail::shift_impl([](uint8_t val, int8_t s) { return val << s; }, lhs, rhs); } inline batch operator>>(const batch& lhs, const batch& rhs) { return avx512_detail::shift_impl([](uint8_t val, int8_t s) { return val >> s; }, lhs, rhs); } XSIMD_DEFINE_LOAD_STORE_INT8(uint8_t, 64, 64) XSIMD_DEFINE_LOAD_STORE_LONG(uint8_t, 64, 64) #undef XSIMD_APPLY_AVX2_FUNCTION_INT8 } #endif xsimd-7.6.0/include/xsimd/types/xsimd_avx512_int_base.hpp000066400000000000000000000371161410101234500233710ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX_INT512_BASE_HPP #define XSIMD_AVX_INT512_BASE_HPP #include "xsimd_base.hpp" #include "xsimd_utils.hpp" namespace xsimd { #define XSIMD_SPLIT_AVX512(avx_name) \ __m256i avx_name##_low = _mm512_castsi512_si256((__m512i)avx_name); \ __m256i avx_name##_high = _mm512_extracti64x4_epi64((__m512i)avx_name, 1) \ #define XSIMD_SPLITPS_AVX512(avx_name) \ __m256 avx_name##_low = _mm512_castps512_ps256((__m512)avx_name); \ __m256 avx_name##_high = _mm512_extractf32x8_ps((__m512)avx_name, 1) \ #define XSIMD_SPLITPD_AVX512(avx_name) \ __m256d avx_name##_low = _mm512_castpd512_pd256((__m512d)avx_name); \ __m256d avx_name##_high = _mm512_extractf64x4_pd((__m512d)avx_name, 1) \ #define XSIMD_RETURN_MERGED_AVX(res_low, res_high) \ __m512i result = _mm512_castsi256_si512(res_low); \ return _mm512_inserti64x4(result, res_high, 1) \ #define XSIMD_RETURN_MERGEDPS_AVX(res_low, res_high) \ __m512 result = _mm512_castps256_ps512(res_low); \ return _mm512_insertf32x8(result, res_high, 1) \ #define XSIMD_RETURN_MERGEDPD_AVX(res_low, res_high) \ __m512d result = _mm512_castpd256_pd512(res_low); \ return _mm512_insertf64x4(result, res_high, 1) \ #define XSIMD_APPLY_AVX2_FUNCTION(N, func, avx_lhs, avx_rhs) \ XSIMD_SPLIT_AVX512(avx_lhs); \ XSIMD_SPLIT_AVX512(avx_rhs); \ __m256i res_low = detail::batch_kernel :: func (avx_lhs##_low, avx_rhs##_low); \ __m256i res_high = detail::batch_kernel :: func (avx_lhs##_high, avx_rhs##_high); \ XSIMD_RETURN_MERGED_AVX(res_low, res_high); namespace detail { template struct mask_type; template <> struct mask_type<8> { using type = __mmask8; }; template <> struct mask_type<16> { using type = __mmask16; }; template <> struct mask_type<32> { using type = __mmask32; }; template <> struct mask_type<64> { using type = __mmask64; }; template using mask_type_t = typename mask_type::type; } template class avx512_int_batch : public simd_batch> { public: using base_type = simd_batch>; using mask_type = detail::mask_type_t; avx512_int_batch(); explicit avx512_int_batch(T i); template > avx512_int_batch(Args... exactly_N_scalars); explicit avx512_int_batch(const T* src); avx512_int_batch(const T* src, aligned_mode); avx512_int_batch(const T* src, unaligned_mode); avx512_int_batch(const __m512i& rhs); avx512_int_batch& operator=(const __m512i& rhs); avx512_int_batch(const batch_bool& rhs); avx512_int_batch& operator=(const batch_bool& rhs); operator __m512i() const; batch& load_aligned(const T* src); batch& load_unaligned(const T* src); batch& load_aligned(const flipped_sign_type_t* src); batch& load_unaligned(const flipped_sign_type_t* src); void store_aligned(T* dst) const; void store_unaligned(T* dst) const; void store_aligned(flipped_sign_type_t* dst) const; void store_unaligned(flipped_sign_type_t* dst) const; using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; }; /*********************************** * avx512_int_batch implementation * ***********************************/ namespace avx512_detail { inline __m512i int_init(std::integral_constant, int8_t t0, int8_t t1, int8_t t2, int8_t t3, int8_t t4, int8_t t5, int8_t t6, int8_t t7, int8_t t8, int8_t t9, int8_t t10, int8_t t11, int8_t t12, int8_t t13, int8_t t14, int8_t t15, int8_t t16, int8_t t17, int8_t t18, int8_t t19, int8_t t20, int8_t t21, int8_t t22, int8_t t23, int8_t t24, int8_t t25, int8_t t26, int8_t t27, int8_t t28, int8_t t29, int8_t t30, int8_t t31, int8_t t32, int8_t t33, int8_t t34, int8_t t35, int8_t t36, int8_t t37, int8_t t38, int8_t t39, int8_t t40, int8_t t41, int8_t t42, int8_t t43, int8_t t44, int8_t t45, int8_t t46, int8_t t47, int8_t t48, int8_t t49, int8_t t50, int8_t t51, int8_t t52, int8_t t53, int8_t t54, int8_t t55, int8_t t56, int8_t t57, int8_t t58, int8_t t59, int8_t t60, int8_t t61, int8_t t62, int8_t t63) { #if defined(__clang__) || __GNUC__ return __extension__ (__m512i)(__v64qi) { t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29, t30, t31, t32, t33, t34, t35, t36, t37, t38, t39, t40, t41, t42, t43, t44, t45, t46, t47, t48, t49, t50, t51, t52, t53, t54, t55, t56, t57, t58, t59, t60, t61, t62, t63 }; #else return _mm512_set_epi8( t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29, t30, t31, t32, t33, t34, t35, t36, t37, t38, t39, t40, t41, t42, t43, t44, t45, t46, t47, t48, t49, t50, t51, t52, t53, t54, t55, t56, t57, t58, t59, t60, t61, t62, t63); #endif } inline __m512i int_init(std::integral_constant, int16_t t0, int16_t t1, int16_t t2, int16_t t3, int16_t t4, int16_t t5, int16_t t6, int16_t t7, int16_t t8, int16_t t9, int16_t t10, int16_t t11, int16_t t12, int16_t t13, int16_t t14, int16_t t15, int16_t t16, int16_t t17, int16_t t18, int16_t t19, int16_t t20, int16_t t21, int16_t t22, int16_t t23, int16_t t24, int16_t t25, int16_t t26, int16_t t27, int16_t t28, int16_t t29, int16_t t30, int16_t t31) { #if defined(__clang__) || __GNUC__ return __extension__ (__m512i)(__v32hi) { t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29, t30, t31 }; #else return _mm512_set_epi16( t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29, t30, t31); #endif } inline __m512i int_init(std::integral_constant, int32_t t0, int32_t t1, int32_t t2, int32_t t3, int32_t t4, int32_t t5, int32_t t6, int32_t t7, int32_t t8, int32_t t9, int32_t t10, int32_t t11, int32_t t12, int32_t t13, int32_t t14, int32_t t15) { // _mm512_setr_epi32 is a macro, preventing parameter pack expansion ... return _mm512_setr_epi32(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15); } inline __m512i int_init(std::integral_constant, int64_t t0, int64_t t1, int64_t t2, int64_t t3, int64_t t4, int64_t t5, int64_t t6, int64_t t7) { // _mm512_setr_epi64 is a macro, preventing parameter pack expansion ... return _mm512_setr_epi64(t0, t1, t2, t3, t4, t5, t6, t7); } template inline __m512i int_set(std::integral_constant, T v) { return _mm512_set1_epi8(v); } template inline __m512i int_set(std::integral_constant, T v) { return _mm512_set1_epi16(v); } template inline __m512i int_set(std::integral_constant, T v) { return _mm512_set1_epi32(v); } template inline __m512i int_set(std::integral_constant, T v) { return _mm512_set1_epi64(v); } } template inline avx512_int_batch::avx512_int_batch() { } template inline avx512_int_batch::avx512_int_batch(T i) : base_type(avx512_detail::int_set(std::integral_constant{}, i)) { } template template inline avx512_int_batch::avx512_int_batch(Args... args) : base_type(avx512_detail::int_init(std::integral_constant{}, args...)) { } template inline avx512_int_batch::avx512_int_batch(const T* src) : base_type(_mm512_loadu_si512((__m512i const*) src)) { } template inline avx512_int_batch::avx512_int_batch(const T* src, aligned_mode) : base_type(_mm512_load_si512((__m512i const*) src)) { } template inline avx512_int_batch::avx512_int_batch(const T* src, unaligned_mode) : base_type(_mm512_loadu_si512((__m512i const*) src)) { } template inline avx512_int_batch::avx512_int_batch(const __m512i& rhs) : base_type(rhs) { } template inline avx512_int_batch& avx512_int_batch::operator=(const __m512i& rhs) { this->m_value = rhs; return *this; } template inline avx512_int_batch::avx512_int_batch(const batch_bool& rhs) : base_type(detail::batch_kernel::select(rhs, batch(T(1)), batch(T(0)))) { } template avx512_int_batch& avx512_int_batch::operator=(const batch_bool& rhs) { this->m_value = detail::batch_kernel::select(rhs, batch(T(1)), batch(T(0))); return *this; } template inline avx512_int_batch::operator __m512i() const { return this->m_value; } template inline batch& avx512_int_batch::load_aligned(const T* src) { this->m_value = _mm512_load_si512((__m512i const*) src); return (*this)(); } template inline batch& avx512_int_batch::load_unaligned(const T* src) { this->m_value = _mm512_loadu_si512((__m512i const*) src); return (*this)(); } template inline batch& avx512_int_batch::load_aligned(const flipped_sign_type_t* src) { this->m_value = _mm512_load_si512((__m512i const*) src); return (*this)(); } template inline batch& avx512_int_batch::load_unaligned(const flipped_sign_type_t* src) { this->m_value = _mm512_loadu_si512((__m512i const*) src); return (*this)(); } template inline void avx512_int_batch::store_aligned(T* dst) const { _mm512_store_si512(dst, this->m_value); } template inline void avx512_int_batch::store_unaligned(T* dst) const { _mm512_storeu_si512(dst, this->m_value); } template inline void avx512_int_batch::store_aligned(flipped_sign_type_t* dst) const { _mm512_store_si512(dst, this->m_value); } template inline void avx512_int_batch::store_unaligned(flipped_sign_type_t* dst) const { _mm512_storeu_si512(dst, this->m_value); } namespace detail { template struct avx512_int_kernel_base { using batch_type = B; static batch_type fmin(const batch_type& lhs, const batch_type& rhs) { return min(lhs, rhs); } static batch_type fmax(const batch_type& lhs, const batch_type& rhs) { return max(lhs, rhs); } static batch_type fabs(const batch_type& rhs) { return abs(rhs); } }; } namespace avx512_detail { template inline batch shift_impl(F&& f, const batch& lhs, int32_t rhs) { alignas(64) T tmp_lhs[N], tmp_res[N]; lhs.store_aligned(&tmp_lhs[0]); unroller([&](std::size_t i) { tmp_res[i] = f(tmp_lhs[i], rhs); }); return batch(tmp_res, aligned_mode()); } template inline batch shift_impl(F&& f, const batch& lhs, const batch& rhs) { alignas(64) T tmp_lhs[N], tmp_res[N]; alignas(64) S tmp_rhs[N]; lhs.store_aligned(&tmp_lhs[0]); rhs.store_aligned(&tmp_rhs[0]); unroller([&](std::size_t i) { tmp_res[i] = f(tmp_lhs[i], tmp_rhs[i]); }); return batch(tmp_res, aligned_mode()); } } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_avx_complex.hpp000066400000000000000000000452561410101234500230300ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX_COMPLEX_HPP #define XSIMD_AVX_COMPLEX_HPP #include #include #include #ifdef XSIMD_ENABLE_XTL_COMPLEX #include "xtl/xcomplex.hpp" #endif #include "xsimd_avx_float.hpp" #include "xsimd_avx_double.hpp" #include "xsimd_complex_base.hpp" namespace xsimd { /************************************** * batch_bool, 8> * **************************************/ template <> struct simd_batch_traits, 8>> : complex_batch_bool_traits, float, 8, 32> { }; template<> class batch_bool, 8> : public simd_complex_batch_bool, 8>> { public: using self_type = batch_bool, 8>; using base_type = simd_complex_batch_bool; using real_batch = batch_bool; batch_bool() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch_bool(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7) : base_type(real_batch(b0, b1, b2, b3, b4, b5, b6, b7)) { } }; /********************************* * batch, 8> * *********************************/ template <> struct simd_batch_traits, 8>> : complex_batch_traits, float, 8, 32> { }; template <> class batch, 8> : public simd_complex_batch, 8>> { public: using self_type = batch, 8>; using base_type = simd_complex_batch; using value_type = std::complex; using real_batch = batch; batch() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch(value_type c0, value_type c1, value_type c2, value_type c3, value_type c4, value_type c5, value_type c6, value_type c7) : base_type(real_batch(c0.real(), c1.real(), c2.real(), c3.real(), c4.real(), c5.real(), c6.real(), c7.real()), real_batch(c0.imag(), c1.imag(), c2.imag(), c3.imag(), c4.imag(), c5.imag(), c6.imag(), c7.imag())) { } private: batch& load_complex(const real_batch& hi, const real_batch& lo); real_batch get_complex_high() const; real_batch get_complex_low() const; friend class simd_complex_batch, 8>>; }; /*************************************** * batch_bool, 4> * ***************************************/ template <> struct simd_batch_traits, 4>> : complex_batch_bool_traits, double, 4, 32> { }; template<> class batch_bool, 4> : public simd_complex_batch_bool, 4>> { public: using self_type = batch_bool, 4>; using base_type = simd_complex_batch_bool; using real_batch = batch_bool; batch_bool() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch_bool(bool b0, bool b1, bool b2, bool b3) : base_type(real_batch(b0, b1, b2, b3)) { } }; /********************************** * batch, 4> * **********************************/ template <> struct simd_batch_traits, 4>> : complex_batch_traits, double, 4, 32> { }; template <> class batch, 4> : public simd_complex_batch, 4>> { public: using self_type = batch, 4>; using base_type = simd_complex_batch; using value_type = std::complex; using real_batch = batch; batch() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch(value_type c0, value_type c1, value_type c2, value_type c3) : base_type(real_batch(c0.real(), c1.real(), c2.real(), c3.real()), real_batch(c0.imag(), c1.imag(), c2.imag(), c3.imag())) { } private: batch& load_complex(const real_batch& hi, const real_batch& lo); real_batch get_complex_high() const; real_batch get_complex_low() const; friend class simd_complex_batch, 4>>; }; /********************************************** * common functions to avoid code duplication * **********************************************/ namespace detail { template inline std::pair load_complex_f(const B& hi, const B& lo) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION B real = _mm256_castpd_ps( _mm256_permute4x64_pd( _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0))), _MM_SHUFFLE(3, 1, 2, 0))); B imag = _mm256_castpd_ps( _mm256_permute4x64_pd( _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1))), _MM_SHUFFLE(3, 1, 2, 0))); #else __m128 tmp0 = _mm256_extractf128_ps(hi, 0); __m128 tmp1 = _mm256_extractf128_ps(hi, 1); __m128 tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0)); __m128 tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1)); B real, imag; real = _mm256_insertf128_ps(real, tmp_real, 0); imag = _mm256_insertf128_ps(imag, tmp_imag, 0); tmp0 = _mm256_extractf128_ps(lo, 0); tmp1 = _mm256_extractf128_ps(lo, 1); tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0)); tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1)); real = _mm256_insertf128_ps(real, tmp_real, 1); imag = _mm256_insertf128_ps(imag, tmp_imag, 1); #endif return std::make_pair(real, imag); } // On clang, _mm256_extractf128_ps is built upon build_shufflevector // which require index parameter to be a constant template inline B get_half_complex_f(const B& real, const B& imag) { __m128 tmp0 = _mm256_extractf128_ps(real, index); __m128 tmp1 = _mm256_extractf128_ps(imag, index); __m128 tmp2 = _mm_unpackhi_ps(tmp0, tmp1); tmp0 = _mm_unpacklo_ps(tmp0, tmp1); __m256 res = real; res = _mm256_insertf128_ps(res, tmp0, 0); res = _mm256_insertf128_ps(res, tmp2, 1); return res; } template inline B get_complex_high_f(const B& real, const B& imag) { return get_half_complex_f<0>(real, imag); } template inline B get_complex_low_f(const B& real, const B& imag) { return get_half_complex_f<1>(real, imag); } template inline std::pair load_complex_d(const B& hi, const B& lo) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION B real = _mm256_permute4x64_pd(_mm256_unpacklo_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0)); B imag = _mm256_permute4x64_pd(_mm256_unpackhi_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0)); #else __m128d tmp0 = _mm256_extractf128_pd(hi, 0); __m128d tmp1 = _mm256_extractf128_pd(hi, 1); B real, imag; __m256d re_tmp0 = _mm256_insertf128_pd(real, _mm_unpacklo_pd(tmp0, tmp1), 0); __m256d im_tmp0 = _mm256_insertf128_pd(imag, _mm_unpackhi_pd(tmp0, tmp1), 0); tmp0 = _mm256_extractf128_pd(lo, 0); tmp1 = _mm256_extractf128_pd(lo, 1); __m256d re_tmp1 = _mm256_insertf128_pd(real, _mm_unpacklo_pd(tmp0, tmp1), 1); __m256d im_tmp1 = _mm256_insertf128_pd(imag, _mm_unpackhi_pd(tmp0, tmp1), 1); real = _mm256_blend_pd(re_tmp0, re_tmp1, 12); imag = _mm256_blend_pd(im_tmp0, im_tmp1, 12); #endif return std::make_pair(real, imag); } // On clang, _mm256_extractf128_pd is built upon build_shufflevector // which require index parameter to be a constant template inline B get_half_complex_d(const B& real, const B& imag) { __m128d tmp0 = _mm256_extractf128_pd(real, index); __m128d tmp1 = _mm256_extractf128_pd(imag, index); __m128d tmp2 = _mm_unpackhi_pd(tmp0, tmp1); tmp0 = _mm_unpacklo_pd(tmp0, tmp1); __m256d res = real; res = _mm256_insertf128_pd(res, tmp0, 0); res = _mm256_insertf128_pd(res, tmp2, 1); return res; } template inline B get_complex_high_d(const B& real, const B& imag) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION __m256d tmp0 = _mm256_permute4x64_pd(real, _MM_SHUFFLE(3, 1, 1, 0)); __m256d tmp1 = _mm256_permute4x64_pd(imag, _MM_SHUFFLE(1, 2, 0, 0)); return _mm256_blend_pd(tmp0, tmp1, 10); #else return get_half_complex_d<0>(real, imag); #endif } template inline B get_complex_low_d(const B& real, const B& imag) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION __m256d tmp0 = _mm256_permute4x64_pd(real, _MM_SHUFFLE(3, 3, 1, 2)); __m256d tmp1 = _mm256_permute4x64_pd(imag, _MM_SHUFFLE(3, 2, 2, 0)); return _mm256_blend_pd(tmp0, tmp1, 10); #else return get_half_complex_d<1>(real, imag); #endif } } /******************************************** * batch, N> implementation * ********************************************/ inline batch, 8>& batch, 8>::load_complex(const real_batch& hi, const real_batch& lo) { std::tie(this->m_real, this->m_imag) = detail::load_complex_f(hi, lo); return *this; } inline auto batch, 8>::get_complex_high() const -> real_batch { return detail::get_complex_high_f(this->m_real, this->m_imag); } inline auto batch, 8>::get_complex_low() const -> real_batch { return detail::get_complex_low_f(this->m_real, this->m_imag); } inline batch, 4>& batch, 4>::load_complex(const real_batch& hi, const real_batch& lo) { std::tie(m_real, m_imag) = detail::load_complex_d(hi, lo); return *this; } inline auto batch, 4>::get_complex_high() const -> real_batch { return detail::get_complex_high_d(this->m_real, this->m_imag); } inline auto batch, 4>::get_complex_low() const -> real_batch { return detail::get_complex_low_d(this->m_real, this->m_imag); } #ifdef XSIMD_ENABLE_XTL_COMPLEX /**************************************************** * batch_bool, 8> * ****************************************************/ template struct simd_batch_traits, 8>> : complex_batch_bool_traits, float, 8, 32> { }; template class batch_bool, 8> : public simd_complex_batch_bool, 8>> { public: using self_type = batch_bool, 8>; using base_type = simd_complex_batch_bool; using real_batch = batch_bool; batch_bool() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch_bool(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7) : base_type(real_batch(b0, b1, b2, b3, b4, b5, b6, b7)) { } }; /*********************************************** * batch, 8> * ***********************************************/ template struct simd_batch_traits, 8>> : complex_batch_traits, float, 8, 32> { }; template class batch, 8> : public simd_complex_batch, 8>> { public: using self_type = batch, 8>; using base_type = simd_complex_batch; using value_type = xtl::xcomplex; using real_batch = batch; batch() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch(value_type c0, value_type c1, value_type c2, value_type c3, value_type c4, value_type c5, value_type c6, value_type c7) : base_type(real_batch(c0.real(), c1.real(), c2.real(), c3.real(), c4.real(), c5.real(), c6.real(), c7.real()), real_batch(c0.imag(), c1.imag(), c2.imag(), c3.imag(), c4.imag(), c5.imag(), c6.imag(), c7.imag())) { } private: batch& load_complex(const real_batch& hi, const real_batch& lo); real_batch get_complex_high() const; real_batch get_complex_low() const; friend class simd_complex_batch, 8>>; }; /****************************************************** * batch_bool, 4> * ******************************************************/ template struct simd_batch_traits, 4>> : complex_batch_bool_traits, double, 4, 32> { }; template class batch_bool, 4> : public simd_complex_batch_bool, 4>> { public: using self_type = batch_bool, 4>; using base_type = simd_complex_batch_bool; using real_batch = batch_bool; batch_bool() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch_bool(bool b0, bool b1, bool b2, bool b3) : base_type(real_batch(b0, b1, b2, b3)) { } }; /************************************************* * batch, 4> * *************************************************/ template struct simd_batch_traits, 4>> : complex_batch_traits, double, 4, 32> { }; template class batch, 4> : public simd_complex_batch, 4>> { public: using self_type = batch, 4>; using base_type = simd_complex_batch; using value_type = xtl::xcomplex; using real_batch = batch; batch() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch(value_type c0, value_type c1, value_type c2, value_type c3) : base_type(real_batch(c0.real(), c1.real(), c2.real(), c3.real()), real_batch(c0.imag(), c1.imag(), c2.imag(), c3.imag())) { } private: batch& load_complex(const real_batch& hi, const real_batch& lo); real_batch get_complex_high() const; real_batch get_complex_low() const; friend class simd_complex_batch, 4>>; }; /******************************************** * batch, N> implementation * ********************************************/ template inline batch, 8>& batch, 8>::load_complex(const real_batch& hi, const real_batch& lo) { std::tie(this->m_real, this->m_imag) = detail::load_complex_f(hi, lo); return *this; } template inline auto batch, 8>::get_complex_high() const -> real_batch { return detail::get_complex_high_f(this->m_real, this->m_imag); } template inline auto batch, 8>::get_complex_low() const -> real_batch { return detail::get_complex_low_f(this->m_real, this->m_imag); } template inline batch, 4>& batch, 4>::load_complex(const real_batch& hi, const real_batch& lo) { std::tie(this->m_real, this->m_imag) = detail::load_complex_d(hi, lo); return *this; } template inline auto batch, 4>::get_complex_high() const -> real_batch { return detail::get_complex_high_d(this->m_real, this->m_imag); } template inline auto batch, 4>::get_complex_low() const -> real_batch { return detail::get_complex_low_d(this->m_real, this->m_imag); } #endif } #endif xsimd-7.6.0/include/xsimd/types/xsimd_avx_conversion.hpp000066400000000000000000000251261410101234500235400ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX_CONVERSION_HPP #define XSIMD_AVX_CONVERSION_HPP #include "xsimd_avx_double.hpp" #include "xsimd_avx_float.hpp" #include "xsimd_avx_int8.hpp" #include "xsimd_avx_int16.hpp" #include "xsimd_avx_int32.hpp" #include "xsimd_avx_int64.hpp" namespace xsimd { /************************ * conversion functions * ************************/ batch to_int(const batch& x); batch to_int(const batch& x); batch to_float(const batch& x); batch to_float(const batch& x); batch u8_to_u16(const batch& x); batch u16_to_u8(const batch& x); batch u8_to_u32(const batch& x); batch u32_to_u8(const batch& x); batch u8_to_u64(const batch& x); batch u64_to_u8(const batch& x); /************************** * boolean cast functions * **************************/ batch_bool bool_cast(const batch_bool& x); batch_bool bool_cast(const batch_bool& x); batch_bool bool_cast(const batch_bool& x); batch_bool bool_cast(const batch_bool& x); /*************************************** * conversion functions implementation * ***************************************/ inline batch to_int(const batch& x) { return _mm256_cvttps_epi32(x); } inline batch to_int(const batch& x) { #if defined(XSIMD_AVX512VL_AVAILABLE) & defined(XSIMD_AVX512DQ_AVAILABLE) return _mm256_cvttpd_epi64(x); #else return batch(static_cast(x[0]), static_cast(x[1]), static_cast(x[2]), static_cast(x[3])); #endif } inline batch to_float(const batch& x) { return _mm256_cvtepi32_ps(x); } inline batch to_float(const batch& x) { #if defined(XSIMD_AVX512VL_AVAILABLE) & defined(XSIMD_AVX512DQ_AVAILABLE) return _mm256_cvtepi64_pd(x); #else return batch(static_cast(x[0]), static_cast(x[1]), static_cast(x[2]), static_cast(x[3])); #endif } inline batch u8_to_u16(const batch& x) { return static_cast>(x); } inline batch u16_to_u8(const batch& x) { return static_cast>(x); } inline batch u8_to_u32(const batch& x) { return static_cast>(x); } inline batch u32_to_u8(const batch& x) { return static_cast>(x); } inline batch u8_to_u64(const batch& x) { return static_cast>(x); } inline batch u64_to_u8(const batch& x) { return static_cast>(x); } /***************************************** * batch cast functions implementation * *****************************************/ XSIMD_BATCH_CAST_IMPLICIT(int8_t, uint8_t, 32) XSIMD_BATCH_CAST_IMPLICIT(uint8_t, int8_t, 32) XSIMD_BATCH_CAST_IMPLICIT(int16_t, uint16_t, 16) XSIMD_BATCH_CAST_IMPLICIT(uint16_t, int16_t, 16) XSIMD_BATCH_CAST_IMPLICIT(int32_t, uint32_t, 8) XSIMD_BATCH_CAST_INTRINSIC(int32_t, float, 8, _mm256_cvtepi32_ps) XSIMD_BATCH_CAST_INTRINSIC(int32_t, double, 4, _mm256_cvtepi32_pd) XSIMD_BATCH_CAST_IMPLICIT(uint32_t, int32_t, 8) XSIMD_BATCH_CAST_IMPLICIT(int64_t, uint64_t, 4) XSIMD_BATCH_CAST_IMPLICIT(uint64_t, int64_t, 4) XSIMD_BATCH_CAST_INTRINSIC(float, int32_t, 8, _mm256_cvttps_epi32) XSIMD_BATCH_CAST_INTRINSIC(float, double, 4, _mm256_cvtps_pd) XSIMD_BATCH_CAST_INTRINSIC(double, int32_t, 4, _mm256_cvttpd_epi32) XSIMD_BATCH_CAST_INTRINSIC(double, float, 4, _mm256_cvtpd_ps) #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION XSIMD_BATCH_CAST_INTRINSIC(int8_t, int16_t, 16, _mm256_cvtepi8_epi16) XSIMD_BATCH_CAST_INTRINSIC(int8_t, uint16_t, 16, _mm256_cvtepi8_epi16) XSIMD_BATCH_CAST_INTRINSIC(uint8_t, int16_t, 16, _mm256_cvtepu8_epi16) XSIMD_BATCH_CAST_INTRINSIC(uint8_t, uint16_t, 16, _mm256_cvtepu8_epi16) XSIMD_BATCH_CAST_INTRINSIC(int16_t, int32_t, 8, _mm256_cvtepi16_epi32) XSIMD_BATCH_CAST_INTRINSIC(int16_t, uint32_t, 8, _mm256_cvtepi16_epi32) XSIMD_BATCH_CAST_INTRINSIC2(int16_t, float, 8, _mm256_cvtepi16_epi32, _mm256_cvtepi32_ps) XSIMD_BATCH_CAST_INTRINSIC(uint16_t, int32_t, 8, _mm256_cvtepu16_epi32) XSIMD_BATCH_CAST_INTRINSIC(uint16_t, uint32_t, 8, _mm256_cvtepu16_epi32) XSIMD_BATCH_CAST_INTRINSIC2(uint16_t, float, 8, _mm256_cvtepu16_epi32, _mm256_cvtepi32_ps) XSIMD_BATCH_CAST_INTRINSIC(int32_t, int64_t, 4, _mm256_cvtepi32_epi64) XSIMD_BATCH_CAST_INTRINSIC(int32_t, uint64_t, 4, _mm256_cvtepi32_epi64) XSIMD_BATCH_CAST_INTRINSIC(uint32_t, int64_t, 4, _mm256_cvtepu32_epi64) XSIMD_BATCH_CAST_INTRINSIC(uint32_t, uint64_t, 4, _mm256_cvtepu32_epi64) #endif #if defined(XSIMD_AVX512VL_AVAILABLE) #if defined(XSIMD_AVX512BW_AVAILABLE) XSIMD_BATCH_CAST_INTRINSIC(int16_t, int8_t, 16, _mm256_cvtepi16_epi8) XSIMD_BATCH_CAST_INTRINSIC(int16_t, uint8_t, 16, _mm256_cvtepi16_epi8) XSIMD_BATCH_CAST_INTRINSIC(uint16_t, int8_t, 16, _mm256_cvtepi16_epi8) XSIMD_BATCH_CAST_INTRINSIC(uint16_t, uint8_t, 16, _mm256_cvtepi16_epi8) #endif XSIMD_BATCH_CAST_INTRINSIC(int32_t, int16_t, 8, _mm256_cvtepi32_epi16) XSIMD_BATCH_CAST_INTRINSIC(int32_t, uint16_t, 8, _mm256_cvtepi32_epi16) XSIMD_BATCH_CAST_INTRINSIC(uint32_t, int16_t, 8, _mm256_cvtepi32_epi16) XSIMD_BATCH_CAST_INTRINSIC(uint32_t, uint16_t, 8, _mm256_cvtepi32_epi16) #if defined(_MSC_VER) namespace detail { static inline __m256 xsimd_mm256_cvtepu32_ps(__m256i a) { return _mm512_castps512_ps256(_mm512_cvtepu32_ps(_mm512_castsi256_si512(a))); } } XSIMD_BATCH_CAST_INTRINSIC(uint32_t, float, 8, detail::xsimd_mm256_cvtepu32_ps) #else XSIMD_BATCH_CAST_INTRINSIC(uint32_t, float, 8, _mm256_cvtepu32_ps) #endif XSIMD_BATCH_CAST_INTRINSIC(uint32_t, double, 4, _mm256_cvtepu32_pd) XSIMD_BATCH_CAST_INTRINSIC(int64_t, int32_t, 4, _mm256_cvtepi64_epi32) XSIMD_BATCH_CAST_INTRINSIC(int64_t, uint32_t, 4, _mm256_cvtepi64_epi32) XSIMD_BATCH_CAST_INTRINSIC(uint64_t, int32_t, 4, _mm256_cvtepi64_epi32) XSIMD_BATCH_CAST_INTRINSIC(uint64_t, uint32_t, 4, _mm256_cvtepi64_epi32) XSIMD_BATCH_CAST_INTRINSIC2(float, int16_t, 8, _mm256_cvttps_epi32, _mm256_cvtepi32_epi16) XSIMD_BATCH_CAST_INTRINSIC2(float, uint16_t, 8, _mm256_cvttps_epi32, _mm256_cvtepi32_epi16) XSIMD_BATCH_CAST_INTRINSIC(float, uint32_t, 8, _mm256_cvttps_epu32) XSIMD_BATCH_CAST_INTRINSIC(double, uint32_t, 4, _mm256_cvttpd_epu32) #if defined(XSIMD_AVX512DQ_AVAILABLE) XSIMD_BATCH_CAST_INTRINSIC(int64_t, float, 4, _mm256_cvtepi64_ps) XSIMD_BATCH_CAST_INTRINSIC(int64_t, double, 4, _mm256_cvtepi64_pd) XSIMD_BATCH_CAST_INTRINSIC(uint64_t, float, 4, _mm256_cvtepu64_ps) XSIMD_BATCH_CAST_INTRINSIC(uint64_t, double, 4, _mm256_cvtepu64_pd) XSIMD_BATCH_CAST_INTRINSIC(float, int64_t, 4, _mm256_cvttps_epi64) XSIMD_BATCH_CAST_INTRINSIC(float, uint64_t, 4, _mm256_cvttps_epu64) XSIMD_BATCH_CAST_INTRINSIC(double, int64_t, 4, _mm256_cvttpd_epi64) XSIMD_BATCH_CAST_INTRINSIC(double, uint64_t, 4, _mm256_cvttpd_epu64) #endif #endif /************************** * boolean cast functions * **************************/ inline batch_bool bool_cast(const batch_bool& x) { return _mm256_castps_si256(x); } inline batch_bool bool_cast(const batch_bool& x) { return _mm256_castpd_si256(x); } inline batch_bool bool_cast(const batch_bool& x) { return _mm256_castsi256_ps(x); } inline batch_bool bool_cast(const batch_bool& x) { return _mm256_castsi256_pd(x); } /***************************************** * bitwise cast functions implementation * *****************************************/ XSIMD_BITWISE_CAST_INTRINSIC(float, 8, double, 4, _mm256_castps_pd) XSIMD_BITWISE_CAST_INTRINSIC(float, 8, int32_t, 8, _mm256_castps_si256) XSIMD_BITWISE_CAST_INTRINSIC(float, 8, int64_t, 4, _mm256_castps_si256) XSIMD_BITWISE_CAST_INTRINSIC(double, 4, float, 8, _mm256_castpd_ps) XSIMD_BITWISE_CAST_INTRINSIC(double, 4, int32_t, 8, _mm256_castpd_si256) XSIMD_BITWISE_CAST_INTRINSIC(double, 4, int64_t, 4, _mm256_castpd_si256) XSIMD_BITWISE_CAST_INTRINSIC(int32_t, 8, float, 8, _mm256_castsi256_ps) XSIMD_BITWISE_CAST_INTRINSIC(int32_t, 8, double, 4, _mm256_castsi256_pd) XSIMD_BITWISE_CAST_INTRINSIC(int64_t, 4, float, 8, _mm256_castsi256_ps) XSIMD_BITWISE_CAST_INTRINSIC(int64_t, 4, double, 4, _mm256_castsi256_pd) } #endif xsimd-7.6.0/include/xsimd/types/xsimd_avx_double.hpp000066400000000000000000000536161410101234500226320ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX_DOUBLE_HPP #define XSIMD_AVX_DOUBLE_HPP #include "xsimd_base.hpp" #include namespace xsimd { /************************* * batch_bool * *************************/ template <> struct simd_batch_traits> { using value_type = double; static constexpr std::size_t size = 4; using batch_type = batch; static constexpr std::size_t align = 32; }; template <> class batch_bool : public simd_batch_bool> { public: batch_bool(); explicit batch_bool(bool b); batch_bool(bool b0, bool b1, bool b2, bool b3); batch_bool(const __m256d& rhs); batch_bool& operator=(const __m256d& rhs); operator __m256d() const; bool_proxy operator[](std::size_t index); bool operator[](std::size_t index) const; __m256d get_value() const; private: batch_bool& load_values(bool b0, bool b1, bool b2, bool b3); union { __m256d m_value; double m_array[4]; }; friend class simd_batch_bool>; }; /******************** * batch * ********************/ template <> struct simd_batch_traits> { using value_type = double; static constexpr std::size_t size = 4; using batch_bool_type = batch_bool; static constexpr std::size_t align = 32; using storage_type = __m256d; }; template <> class batch : public simd_batch> { public: using self_type = batch; using base_type = simd_batch; using batch_bool_type = typename base_type::batch_bool_type; batch(); explicit batch(double d); batch(double d0, double d1, double d2, double d3); explicit batch(const double* src); batch(const double* src, aligned_mode); batch(const double* src, unaligned_mode); batch(const __m256d& rhs); batch& operator=(const __m256d& rhs); batch(const batch_bool_type& rhs); batch& operator=(const batch_bool_type& rhs); operator __m256d() const; XSIMD_DECLARE_LOAD_STORE_ALL(double, 4) XSIMD_DECLARE_LOAD_STORE_LONG(double, 4) using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; }; /**************************************** * batch_bool implementation * ****************************************/ inline batch_bool::batch_bool() { } inline batch_bool::batch_bool(bool b) { m_value = _mm256_castsi256_pd(_mm256_set1_epi32(-(int)b)); } inline batch_bool::batch_bool(bool b0, bool b1, bool b2, bool b3) { m_value = _mm256_castsi256_pd( _mm256_setr_epi32(-(int)b0, -(int)b0, -(int)b1, -(int)b1, -(int)b2, -(int)b2, -(int)b3, -(int)b3)); } inline batch_bool::batch_bool(const __m256d& rhs) { m_value = rhs; } inline batch_bool& batch_bool::operator=(const __m256d& rhs) { m_value = rhs; return *this; } inline batch_bool::operator __m256d() const { return m_value; } inline bool_proxy batch_bool::operator[](std::size_t index) { return bool_proxy(m_array[index & 3]); } inline bool batch_bool::operator[](std::size_t index) const { return static_cast(m_array[index & 3]); } inline __m256d batch_bool::get_value() const { return m_value; } inline batch_bool& batch_bool::load_values(bool b0, bool b1, bool b2, bool b3) { m_value = _mm256_castsi256_pd( _mm256_setr_epi32(-(int)b0, -(int)b0, -(int)b1, -(int)b1, -(int)b2, -(int)b2, -(int)b3, -(int)b3)); return *this; } namespace detail { template <> struct batch_bool_kernel { using batch_type = batch_bool; static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return _mm256_and_pd(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return _mm256_or_pd(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return _mm256_xor_pd(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return _mm256_xor_pd(rhs, _mm256_castsi256_pd(_mm256_set1_epi32(-1))); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return _mm256_andnot_pd(lhs, rhs); } static batch_type equal(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_castsi256_pd(_mm256_cmpeq_epi64(_mm256_castpd_si256(lhs), _mm256_castpd_si256(rhs))); #else __m128i lhs_low = _mm256_castsi256_si128(_mm256_castpd_si256(lhs)); __m128i lhs_high = _mm256_extractf128_si256(_mm256_castpd_si256(lhs), 1); __m128i rhs_low = _mm256_castsi256_si128(_mm256_castpd_si256(rhs)); __m128i rhs_high = _mm256_extractf128_si256(_mm256_castpd_si256(rhs), 1); __m128i res_low = _mm_cmpeq_epi64(lhs_low, rhs_low); __m128i res_high = _mm_cmpeq_epi64(lhs_high, rhs_high); __m256i result = _mm256_castsi128_si256(res_low); return _mm256_castsi256_pd(_mm256_insertf128_si256(result, res_high, 1)); #endif } static batch_type not_equal(const batch_type& lhs, const batch_type& rhs) { return _mm256_xor_pd(lhs, rhs); } static bool all(const batch_type& rhs) { return _mm256_testc_pd(rhs, batch_bool(true)) != 0; } static bool any(const batch_type& rhs) { return !_mm256_testz_pd(rhs, rhs); } }; } /*********************************** * batch implementation * ***********************************/ inline batch::batch() { } inline batch::batch(double d) : base_type(_mm256_set1_pd(d)) { } inline batch::batch(double d0, double d1, double d2, double d3) : base_type(_mm256_setr_pd(d0, d1, d2, d3)) { } inline batch::batch(const double* src) : base_type(_mm256_loadu_pd(src)) { } inline batch::batch(const double* src, aligned_mode) : base_type(_mm256_load_pd(src)) { } inline batch::batch(const double* src, unaligned_mode) : base_type(_mm256_loadu_pd(src)) { } inline batch::batch(const __m256d& rhs) : base_type(rhs) { } inline batch& batch::operator=(const __m256d& rhs) { this->m_value = rhs; return *this; } inline batch::batch(const batch_bool_type& rhs) : base_type(_mm256_and_pd(rhs, batch(1.))) { } inline batch& batch::operator=(const batch_bool_type& rhs) { this->m_value = _mm256_and_pd(rhs, batch(1.)); return *this; } inline batch::operator __m256d() const { return this->m_value; } XSIMD_DEFINE_LOAD_STORE(double, 4, bool, 32) inline batch& batch::load_aligned(const int8_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); __m128i tmp2 = _mm_cvtepi8_epi32(tmp); this->m_value = _mm256_cvtepi32_pd(tmp2); return *this; } inline batch& batch::load_unaligned(const int8_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const uint8_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); __m128i tmp2 = _mm_cvtepu8_epi32(tmp); this->m_value = _mm256_cvtepi32_pd(tmp2); return *this; } inline batch& batch::load_unaligned(const uint8_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const int16_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); __m128i tmp2 = _mm_cvtepi16_epi32(tmp); this->m_value = _mm256_cvtepi32_pd(tmp2); return *this; } inline batch& batch::load_unaligned(const int16_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const uint16_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); __m128i tmp2 = _mm_cvtepu16_epi32(tmp); this->m_value = _mm256_cvtepi32_pd(tmp2); return *this; } inline batch& batch::load_unaligned(const uint16_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const int32_t* src) { this->m_value = _mm256_cvtepi32_pd(_mm_load_si128((__m128i const*)src)); return *this; } inline batch& batch::load_unaligned(const int32_t* src) { this->m_value = _mm256_cvtepi32_pd(_mm_loadu_si128((__m128i const*)src)); return *this; } XSIMD_DEFINE_LOAD_STORE(double, 4, uint32_t, 32) XSIMD_DEFINE_LOAD_STORE(double, 4, int64_t, 32) XSIMD_DEFINE_LOAD_STORE(double, 4, uint64_t, 32) XSIMD_DEFINE_LOAD_STORE_LONG(double, 4, 32) inline batch& batch::load_aligned(const float* src) { this->m_value = _mm256_cvtps_pd(_mm_load_ps(src)); return *this; } inline batch& batch::load_unaligned(const float* src) { this->m_value = _mm256_cvtps_pd(_mm_loadu_ps(src)); return *this; } inline batch& batch::load_aligned(const double* src) { this->m_value = _mm256_load_pd(src); return *this; } inline batch& batch::load_unaligned(const double* src) { this->m_value = _mm256_loadu_pd(src); return *this; } inline void batch::store_aligned(int8_t* dst) const { __m128i tmp = _mm256_cvtpd_epi32(this->m_value); __m128i tmp1 = _mm_packs_epi32(tmp, _mm_set1_epi32(0)); __m128i tmp2 = _mm_packs_epi16(tmp1, _mm_set1_epi16(0)); _mm_storel_epi64((__m128i*)dst, tmp2); } inline void batch::store_unaligned(int8_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint8_t* dst) const { __m128i tmp = _mm256_cvtpd_epi32(this->m_value); __m128i tmp1 = _mm_packs_epi32(tmp, _mm_set1_epi32(0)); __m128i tmp2 = _mm_packus_epi16(tmp1, _mm_set1_epi16(0)); _mm_storel_epi64((__m128i*)dst, tmp2); } inline void batch::store_unaligned(uint8_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(int16_t* dst) const { __m128i tmp = _mm256_cvtpd_epi32(this->m_value); __m128i tmp1 = _mm_packs_epi32(tmp, _mm_set1_epi32(0)); _mm_storel_epi64((__m128i*)dst, tmp1); } inline void batch::store_unaligned(int16_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint16_t* dst) const { __m128i tmp = _mm256_cvtpd_epi32(this->m_value); __m128i tmp1 = _mm_packs_epi32(tmp, _mm_set1_epi32(0)); _mm_storel_epi64((__m128i*)dst, tmp1); } inline void batch::store_unaligned(uint16_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(int32_t* dst) const { _mm_store_si128((__m128i*)dst, _mm256_cvtpd_epi32(this->m_value)); } inline void batch::store_unaligned(int32_t* dst) const { _mm_storeu_si128((__m128i*)dst, _mm256_cvtpd_epi32(this->m_value)); } inline void batch::store_aligned(float* dst) const { _mm_store_ps(dst, _mm256_cvtpd_ps(this->m_value)); } inline void batch::store_unaligned(float* dst) const { _mm_storeu_ps(dst, _mm256_cvtpd_ps(this->m_value)); } inline void batch::store_aligned(double* dst) const { _mm256_store_pd(dst, this->m_value); } inline void batch::store_unaligned(double* dst) const { _mm256_storeu_pd(dst, this->m_value); } namespace detail { template <> struct batch_kernel { using batch_type = batch; using value_type = double; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { return _mm256_xor_pd(rhs, _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000))); } static batch_type add(const batch_type& lhs, const batch_type& rhs) { return _mm256_add_pd(lhs, rhs); } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { return _mm256_sub_pd(lhs, rhs); } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { return add(lhs, rhs); //FIXME something special for inf ? } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return sub(lhs,rhs); //FIXME something special for inf ? } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { return _mm256_mul_pd(lhs, rhs); } static batch_type div(const batch_type& lhs, const batch_type& rhs) { return _mm256_div_pd(lhs, rhs); } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { return _mm256_cmp_pd(lhs, rhs, _CMP_EQ_OQ); } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { return _mm256_cmp_pd(lhs, rhs, _CMP_NEQ_OQ); } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { return _mm256_cmp_pd(lhs, rhs, _CMP_LT_OQ); } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { return _mm256_cmp_pd(lhs, rhs, _CMP_LE_OQ); } static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return _mm256_and_pd(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return _mm256_or_pd(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return _mm256_xor_pd(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return _mm256_xor_pd(rhs, _mm256_castsi256_pd(_mm256_set1_epi32(-1))); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return _mm256_andnot_pd(lhs, rhs); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return _mm256_min_pd(lhs, rhs); } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return _mm256_max_pd(lhs, rhs); } static batch_type fmin(const batch_type& lhs, const batch_type& rhs) { return min(lhs, rhs); } static batch_type fmax(const batch_type& lhs, const batch_type& rhs) { return max(lhs, rhs); } static batch_type abs(const batch_type& rhs) { __m256d sign_mask = _mm256_set1_pd(-0.); // -0. = 1 << 63 return _mm256_andnot_pd(sign_mask, rhs); } static batch_type fabs(const batch_type& rhs) { return abs(rhs); } static batch_type sqrt(const batch_type& rhs) { return _mm256_sqrt_pd(rhs); } static batch_type fma(const batch_type& x, const batch_type& y, const batch_type& z) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_FMA3_VERSION return _mm256_fmadd_pd(x, y, z); #elif XSIMD_X86_INSTR_SET >= XSIMD_X86_AMD_FMA4_VERSION return _mm256_macc_pd(x, y, z); #else return x * y + z; #endif } static batch_type fms(const batch_type& x, const batch_type& y, const batch_type& z) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_FMA3_VERSION return _mm256_fmsub_pd(x, y, z); #elif XSIMD_X86_INSTR_SET >= XSIMD_X86_AMD_FMA4_VERSION return _mm256_msub_pd(x, y, z); #else return x * y - z; #endif } static batch_type fnma(const batch_type& x, const batch_type& y, const batch_type& z) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_FMA3_VERSION return _mm256_fnmadd_pd(x, y, z); #elif XSIMD_X86_INSTR_SET >= XSIMD_X86_AMD_FMA4_VERSION return _mm256_nmacc_pd(x, y, z); #else return -x * y + z; #endif } static batch_type fnms(const batch_type& x, const batch_type& y, const batch_type& z) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_FMA3_VERSION return _mm256_fnmsub_pd(x, y, z); #elif XSIMD_X86_INSTR_SET >= XSIMD_X86_AMD_FMA4_VERSION return _mm256_nmsub_pd(x, y, z); #else return -x * y - z; #endif } static value_type hadd(const batch_type& rhs) { // rhs = (x0, x1, x2, x3) // tmp = (x2, x3, x0, x1) __m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1); // tmp = (x2+x0, x3+x1, -, -) tmp = _mm256_add_pd(rhs, tmp); // tmp = (x2+x0+x3+x1, -, -, -) tmp = _mm256_hadd_pd(tmp, tmp); return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0)); } static batch_type haddp(const batch_type* row) { // row = (a,b,c,d) // tmp0 = (a0+a1, b0+b1, a2+a3, b2+b3) __m256d tmp0 = _mm256_hadd_pd(row[0], row[1]); // tmp1 = (c0+c1, d0+d1, c2+c3, d2+d3) __m256d tmp1 = _mm256_hadd_pd(row[2], row[3]); // tmp2 = (a0+a1, b0+b1, c2+c3, d2+d3) __m256d tmp2 = _mm256_blend_pd(tmp0, tmp1, 0b1100); // tmp1 = (a2+a3, b2+b3, c2+c3, d2+d3) tmp1 = _mm256_permute2f128_pd(tmp0, tmp1, 0x21); return _mm256_add_pd(tmp1, tmp2); } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { return _mm256_blendv_pd(b, a, cond); } template static batch_type select(const batch_bool_constant&, const batch_type& a, const batch_type& b) { constexpr int mask = batch_bool_constant::mask(); return _mm256_blend_pd(b, a, mask); } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { return _mm256_unpacklo_pd(lhs, rhs); } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { return _mm256_unpackhi_pd(lhs, rhs); } static batch_type extract_pair(const batch_type& lhs, const batch_type& rhs, const int n) { batch_type b_concatenate; for (int i = 0 ; i < (4 - n); ++i) { b_concatenate[i] = lhs[i + n]; if(i < n) { b_concatenate[4 - 1 - i] = rhs[n - 1 - i]; } } return b_concatenate; } static batch_bool_type isnan(const batch_type& x) { return _mm256_cmp_pd(x, x, _CMP_UNORD_Q); } }; } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_avx_float.hpp000066400000000000000000000613351410101234500224620ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX_FLOAT_HPP #define XSIMD_AVX_FLOAT_HPP #include "xsimd_base.hpp" #include "xsimd_int_conversion.hpp" #include namespace xsimd { /************************ * batch_bool * ************************/ template <> struct simd_batch_traits> { using value_type = float; static constexpr std::size_t size = 8; using batch_type = batch; static constexpr std::size_t align = 32; }; template <> class batch_bool : public simd_batch_bool> { public: batch_bool(); explicit batch_bool(bool b); batch_bool(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7); batch_bool(const __m256& rhs); batch_bool& operator=(const __m256& rhs); operator __m256() const; bool_proxy operator[](std::size_t index); bool operator[](std::size_t index) const; __m256 get_value() const; private: batch_bool& load_values(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7); union { __m256 m_value; float m_array[8]; }; friend class simd_batch_bool>; }; /******************* * batch * *******************/ template <> struct simd_batch_traits> { using value_type = float; static constexpr std::size_t size = 8; using batch_bool_type = batch_bool; static constexpr std::size_t align = 32; using storage_type = __m256; }; template <> class batch : public simd_batch> { public: using self_type = batch; using base_type = simd_batch; using batch_bool_type = typename base_type::batch_bool_type; batch(); explicit batch(float f); batch(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7); explicit batch(const float* src); batch(const float* src, aligned_mode); batch(const float* src, unaligned_mode); batch(const __m256& rhs); batch& operator=(const __m256& rhs); batch(const batch_bool_type& rhs); batch& operator=(const batch_bool_type& rhs); operator __m256() const; XSIMD_DECLARE_LOAD_STORE_ALL(float, 8) XSIMD_DECLARE_LOAD_STORE_LONG(float, 8) using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; }; /*************************************** * batch_bool implementation * ***************************************/ inline batch_bool::batch_bool() { } inline batch_bool::batch_bool(bool b) { m_value = _mm256_castsi256_ps(_mm256_set1_epi32(-(int)b)); } inline batch_bool::batch_bool(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7) { m_value = _mm256_castsi256_ps( _mm256_setr_epi32(-(int)b0, -(int)b1, -(int)b2, -(int)b3, -(int)b4, -(int)b5, -(int)b6, -(int)b7)); } inline batch_bool::batch_bool(const __m256& rhs) { m_value = rhs; } inline batch_bool& batch_bool::operator=(const __m256& rhs) { m_value = rhs; return *this; } inline batch::batch(const batch_bool_type& rhs) : base_type(_mm256_and_ps(rhs, batch(1.f))) { } inline batch& batch::operator=(const batch_bool_type& rhs) { this->m_value = _mm256_and_ps(rhs, batch(1.f)); return *this; } inline batch_bool::operator __m256() const { return m_value; } inline bool_proxy batch_bool::operator[](std::size_t index) { return bool_proxy(m_array[index & 7]); } inline bool batch_bool::operator[](std::size_t index) const { return static_cast(m_array[index & 7]); } inline __m256 batch_bool::get_value() const { return m_value; } inline batch_bool& batch_bool::load_values(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7) { m_value = _mm256_castsi256_ps( _mm256_setr_epi32(-(int)b0, -(int)b1, -(int)b2, -(int)b3, -(int)b4, -(int)b5, -(int)b6, -(int)b7)); return *this; } namespace detail { template <> struct batch_bool_kernel { using batch_type = batch_bool; static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return _mm256_and_ps(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return _mm256_or_ps(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return _mm256_xor_ps(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return _mm256_xor_ps(rhs, _mm256_castsi256_ps(_mm256_set1_epi32(-1))); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return _mm256_andnot_ps(lhs, rhs); } static batch_type equal(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_castsi256_ps(_mm256_cmpeq_epi32(_mm256_castps_si256(lhs), _mm256_castps_si256(rhs))); #else __m128i lhs_low = _mm256_castsi256_si128(_mm256_castps_si256(lhs)); __m128i lhs_high = _mm256_extractf128_si256(_mm256_castps_si256(lhs), 1); __m128i rhs_low = _mm256_castsi256_si128(_mm256_castps_si256(rhs)); __m128i rhs_high = _mm256_extractf128_si256(_mm256_castps_si256(rhs), 1); __m128i res_low = _mm_cmpeq_epi32(lhs_low, rhs_low); __m128i res_high = _mm_cmpeq_epi32(lhs_high, rhs_high); __m256i result = _mm256_castsi128_si256(res_low); return _mm256_castsi256_ps(_mm256_insertf128_si256(result, res_high, 1)); #endif } static batch_type not_equal(const batch_type& lhs, const batch_type& rhs) { return _mm256_xor_ps(lhs, rhs); } static bool all(const batch_type& rhs) { return _mm256_testc_ps(rhs, batch_bool(true)) != 0; } static bool any(const batch_type& rhs) { return !_mm256_testz_ps(rhs, rhs); } }; } /********************************** * batch implementation * **********************************/ inline batch::batch() { } inline batch::batch(float f) : base_type(_mm256_set1_ps(f)) { } inline batch::batch(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7) : base_type(_mm256_setr_ps(f0, f1, f2, f3, f4, f5, f6, f7)) { } inline batch::batch(const float* src) : base_type(_mm256_loadu_ps(src)) { } inline batch::batch(const float* src, aligned_mode) : base_type(_mm256_load_ps(src)) { } inline batch::batch(const float* src, unaligned_mode) : base_type(_mm256_loadu_ps(src)) { } inline batch::batch(const __m256& rhs) : base_type(rhs) { } inline batch& batch::operator=(const __m256& rhs) { this->m_value = rhs; return *this; } inline batch::operator __m256() const { return this->m_value; } XSIMD_DEFINE_LOAD_STORE(float, 8, bool, 32) inline batch& batch::load_aligned(const int8_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); __m256i tmp2 = detail::xsimd_cvtepi8_epi32(tmp); this->m_value = _mm256_cvtepi32_ps(tmp2); return *this; } inline batch& batch::load_unaligned(const int8_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const uint8_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); __m256i tmp2 = detail::xsimd_cvtepu8_epi32(tmp); this->m_value = _mm256_cvtepi32_ps(tmp2); return *this; } inline batch& batch::load_unaligned(const uint8_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const int16_t* src) { __m128i tmp = _mm_load_si128((const __m128i*)src); __m256i tmp2 = detail::xsimd_cvtepi16_epi32(tmp); this->m_value = _mm256_cvtepi32_ps(tmp2); return *this; } inline batch& batch::load_unaligned(const int16_t* src) { __m128i tmp = _mm_loadu_si128((const __m128i*)src); __m256i tmp2 = detail::xsimd_cvtepi16_epi32(tmp); this->m_value = _mm256_cvtepi32_ps(tmp2); return *this; } inline batch& batch::load_aligned(const uint16_t* src) { __m128i tmp = _mm_load_si128((const __m128i*)src); __m256i tmp2 = detail::xsimd_cvtepu16_epi32(tmp); this->m_value = _mm256_cvtepi32_ps(tmp2); return *this; } inline batch& batch::load_unaligned(const uint16_t* src) { __m128i tmp = _mm_loadu_si128((const __m128i*)src); __m256i tmp2 = detail::xsimd_cvtepu16_epi32(tmp); this->m_value = _mm256_cvtepi32_ps(tmp2); return *this; } inline batch& batch::load_aligned(const int32_t* src) { this->m_value = _mm256_cvtepi32_ps(_mm256_load_si256((__m256i const*)src)); return *this; } inline batch& batch::load_unaligned(const int32_t* src) { this->m_value = _mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i const*)src)); return *this; } XSIMD_DEFINE_LOAD_STORE(float, 8, uint32_t, 32) XSIMD_DEFINE_LOAD_STORE(float, 8, int64_t, 32) XSIMD_DEFINE_LOAD_STORE(float, 8, uint64_t, 32) XSIMD_DEFINE_LOAD_STORE_LONG(float, 8, 32) inline batch& batch::load_aligned(const float* src) { this->m_value = _mm256_load_ps(src); return *this; } inline batch& batch::load_unaligned(const float* src) { this->m_value = _mm256_loadu_ps(src); return *this; } inline batch& batch::load_aligned(const double* src) { __m128 tmp1 = _mm256_cvtpd_ps(_mm256_load_pd(src)); __m128 tmp2 = _mm256_cvtpd_ps(_mm256_load_pd(src + 4)); this->m_value = _mm256_castps128_ps256(tmp1); this->m_value = _mm256_insertf128_ps(this->m_value, tmp2, 1); return *this; } inline batch& batch::load_unaligned(const double* src) { __m128 tmp1 = _mm256_cvtpd_ps(_mm256_loadu_pd(src)); __m128 tmp2 = _mm256_cvtpd_ps(_mm256_loadu_pd(src + 4)); this->m_value = _mm256_castps128_ps256(tmp1); this->m_value = _mm256_insertf128_ps(this->m_value, tmp2, 1); return *this; } inline void batch::store_aligned(int8_t* dst) const { __m256i tmp = _mm256_cvtps_epi32(this->m_value); __m128i tmp2 = detail::xsimd_cvtepi32_epi8(tmp); _mm_storel_epi64((__m128i*)dst, tmp2); } inline void batch::store_unaligned(int8_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint8_t* dst) const { __m256i tmp = _mm256_cvtps_epi32(this->m_value); __m128i tmp2 = detail::xsimd_cvtepi32_epu8(tmp); _mm_storel_epi64((__m128i*)dst, tmp2); } inline void batch::store_unaligned(uint8_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(int16_t* dst) const { __m256i tmp = _mm256_cvtps_epi32(this->m_value); __m128i tmp2 = detail::xsimd_cvtepi32_epi16(tmp); _mm_store_si128((__m128i*)dst, tmp2); } inline void batch::store_unaligned(int16_t* dst) const { __m256i tmp = _mm256_cvtps_epi32(this->m_value); __m128i tmp2 = detail::xsimd_cvtepi32_epi16(tmp); _mm_storeu_si128((__m128i*)dst, tmp2); } inline void batch::store_aligned(uint16_t* dst) const { __m256i tmp = _mm256_cvtps_epi32(this->m_value); __m128i tmp2 = detail::xsimd_cvtepi32_epu16(tmp); _mm_store_si128((__m128i*)dst, tmp2); } inline void batch::store_unaligned(uint16_t* dst) const { __m256i tmp = _mm256_cvtps_epi32(this->m_value); __m128i tmp2 = detail::xsimd_cvtepi32_epu16(tmp); _mm_storeu_si128((__m128i*)dst, tmp2); } inline void batch::store_aligned(int32_t* dst) const { _mm256_store_si256((__m256i*)dst, _mm256_cvtps_epi32(this->m_value)); } inline void batch::store_unaligned(int32_t* dst) const { _mm256_storeu_si256((__m256i*)dst, _mm256_cvtps_epi32(this->m_value)); } inline void batch::store_aligned(float* dst) const { _mm256_store_ps(dst, this->m_value); } inline void batch::store_unaligned(float* dst) const { _mm256_storeu_ps(dst, this->m_value); } inline void batch::store_aligned(double* dst) const { alignas(32) float tmp[8]; _mm256_store_ps(tmp, this->m_value); dst[0] = static_cast(tmp[0]); dst[1] = static_cast(tmp[1]); dst[2] = static_cast(tmp[2]); dst[3] = static_cast(tmp[3]); dst[4] = static_cast(tmp[4]); dst[5] = static_cast(tmp[5]); dst[6] = static_cast(tmp[6]); dst[7] = static_cast(tmp[7]); } inline void batch::store_unaligned(double* dst) const { store_aligned(dst); } namespace detail { template <> struct batch_kernel { using batch_type = batch; using value_type = float; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { return _mm256_xor_ps(rhs, _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); } static batch_type add(const batch_type& lhs, const batch_type& rhs) { return _mm256_add_ps(lhs, rhs); } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { return _mm256_sub_ps(lhs, rhs); } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { return add(lhs, rhs); //FIXME something special for inf ? } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return sub(lhs,rhs); //FIXME something special for inf ? } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { return _mm256_mul_ps(lhs, rhs); } static batch_type div(const batch_type& lhs, const batch_type& rhs) { return _mm256_div_ps(lhs, rhs); } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { return _mm256_cmp_ps(lhs, rhs, _CMP_EQ_OQ); } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { return _mm256_cmp_ps(lhs, rhs, _CMP_NEQ_OQ); } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { return _mm256_cmp_ps(lhs, rhs, _CMP_LT_OQ); } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { return _mm256_cmp_ps(lhs, rhs, _CMP_LE_OQ); } static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return _mm256_and_ps(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return _mm256_or_ps(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return _mm256_xor_ps(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return _mm256_xor_ps(rhs, _mm256_castsi256_ps(_mm256_set1_epi32(-1))); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return _mm256_andnot_ps(lhs, rhs); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return _mm256_min_ps(lhs, rhs); } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return _mm256_max_ps(lhs, rhs); } static batch_type fmin(const batch_type& lhs, const batch_type& rhs) { return min(lhs, rhs); } static batch_type fmax(const batch_type& lhs, const batch_type& rhs) { return max(lhs, rhs); } static batch_type abs(const batch_type& rhs) { __m256 sign_mask = _mm256_set1_ps(-0.f); // -0.f = 1 << 31 return _mm256_andnot_ps(sign_mask, rhs); } static batch_type fabs(const batch_type& rhs) { return abs(rhs); } static batch_type sqrt(const batch_type& rhs) { return _mm256_sqrt_ps(rhs); } static batch_type fma(const batch_type& x, const batch_type& y, const batch_type& z) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_FMA3_VERSION return _mm256_fmadd_ps(x, y, z); #elif XSIMD_X86_INSTR_SET >= XSIMD_X86_AMD_FMA4_VERSION return _mm256_macc_ps(x, y, z); #else return x * y + z; #endif } static batch_type fms(const batch_type& x, const batch_type& y, const batch_type& z) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_FMA3_VERSION return _mm256_fmsub_ps(x, y, z); #elif XSIMD_X86_INSTR_SET >= XSIMD_X86_AMD_FMA4_VERSION return _mm256_msub_ps(x, y, z); #else return x * y - z; #endif } static batch_type fnma(const batch_type& x, const batch_type& y, const batch_type& z) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_FMA3_VERSION return _mm256_fnmadd_ps(x, y, z); #elif XSIMD_X86_INSTR_SET >= XSIMD_X86_AMD_FMA4_VERSION return _mm256_nmacc_ps(x, y, z); #else return -x * y + z; #endif } static batch_type fnms(const batch_type& x, const batch_type& y, const batch_type& z) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_FMA3_VERSION return _mm256_fnmsub_ps(x, y, z); #elif XSIMD_X86_INSTR_SET >= XSIMD_X86_AMD_FMA4_VERSION return _mm256_nmsub_ps(x, y, z); #else return -x * y - z; #endif } static value_type hadd(const batch_type& rhs) { // Warning about _mm256_hadd_ps: // _mm256_hadd_ps(a,b) gives // (a0+a1,a2+a3,b0+b1,b2+b3,a4+a5,a6+a7,b4+b5,b6+b7). Hence we can't // rely on a naive use of this method // rhs = (x0, x1, x2, x3, x4, x5, x6, x7) // tmp = (x4, x5, x6, x7, x0, x1, x2, x3) __m256 tmp = _mm256_permute2f128_ps(rhs, rhs, 1); // tmp = (x4+x0, x5+x1, x6+x2, x7+x3, x0+x4, x1+x5, x2+x6, x3+x7) tmp = _mm256_add_ps(rhs, tmp); // tmp = (x4+x0+x5+x1, x6+x2+x7+x3, -, -, -, -, -, -) tmp = _mm256_hadd_ps(tmp, tmp); // tmp = (x4+x0+x5+x1+x6+x2+x7+x3, -, -, -, -, -, -, -) tmp = _mm256_hadd_ps(tmp, tmp); return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0)); } static batch_type haddp(const batch_type* row) { // row = (a,b,c,d,e,f,g,h) // tmp0 = (a0+a1, a2+a3, b0+b1, b2+b3, a4+a5, a6+a7, b4+b5, b6+b7) __m256 tmp0 = _mm256_hadd_ps(row[0], row[1]); // tmp1 = (c0+c1, c2+c3, d1+d2, d2+d3, c4+c5, c6+c7, d4+d5, d6+d7) __m256 tmp1 = _mm256_hadd_ps(row[2], row[3]); // tmp1 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3, // a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7) tmp1 = _mm256_hadd_ps(tmp0, tmp1); // tmp0 = (e0+e1, e2+e3, f0+f1, f2+f3, e4+e5, e6+e7, f4+f5, f6+f7) tmp0 = _mm256_hadd_ps(row[4], row[5]); // tmp2 = (g0+g1, g2+g3, h0+h1, h2+h3, g4+g5, g6+g7, h4+h5, h6+h7) __m256 tmp2 = _mm256_hadd_ps(row[6], row[7]); // tmp2 = (e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3, // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7) tmp2 = _mm256_hadd_ps(tmp0, tmp2); // tmp0 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3, // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7) tmp0 = _mm256_blend_ps(tmp1, tmp2, 0b11110000); // tmp1 = (a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7, // e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3) tmp1 = _mm256_permute2f128_ps(tmp1, tmp2, 0x21); return _mm256_add_ps(tmp0, tmp1); } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { return _mm256_blendv_ps(b, a, cond); } template static batch_type select(const batch_bool_constant&, const batch_type& a, const batch_type& b) { constexpr int mask = batch_bool_constant::mask(); return _mm256_blend_ps(b, a, mask); } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { return _mm256_unpacklo_ps(lhs, rhs); } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { return _mm256_unpackhi_ps(lhs, rhs); } static batch_type extract_pair(const batch_type& lhs, const batch_type& rhs, const int n) { batch_type b_concatenate; for (int i = 0 ; i < (8 - n); ++i) { b_concatenate[i] = lhs[i + n]; if(i < n) { b_concatenate[8 - 1 - i] = rhs[n - 1 - i]; } } return b_concatenate; } static batch_bool_type isnan(const batch_type& x) { return _mm256_cmp_ps(x, x, _CMP_UNORD_Q); } }; } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_avx_int16.hpp000066400000000000000000000415171410101234500223160ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX_INT16_HPP #define XSIMD_AVX_INT16_HPP #include #include "xsimd_base.hpp" #include "xsimd_avx_int_base.hpp" namespace xsimd { /*************************** * batch_bool * ***************************/ template <> struct simd_batch_traits> { using value_type = int16_t; static constexpr std::size_t size = 16; using batch_type = batch; static constexpr std::size_t align = 32; }; template <> struct simd_batch_traits> { using value_type = uint16_t; static constexpr std::size_t size = 16; using batch_type = batch; static constexpr std::size_t align = 32; }; template <> class batch_bool : public avx_int_batch_bool { public: using avx_int_batch_bool::avx_int_batch_bool; }; template <> class batch_bool : public avx_int_batch_bool { public: using avx_int_batch_bool::avx_int_batch_bool; }; namespace detail { template <> struct batch_bool_kernel : public avx_int_batch_bool_kernel { }; template <> struct batch_bool_kernel : public avx_int_batch_bool_kernel { }; } /********************** * batch * **********************/ template <> struct simd_batch_traits> { using value_type = int16_t; static constexpr std::size_t size = 16; using batch_bool_type = batch_bool; static constexpr std::size_t align = 32; using storage_type = __m256i; }; template <> struct simd_batch_traits> { using value_type = uint16_t; static constexpr std::size_t size = 16; using batch_bool_type = batch_bool; static constexpr std::size_t align = 32; using storage_type = __m256i; }; template <> class batch : public avx_int_batch { public: using base_class = avx_int_batch; using base_class::base_class; using base_class::load_aligned; using base_class::load_unaligned; using base_class::store_aligned; using base_class::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT16(int16_t, 16) XSIMD_DECLARE_LOAD_STORE_LONG(int16_t, 16) }; template <> class batch : public avx_int_batch { public: using base_class = avx_int_batch; using base_class::base_class; using base_class::load_aligned; using base_class::load_unaligned; using base_class::store_aligned; using base_class::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT16(uint16_t, 16) XSIMD_DECLARE_LOAD_STORE_LONG(uint16_t, 16) }; batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); /************************************* * batch implementation * *************************************/ namespace detail { template struct int16_batch_kernel : avx_int_kernel_base> { using batch_type = batch; using value_type = T; using batch_bool_type = batch_bool; constexpr static bool is_signed = std::is_signed::value; static batch_type neg(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_sub_epi16(_mm256_setzero_si256(), rhs); #else XSIMD_SPLIT_AVX(rhs); __m128i res_low = _mm_sub_epi16(_mm_setzero_si128(), rhs_low); __m128i res_high = _mm_sub_epi16(_mm_setzero_si128(), rhs_high); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } static batch_type add(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_add_epi16(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_add_epi16, lhs, rhs); #endif } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_sub_epi16(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_sub_epi16, lhs, rhs); #endif } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_adds_epi16(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_adds_epi16, lhs, rhs); #endif } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_subs_epi16(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_subs_epi16, lhs, rhs); #endif } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_mullo_epi16(lhs, rhs); #else // Note implement with conversion to epi16 XSIMD_MACRO_UNROLL_BINARY(*); #endif } static batch_type div(const batch_type& lhs, const batch_type& rhs) { // TODO check if instruction workaround exists // #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION #if 0 #else XSIMD_MACRO_UNROLL_BINARY(/); #endif } static batch_type mod(const batch_type& lhs, const batch_type& rhs) { XSIMD_MACRO_UNROLL_BINARY(%); } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_cmpeq_epi16(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_cmpeq_epi16, lhs, rhs); #endif } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_cmpgt_epi16(rhs, lhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_cmpgt_epi16, rhs, lhs); #endif } static batch_type min(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_min_epi16(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_min_epi16, lhs, rhs); #endif } static batch_type max(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_max_epi16(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_max_epi16, lhs, rhs); #endif } static batch_type abs(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_abs_epi16(rhs); #else XSIMD_SPLIT_AVX(rhs); __m128i res_low = _mm_abs_epi16(rhs_low); __m128i res_high = _mm_abs_epi16(rhs_high); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } // TODO use conversion to int16_t static value_type hadd(const batch_type& lhs) { alignas(32) value_type tmp_lhs[16]; lhs.store_aligned(&tmp_lhs[0]); value_type res = 0; unroller<16>([&](std::size_t i) { res += tmp_lhs[i]; }); return res; } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_blendv_epi8(b, a, cond); #else XSIMD_SPLIT_AVX(cond); XSIMD_SPLIT_AVX(a); XSIMD_SPLIT_AVX(b); __m128i res_low = _mm_blendv_epi8(b_low, a_low, cond_low); __m128i res_high = _mm_blendv_epi8(b_high, a_high, cond_high); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { return _mm256_unpacklo_epi16(lhs, rhs); } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { return _mm256_unpackhi_epi16(lhs, rhs); } static batch_type extract_pair(const batch_type& v_lhs, const batch_type& v_rhs, const int num) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION const batch_type lhs = v_rhs; const batch_type rhs = v_lhs; const int n = 2 * num; switch(n) { case 0: return rhs; XSIMD_REPEAT_32_v2(_mm256_alignr_epi8); default: break; } return batch_type(T(0)); #else batch_type b_concatenate; const int n = num; for (int i = 0 ; i < (16 - n); ++i) { b_concatenate[i] = v_lhs[i + n]; if(i < n) { b_concatenate[16 - 1 - i] = v_rhs[n - 1 - i]; } } return b_concatenate; #endif } }; template <> struct batch_kernel : int16_batch_kernel { }; template <> struct batch_kernel : int16_batch_kernel { static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION auto xor_lhs = _mm256_xor_si256(lhs, _mm256_set1_epi16(std::numeric_limits::lowest())); auto xor_rhs = _mm256_xor_si256(rhs, _mm256_set1_epi16(std::numeric_limits::lowest())); return _mm256_cmpgt_epi16(xor_rhs, xor_lhs); #else // Note we could also use _mm256_xor_ps here but it might be slower // as it would go to the floating point device XSIMD_SPLIT_AVX(lhs); XSIMD_SPLIT_AVX(rhs); auto xer = _mm_set1_epi16(std::numeric_limits::lowest()); lhs_low = _mm_xor_si128(lhs_low, xer); lhs_high = _mm_xor_si128(lhs_high, xer); rhs_low = _mm_xor_si128(rhs_low, xer); rhs_high = _mm_xor_si128(rhs_high, xer); __m128i res_low = _mm_cmpgt_epi16(rhs_low, lhs_low); __m128i res_high = _mm_cmpgt_epi16(rhs_high, lhs_high); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } static batch_type min(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_min_epu16(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_min_epu16, lhs, rhs); #endif } static batch_type max(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_max_epu16(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_max_epu16, lhs, rhs); #endif } static batch_type abs(const batch_type& rhs) { return rhs; } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_adds_epu16(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_adds_epu16, lhs, rhs); #endif } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_subs_epu16(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_subs_epu16, lhs, rhs); #endif } }; } XSIMD_DEFINE_LOAD_STORE_INT16(int16_t, 16, 32) XSIMD_DEFINE_LOAD_STORE_LONG(int16_t, 16, 32) inline batch operator<<(const batch& lhs, int32_t rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_slli_epi16(lhs, rhs); #else XSIMD_SPLIT_AVX(lhs); __m128i res_low = _mm_slli_epi16(lhs_low, rhs); __m128i res_high = _mm_slli_epi16(lhs_high, rhs); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } inline batch operator>>(const batch& lhs, int32_t rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_srai_epi16(lhs, rhs); #else XSIMD_SPLIT_AVX(lhs); __m128i res_low = _mm_srai_epi16(lhs_low, rhs); __m128i res_high = _mm_srai_epi16(lhs_high, rhs); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } inline batch operator<<(const batch& lhs, const batch& rhs) { #if defined(XSIMD_AVX512VL_AVAILABLE) && defined(XSIMD_AVX512BW_AVAILABLE) return _mm256_sllv_epi16(lhs, rhs); #else return avx_detail::shift_impl([](int16_t lhs, int16_t s) { return lhs << s; }, lhs, rhs); #endif } inline batch operator>>(const batch& lhs, const batch& rhs) { #if defined(XSIMD_AVX512VL_AVAILABLE) && defined(XSIMD_AVX512BW_AVAILABLE) return _mm256_srav_epi16(lhs, rhs); #else return avx_detail::shift_impl([](int16_t lhs, int16_t s) { return lhs >> s; }, lhs, rhs); #endif } XSIMD_DEFINE_LOAD_STORE_INT16(uint16_t, 16, 32) XSIMD_DEFINE_LOAD_STORE_LONG(uint16_t, 16, 32) inline batch operator<<(const batch& lhs, int32_t rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_slli_epi16(lhs, rhs); #else XSIMD_SPLIT_AVX(lhs); __m128i res_low = _mm_slli_epi16(lhs_low, rhs); __m128i res_high = _mm_slli_epi16(lhs_high, rhs); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } inline batch operator>>(const batch& lhs, int32_t rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_srli_epi16(lhs, rhs); #else XSIMD_SPLIT_AVX(lhs); __m128i res_low = _mm_srli_epi16(lhs_low, rhs); __m128i res_high = _mm_srli_epi16(lhs_high, rhs); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } inline batch operator<<(const batch& lhs, const batch& rhs) { #if defined(XSIMD_AVX512VL_AVAILABLE) && defined(XSIMD_AVX512BW_AVAILABLE) return _mm256_sllv_epi16(lhs, rhs); #else return avx_detail::shift_impl([](uint16_t lhs, int16_t s) { return lhs << s; }, lhs, rhs); #endif } inline batch operator>>(const batch& lhs, const batch& rhs) { #if defined(XSIMD_AVX512VL_AVAILABLE) && defined(XSIMD_AVX512BW_AVAILABLE) return _mm256_srlv_epi16(lhs, rhs); #else return avx_detail::shift_impl([](uint16_t lhs, int16_t s) { return lhs >> s; }, lhs, rhs); #endif } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_avx_int32.hpp000066400000000000000000000621451410101234500223140ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX_INT32_HPP #define XSIMD_AVX_INT32_HPP #include #include "xsimd_base.hpp" #include "xsimd_avx_int_base.hpp" #include "xsimd_int_conversion.hpp" #include "xsimd_sse_int32.hpp" namespace xsimd { /************************** * batch_bool * **************************/ template <> struct simd_batch_traits> { using value_type = int32_t; static constexpr std::size_t size = 8; using batch_type = batch; static constexpr std::size_t align = 32; }; template <> struct simd_batch_traits> { using value_type = uint32_t; static constexpr std::size_t size = 8; using batch_type = batch; static constexpr std::size_t align = 32; }; template <> class batch_bool : public avx_int_batch_bool { public: using avx_int_batch_bool::avx_int_batch_bool; }; template <> class batch_bool : public avx_int_batch_bool { public: using avx_int_batch_bool::avx_int_batch_bool; }; namespace detail { template <> struct batch_bool_kernel : public avx_int_batch_bool_kernel { }; template <> struct batch_bool_kernel : public avx_int_batch_bool_kernel { }; } /********************* * batch * *********************/ template <> struct simd_batch_traits> { using value_type = int32_t; static constexpr std::size_t size = 8; using batch_bool_type = batch_bool; static constexpr std::size_t align = 32; using storage_type = __m256i; }; template <> struct simd_batch_traits> { using value_type = uint32_t; static constexpr std::size_t size = 8; using batch_bool_type = batch_bool; static constexpr std::size_t align = 32; using storage_type = __m256i; }; template <> class batch : public avx_int_batch { public: using base_type = avx_int_batch; using base_type::base_type; using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT32(int32_t, 8) XSIMD_DECLARE_LOAD_STORE_LONG(int32_t, 8) }; template <> class batch : public avx_int_batch { public: using base_type = avx_int_batch; using base_type::base_type; using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT32(uint32_t, 8) XSIMD_DECLARE_LOAD_STORE_LONG(uint32_t, 8) }; batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); /************************************ * batch implementation * ************************************/ inline batch& batch::load_aligned(const float* src) { this->m_value = _mm256_cvtps_epi32(_mm256_load_ps(src)); return *this; } inline batch& batch::load_unaligned(const float* src) { this->m_value = _mm256_cvtps_epi32(_mm256_loadu_ps(src)); return *this; } inline void batch::store_aligned(float* dst) const { _mm256_store_ps(dst, _mm256_cvtepi32_ps(this->m_value)); } inline void batch::store_unaligned(float* dst) const { _mm256_storeu_ps(dst, _mm256_cvtepi32_ps(this->m_value)); } XSIMD_DEFINE_LOAD_STORE(int32_t, 8, bool, 32) XSIMD_DEFINE_LOAD_STORE(int32_t, 8, int8_t, 32) XSIMD_DEFINE_LOAD_STORE(int32_t, 8, uint8_t, 32) XSIMD_DEFINE_LOAD_STORE(int32_t, 8, int16_t, 32) XSIMD_DEFINE_LOAD_STORE(int32_t, 8, uint16_t, 32) XSIMD_DEFINE_LOAD_STORE(int32_t, 8, int64_t, 32) XSIMD_DEFINE_LOAD_STORE(int32_t, 8, uint64_t, 32) XSIMD_DEFINE_LOAD_STORE(int32_t, 8, double, 32) XSIMD_DEFINE_LOAD_STORE_LONG(int32_t, 8, 32) /************************************* * batch implementation * *************************************/ XSIMD_DEFINE_LOAD_STORE_INT32(uint32_t, 8, 32) XSIMD_DEFINE_LOAD_STORE_LONG(uint32_t, 8, 32) #undef AVX_DEFINE_LOAD_STORE_INT32 namespace detail { template <> struct batch_kernel : avx_int_kernel_base> { using batch_type = batch; using value_type = int32_t; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_sub_epi32(_mm256_setzero_si256(), rhs); #else XSIMD_SPLIT_AVX(rhs); __m128i res_low = _mm_sub_epi32(_mm_setzero_si128(), rhs_low); __m128i res_high = _mm_sub_epi32(_mm_setzero_si128(), rhs_high); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } static batch_type add(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_add_epi32(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_add_epi32, lhs, rhs); #endif } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_sub_epi32(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_sub_epi32, lhs, rhs); #endif } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { batch_type mask = rhs >> (8 * sizeof(value_type) - 1); batch_type lhs_pos_branch = min(std::numeric_limits::max() - rhs, lhs); batch_type lhs_neg_branch = max(std::numeric_limits::min() - rhs, lhs); return rhs + select((typename batch_type::storage_type)mask, lhs_neg_branch, lhs_pos_branch); } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return sadd(lhs, neg(rhs)); } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_mullo_epi32(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_mullo_epi32, lhs, rhs); #endif } static batch_type div(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_FAST_INTEGER_DIVISION) return _mm256_cvttps_epi32(_mm256_div_ps(_mm256_cvtepi32_ps(lhs), _mm256_cvtepi32_ps(rhs))); #else alignas(64) int32_t tmp_lhs[8], tmp_rhs[8], tmp_res[8]; lhs.store_aligned(tmp_lhs); rhs.store_aligned(tmp_rhs); unroller<8>([&](std::size_t i) { tmp_res[i] = tmp_lhs[i] / tmp_rhs[i]; }); return batch_type(tmp_res, aligned_mode()); #endif } static batch_type mod(const batch_type& lhs, const batch_type& rhs) { alignas(64) int32_t tmp_lhs[8], tmp_rhs[8], tmp_res[8]; lhs.store_aligned(tmp_lhs); rhs.store_aligned(tmp_rhs); unroller<8>([&](std::size_t i) { tmp_res[i] = tmp_lhs[i] % tmp_rhs[i]; }); return batch_type(tmp_res, aligned_mode()); } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_cmpeq_epi32(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_cmpeq_epi32, lhs, rhs); #endif } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_cmpgt_epi32(rhs, lhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_cmpgt_epi32, rhs, lhs); #endif } static batch_type min(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_min_epi32(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_min_epi32, lhs, rhs); #endif } static batch_type max(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_max_epi32(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_max_epi32, lhs, rhs); #endif } static batch_type abs(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_abs_epi32(rhs); #else XSIMD_SPLIT_AVX(rhs); __m128i res_low = _mm_abs_epi32(rhs_low); __m128i res_high = _mm_abs_epi32(rhs_high); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } static value_type hadd(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION __m256i tmp1 = _mm256_hadd_epi32(rhs, rhs); __m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1); __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1); __m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3); return _mm_cvtsi128_si32(tmp4); #else XSIMD_SPLIT_AVX(rhs); __m128i tmp1 = _mm_add_epi32(rhs_low, rhs_high); __m128i tmp2 = _mm_hadd_epi32(tmp1, tmp1); __m128i tmp3 = _mm_hadd_epi32(tmp2, tmp2); return _mm_cvtsi128_si32(tmp3); #endif } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_blendv_epi8(b, a, cond); #else XSIMD_SPLIT_AVX(cond); XSIMD_SPLIT_AVX(a); XSIMD_SPLIT_AVX(b); __m128i res_low = _mm_blendv_epi8(b_low, a_low, cond_low); __m128i res_high = _mm_blendv_epi8(b_high, a_high, cond_high); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { return _mm256_unpacklo_epi32(lhs, rhs); } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { return _mm256_unpackhi_epi32(lhs, rhs); } static batch_type extract_pair(const batch_type& v_lhs, const batch_type& v_rhs, const int num) { #if defined(XSIMD_AVX512VL_AVAILABLE) const batch_type lhs = v_rhs; const batch_type rhs = v_lhs; const int n = num; switch(n) { case 0: return rhs; XSIMD_REPEAT_8_v2(_mm256_alignr_epi32); default: break; } return batch_type(int32_t(0)); #else #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION const batch_type lhs = v_rhs; const batch_type rhs = v_lhs; const int n = 4 * num; switch(n) { case 0: return rhs; XSIMD_REPEAT_32_v2(_mm256_alignr_epi8); default: break; } return batch_type(int32_t(0)); #else batch_type b_concatenate; const int n = num; for (int i = 0 ; i < (8 - n); ++i) { b_concatenate[i] = v_lhs[i + n]; if(i < n) { b_concatenate[8 - 1 - i] = v_rhs[n - 1 - i]; } } return b_concatenate; #endif #endif } }; template <> struct batch_kernel : avx_int_kernel_base> { using batch_type = batch; using value_type = uint32_t; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_sub_epi32(_mm256_setzero_si256(), rhs); #else XSIMD_SPLIT_AVX(rhs); __m128i res_low = _mm_sub_epi32(_mm_setzero_si128(), rhs_low); __m128i res_high = _mm_sub_epi32(_mm_setzero_si128(), rhs_high); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } static batch_type add(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_add_epi32(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_add_epi32, lhs, rhs); #endif } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_sub_epi32(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_sub_epi32, lhs, rhs); #endif } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { const auto diffmax = std::numeric_limits::max() - lhs; const auto mindiff = min(diffmax, rhs); return lhs + mindiff; } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { const auto diff = min(lhs, rhs); return lhs - diff; } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_mullo_epi32(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_mullo_epi32, lhs, rhs); #endif } static batch_type div(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_FAST_INTEGER_DIVISION) return _mm256_cvttps_epi32(_mm256_div_ps(_mm256_cvtepi32_ps(lhs), _mm256_cvtepi32_ps(rhs))); #else alignas(64) uint32_t tmp_lhs[8], tmp_rhs[8], tmp_res[8]; lhs.store_aligned(tmp_lhs); rhs.store_aligned(tmp_rhs); unroller<8>([&](std::size_t i) { tmp_res[i] = tmp_lhs[i] / tmp_rhs[i]; }); return batch_type(tmp_res, aligned_mode()); #endif } static batch_type mod(const batch_type& lhs, const batch_type& rhs) { alignas(64) uint32_t tmp_lhs[8], tmp_rhs[8], tmp_res[8]; lhs.store_aligned(tmp_lhs); rhs.store_aligned(tmp_rhs); unroller<8>([&](std::size_t i) { tmp_res[i] = tmp_lhs[i] % tmp_rhs[i]; }); return batch_type(tmp_res, aligned_mode()); } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_cmpeq_epi32(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_cmpeq_epi32, lhs, rhs); #endif } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION auto xor_lhs = _mm256_xor_si256(lhs, _mm256_set1_epi32(std::numeric_limits::lowest())); auto xor_rhs = _mm256_xor_si256(rhs, _mm256_set1_epi32(std::numeric_limits::lowest())); return _mm256_cmpgt_epi32(xor_rhs, xor_lhs); #else // Note we could also use _mm256_xor_ps here but it might be slower // as it would go to the floating point device XSIMD_SPLIT_AVX(lhs); XSIMD_SPLIT_AVX(rhs); auto xer = _mm_set1_epi32(std::numeric_limits::lowest()); lhs_low = _mm_xor_si128(lhs_low, xer); lhs_high = _mm_xor_si128(lhs_high, xer); rhs_low = _mm_xor_si128(rhs_low, xer); rhs_high = _mm_xor_si128(rhs_high, xer); __m128i res_low = _mm_cmpgt_epi32(rhs_low, lhs_low); __m128i res_high = _mm_cmpgt_epi32(rhs_high, lhs_high); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } static batch_type min(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_min_epu32(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_min_epu32, lhs, rhs); #endif } static batch_type max(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_max_epu32(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_max_epu32, lhs, rhs); #endif } static batch_type abs(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_sign_epi32(rhs, rhs); #else XSIMD_SPLIT_AVX(rhs); __m128i res_low = _mm_sign_epi32(rhs_low, rhs_low); __m128i res_high = _mm_sign_epi32(rhs_high, rhs_high); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } static value_type hadd(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION __m256i tmp1 = _mm256_hadd_epi32(rhs, rhs); __m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1); __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1); __m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3); return _mm_cvtsi128_si32(tmp4); #else XSIMD_SPLIT_AVX(rhs); __m128i tmp1 = _mm_add_epi32(rhs_low, rhs_high); __m128i tmp2 = _mm_hadd_epi32(tmp1, tmp1); __m128i tmp3 = _mm_hadd_epi32(tmp2, tmp2); return _mm_cvtsi128_si32(tmp3); #endif } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_blendv_epi8(b, a, cond); #else XSIMD_SPLIT_AVX(cond); XSIMD_SPLIT_AVX(a); XSIMD_SPLIT_AVX(b); __m128i res_low = _mm_blendv_epi8(b_low, a_low, cond_low); __m128i res_high = _mm_blendv_epi8(b_high, a_high, cond_high); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { return _mm256_unpacklo_epi32(lhs, rhs); } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { return _mm256_unpackhi_epi32(lhs, rhs); } static batch_type extract_pair(const batch_type& v_lhs, const batch_type& v_rhs, const int num) { #if defined(XSIMD_AVX512VL_AVAILABLE) const batch_type lhs = v_rhs; const batch_type rhs = v_lhs; const int n = num; switch(n) { case 0: return rhs; XSIMD_REPEAT_8_v2(_mm256_alignr_epi32); default: break; } return batch_type(uint32_t(0)); #else #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION const batch_type lhs = v_rhs; const batch_type rhs = v_lhs; const int n = 4 * num; switch(n) { case 0: return rhs; XSIMD_REPEAT_32_v2(_mm256_alignr_epi8); default: break; } return batch_type(uint32_t(0)); #else batch_type b_concatenate; const int n = num; for (int i = 0 ; i < (8 - n); ++i) { b_concatenate[i] = v_lhs[i + n]; if(i < n) { b_concatenate[8 - 1 - i] = v_rhs[n - 1 - i]; } } return b_concatenate; #endif #endif } }; } inline batch operator<<(const batch& lhs, int32_t rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_slli_epi32(lhs, rhs); #else XSIMD_SPLIT_AVX(lhs); __m128i res_low = _mm_slli_epi32(lhs_low, rhs); __m128i res_high = _mm_slli_epi32(lhs_high, rhs); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } inline batch operator>>(const batch& lhs, int32_t rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_srai_epi32(lhs, rhs); #else XSIMD_SPLIT_AVX(lhs); __m128i res_low = _mm_srai_epi32(lhs_low, rhs); __m128i res_high = _mm_srai_epi32(lhs_high, rhs); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } inline batch operator<<(const batch& lhs, const batch& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_sllv_epi32(lhs, rhs); #else return avx_detail::shift_impl([](int32_t lhs, int32_t s) { return lhs << s; }, lhs, rhs); #endif } inline batch operator>>(const batch& lhs, const batch& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_srav_epi32(lhs, rhs); #else return avx_detail::shift_impl([](int32_t lhs, int32_t s) { return lhs >> s; }, lhs, rhs); #endif } inline batch operator<<(const batch& lhs, int32_t rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_slli_epi32(lhs, rhs); #else XSIMD_SPLIT_AVX(lhs); __m128i res_low = _mm_slli_epi32(lhs_low, rhs); __m128i res_high = _mm_slli_epi32(lhs_high, rhs); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } inline batch operator>>(const batch& lhs, int32_t rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_srli_epi32(lhs, rhs); #else XSIMD_SPLIT_AVX(lhs); __m128i res_low = _mm_srli_epi32(lhs_low, rhs); __m128i res_high = _mm_srli_epi32(lhs_high, rhs); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } inline batch operator<<(const batch& lhs, const batch& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_sllv_epi32(lhs, rhs); #else return avx_detail::shift_impl([](uint32_t lhs, int32_t s) { return lhs << s; }, lhs, rhs); #endif } inline batch operator>>(const batch& lhs, const batch& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_srlv_epi32(lhs, rhs); #else return avx_detail::shift_impl([](uint32_t lhs, int32_t s) { return lhs >> s; }, lhs, rhs); #endif } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_avx_int64.hpp000066400000000000000000000623131410101234500223160ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX_INT64_HPP #define XSIMD_AVX_INT64_HPP #include #include "xsimd_base.hpp" #include "xsimd_avx_int_base.hpp" #include "xsimd_sse_int64.hpp" namespace xsimd { /************************** * batch_bool * **************************/ template <> struct simd_batch_traits> { using value_type = int64_t; static constexpr std::size_t size = 4; using batch_type = batch; static constexpr std::size_t align = 32; }; template <> struct simd_batch_traits> { using value_type = uint64_t; static constexpr std::size_t size = 4; using batch_type = batch; static constexpr std::size_t align = 32; }; template <> class batch_bool : public avx_int_batch_bool { public: using avx_int_batch_bool::avx_int_batch_bool; }; template <> class batch_bool : public avx_int_batch_bool { public: using avx_int_batch_bool::avx_int_batch_bool; }; namespace detail { template <> struct batch_bool_kernel : public avx_int_batch_bool_kernel { }; template <> struct batch_bool_kernel : public avx_int_batch_bool_kernel { }; } /********************* * batch * *********************/ template <> struct simd_batch_traits> { using value_type = int64_t; static constexpr std::size_t size = 4; using batch_bool_type = batch_bool; static constexpr std::size_t align = 32; using storage_type = __m256i; }; template <> struct simd_batch_traits> { using value_type = uint64_t; static constexpr std::size_t size = 4; using batch_bool_type = batch_bool; static constexpr std::size_t align = 32; using storage_type = __m256i; }; template <> class batch : public avx_int_batch { public: using base_type = avx_int_batch; using base_type::base_type; using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT64(int64_t, 4) XSIMD_DECLARE_LOAD_STORE_LONG(int64_t, 4) }; template <> class batch : public avx_int_batch { public: using base_type = avx_int_batch; using base_type::base_type; using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT64(uint64_t, 4) XSIMD_DECLARE_LOAD_STORE_LONG(uint64_t, 4) }; batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); /************************************ * batch implementation * ************************************/ XSIMD_DEFINE_LOAD_STORE_INT64(int64_t, 4, 32) XSIMD_DEFINE_LOAD_STORE_LONG(int64_t, 4, 32) /************************************* * batch implementation * *************************************/ XSIMD_DEFINE_LOAD_STORE_INT64(uint64_t, 4, 32) XSIMD_DEFINE_LOAD_STORE_LONG(uint64_t, 4, 32) #undef AVX_DEFINE_LOAD_STORE_INT64 namespace detail { template <> struct batch_kernel : avx_int_kernel_base> { using batch_type = batch; using value_type = uint64_t; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_sub_epi64(_mm256_setzero_si256(), rhs); #else XSIMD_SPLIT_AVX(rhs); __m128i res_low = _mm_sub_epi64(_mm_setzero_si128(), rhs_low); __m128i res_high = _mm_sub_epi64(_mm_setzero_si128(), rhs_high); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } static batch_type add(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_add_epi64(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_add_epi64, lhs, rhs); #endif } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_sub_epi64(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_sub_epi64, lhs, rhs); #endif } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { const auto diffmax = batch_type(std::numeric_limits::max()) - lhs; const auto mindiff = min(diffmax, rhs); return lhs + mindiff; } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { const auto diff = min(lhs, rhs); return lhs - diff; } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { XSIMD_MACRO_UNROLL_BINARY(*); } static batch_type div(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_FAST_INTEGER_DIVISION) __m256d dlhs = _mm256_setr_pd(static_cast(lhs[0]), static_cast(lhs[1]), static_cast(lhs[2]), static_cast(lhs[3])); __m256d drhs = _mm256_setr_pd(static_cast(rhs[0]), static_cast(rhs[1]), static_cast(rhs[2]), static_cast(rhs[3])); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_cvtepi32_epi64(_mm256_cvttpd_epi32(_mm256_div_pd(dlhs, drhs))); #else using batch_int = batch; __m128i tmp = _mm256_cvttpd_epi32(_mm256_div_pd(dlhs, drhs)); __m128i res_low = _mm_unpacklo_epi32(tmp, batch_int(tmp) < batch_int(0)); __m128i res_high = _mm_unpackhi_epi32(tmp, batch_int(tmp) < batch_int(0)); __m256i result = _mm256_castsi128_si256(res_low); return _mm256_insertf128_si256(result, res_high, 1); #endif #else XSIMD_MACRO_UNROLL_BINARY(/) #endif } static batch_type mod(const batch_type& lhs, const batch_type& rhs) { XSIMD_MACRO_UNROLL_BINARY(%); } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_cmpeq_epi64(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_cmpeq_epi64, lhs, rhs); #endif } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION auto xor_lhs = _mm256_xor_si256(lhs, batch(std::numeric_limits::lowest())); auto xor_rhs = _mm256_xor_si256(rhs, batch(std::numeric_limits::lowest())); return _mm256_cmpgt_epi64(xor_rhs, xor_lhs); #else XSIMD_SPLIT_AVX(lhs); XSIMD_SPLIT_AVX(rhs); __m128i xer = batch(std::numeric_limits::lowest()); lhs_low = _mm_xor_si128(lhs_low, xer); lhs_high = _mm_xor_si128(lhs_high, xer); rhs_low = _mm_xor_si128(rhs_low, xer); rhs_high = _mm_xor_si128(rhs_high, xer); __m128i res_low = _mm_cmpgt_epi64(rhs_low, lhs_low); __m128i res_high = _mm_cmpgt_epi64(rhs_high, lhs_high); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return select(lhs < rhs, lhs, rhs); } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return select(lhs > rhs, lhs, rhs); } static batch_type abs(const batch_type& rhs) { #if defined(XSIMD_AVX512VL_AVAILABLE) return _mm256_abs_epi64(rhs); #elif XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION __m256i sign = _mm256_cmpgt_epi64(_mm256_setzero_si256(), rhs); __m256i inv = _mm256_xor_si256(rhs, sign); return _mm256_sub_epi64(inv, sign); #else XSIMD_SPLIT_AVX(rhs); __m128i sign_low = _mm_cmpgt_epi64(_mm_setzero_si128(), rhs_low); __m128i sign_high = _mm_cmpgt_epi64(_mm_setzero_si128(), rhs_high); __m128i inv_low = _mm_xor_si128(rhs_low, sign_low); __m128i inv_high = _mm_xor_si128(rhs_high, sign_high); __m128i res_low = _mm_sub_epi64(inv_low, sign_low); __m128i res_high = _mm_sub_epi64(inv_high, sign_high); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } static value_type hadd(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION __m256i tmp1 = _mm256_shuffle_epi32(rhs, 0x0E); __m256i tmp2 = _mm256_add_epi64(rhs, tmp1); __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1); __m128i res = _mm_add_epi64(_mm256_castsi256_si128(tmp2), tmp3); #else XSIMD_SPLIT_AVX(rhs); __m128i tmp1 = _mm_shuffle_epi32(rhs_low, 0x0E); __m128i tmp2 = _mm_add_epi64(tmp1, rhs_low); __m128i tmp3 = _mm_shuffle_epi32(rhs_high, 0x0E); __m128i tmp4 = _mm_add_epi64(tmp3, rhs_high); __m128i res = _mm_add_epi64(tmp2, tmp4); #endif #if defined(__x86_64__) return _mm_cvtsi128_si64(res); #else union { int64_t i; __m128i m; } u; _mm_storel_epi64(&u.m, res); return u.i; #endif } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_blendv_epi8(b, a, cond); #else XSIMD_SPLIT_AVX(cond); XSIMD_SPLIT_AVX(a); XSIMD_SPLIT_AVX(b); __m128i res_low = _mm_blendv_epi8(b_low, a_low, cond_low); __m128i res_high = _mm_blendv_epi8(b_high, a_high, cond_high); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { return _mm256_unpacklo_epi64(lhs, rhs); } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { return _mm256_unpackhi_epi64(lhs, rhs); } static batch_type extract_pair(const batch_type& v_lhs, const batch_type& v_rhs, const int num) { #if defined(XSIMD_AVX512VL_AVAILABLE) const batch_type lhs = v_rhs; const batch_type rhs = v_lhs; const int n = num; switch(n) { case 0: return rhs; XSIMD_REPEAT_4(_mm256_alignr_epi64); default: break; } return batch_type(uint64_t(0)); #else #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION const batch_type lhs = v_rhs; const batch_type rhs = v_lhs; const int n = 8 * num; switch(n) { case 0: return rhs; XSIMD_REPEAT_32_v2(_mm256_alignr_epi8); default: break; } return batch_type(uint64_t(0)); #else batch_type b_concatenate; const int n = num; for (int i = 0 ; i < (4 - n); ++i) { b_concatenate[i] = v_lhs[i + n]; if(i < n) { b_concatenate[4 - 1 - i] = v_rhs[n - 1 - i]; } } return b_concatenate; #endif #endif } }; template <> struct batch_kernel : avx_int_kernel_base> { using batch_type = batch; using value_type = int64_t; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_sub_epi64(_mm256_setzero_si256(), rhs); #else XSIMD_SPLIT_AVX(rhs); __m128i res_low = _mm_sub_epi64(_mm_setzero_si128(), rhs_low); __m128i res_high = _mm_sub_epi64(_mm_setzero_si128(), rhs_high); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } static batch_type add(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_add_epi64(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_add_epi64, lhs, rhs); #endif } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_sub_epi64(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_sub_epi64, lhs, rhs); #endif } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { batch_type mask = rhs >> (8 * sizeof(value_type) - 1); batch_type lhs_pos_branch = min(std::numeric_limits::max() - rhs, lhs); batch_type lhs_neg_branch = max(std::numeric_limits::min() - rhs, lhs); return rhs + select((typename batch_type::storage_type)mask, lhs_neg_branch, lhs_pos_branch); } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return sadd(lhs, neg(rhs)); } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { XSIMD_MACRO_UNROLL_BINARY(*); } static batch_type div(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_FAST_INTEGER_DIVISION) __m256d dlhs = _mm256_setr_pd(static_cast(lhs[0]), static_cast(lhs[1]), static_cast(lhs[2]), static_cast(lhs[3])); __m256d drhs = _mm256_setr_pd(static_cast(rhs[0]), static_cast(rhs[1]), static_cast(rhs[2]), static_cast(rhs[3])); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_cvtepi32_epi64(_mm256_cvttpd_epi32(_mm256_div_pd(dlhs, drhs))); #else using batch_int = batch; __m128i tmp = _mm256_cvttpd_epi32(_mm256_div_pd(dlhs, drhs)); __m128i res_low = _mm_unpacklo_epi32(tmp, batch_int(tmp) < batch_int(0)); __m128i res_high = _mm_unpackhi_epi32(tmp, batch_int(tmp) < batch_int(0)); __m256i result = _mm256_castsi128_si256(res_low); return _mm256_insertf128_si256(result, res_high, 1); #endif #else XSIMD_MACRO_UNROLL_BINARY(/) #endif } static batch_type mod(const batch_type& lhs, const batch_type& rhs) { XSIMD_MACRO_UNROLL_BINARY(%); } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_cmpeq_epi64(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_cmpeq_epi64, lhs, rhs); #endif } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_cmpgt_epi64(rhs, lhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_cmpgt_epi64, rhs, lhs); #endif } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return select(lhs < rhs, lhs, rhs); } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return select(lhs > rhs, lhs, rhs); } static batch_type abs(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION __m256i sign = _mm256_cmpgt_epi64(_mm256_setzero_si256(), rhs); __m256i inv = _mm256_xor_si256(rhs, sign); return _mm256_sub_epi64(inv, sign); #else XSIMD_SPLIT_AVX(rhs); __m128i sign_low = _mm_cmpgt_epi64(_mm_setzero_si128(), rhs_low); __m128i sign_high = _mm_cmpgt_epi64(_mm_setzero_si128(), rhs_high); __m128i inv_low = _mm_xor_si128(rhs_low, sign_low); __m128i inv_high = _mm_xor_si128(rhs_high, sign_high); __m128i res_low = _mm_sub_epi64(inv_low, sign_low); __m128i res_high = _mm_sub_epi64(inv_high, sign_high); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } static value_type hadd(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION __m256i tmp1 = _mm256_shuffle_epi32(rhs, 0x0E); __m256i tmp2 = _mm256_add_epi64(rhs, tmp1); __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1); __m128i res = _mm_add_epi64(_mm256_castsi256_si128(tmp2), tmp3); #else XSIMD_SPLIT_AVX(rhs); __m128i tmp1 = _mm_shuffle_epi32(rhs_low, 0x0E); __m128i tmp2 = _mm_add_epi64(tmp1, rhs_low); __m128i tmp3 = _mm_shuffle_epi32(rhs_high, 0x0E); __m128i tmp4 = _mm_add_epi64(tmp3, rhs_high); __m128i res = _mm_add_epi64(tmp2, tmp4); #endif #if defined(__x86_64__) return _mm_cvtsi128_si64(res); #else union { int64_t i; __m128i m; } u; _mm_storel_epi64(&u.m, res); return u.i; #endif } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_blendv_epi8(b, a, cond); #else XSIMD_SPLIT_AVX(cond); XSIMD_SPLIT_AVX(a); XSIMD_SPLIT_AVX(b); __m128i res_low = _mm_blendv_epi8(b_low, a_low, cond_low); __m128i res_high = _mm_blendv_epi8(b_high, a_high, cond_high); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { return _mm256_unpacklo_epi64(lhs, rhs); } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { return _mm256_unpackhi_epi64(lhs, rhs); } static batch_type extract_pair(const batch_type& v_lhs, const batch_type& v_rhs, const int num) { #if defined(XSIMD_AVX512VL_AVAILABLE) const batch_type lhs = v_rhs; const batch_type rhs = v_lhs; const int n = num; switch(n) { case 0: return rhs; XSIMD_REPEAT_4(_mm256_alignr_epi64); default: break; } return batch_type(int64_t(0)); #else #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION const batch_type lhs = v_rhs; const batch_type rhs = v_lhs; const int n = 8 * num; switch(n) { case 0: return rhs; XSIMD_REPEAT_32_v2(_mm256_alignr_epi8); default: break; } return batch_type(int64_t(0)); #else batch_type b_concatenate; const int n = num; for (int i = 0 ; i < (4 - n); ++i) { b_concatenate[i] = v_lhs[i + n]; if(i < n) { b_concatenate[4 - 1 - i] = v_rhs[n - 1 - i]; } } return b_concatenate; #endif #endif } }; } inline batch operator<<(const batch& lhs, int32_t rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_slli_epi64(lhs, rhs); #else XSIMD_SPLIT_AVX(lhs); __m128i res_low = _mm_slli_epi64(lhs_low, rhs); __m128i res_high = _mm_slli_epi64(lhs_high, rhs); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } inline batch operator>>(const batch& lhs, int32_t rhs) { #if defined(XSIMD_AVX512VL_AVAILABLE) return _mm256_srai_epi64(lhs, rhs); #else return avx_detail::shift_impl([](int64_t val, int32_t s) { return val >> s; }, lhs, rhs); #endif } inline batch operator<<(const batch& lhs, const batch& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_sllv_epi64(lhs, rhs); #else return avx_detail::shift_impl([](int64_t val, int64_t s) { return val << s; }, lhs, rhs); #endif } inline batch operator>>(const batch& lhs, const batch& rhs) { #if defined(XSIMD_AVX512VL_AVAILABLE) return _mm256_srav_epi64(lhs, rhs); #else return avx_detail::shift_impl([](int64_t val, int64_t s) { return val >> s; }, lhs, rhs); #endif } inline batch operator<<(const batch& lhs, int32_t rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_slli_epi64(lhs, rhs); #else XSIMD_SPLIT_AVX(lhs); __m128i res_low = _mm_slli_epi64(lhs_low, rhs); __m128i res_high = _mm_slli_epi64(lhs_high, rhs); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } inline batch operator>>(const batch& lhs, int32_t rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_srli_epi64(lhs, rhs); #else XSIMD_SPLIT_AVX(lhs); __m128i res_low = _mm_srli_epi64(lhs_low, rhs); __m128i res_high = _mm_srli_epi64(lhs_high, rhs); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } inline batch operator<<(const batch& lhs, const batch& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_sllv_epi64(lhs, rhs); #else return avx_detail::shift_impl([](uint64_t val, int64_t s) { return val << s; }, lhs, rhs); #endif } inline batch operator>>(const batch& lhs, const batch& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_srlv_epi64(lhs, rhs); #else return avx_detail::shift_impl([](uint64_t val, int64_t s) { return val >> s; }, lhs, rhs); #endif } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_avx_int8.hpp000066400000000000000000000467721410101234500222470ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX_INT8_HPP #define XSIMD_AVX_INT8_HPP #include #include "xsimd_base.hpp" #include "xsimd_avx_int_base.hpp" namespace xsimd { /************************** * batch_bool * **************************/ template <> struct simd_batch_traits> { using value_type = int8_t; static constexpr std::size_t size = 32; using batch_type = batch; static constexpr std::size_t align = 32; }; template <> struct simd_batch_traits> { using value_type = uint8_t; static constexpr std::size_t size = 32; using batch_type = batch; static constexpr std::size_t align = 32; }; template <> class batch_bool : public avx_int_batch_bool { public: using avx_int_batch_bool::avx_int_batch_bool; }; template <> class batch_bool : public avx_int_batch_bool { public: using avx_int_batch_bool::avx_int_batch_bool; }; namespace detail { template <> struct batch_bool_kernel : public avx_int_batch_bool_kernel { }; template <> struct batch_bool_kernel : public avx_int_batch_bool_kernel { }; } /********************* * batch * *********************/ template <> struct simd_batch_traits> { using value_type = int8_t; static constexpr std::size_t size = 32; using batch_bool_type = batch_bool; static constexpr std::size_t align = 32; using storage_type = __m256i; }; template <> struct simd_batch_traits> { using value_type = uint8_t; static constexpr std::size_t size = 32; using batch_bool_type = batch_bool; static constexpr std::size_t align = 32; using storage_type = __m256i; }; template <> class batch : public avx_int_batch { public: using base_class = avx_int_batch; using base_class::base_class; using base_class::load_aligned; using base_class::load_unaligned; using base_class::store_aligned; using base_class::store_unaligned; batch() = default; explicit batch(const char* src) : batch(reinterpret_cast(src)) { } batch(const char* src, aligned_mode) : batch(reinterpret_cast(src), aligned_mode{}) { } batch(const char* src, unaligned_mode) : batch(reinterpret_cast(src), unaligned_mode{}) { } XSIMD_DECLARE_LOAD_STORE_INT8(int8_t, 32) XSIMD_DECLARE_LOAD_STORE_LONG(int8_t, 32) }; template <> class batch : public avx_int_batch { public: using base_class = avx_int_batch; using base_class::base_class; using base_class::load_aligned; using base_class::load_unaligned; using base_class::store_aligned; using base_class::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT8(uint8_t, 32) XSIMD_DECLARE_LOAD_STORE_LONG(uint8_t, 32) }; batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); /************************************ * batch implementation * ************************************/ namespace detail { template struct int8_batch_kernel : avx_int_kernel_base> { using batch_type = batch; using value_type = T; using batch_bool_type = batch_bool; constexpr static bool is_signed = std::is_signed::value; static batch_type neg(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_sub_epi8(_mm256_setzero_si256(), rhs); #else XSIMD_SPLIT_AVX(rhs); __m128i res_low = _mm_sub_epi8(_mm_setzero_si128(), rhs_low); __m128i res_high = _mm_sub_epi8(_mm_setzero_si128(), rhs_high); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } static batch_type add(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_add_epi8(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_add_epi8, lhs, rhs); #endif } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_sub_epi8(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_sub_epi8, lhs, rhs); #endif } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_adds_epi8(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_adds_epi8, lhs, rhs); #endif } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_subs_epi8(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_subs_epi8, lhs, rhs); #endif } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION batch_type upper = _mm256_and_si256(_mm256_mullo_epi16(lhs, rhs), _mm256_srli_epi16(_mm256_set1_epi16(-1), 8)); batch_type lower = _mm256_slli_epi16(_mm256_mullo_epi16(_mm256_srli_si256(lhs, 1), _mm256_srli_si256(rhs, 1)), 8); return _mm256_or_si256(upper, lower); #else // Note implement with conversion to epi16 XSIMD_MACRO_UNROLL_BINARY(*); #endif } static batch_type div(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION auto to_float = [](__m256i val) { // sign matters for conversion to epi32! if (std::is_signed::value) { return _mm256_cvtepi32_ps( _mm256_cvtepi8_epi32( _mm256_extractf128_si256(val, 0) ) ); } else { return _mm256_cvtepi32_ps( _mm256_cvtepu8_epi32( _mm256_extractf128_si256(val, 0) ) ); } }; auto to_int8 = [](__m256 x, __m256 y) { auto v0 = _mm256_cvttps_epi32(x); auto v1 = _mm256_cvttps_epi32(y); // here the sign doesn't matter ... just an interpretation detail auto a = _mm256_unpacklo_epi8(v0, v1); // 08.. .... 19.. .... 4C.. .... 5D.. .... auto b = _mm256_unpackhi_epi8(v0, v1); // 2A.. .... 3B.. .... 6E.. .... 7F.. .... auto c = _mm256_unpacklo_epi8(a, b); // 028A .... .... .... 46CE ... auto d = _mm256_unpackhi_epi8(a, b); // 139B .... .... .... 57DF ... auto e = _mm256_unpacklo_epi8(c, d); // 0123 89AB .... .... 4567 CDEF ... return _mm_unpacklo_epi32(_mm256_extractf128_si256(e, 0), _mm256_extractf128_si256(e, 1)); // 0123 4567 89AB CDEF }; auto insert = [](__m256i a, __m128i b) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_inserti128_si256(a, b, 1); #else return _mm256_insertf128_si256(a, b, 1); #endif }; batch res_1 = _mm256_div_ps(to_float(lhs), to_float(rhs)); batch res_2 = _mm256_div_ps(to_float(_mm256_permute4x64_epi64(lhs, 0x01)), to_float(_mm256_permute4x64_epi64(rhs, 0x01))); batch res_3 = _mm256_div_ps(to_float(_mm256_permute4x64_epi64(lhs, 0x02)), to_float(_mm256_permute4x64_epi64(rhs, 0x02))); batch res_4 = _mm256_div_ps(to_float(_mm256_permute4x64_epi64(lhs, 0x03)), to_float(_mm256_permute4x64_epi64(rhs, 0x03))); return batch_type( insert(_mm256_castsi128_si256(to_int8(res_1, res_2)), to_int8(res_3, res_4)) ); #else XSIMD_MACRO_UNROLL_BINARY(/); #endif } static batch_type mod(const batch_type& lhs, const batch_type& rhs) { XSIMD_MACRO_UNROLL_BINARY(%); } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_cmpeq_epi8(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_cmpeq_epi8, lhs, rhs); #endif } // TODO use conversion to int16_t static value_type hadd(const batch_type& lhs) { alignas(32) value_type tmp_lhs[32]; lhs.store_aligned(&tmp_lhs[0]); value_type res = 0; unroller<32>([&](std::size_t i) { res += tmp_lhs[i]; }); return res; } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_blendv_epi8(b, a, cond); #else XSIMD_SPLIT_AVX(cond); XSIMD_SPLIT_AVX(a); XSIMD_SPLIT_AVX(b); __m128i res_low = _mm_blendv_epi8(b_low, a_low, cond_low); __m128i res_high = _mm_blendv_epi8(b_high, a_high, cond_high); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { return _mm256_unpacklo_epi8(lhs, rhs); } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { return _mm256_unpackhi_epi8(lhs, rhs); } static batch_type extract_pair(const batch_type& v_lhs, const batch_type& v_rhs, const int n) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION const batch_type lhs = v_rhs; const batch_type rhs = v_lhs; switch(n) { case 0: return rhs; XSIMD_REPEAT_32_v2(_mm256_alignr_epi8); default: break; } return batch_type(T(0)); #else batch_type b_concatenate; for (int i = 0 ; i < (32 - n); ++i) { b_concatenate[i] = v_lhs[i + n]; if(i < n) { b_concatenate[32 - 1 - i] = v_rhs[n - 1 - i]; } } return b_concatenate; #endif } }; template <> struct batch_kernel : int8_batch_kernel { static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_cmpgt_epi8(rhs, lhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_cmpgt_epi8, rhs, lhs); #endif } static batch_type min(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_min_epi8(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_min_epi8, lhs, rhs); #endif } static batch_type max(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_max_epi8(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_max_epi8, lhs, rhs); #endif } static batch_type abs(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_abs_epi8(rhs); #else XSIMD_SPLIT_AVX(rhs); __m128i res_low = _mm_abs_epi8(rhs_low); __m128i res_high = _mm_abs_epi8(rhs_high); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } }; template <> struct batch_kernel : int8_batch_kernel { static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION auto xor_lhs = _mm256_xor_si256(lhs, _mm256_set1_epi8(std::numeric_limits::lowest())); auto xor_rhs = _mm256_xor_si256(rhs, _mm256_set1_epi8(std::numeric_limits::lowest())); return _mm256_cmpgt_epi8(xor_rhs, xor_lhs); #else XSIMD_SPLIT_AVX(lhs); XSIMD_SPLIT_AVX(rhs); auto xer = _mm_set1_epi8(std::numeric_limits::lowest()); lhs_low = _mm_xor_si128(lhs_low, xer); lhs_high = _mm_xor_si128(lhs_high, xer); rhs_low = _mm_xor_si128(rhs_low, xer); rhs_high = _mm_xor_si128(rhs_high, xer); __m128i res_low = _mm_cmpgt_epi8(rhs_low, lhs_low); __m128i res_high = _mm_cmpgt_epi8(rhs_high, lhs_high); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } static batch_type min(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_min_epu8(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_min_epu8, lhs, rhs); #endif } static batch_type max(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_max_epu8(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_max_epu8, lhs, rhs); #endif } static batch_type abs(const batch_type& rhs) { return rhs; } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_adds_epu8(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_adds_epu8, lhs, rhs); #endif } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_subs_epu8(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_subs_epu8, lhs, rhs); #endif } }; } inline batch operator<<(const batch& lhs, int32_t rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_and_si256(_mm256_set1_epi8(0xFF << rhs), _mm256_slli_epi32(lhs, rhs)); #else return avx_detail::shift_impl([](int8_t val, int32_t s) { return val << s; }, lhs, rhs); #endif } inline batch operator>>(const batch& lhs, int32_t rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION __m256i sign_mask = _mm256_set1_epi16((0xFF00 >> rhs) & 0x00FF); __m256i cmp_is_negative = _mm256_cmpgt_epi8(_mm256_setzero_si256(), lhs); __m256i res = _mm256_srai_epi16(lhs, rhs); return _mm256_or_si256(_mm256_and_si256(sign_mask, cmp_is_negative), _mm256_andnot_si256(sign_mask, res)); #else return avx_detail::shift_impl([](int8_t val, int32_t s) { return val >> s; }, lhs, rhs); #endif } inline batch operator<<(const batch& lhs, const batch& rhs) { return avx_detail::shift_impl([](int8_t val, int8_t s) { return val << s; }, lhs, rhs); } inline batch operator>>(const batch& lhs, const batch& rhs) { return avx_detail::shift_impl([](int8_t val, int8_t s) { return val >> s; }, lhs, rhs); } XSIMD_DEFINE_LOAD_STORE_INT8(int8_t, 32, 32) XSIMD_DEFINE_LOAD_STORE_LONG(int8_t, 32, 32) inline batch operator<<(const batch& lhs, int32_t rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_and_si256(_mm256_set1_epi8(0xFF << rhs), _mm256_slli_epi32(lhs, rhs)); #else return avx_detail::shift_impl([](int8_t val, int32_t s) { return val << s; }, lhs, rhs); #endif } inline batch operator>>(const batch& lhs, int32_t rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_and_si256(_mm256_set1_epi8(0xFF >> rhs), _mm256_srli_epi32(lhs, rhs)); #else return avx_detail::shift_impl([](uint8_t val, int32_t s) { return val >> s; }, lhs, rhs); #endif } inline batch operator<<(const batch& lhs, const batch& rhs) { return avx_detail::shift_impl([](uint8_t val, int8_t s) { return val << s; }, lhs, rhs); } inline batch operator>>(const batch& lhs, const batch& rhs) { return avx_detail::shift_impl([](uint8_t val, int8_t s) { return val >> s; }, lhs, rhs); } XSIMD_DEFINE_LOAD_STORE_INT8(uint8_t, 32, 32) XSIMD_DEFINE_LOAD_STORE_LONG(uint8_t, 32, 32) } #endif xsimd-7.6.0/include/xsimd/types/xsimd_avx_int_base.hpp000066400000000000000000000461031410101234500231350ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX_INT_BASE_HPP #define XSIMD_AVX_INT_BASE_HPP #include "xsimd_base.hpp" namespace xsimd { #if XSIMD_X86_INSTR_SET < XSIMD_X86_AVX512_VERSION #define XSIMD_SPLIT_AVX(avx_name) \ __m128i avx_name##_low = _mm256_castsi256_si128(avx_name); \ __m128i avx_name##_high = _mm256_extractf128_si256(avx_name, 1) #define XSIMD_RETURN_MERGED_SSE(res_low, res_high) \ __m256i result = _mm256_castsi128_si256(res_low); \ return _mm256_insertf128_si256(result, res_high, 1) #define XSIMD_APPLY_SSE_FUNCTION(func, avx_lhs, avx_rhs) \ XSIMD_SPLIT_AVX(avx_lhs); \ XSIMD_SPLIT_AVX(avx_rhs); \ __m128i res_low = func(avx_lhs##_low, avx_rhs##_low); \ __m128i res_high = func(avx_lhs##_high, avx_rhs##_high); \ XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif template class avx_int_batch_bool : public simd_batch_bool> { public: avx_int_batch_bool(); explicit avx_int_batch_bool(bool b); template > avx_int_batch_bool(Args... args); avx_int_batch_bool(const __m256i& rhs); avx_int_batch_bool& operator=(const __m256i& rhs); operator __m256i() const; bool_proxy operator[](std::size_t index); bool operator[](std::size_t index) const; __m256i get_value() const; private: template batch_bool& load_values(Args... args); union { __m256i m_value; T m_array[N]; }; friend class simd_batch_bool>; }; template class avx_int_batch : public simd_batch> { public: using base_type = simd_batch>; using batch_bool_type = typename base_type::batch_bool_type; avx_int_batch(); explicit avx_int_batch(T i); // Constructor from N scalar parameters template > avx_int_batch(Args... exactly_N_scalars); explicit avx_int_batch(const T* src); avx_int_batch(const T* src, aligned_mode); avx_int_batch(const T* src, unaligned_mode); avx_int_batch(const __m256i& rhs); avx_int_batch(const batch_bool_type& rhs); batch& operator=(const __m256i& rhs); batch& operator=(const batch_bool_type& rhs); operator __m256i() const; batch& load_aligned(const T* src); batch& load_unaligned(const T* src); batch& load_aligned(const flipped_sign_type_t* src); batch& load_unaligned(const flipped_sign_type_t* src); void store_aligned(T* dst) const; void store_unaligned(T* dst) const; void store_aligned(flipped_sign_type_t* dst) const; void store_unaligned(flipped_sign_type_t* dst) const; using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; }; namespace avx_detail { template inline __m256i int_init(std::integral_constant, Args... args) { return _mm256_setr_epi8(args...); } template inline __m256i int_init(std::integral_constant, Args... args) { return _mm256_setr_epi16(args...); } template inline __m256i int_init(std::integral_constant, Args... args) { return _mm256_setr_epi32(args...); } template inline __m256i int_init(std::integral_constant, Args... args) { return _mm256_setr_epi64x(args...); } template inline __m256i int_set(std::integral_constant, T v) { return _mm256_set1_epi8(v); } template inline __m256i int_set(std::integral_constant, T v) { return _mm256_set1_epi16(v); } template inline __m256i int_set(std::integral_constant, T v) { return _mm256_set1_epi32(v); } template inline __m256i int_set(std::integral_constant, T v) { return _mm256_set1_epi64x(v); } } /***************************************** * batch_bool implementation * *****************************************/ template inline avx_int_batch_bool::avx_int_batch_bool() { } template inline avx_int_batch_bool::avx_int_batch_bool(bool b) : m_value(_mm256_set1_epi32(-(int32_t)b)) { } template template inline avx_int_batch_bool::avx_int_batch_bool(Args... args) : m_value(avx_detail::int_init(std::integral_constant{}, static_cast(args ? typename std::make_signed::type{-1} : 0)...)) { } template inline avx_int_batch_bool::avx_int_batch_bool(const __m256i& rhs) : m_value(rhs) { } template inline avx_int_batch_bool& avx_int_batch_bool::operator=(const __m256i& rhs) { m_value = rhs; return *this; } template inline avx_int_batch_bool::operator __m256i() const { return m_value; } template inline bool_proxy avx_int_batch_bool::operator[](std::size_t index) { return bool_proxy(m_array[index & (N - 1)]); } template inline bool avx_int_batch_bool::operator[](std::size_t index) const { return static_cast(m_array[index & (N - 1)]); } template inline __m256i avx_int_batch_bool::get_value() const { return m_value; } template template inline batch_bool& avx_int_batch_bool::load_values(Args... args) { m_value = avx_detail::int_init(std::integral_constant{}, static_cast(args ? typename std::make_signed::type{-1} : 0)...); return (*this)(); } namespace detail { template struct avx_int_batch_bool_kernel { using batch_type = batch_bool; static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_and_si256(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_and_si128, lhs, rhs); #endif } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_or_si256(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_or_si128, lhs, rhs); #endif } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_xor_si256(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_xor_si128, lhs, rhs); #endif } static batch_type bitwise_not(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_xor_si256(rhs, _mm256_set1_epi32(-1)); // xor with all one #else XSIMD_SPLIT_AVX(rhs); __m128i res_low = _mm_xor_si128(rhs_low, _mm_set1_epi32(-1)); __m128i res_high = _mm_xor_si128(rhs_high, _mm_set1_epi32(-1)); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_andnot_si256(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_andnot_si128, lhs, rhs); #endif } static batch_type equal(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION switch(sizeof(T)) { case 1: return _mm256_cmpeq_epi8(lhs, rhs); case 2: return _mm256_cmpeq_epi16(lhs, rhs); case 4: return _mm256_cmpeq_epi32(lhs, rhs); case 8: return _mm256_cmpeq_epi64(lhs, rhs); } #else switch(sizeof(T)) { case 1: { XSIMD_APPLY_SSE_FUNCTION(_mm_cmpeq_epi8, lhs, rhs); } case 2: { XSIMD_APPLY_SSE_FUNCTION(_mm_cmpeq_epi16, lhs, rhs); } case 4: { XSIMD_APPLY_SSE_FUNCTION(_mm_cmpeq_epi32, lhs, rhs); } case 8: { XSIMD_APPLY_SSE_FUNCTION(_mm_cmpeq_epi64, lhs, rhs); } } #endif } static batch_type not_equal(const batch_type& lhs, const batch_type& rhs) { return ~(lhs == rhs); } static bool all(const batch_type& rhs) { return _mm256_testc_si256(rhs, batch_type(true)) != 0; } static bool any(const batch_type& rhs) { return !_mm256_testz_si256(rhs, rhs); } }; } /************************************** * avx_int_batch implementation * **************************************/ template inline avx_int_batch::avx_int_batch() { } template inline avx_int_batch::avx_int_batch(T i) : base_type(avx_detail::int_set(std::integral_constant{}, i)) { } template template inline avx_int_batch::avx_int_batch(Args... args) : base_type(avx_detail::int_init(std::integral_constant{}, args...)) { } template inline avx_int_batch::avx_int_batch(const T* src) : base_type(_mm256_loadu_si256((__m256i const*)src)) { } template inline avx_int_batch::avx_int_batch(const T* src, aligned_mode) : base_type(_mm256_load_si256((__m256i const*)src)) { } template inline avx_int_batch::avx_int_batch(const T* src, unaligned_mode) : base_type(_mm256_loadu_si256((__m256i const*)src)) { } template inline avx_int_batch::avx_int_batch(const __m256i& rhs) : base_type(rhs) { } namespace detail { inline __m256i bitwise_and_impl(__m256i lhs, __m256i rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_and_si256(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_and_si128, lhs, rhs); #endif } } template inline avx_int_batch::avx_int_batch(const batch_bool_type& rhs) : base_type(detail::bitwise_and_impl(rhs, batch(1))) { } template inline batch& avx_int_batch::operator=(const __m256i& rhs) { this->m_value = rhs; return (*this)(); } template inline batch& avx_int_batch::operator=(const batch_bool_type& rhs) { this->m_value = detail::bitwise_and_impl(rhs, batch(1)); return (*this)(); } template inline avx_int_batch::operator __m256i() const { return this->m_value; } template inline batch& avx_int_batch::load_aligned(const T* src) { this->m_value = _mm256_load_si256((__m256i const*) src); return (*this)(); } template inline batch& avx_int_batch::load_unaligned(const T* src) { this->m_value = _mm256_loadu_si256((__m256i const*) src); return (*this)(); } template inline batch& avx_int_batch::load_aligned(const flipped_sign_type_t* src) { this->m_value = _mm256_load_si256((__m256i const*) src); return (*this)(); } template inline batch& avx_int_batch::load_unaligned(const flipped_sign_type_t* src) { this->m_value = _mm256_loadu_si256((__m256i const*) src); return (*this)(); } template inline void avx_int_batch::store_aligned(T* dst) const { _mm256_store_si256((__m256i*) dst, this->m_value); } template inline void avx_int_batch::store_unaligned(T* dst) const { _mm256_storeu_si256((__m256i*) dst, this->m_value); } template inline void avx_int_batch::store_aligned(flipped_sign_type_t* dst) const { _mm256_store_si256((__m256i*) dst, this->m_value); } template inline void avx_int_batch::store_unaligned(flipped_sign_type_t* dst) const { _mm256_storeu_si256((__m256i*) dst, this->m_value); } namespace detail { template struct avx_int_kernel_base { using batch_type = B; using batch_bool_type = typename simd_batch_traits::batch_bool_type; // static constexpr std::size_t size = simd_batch_traits::size; // static constexpr std::size_t align = simd_batch_traits::align; static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { return ~(lhs == rhs); } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { return ~(rhs < lhs); } static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return detail::bitwise_and_impl(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_or_si256(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_or_si128, lhs, rhs); #endif } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_xor_si256(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_xor_si128, lhs, rhs); #endif } static batch_type bitwise_not(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_xor_si256(rhs, _mm256_set1_epi8(-1)); #else XSIMD_SPLIT_AVX(rhs); __m128i res_low = _mm_xor_si128(rhs_low, _mm_set1_epi8(-1)); __m128i res_high = _mm_xor_si128(rhs_high, _mm_set1_epi8(-1)); XSIMD_RETURN_MERGED_SSE(res_low, res_high); #endif } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm256_andnot_si256(lhs, rhs); #else XSIMD_APPLY_SSE_FUNCTION(_mm_andnot_si128, lhs, rhs); #endif } static batch_type fmin(const batch_type& lhs, const batch_type& rhs) { return min(lhs, rhs); } static batch_type fmax(const batch_type& lhs, const batch_type& rhs) { return max(lhs, rhs); } static batch_type fabs(const batch_type& rhs) { return abs(rhs); } static batch_type fma(const batch_type& x, const batch_type& y, const batch_type& z) { return x * y + z; } static batch_type fms(const batch_type& x, const batch_type& y, const batch_type& z) { return x * y - z; } static batch_type fnma(const batch_type& x, const batch_type& y, const batch_type& z) { return -x * y + z; } static batch_type fnms(const batch_type& x, const batch_type& y, const batch_type& z) { return -x * y - z; } }; } namespace avx_detail { template inline batch shift_impl(F&& f, const batch& lhs, int32_t rhs) { alignas(32) T tmp_lhs[N], tmp_res[N]; lhs.store_aligned(&tmp_lhs[0]); unroller([&](std::size_t i) { tmp_res[i] = f(tmp_lhs[i], rhs); }); return batch(tmp_res, aligned_mode()); } template inline batch shift_impl(F&& f, const batch& lhs, const batch& rhs) { alignas(32) T tmp_lhs[N], tmp_res[N]; alignas(32) S tmp_rhs[N]; lhs.store_aligned(&tmp_lhs[0]); rhs.store_aligned(&tmp_rhs[0]); unroller([&](std::size_t i) { tmp_res[i] = f(tmp_lhs[i], tmp_rhs[i]); }); return batch(tmp_res, aligned_mode()); } } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_base.hpp000066400000000000000000002406411410101234500214100ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_BASE_HPP #define XSIMD_BASE_HPP #include #include #include #include #include #include #ifdef XSIMD_ENABLE_XTL_COMPLEX #include "xtl/xcomplex.hpp" #endif #include "../memory/xsimd_alignment.hpp" #include "xsimd_utils.hpp" #include "xsimd_base_bool.hpp" #include "xsimd_base_constant.hpp" namespace xsimd { template class simd_base; template class batch; namespace detail { template struct batch_kernel; } template struct simd_batch_inner_types { using batch_reference = X&; using const_batch_reference = const X&; }; template using batch_type_t = typename T::batch_type; namespace detail { template struct get_real_batch_type { using batch_type = batch_type_t; }; template struct get_real_batch_type, N>> { using batch_type = typename batch, N>::real_batch; }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct get_real_batch_type, N>> { using batch_type = typename batch, N>::real_batch; }; #endif } template using real_batch_type_t = typename detail::get_real_batch_type::batch_type; namespace detail { template struct is_simd_type : std::is_base_of, X> { }; } template using enable_if_simd_t = typename std::enable_if::value, batch_type_t>::type; /************* * simd_base * *************/ /** * @class simd_base * @brief Base class for batches and batch proxies. * * The simd_base class is the base class for all classes * representing a batch or a batch proxy. It provides very few * methods, so concrete batches usually inherit from intermediate * classes. * * @tparam X The most derived type */ template class simd_base { public: using derived_class = X; using batch_reference = typename simd_batch_inner_types::batch_reference; using const_batch_reference = typename simd_batch_inner_types::const_batch_reference; batch_reference operator()(); const_batch_reference operator()() const; X& derived_cast(); const X& derived_cast() const; }; /************** * simd_batch * **************/ /** * @class simd_batch * @brief Base class for batch of integer or floating point values. * * The simd_batch class is the base class for all classes representing * a batch of integer or floating point values. Each type of batch (i.e. * a class inheriting from simd_batch) has its dedicated type of boolean * batch (i.e. a class inheriting from simd_batch_bool) for logical operations. * * @tparam X The derived type * @sa simd_batch_bool */ template class simd_batch : public simd_base { public: using base_type = simd_base; using batch_reference = typename base_type::batch_reference; using const_batch_reference = typename base_type::const_batch_reference; using batch_type = X; using value_type = typename simd_batch_traits::value_type; static constexpr std::size_t size = simd_batch_traits::size; using storage_type = typename simd_batch_traits::storage_type; using batch_bool_type = typename simd_batch_traits::batch_bool_type; using iterator = value_type*; using const_iterator = const value_type*; using reverse_iterator = std::reverse_iterator; using const_reverse_iterator = std::reverse_iterator; static X broadcast(value_type v); template static X from_unaligned(T* src); template static X from_aligned(T* src); X& operator+=(const X& rhs); X& operator+=(const value_type& rhs); X& operator-=(const X& rhs); X& operator-=(const value_type& rhs); X& operator*=(const X& rhs); X& operator*=(const value_type& rhs); X& operator/=(const X& rhs); X& operator/=(const value_type& rhs); X& operator&=(const X& rhs); X& operator|=(const X& rhs); X& operator^=(const X& rhs); X& operator++(); X& operator++(int); X& operator--(); X& operator--(int); X& load_aligned(const char* src); X& load_unaligned(const char* src); void store_aligned(char* dst) const; void store_unaligned(char* dst) const; batch_reference get(); const_batch_reference get() const; value_type& operator[](std::size_t index); const value_type& operator[](std::size_t index) const; iterator begin(); iterator end(); const_iterator begin() const; const_iterator end() const; const_iterator cbegin() const; const_iterator cend() const; reverse_iterator rbegin(); reverse_iterator rend(); const_reverse_iterator rbegin() const; const_reverse_iterator rend() const; const_reverse_iterator crbegin() const; const_reverse_iterator crend() const; protected: simd_batch() = default; ~simd_batch() = default; simd_batch(const simd_batch&) = default; simd_batch& operator=(const simd_batch&) = default; simd_batch(simd_batch&&) = default; simd_batch& operator=(simd_batch&&) = default; constexpr simd_batch(storage_type value); using char_itype = typename std::conditional::value, int8_t, uint8_t>::type; union { storage_type m_value; value_type m_array[size]; }; }; template typename simd_batch_traits::batch_bool_type operator!(const simd_base& rhs); template batch_type_t min(const simd_base& lhs, const simd_base& rhs); template batch_type_t max(const simd_base& lhs, const simd_base& rhs); template batch_type_t fmin(const simd_base& lhs, const simd_base& rhs); template batch_type_t fmax(const simd_base& lhs, const simd_base& rhs); template real_batch_type_t abs(const simd_base& rhs); template batch_type_t fabs(const simd_base& rhs); template batch_type_t sqrt(const simd_base& rhs); template batch_type_t fma(const simd_base& x, const simd_base& y, const simd_base& z); template batch_type_t fms(const simd_base& x, const simd_base& y, const simd_base& z); template batch_type_t fnma(const simd_base& x, const simd_base& y, const simd_base& z); template batch_type_t fnms(const simd_base& x, const simd_base& y, const simd_base& z); template typename simd_batch_traits::value_type hadd(const simd_base& rhs); template enable_if_simd_t haddp(const X* row); template batch_type_t select(const typename simd_batch_traits::batch_bool_type& cond, const simd_base& a, const simd_base& b); template batch_type_t zip_lo(const simd_base& lhs, const simd_base& rhs); template batch_type_t zip_hi(const simd_base& lhs, const simd_base& rhs); template batch_type_t extract_pair(const simd_base& lhs, const simd_base& rhs, const int n); template typename simd_batch_traits::batch_bool_type isnan(const simd_base& x); template std::ostream& operator<<(std::ostream& out, const simd_batch& rhs); /*************************** * generic batch operators * ***************************/ template batch operator&&(const batch& lhs, const batch& rhs); template batch operator||(const batch& lhs, const batch& rhs); template batch operator<<(const batch& lhs, const batch& rhs); template batch operator>>(const batch& lhs, const batch& rhs); /************************** * batch cast functions * **************************/ // Provides a static_cast from batch to batch template struct batch_cast_impl { template static inline batch run_impl(const batch& x, detail::index_sequence) { return batch(static_cast(x[I])...); } public: static inline batch run(const batch& x) { return run_impl(x, detail::make_index_sequence{}); } }; template struct batch_cast_impl { static inline batch run(const batch& x) { return x; } }; // Shorthand for defining an intrinsic-based batch_cast implementation #define XSIMD_BATCH_CAST_INTRINSIC(T_IN, T_OUT, N, INTRINSIC) \ template <> \ struct batch_cast_impl \ { \ static inline batch run(const batch& x) \ { \ return INTRINSIC(x); \ } \ }; // Shorthand for defining an intrinsic-based batch_cast implementation that requires 2 intrinsics #define XSIMD_BATCH_CAST_INTRINSIC2(T_IN, T_OUT, N, INTRINSIC1, INTRINSIC2) \ template <> \ struct batch_cast_impl \ { \ static inline batch run(const batch& x) \ { \ return INTRINSIC2(INTRINSIC1(x)); \ } \ }; // Shorthand for defining an implicit batch_cast implementation #define XSIMD_BATCH_CAST_IMPLICIT(T_IN, T_OUT, N) \ template <> \ struct batch_cast_impl \ { \ static inline batch run(const batch& x) \ { \ return batch(x); \ } \ }; /************************** * bitwise cast functions * **************************/ // Provides a reinterpret_cast from batch to batch template struct bitwise_cast_impl; // Shorthand for defining an intrinsic-based bitwise_cast implementation #define XSIMD_BITWISE_CAST_INTRINSIC(T_IN, N_IN, T_OUT, N_OUT, INTRINSIC) \ template <> \ struct bitwise_cast_impl, batch> \ { \ static inline batch run(const batch& x) \ { \ return INTRINSIC(x); \ } \ }; // Backwards-compatible interface to bitwise_cast_impl template ::size> B bitwise_cast(const batch& x); template ::size> B bitwise_cast(const batch& x); template ::size> B bitwise_cast(const batch& x); template ::size> B bitwise_cast(const batch& x); template batch bitwise_cast(const batch_bool& src); /**************** * helper macro * ****************/ #define XSIMD_DECLARE_LOAD_STORE(TYPE, N, CVT_TYPE) \ batch& load_aligned(const CVT_TYPE*); \ batch& load_unaligned(const CVT_TYPE*); \ void store_aligned(CVT_TYPE* dst) const; \ void store_unaligned(CVT_TYPE* dst) const; #define XSIMD_DEFINE_LOAD_STORE(TYPE, N, CVT_TYPE, ALIGNMENT) \ inline batch& batch::load_aligned(const CVT_TYPE* src) \ { \ alignas(ALIGNMENT) TYPE tmp[N]; \ unroller([&](std::size_t i) { \ tmp[i] = static_cast(src[i]); \ }); \ return load_aligned(tmp); \ } \ inline batch& batch::load_unaligned(const CVT_TYPE* src) \ { \ return load_aligned(src); \ } \ inline void batch::store_aligned(CVT_TYPE* dst) const \ { \ alignas(ALIGNMENT) TYPE tmp[N]; \ store_aligned(tmp); \ unroller([&](std::size_t i) { \ dst[i] = static_cast(tmp[i]); \ }); \ } \ inline void batch::store_unaligned(CVT_TYPE* dst) const \ { \ return store_aligned(dst); \ } #ifdef XSIMD_32_BIT_ABI #define XSIMD_DECLARE_LOAD_STORE_LONG(TYPE, N) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, long) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, unsigned long) \ namespace detail { template struct get_int_type; template <> struct get_int_type { using type = int32_t; }; template <> struct get_int_type { using type = uint32_t; }; template using get_int_type_t = typename get_int_type::type; } #define XSIMD_DEFINE_LOAD_STORE_LONG_IMPL(TYPE, N, CVT_TYPE, ALIGNMENT) \ inline batch& batch::load_aligned(const CVT_TYPE* src) \ { \ using int_type = detail::get_int_type_t; \ return this->load_aligned(reinterpret_cast(src)); \ } \ inline batch& batch::load_unaligned(const CVT_TYPE* src) \ { \ using int_type = detail::get_int_type_t; \ return this->load_unaligned(reinterpret_cast(src)); \ } \ inline void batch::store_aligned(CVT_TYPE* dst) const \ { \ using int_type = detail::get_int_type_t; \ this->store_aligned(reinterpret_cast(dst)); \ } \ inline void batch::store_unaligned(CVT_TYPE* dst) const \ { \ using int_type = detail::get_int_type_t; \ this->store_unaligned(reinterpret_cast(dst)); \ } \ #define XSIMD_DEFINE_LOAD_STORE_LONG(TYPE, N, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE_LONG_IMPL(TYPE, N, long, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE_LONG_IMPL(TYPE, N, unsigned long, ALIGNMENT) \ #else #define XSIMD_DECLARE_LOAD_STORE_LONG(TYPE, N) #define XSIMD_DEFINE_LOAD_STORE_LONG(TYPE, N, ALIGNMENT) #endif // XSIMD_32_BIT_ABI #define XSIMD_DECLARE_LOAD_STORE_INT8(TYPE, N) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, bool) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, int16_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, uint16_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, int32_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, uint32_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, int64_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, uint64_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, float) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, double) #define XSIMD_DEFINE_LOAD_STORE_INT8(TYPE, N, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, bool, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, int16_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, uint16_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, int32_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, uint32_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, int64_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, uint64_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, float, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, double, ALIGNMENT) #define XSIMD_DECLARE_LOAD_STORE_INT16(TYPE, N) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, bool) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, int8_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, uint8_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, int32_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, uint32_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, int64_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, uint64_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, float) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, double) #define XSIMD_DEFINE_LOAD_STORE_INT16(TYPE, N, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, bool, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, int8_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, uint8_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, int32_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, uint32_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, int64_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, uint64_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, float, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, double, ALIGNMENT) #define XSIMD_DECLARE_LOAD_STORE_INT32(TYPE, N) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, bool) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, int8_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, uint8_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, int16_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, uint16_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, int64_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, uint64_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, float) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, double) #define XSIMD_DEFINE_LOAD_STORE_INT32(TYPE, N, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, bool, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, int8_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, uint8_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, int16_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, uint16_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, int64_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, uint64_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, float, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, double, ALIGNMENT) #define XSIMD_DECLARE_LOAD_STORE_INT64(TYPE, N) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, bool) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, int8_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, uint8_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, int16_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, uint16_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, int32_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, uint32_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, float) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, double) #define XSIMD_DEFINE_LOAD_STORE_INT64(TYPE, N, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, bool, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, int8_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, uint8_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, int16_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, uint16_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, int32_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, uint32_t, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, float, ALIGNMENT) \ XSIMD_DEFINE_LOAD_STORE(TYPE, N, double, ALIGNMENT) #define XSIMD_DECLARE_LOAD_STORE_ALL(TYPE, N) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, bool) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, int8_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, uint8_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, int16_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, uint16_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, int32_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, uint32_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, int64_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, uint64_t) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, float) \ XSIMD_DECLARE_LOAD_STORE(TYPE, N, double) #define XSIMD_DEFINE_BITWISE_CAST(TYPE, N) \ inline batch bitwise_cast(const batch_bool& src) \ { \ TYPE z(0); \ return select(src, batch(TYPE(~z)), batch(z)); \ } #define XSIMD_DEFINE_BITWISE_CAST_FLOAT(TYPE, N) \ inline batch bitwise_cast(const batch_bool& src) \ { \ TYPE z0(0), z1(0); \ using int_type = as_unsigned_integer_t; \ int_type value(~int_type(0)); \ std::memcpy(&z1, &value, sizeof(int_type)); \ return select(src, batch(z1), batch(z0)); \ } #define XSIMD_DEFINE_BITWISE_CAST_ALL(NMIN) \ XSIMD_DEFINE_BITWISE_CAST_FLOAT(double, NMIN) \ XSIMD_DEFINE_BITWISE_CAST_FLOAT(float, NMIN * 2) \ XSIMD_DEFINE_BITWISE_CAST(int64_t, NMIN) \ XSIMD_DEFINE_BITWISE_CAST(uint64_t, NMIN) \ XSIMD_DEFINE_BITWISE_CAST(int32_t, NMIN * 2) \ XSIMD_DEFINE_BITWISE_CAST(uint32_t, NMIN * 2) \ XSIMD_DEFINE_BITWISE_CAST(int16_t, NMIN * 4) \ XSIMD_DEFINE_BITWISE_CAST(uint16_t, NMIN * 4) \ XSIMD_DEFINE_BITWISE_CAST(int8_t, NMIN * 8) \ XSIMD_DEFINE_BITWISE_CAST(uint8_t, NMIN * 8) /**************************** * simd_base implementation * ****************************/ /** * @name Static downcast functions */ //@{ /** * Returns a reference to the batch type used for computation. */ template inline auto simd_base::operator()() -> batch_reference { return derived_cast().get(); } /** * Returns a constant reference to the batch type used for computation. */ template inline auto simd_base::operator()() const -> const_batch_reference { return derived_cast().get(); } /** * Returns a reference to the actual derived type of simd_base. */ template inline X& simd_base::derived_cast() { return *static_cast(this); } /** * Returns a constant reference to the actual derived type of simd_base. */ template inline const X& simd_base::derived_cast() const { return *static_cast(this); } //@} /***************************** * simd_batch implementation * *****************************/ template constexpr inline simd_batch::simd_batch(storage_type value) : m_value(value) { } /** * @name Static builders */ //@{ /** * Creates a batch from the single value \c v. * @param v the value used to initialize the batch * @return a new batch instance */ template inline X simd_batch::broadcast(value_type v) { return X(v); } /** * Creates a batch from the buffer \c src. The * memory does not need to be aligned. * @param src the memory buffer to read * @return a new batch instance */ template template inline X simd_batch::from_unaligned(T* src) { X res; res.load_unaligned(src); return res; } /** * Creates a batch from the buffer \c src. The * memory needs to be aligned. * @param src the memory buffer to read * @return a new batch instance */ template template inline X simd_batch::from_aligned(T* src) { X res; res.load_aligned(src); return res; } //@} /** * @name Arithmetic computed assignment */ //@{ /** * Adds the batch \c rhs to \c this. * @param rhs the batch to add. * @return a reference to \c this. */ template inline X& simd_batch::operator+=(const X& rhs) { (*this)() = (*this)() + rhs; return (*this)(); } /** * Adds the scalar \c rhs to each value contained in \c this. * @param rhs the scalar to add. * @return a reference to \c this. */ template inline X& simd_batch::operator+=(const value_type& rhs) { (*this)() = (*this)() + X(rhs); return (*this)(); } /** * Substracts the batch \c rhs to \c this. * @param rhs the batch to substract. * @return a reference to \c this. */ template inline X& simd_batch::operator-=(const X& rhs) { (*this)() = (*this)() - rhs; return (*this)(); } /** * Substracts the scalar \c rhs to each value contained in \c this. * @param rhs the scalar to substract. * @return a reference to \c this. */ template inline X& simd_batch::operator-=(const value_type& rhs) { (*this)() = (*this)() - X(rhs); return (*this)(); } /** * Multiplies \c this with the batch \c rhs. * @param rhs the batch involved in the multiplication. * @return a reference to \c this. */ template inline X& simd_batch::operator*=(const X& rhs) { (*this)() = (*this)() * rhs; return (*this)(); } /** * Multiplies each scalar contained in \c this with the scalar \c rhs. * @param rhs the scalar involved in the multiplication. * @return a reference to \c this. */ template inline X& simd_batch::operator*=(const value_type& rhs) { (*this)() = (*this)() * X(rhs); return (*this)(); } /** * Divides \c this by the batch \c rhs. * @param rhs the batch involved in the division. * @return a reference to \c this. */ template inline X& simd_batch::operator/=(const X& rhs) { (*this)() = (*this)() / rhs; return (*this)(); } /** * Divides each scalar contained in \c this by the scalar \c rhs. * @param rhs the scalar involved in the division. * @return a reference to \c this. */ template inline X& simd_batch::operator/=(const value_type& rhs) { (*this)() = (*this)() / X(rhs); return (*this)(); } //@} /** * @name Bitwise computed assignment */ /** * Assigns the bitwise and of \c rhs and \c this. * @param rhs the batch involved in the operation. * @return a reference to \c this. */ template inline X& simd_batch::operator&=(const X& rhs) { (*this)() = (*this)() & rhs; return (*this)(); } /** * Assigns the bitwise or of \c rhs and \c this. * @param rhs the batch involved in the operation. * @return a reference to \c this. */ template inline X& simd_batch::operator|=(const X& rhs) { (*this)() = (*this)() | rhs; return (*this)(); } /** * Assigns the bitwise xor of \c rhs and \c this. * @param rhs the batch involved in the operation. * @return a reference to \c this. */ template inline X& simd_batch::operator^=(const X& rhs) { (*this)() = (*this)() ^ rhs; return (*this)(); } //@} /** * @name Increment and decrement operators */ //@{ /** * Pre-increment operator. * @return a reference to \c this. */ template inline X& simd_batch::operator++() { (*this)() += value_type(1); return (*this)(); } /** * Post-increment operator. * @return a reference to \c this. */ template inline X& simd_batch::operator++(int) { X tmp = (*this)(); (*this)() += value_type(1); return tmp; } /** * Pre-decrement operator. * @return a reference to \c this. */ template inline X& simd_batch::operator--() { (*this)() -= value_type(1); return (*this)(); } /** * Post-decrement operator. * @return a reference to \c this. */ template inline X& simd_batch::operator--(int) { X tmp = (*this)(); (*this)() -= value_type(1); return tmp; } //@} template inline X& simd_batch::load_aligned(const char* src) { return (*this)().load_aligned(reinterpret_cast(src)); } template inline X& simd_batch::load_unaligned(const char* src) { return (*this)().load_unaligned(reinterpret_cast(src)); } template void simd_batch::store_aligned(char* dst) const { return (*this)().store_aligned(reinterpret_cast(dst)); } template void simd_batch::store_unaligned(char* dst) const { return (*this)().store_unaligned(reinterpret_cast(dst)); } template inline auto simd_batch::get() -> batch_reference { return this->derived_cast(); } template inline auto simd_batch::get() const -> const_batch_reference { return this->derived_cast(); } template inline auto simd_batch::operator[](std::size_t index) -> value_type& { return m_array[index & (size - 1)]; } template inline auto simd_batch::operator[](std::size_t index) const -> const value_type& { return m_array[index & (size - 1)]; } template inline auto simd_batch::begin() -> iterator { return m_array; } template inline auto simd_batch::end() -> iterator { return m_array + size; } template inline auto simd_batch::begin() const -> const_iterator { return cbegin(); } template inline auto simd_batch::end() const -> const_iterator { return cend(); } template inline auto simd_batch::cbegin() const -> const_iterator { return m_array; } template inline auto simd_batch::cend() const -> const_iterator { return m_array + size; } template inline auto simd_batch::rbegin() -> reverse_iterator { return reverse_iterator(end()); } template inline auto simd_batch::rend() -> reverse_iterator { return reverse_iterator(begin()); } template inline auto simd_batch::rbegin() const -> const_reverse_iterator { return crbegin(); } template inline auto simd_batch::rend() const -> const_reverse_iterator { return crend(); } template inline auto simd_batch::crbegin() const -> const_reverse_iterator { return const_reverse_iterator(end()); } template inline auto simd_batch::crend() const -> const_reverse_iterator { return const_reverse_iterator(begin()); } #define XSIMD_UNARY_OP(OP, FUNC) \ template \ inline batch_type_t operator OP(const simd_base& rhs) \ { \ using value_type = typename simd_batch_traits::value_type; \ using kernel = detail::batch_kernel::size>; \ return kernel::FUNC(rhs()); \ } #define XSIMD_BINARY_OP(OP, FUNC) \ template \ inline batch_type_t operator OP(const simd_base& lhs, const simd_base& rhs) \ { \ using value_type = typename simd_batch_traits::value_type; \ using kernel = detail::batch_kernel::size>; \ return kernel::FUNC(lhs(), rhs()); \ } \ \ template \ inline batch_type_t operator OP(const typename simd_batch_traits::value_type& lhs, \ const simd_base& rhs) \ { \ return batch_type_t(lhs) OP rhs(); \ } \ \ template \ inline batch_type_t operator OP(const simd_base& lhs, \ const typename simd_batch_traits::value_type& rhs) \ { \ return lhs() OP batch_type_t(rhs); \ } #define XSIMD_BINARY_BOOL_OP(OP, FUNC) \ template \ inline typename simd_batch_traits::batch_bool_type operator OP(const simd_base& lhs, \ const simd_base& rhs) \ { \ using value_type = typename simd_batch_traits::value_type; \ using kernel = detail::batch_kernel::size>; \ return kernel::FUNC(lhs(), rhs()); \ } \ \ template \ inline typename simd_batch_traits::batch_bool_type operator OP( \ const typename simd_batch_traits::value_type& lhs, const simd_base& rhs) \ { \ return batch_type_t(lhs) OP rhs(); \ } \ \ template \ inline typename simd_batch_traits::batch_bool_type operator OP( \ const simd_base& lhs, const typename simd_batch_traits::value_type& rhs) \ { \ return lhs() OP batch_type_t(rhs); \ } #define XSIMD_BINARY_BOOL_OP_DERIVED(OP, BASE_OP) \ template \ inline typename simd_batch_traits::batch_bool_type operator OP(const simd_base& lhs, \ const simd_base& rhs) \ { \ return rhs() BASE_OP lhs(); \ } \ \ template \ inline typename simd_batch_traits::batch_bool_type operator OP( \ const typename simd_batch_traits::value_type& lhs, const simd_base& rhs) \ { \ return rhs() BASE_OP batch_type_t(lhs); \ } \ \ template \ inline typename simd_batch_traits::batch_bool_type operator OP( \ const simd_base& lhs, const typename simd_batch_traits::value_type& rhs) \ { \ return batch_type_t(rhs) BASE_OP lhs(); \ } /** * @defgroup simd_batch_arithmetic Arithmetic operators */ /** * @ingroup simd_batch_arithmetic * * Computes the opposite of the batch \c rhs. * @tparam X the actual type of batch. * @param rhs batch involved in the operation. * @return the opposite of \c rhs. */ template inline batch_type_t operator-(const simd_base& rhs); XSIMD_UNARY_OP(-, neg) /** * @ingroup simd_batch_arithmetic * * No-op on \c rhs. * @tparam X the actual type of batch. * @param rhs batch involved in the operation. * @return \c rhs. */ template inline X operator+(const simd_batch& rhs) { return rhs(); } /** * @ingroup simd_batch_arithmetic * * Computes the sum of the batches \c lhs and \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the addition. * @param rhs batch involved in the addition. * @return the result of the addition. */ template batch_type_t operator+(const simd_base& lhs, const simd_base& rhs); /** * @ingroup simd_batch_arithmetic * * Computes the sum of the batch \c lhs and the scalar \c rhs. Equivalent to the * sum of two batches where all the values of the second one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the addition. * @param rhs scalar involved in the addition. * @return the result of the addition. */ template batch_type_t operator+(const simd_base& lhs, const typename simd_batch_traits::value_type& rhs); /** * @ingroup simd_batch_arithmetic * * Computes the sum of the scalar \c lhs and the batch \c rhs. Equivalent to the * sum of two batches where all the values of the first one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs scalar involved in the addition. * @param rhs batch involved in the addition. * @return the result of the addition. */ template batch_type_t operator+(const typename simd_batch_traits::value_type& lhs, const simd_base& rhs); XSIMD_BINARY_OP(+, add) /** * @ingroup simd_batch_arithmetic * * Computes the saturate sum of the batch \c lhs and the batch \c rhs. * \c lhs. * @tparam X the actual type of batch. * @param lhs batch involved in the saturated addition. * @param rhs batch involved in the saturated addition. * @return the result of the saturated addition. */ template inline batch_type_t sadd(const simd_base& lhs, const simd_base& rhs) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_kernel::size>; return kernel::sadd(lhs(), rhs()); } /** * @ingroup simd_batch_arithmetic * * Computes the saturate sum of the scalar \c lhs and the batch \c rhs. Equivalent to the * saturated sum of two batches where all the values of the first one are initialized to * \c lhs. * @tparam X the actual type of batch. * @param lhs scalar involved in the saturated addition. * @param rhs batch involved in the saturated addition. * @return the result of the saturated addition. */ template inline batch_type_t sadd(const typename simd_batch_traits::value_type& lhs, const simd_base& rhs) { return sadd(batch_type_t(lhs),rhs()); } /** * @ingroup simd_batch_arithmetic * * Computes the saturate sum of the batch \c lhs and the scalar \c rhs. Equivalent to the * saturated sum of two batches where all the values of the second one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the saturated addition. * @param rhs scalar involved in the saturated addition. * @return the result of the saturated addition. */ template inline batch_type_t sadd(const simd_base& lhs, const typename simd_batch_traits::value_type& rhs) { return sadd(lhs(),batch_type_t(rhs)); } /** * @ingroup simd_batch_arithmetic * * Computes the difference of the batches \c lhs and \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the difference. * @param rhs batch involved in the difference. * @return the result of the difference. */ template batch_type_t operator-(const simd_base& lhs, const simd_base& rhs); /** * @ingroup simd_batch_arithmetic * * Computes the difference of the batch \c lhs and the scalar \c rhs. Equivalent to the * difference of two batches where all the values of the second one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the difference. * @param rhs scalar involved in the difference. * @return the result of the difference. */ template batch_type_t operator-(const simd_base& lhs, const typename simd_batch_traits::value_type& rhs); /** * @ingroup simd_batch_arithmetic * * Computes the difference of the scalar \c lhs and the batch \c rhs. Equivalent to the * difference of two batches where all the values of the first one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs scalar involved in the difference. * @param rhs batch involved in the difference. * @return the result of the difference. */ template batch_type_t operator-(const typename simd_batch_traits::value_type& lhs, const simd_base& rhs); XSIMD_BINARY_OP(-, sub) /** * @ingroup simd_batch_arithmetic * * Computes the saturate difference of the batch \c lhs and the batch \c rhs. * \c lhs. * @tparam X the actual type of batch. * @param lhs batch involved in the saturated difference. * @param rhs batch involved in the saturated difference. * @return the result of the saturated difference. */ template inline batch_type_t ssub(const simd_base& lhs, const simd_base& rhs) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_kernel::size>; return kernel::ssub(lhs(), rhs()); } /** * @ingroup simd_batch_arithmetic * * Computes the saturate difference of the scalar \c lhs and the batch \c rhs. Equivalent to the * saturated sum of two batches where all the values of the first one are initialized to * \c lhs. * @tparam X the actual type of batch. * @param lhs scalar involved in the saturated difference. * @param rhs batch involved in the saturated difference. * @return the result of the saturated difference. */ template inline batch_type_t ssub(const typename simd_batch_traits::value_type& lhs, const simd_base& rhs) { return ssub(batch_type_t(lhs),rhs()); } /** * @ingroup simd_batch_arithmetic * * Computes the saturate difference of the batch \c lhs and the scalar \c rhs. Equivalent to the * saturated difference of two batches where all the values of the second one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the saturated difference. * @param rhs scalar involved in the saturated difference. * @return the result of the saturated difference. */ template inline batch_type_t ssub(const simd_base& lhs, const typename simd_batch_traits::value_type& rhs) { return ssub(lhs(),batch_type_t(rhs)); } /** * @ingroup simd_batch_arithmetic * * Computes the product of the batches \c lhs and \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the product. * @param rhs batch involved in the product. * @return the result of the product. */ template batch_type_t operator*(const simd_base& lhs, const simd_base& rhs); /** * @ingroup simd_batch_arithmetic * * Computes the product of the batch \c lhs and the scalar \c rhs. Equivalent to the * product of two batches where all the values of the second one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the product. * @param rhs scalar involved in the product. * @return the result of the product. */ template batch_type_t operator*(const simd_base& lhs, const typename simd_batch_traits::value_type& rhs); /** * @ingroup simd_batch_arithmetic * * Computes the product of the scalar \c lhs and the batch \c rhs. Equivalent to the * difference of two batches where all the values of the first one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs scalar involved in the product. * @param rhs batch involved in the product. * @return the result of the product. */ template batch_type_t operator*(const typename simd_batch_traits::value_type& lhs, const simd_base& rhs); XSIMD_BINARY_OP(*, mul) /** * @ingroup simd_batch_arithmetic * * Computes the division of the batch \c lhs by the batch \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the division. * @param rhs batch involved in the division. * @return the result of the division. */ template batch_type_t operator/(const simd_base& lhs, const simd_base& rhs); /** * @ingroup simd_batch_arithmetic * * Computes the division of the batch \c lhs by the scalar \c rhs. Equivalent to the * division of two batches where all the values of the second one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the division. * @param rhs scalar involved in the division. * @return the result of the division. */ template batch_type_t operator/(const simd_base& lhs, const typename simd_batch_traits::value_type& rhs); /** * @ingroup simd_batch_arithmetic * * Computes the division of the scalar \c lhs and the batch \c rhs. Equivalent to the * difference of two batches where all the values of the first one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs scalar involved in the division. * @param rhs batch involved in the division. * @return the result of the division. */ template batch_type_t operator/(const typename simd_batch_traits::value_type& lhs, const simd_base& rhs); XSIMD_BINARY_OP(/, div) /** * @ingroup simd_batch_arithmetic * * Computes the integer modulo of the batch \c lhs by the batch \c rhs. * @param lhs batch involved in the modulo. * @param rhs batch involved in the modulo. * @return the result of the modulo. */ template batch_type_t operator%(const simd_base& lhs, const simd_base& rhs); /** * @ingroup simd_batch_arithmetic * * Computes the integer modulo of the batch \c lhs by the scalar \c rhs. Equivalent to the * modulo of two batches where all the values of the second one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the modulo. * @param rhs scalar involved in the modulo. * @return the result of the modulo. */ template batch_type_t operator%(const simd_base& lhs, const typename simd_batch_traits::value_type& rhs); /** * @ingroup simd_batch_arithmetic * * Computes the integer modulo of the scalar \c lhs and the batch \c rhs. Equivalent to the * difference of two batches where all the values of the first one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs scalar involved in the modulo. * @param rhs batch involved in the modulo. * @return the result of the modulo. */ template batch_type_t operator%(const typename simd_batch_traits::value_type& lhs, const simd_base& rhs); XSIMD_BINARY_OP(%, mod) /** * @defgroup simd_batch_comparison Comparison operators */ /** * @ingroup simd_batch_comparison * * Element-wise equality comparison of batches \c lhs and \c rhs. * @param lhs batch involved in the comparison. * @param rhs batch involved in the comparison. * @return a boolean batch. */ template typename simd_batch_traits::batch_bool_type operator==(const simd_base& lhs, const simd_base& rhs); XSIMD_BINARY_BOOL_OP(==, eq) /** * @ingroup simd_batch_comparison * * Element-wise inequality comparison of batches \c lhs and \c rhs. * @param lhs batch involved in the comparison. * @param rhs batch involved in the comparison. * @return a boolean batch. */ template typename simd_batch_traits::batch_bool_type operator!=(const simd_base& lhs, const simd_base& rhs); XSIMD_BINARY_BOOL_OP(!=, neq) /** * @ingroup simd_batch_comparison * * Element-wise lesser than comparison of batches \c lhs and \c rhs. * @param lhs batch involved in the comparison. * @param rhs batch involved in the comparison. * @return a boolean batch. */ template typename simd_batch_traits::batch_bool_type operator<(const simd_base& lhs, const simd_base& rhs); XSIMD_BINARY_BOOL_OP(<, lt) /** * @ingroup simd_batch_comparison * * Element-wise lesser or equal to comparison of batches \c lhs and \c rhs. * @param lhs batch involved in the comparison. * @param rhs batch involved in the comparison. * @return a boolean batch. */ template typename simd_batch_traits::batch_bool_type operator<=(const simd_base& lhs, const simd_base& rhs); XSIMD_BINARY_BOOL_OP(<=, lte) /** * @ingroup simd_batch_comparison * * Element-wise greater than comparison of batches \c lhs and \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the comparison. * @param rhs batch involved in the comparison. * @return a boolean batch. */ template typename simd_batch_traits::batch_bool_type operator>(const simd_base& lhs, const simd_base& rhs); XSIMD_BINARY_BOOL_OP_DERIVED(>, <) /** * @ingroup simd_batch_comparison * * Element-wise greater or equal comparison of batches \c lhs and \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the comparison. * @param rhs batch involved in the comparison. * @return a boolean batch. */ template typename simd_batch_traits::batch_bool_type operator>=(const simd_base& lhs, const simd_base& rhs); XSIMD_BINARY_BOOL_OP_DERIVED(>=, <=) /** * @defgroup simd_batch_bitwise Bitwise operators */ /** * @ingroup simd_batch_bitwise * * Computes the bitwise and of the batches \c lhs and \c rhs. * @param lhs batch involved in the operation. * @param rhs batch involved in the operation. * @return the result of the bitwise and. */ template inline batch_type_t operator&(const simd_base& lhs, const simd_base& rhs); XSIMD_BINARY_OP(&, bitwise_and) /** * @ingroup simd_batch_bitwise * * Computes the bitwise or of the batches \c lhs and \c rhs. * @param lhs batch involved in the operation. * @param rhs batch involved in the operation. * @return the result of the bitwise or. */ template inline batch_type_t operator|(const simd_base& lhs, const simd_base& rhs); XSIMD_BINARY_OP(|, bitwise_or) /** * @ingroup simd_batch_bitwise * * Computes the bitwise xor of the batches \c lhs and \c rhs. * @param lhs batch involved in the operation. * @param rhs batch involved in the operation. * @return the result of the bitwise xor. */ template inline batch_type_t operator^(const simd_base& lhs, const simd_base& rhs); XSIMD_BINARY_OP(^, bitwise_xor) /** * @ingroup simd_batch_bitwise * * Computes the bitwise not of the batches \c lhs and \c rhs. * @param rhs batch involved in the operation. * @return the result of the bitwise not. */ template batch_type_t operator~(const simd_base& rhs); XSIMD_UNARY_OP(~, bitwise_not) /** * @ingroup simd_batch_bitwise * * Computes the bitwise andnot of the batches \c lhs and \c rhs. * @param lhs batch involved in the operation. * @param rhs batch involved in the operation. * @return the result of the bitwise andnot. */ template inline batch_type_t bitwise_andnot(const simd_batch& lhs, const simd_batch& rhs) { using value_type = typename simd_batch_traits::value_type; \ using kernel = detail::batch_kernel::size>; \ return kernel::bitwise_andnot(lhs(), rhs()); } /** * Element-wise not of \c rhs. * @tparam X the actual type of batch. * @param rhs batch involved in the logical not operation. * @return boolean batch. */ template inline typename simd_batch_traits::batch_bool_type operator!(const simd_base& rhs) { using b_type = typename X::batch_type; using value_type = typename simd_batch_traits::value_type; return rhs() == b_type(value_type(0)); } /** * Returns the smaller values of the batches \c lhs and \c rhs. * @param lhs a batch of integer or floating point values. * @param rhs a batch of integer or floating point values. * @return a batch of the smaller values. */ template inline batch_type_t min(const simd_base& lhs, const simd_base& rhs) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_kernel::size>; return kernel::min(lhs(), rhs()); } /** * Returns the larger values of the batches \c lhs and \c rhs. * @param lhs a batch of integer or floating point values. * @param rhs a batch of integer or floating point values. * @return a batch of the larger values. */ template inline batch_type_t max(const simd_base& lhs, const simd_base& rhs) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_kernel::size>; return kernel::max(lhs(), rhs()); } /** * Returns the smaller values of the batches \c lhs and \c rhs. * @param lhs a batch of floating point values. * @param rhs a batch of floating point values. * @return a batch of the smaller values. */ template inline batch_type_t fmin(const simd_batch& lhs, const simd_batch& rhs) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_kernel::size>; return kernel::fmin(lhs(), rhs()); } /** * Returns the larger values of the batches \c lhs and \c rhs. * @param lhs a batch of floating point values. * @param rhs a batch of floating point values. * @return a batch of the larger values. */ template inline batch_type_t fmax(const simd_batch& lhs, const simd_batch& rhs) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_kernel::size>; return kernel::fmax(lhs(), rhs()); } /** * Computes the absolute values of each scalar in the batch \c rhs. * @param rhs batch of integer or floating point values. * @return the asbolute values of \c rhs. */ template inline real_batch_type_t abs(const simd_base& rhs) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_kernel::size>; return kernel::abs(rhs()); } /** * Computes the absolute values of each scalar in the batch \c rhs. * @param rhs batch floating point values. * @return the asbolute values of \c rhs. */ template inline batch_type_t fabs(const simd_base& rhs) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_kernel::size>; return kernel::fabs(rhs()); } /** * Computes the square root of the batch \c rhs. * @param rhs batch of floating point values. * @return the square root of \c rhs. */ template inline batch_type_t sqrt(const simd_base& rhs) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_kernel::size>; return kernel::sqrt(rhs()); } /** * Computes (x*y) + z in a single instruction when possible. * @param x a batch of integer or floating point values. * @param y a batch of integer or floating point values. * @param z a batch of integer or floating point values. * @return the result of the fused multiply-add operation. */ template inline batch_type_t fma(const simd_base& x, const simd_base& y, const simd_base& z) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_kernel::size>; return kernel::fma(x(), y(), z()); } /** * Computes (x*y) - z in a single instruction when possible. * @param x a batch of integer or floating point values. * @param y a batch of integer or floating point values. * @param z a batch of integer or floating point values. * @return the result of the fused multiply-sub operation. */ template inline batch_type_t fms(const simd_base& x, const simd_base& y, const simd_base& z) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_kernel::size>; return kernel::fms(x(), y(), z()); } /** * Computes -(x*y) + z in a single instruction when possible. * @param x a batch of integer or floating point values. * @param y a batch of integer or floating point values. * @param z a batch of integer or floating point values. * @return the result of the fused negated multiply-add operation. */ template inline batch_type_t fnma(const simd_base& x, const simd_base& y, const simd_base& z) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_kernel::size>; return kernel::fnma(x(), y(), z()); } /** * Computes -(x*y) - z in a single instruction when possible. * @param x a batch of integer or floating point values. * @param y a batch of integer or floating point values. * @param z a batch of integer or floating point values. * @return the result of the fused negated multiply-sub operation. */ template inline batch_type_t fnms(const simd_base& x, const simd_base& y, const simd_base& z) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_kernel::size>; return kernel::fnms(x(), y(), z()); } /** * @defgroup simd_batch_reducers Reducers */ /** * @ingroup simd_batch_reducers * * Adds all the scalars of the batch \c rhs. * @param rhs batch involved in the reduction * @return the result of the reduction. */ template inline typename simd_batch_traits::value_type hadd(const simd_base& rhs) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_kernel::size>; return kernel::hadd(rhs()); } /** * @ingroup simd_batch_reducers * * Parallel horizontal addition: adds the scalars of each batch * in the array pointed by \c row and store them in a returned * batch. * @param row an array of \c N batches * @return the result of the reduction. */ template enable_if_simd_t haddp(const X* row) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_kernel::size>; return kernel::haddp(row); } /** * @defgroup simd_batch_miscellaneous Miscellaneous */ /** * @ingroup simd_batch_miscellaneous * * Ternary operator for batches: selects values from the batches \c a or \c b * depending on the boolean values in \c cond. Equivalent to * \code{.cpp} * for(std::size_t i = 0; i < N; ++i) * res[i] = cond[i] ? a[i] : b[i]; * \endcode * @param cond batch condition. * @param a batch values for truthy condition. * @param b batch value for falsy condition. * @return the result of the selection. */ template inline batch_type_t select(const typename simd_batch_traits::batch_bool_type& cond, const simd_base& a, const simd_base& b) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_kernel::size>; return kernel::select(cond(), a(), b()); } /** * @ingroup simd_batch_miscellaneous * * Ternary operator for batches: selects values from the batches \c a or \c b * depending on the boolean values in the constant batch \c cond. Equivalent to * \code{.cpp} * for(std::size_t i = 0; i < N; ++i) * res[i] = cond[i] ? a[i] : b[i]; * \endcode * @param cond constant batch condition. * @param a batch values for truthy condition. * @param b batch value for falsy condition. * @return the result of the selection. */ template inline batch_type_t select(const batch_bool_constant::value_type, Masks...>& cond, const simd_base& a, const simd_base& b) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_kernel::size>; return kernel::select(cond, a(), b()); } /** * Unpack and interleave data from the LOW half of batches \c lhs and \c rhs. * Store the results in the Return value. * @param lhs a batch of integer or floating point or double precision values. * @param rhs a batch of integer or floating point or double precision values. * @return a batch of the low part of shuffled values. */ template inline batch_type_t zip_lo(const simd_base& lhs, const simd_base& rhs) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_kernel::size>; return kernel::zip_lo(lhs(), rhs()); } /** * Unpack and interleave data from the HIGH half of batches \c lhs and \c rhs. * Store the results in the Return value. * @param lhs a batch of integer or floating point or double precision values. * @param rhs a batch of integer or floating point or double precision values. * @return a batch of the high part of shuffled values. */ template inline batch_type_t zip_hi(const simd_base& lhs, const simd_base& rhs) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_kernel::size>; return kernel::zip_hi(lhs(), rhs()); } /** * Extract vector from pair of vectors * extracts the lowest vector elements from the second source \c rhs * and the highest vector elements from the first source \c lhs * The index: 'n' specifies the lowest vector element to extract from the first source register. * Concatenates the results into th Return value. * @param lhs a batch of integer or floating point or double precision values. * @param rhs a batch of integer or floating point or double precision values. * @return. */ template inline batch_type_t extract_pair(const simd_base& lhs, const simd_base& rhs, const int index) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_kernel::size>; return kernel::extract_pair(lhs(), rhs(), index); } /** * Determines if the scalars in the given batch \c x are NaN values. * @param x batch of floating point values. * @return a batch of booleans. */ template inline typename simd_batch_traits::batch_bool_type isnan(const simd_base& x) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_kernel::size>; return kernel::isnan(x()); } /** * Insert the batch \c rhs into the stream \c out. * @tparam X the actual type of batch. * @param out the output stream. * @param rhs the batch to output. * @return the output stream. */ template inline std::ostream& operator<<(std::ostream& out, const simd_batch& rhs) { out << '('; std::size_t s = simd_batch::size; for (std::size_t i = 0; i < s - 1; ++i) { out << rhs()[i] << ", "; } out << rhs()[s - 1] << ')'; return out; } /****************************************** * generic batch operators implementation * ******************************************/ #define GENERIC_OPERATOR_IMPLEMENTATION(OP) \ using traits = simd_batch_traits>; \ constexpr std::size_t align = traits::align; \ alignas(align) T tmp_lhs[N]; \ alignas(align) T tmp_rhs[N]; \ alignas(align) T tmp_res[N]; \ lhs.store_aligned(tmp_lhs); \ rhs.store_aligned(tmp_rhs); \ for (std::size_t i = 0; i < traits::size; ++i) \ { \ tmp_res[i] = tmp_lhs[i] OP tmp_rhs[i]; \ } \ return batch(tmp_res, aligned_mode()) template inline batch operator&&(const batch& lhs, const batch& rhs) { GENERIC_OPERATOR_IMPLEMENTATION(&&); } template inline batch operator||(const batch& lhs, const batch& rhs) { GENERIC_OPERATOR_IMPLEMENTATION(||); } template inline batch operator<<(const batch& lhs, const batch& rhs) { GENERIC_OPERATOR_IMPLEMENTATION(<<); } template inline batch operator>>(const batch& lhs, const batch& rhs) { GENERIC_OPERATOR_IMPLEMENTATION(>>); } /***************************************** * batch cast functions implementation * *****************************************/ template inline batch batch_cast(const batch& x) { return batch_cast_impl::run(x); } /***************************************** * bitwise cast functions implementation * *****************************************/ template inline B bitwise_cast(const batch& x) { return bitwise_cast_impl, B>::run(x); } template inline B bitwise_cast(const batch& x) { return bitwise_cast_impl, B>::run(x); } template inline B bitwise_cast(const batch& x) { return bitwise_cast_impl, B>::run(x); } template inline B bitwise_cast(const batch& x) { return bitwise_cast_impl, B>::run(x); } template inline batch bitwise_cast(const batch_bool& src) { return batch(src.get_value()); } /*********************************** * Workaround for Clang on Windows * ***********************************/ #if defined(_WIN32) && defined(__clang__) /** * On Windows, the return type of fma is the promote type of its * arguments if they are integral or floating point types, float * otherwise. The implementation does not rely on SFINAE to * remove it from the overload resolution set when the argument * types are neither integral types nor floating point type. * * The fma overload defined xsimd accepts simd_base> * arguments, not batch. Thus a call to this latter is not * more specialized than a call to the STL overload, which is * considered. Since there is no mean to convert batch * to float for instance, this results in a compilation error. */ template inline batch fma(const batch& a, const batch& b, const batch& c) { using base_type = simd_base>; const base_type& sba = a; const base_type& sbb = b; const base_type& sbc = c; return fma(sba, sbb, sbc); } #endif } #endif xsimd-7.6.0/include/xsimd/types/xsimd_base_bool.hpp000066400000000000000000000450141410101234500224200ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_BASE_BOOL_HPP #define XSIMD_BASE_BOOL_HPP #include namespace xsimd { template class batch_bool; template struct simd_batch_traits; namespace detail { template struct batch_bool_kernel; } /************** * bool_proxy * **************/ template class bool_proxy { public: bool_proxy(T& ref); bool_proxy(const bool_proxy&) = default; bool_proxy& operator=(const bool_proxy&) = default; bool_proxy(bool_proxy&&) = default; bool_proxy& operator=(bool_proxy&&) = default; bool_proxy& operator=(bool rhs); operator bool() const; private: T& m_ref; }; /******************* * simd_batch_bool * *******************/ /** * @class simd_batch_bool * @brief Base class for batch of boolean values. * * The simd_batch_bool class is the base class for all classes representing * a batch of boolean values. Batch of boolean values is meant for operations * that may involve batches of integer or floating point values. Thus, * the boolean values are stored as integer or floating point values, and each * type of batch has its dedicated type of boolean batch. * * @tparam X The derived type * @sa simd_batch */ template class simd_batch_bool { public: using value_type = typename simd_batch_traits::value_type; static constexpr std::size_t size = simd_batch_traits::size; X& operator&=(const X& rhs); X& operator|=(const X& rhs); X& operator^=(const X& rhs); X& operator()(); const X& operator()() const; X& load_aligned(const bool* src); X& load_unaligned(const bool* src); void store_aligned(bool* dst) const; void store_unaligned(bool* dst) const; template X& load_aligned(const P& src); template X& load_unaligned(const P& src); template void store_aligned(P& dst) const; template void store_unaligned(P& dst) const; protected: simd_batch_bool() = default; ~simd_batch_bool() = default; simd_batch_bool(const simd_batch_bool&) = default; simd_batch_bool& operator=(const simd_batch_bool&) = default; simd_batch_bool(simd_batch_bool&&) = default; simd_batch_bool& operator=(simd_batch_bool&&) = default; private: template X& load_impl(detail::index_sequence, const P& src); template void store_impl(P& dst) const; }; template X operator&(const simd_batch_bool& lhs, const simd_batch_bool&rhs); template X operator|(const simd_batch_bool& lhs, const simd_batch_bool&rhs); template X operator^(const simd_batch_bool& lhs, const simd_batch_bool&rhs); template X operator~(const simd_batch_bool& rhs); template X bitwise_andnot(const simd_batch_bool& lhs, const simd_batch_bool&rhs); template X operator==(const simd_batch_bool& lhs, const simd_batch_bool&rhs); template X operator!=(const simd_batch_bool& lhs, const simd_batch_bool&rhs); template bool all(const simd_batch_bool& rhs); template bool any(const simd_batch_bool& rhs); template X operator&&(const simd_batch_bool& lhs, const simd_batch_bool& rhs); template X operator&&(const simd_batch_bool& lhs, bool rhs); template X operator&&(bool lhs, const simd_batch_bool& rhs); template X operator||(const simd_batch_bool& lhs, const simd_batch_bool& rhs); template X operator||(const simd_batch_bool& lhs, bool rhs); template X operator||(bool lhs, const simd_batch_bool& rhs); template X operator!(const simd_batch_bool& rhs); template std::ostream& operator<<(std::ostream& out, const simd_batch_bool& rhs); /***************************** * bool_proxy implementation * *****************************/ template inline bool_proxy::bool_proxy(T& ref) : m_ref(ref) { } template inline bool_proxy& bool_proxy::operator=(bool rhs) { m_ref = static_cast(rhs); return *this; } template inline bool_proxy::operator bool() const { return static_cast(m_ref); } /********************************** * simd_batch_bool implementation * **********************************/ /** * @name Bitwise computed assignement */ //@{ /** * Assigns the bitwise and of \c rhs and \c this. * @param rhs the batch involved in the operation. * @return a reference to \c this. */ template inline X& simd_batch_bool::operator&=(const X& rhs) { (*this)() = (*this)() & rhs; return (*this)(); } /** * Assigns the bitwise or of \c rhs and \c this. * @param rhs the batch involved in the operation. * @return a reference to \c this. */ template inline X& simd_batch_bool::operator|=(const X& rhs) { (*this)() = (*this)() | rhs; return (*this)(); } /** * Assigns the bitwise xor of \c rhs and \c this. * @param rhs the batch involved in the operation. * @return a reference to \c this. */ template inline X& simd_batch_bool::operator^=(const X& rhs) { (*this)() = (*this)() ^ rhs; return (*this)(); } //@} /** * @name Static downcast functions */ //@{ /** * Returns a reference to the actual derived type of the simd_batch_bool. */ template inline X& simd_batch_bool::operator()() { return *static_cast(this); } /** * Returns a constant reference to the actual derived type of the simd_batch_bool. */ template const X& simd_batch_bool::operator()() const { return *static_cast(this); } //@} template inline X& simd_batch_bool::load_aligned(const bool* src) { return load_impl(detail::make_index_sequence(), src); } template inline X& simd_batch_bool::load_unaligned(const bool* src) { return load_aligned(src); } template inline void simd_batch_bool::store_aligned(bool* dst) const { store_impl(dst); } template inline void simd_batch_bool::store_unaligned(bool* dst) const { store_impl(dst); } template template inline X& simd_batch_bool::load_aligned(const P& src) { return load_impl(detail::make_index_sequence(), src); } template template inline X& simd_batch_bool::load_unaligned(const P& src) { return load_aligned(src); } template template inline void simd_batch_bool::store_aligned(P& dst) const { store_impl(dst); } template template inline void simd_batch_bool::store_unaligned(P& dst) const { store_impl(dst); } template template inline X& simd_batch_bool::load_impl(detail::index_sequence, const P& src) { return (*this)().load_values(src[I]...); } template template inline void simd_batch_bool::store_impl(P& dst) const { for(std::size_t i = 0; i < size; ++i) { dst[i] = (*this)()[i]; } } /** * @defgroup simd_batch_bool_bitwise Bitwise functions */ /** * @ingroup simd_batch_bool_bitwise * * Computes the bitwise and of batches \c lhs and \c rhs. * @tparam X the actual type of boolean batch. * @param lhs batch involved in the operation. * @param rhs batch involved in the operation. * @return the result of the bitwise and. */ template inline X operator&(const simd_batch_bool& lhs, const simd_batch_bool&rhs) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_bool_kernel::size>; return kernel::bitwise_and(lhs(), rhs()); } /** * @ingroup simd_batch_bool_bitwise * * Computes the bitwise or of batches \c lhs and \c rhs. * @tparam X the actual type of boolean batch. * @param lhs batch involved in the operation. * @param rhs batch involved in the operation. * @return the result of the bitwise or. */ template inline X operator|(const simd_batch_bool& lhs, const simd_batch_bool&rhs) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_bool_kernel::size>; return kernel::bitwise_or(lhs(), rhs()); } /** * @ingroup simd_batch_bool_bitwise * * Computes the bitwise xor of batches \c lhs and \c rhs. * @tparam X the actual type of boolean batch. * @param lhs batch involved in the operation. * @param rhs batch involved in the operation. * @return the result of the bitwise xor. */ template inline X operator^(const simd_batch_bool& lhs, const simd_batch_bool&rhs) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_bool_kernel::size>; return kernel::bitwise_xor(lhs(), rhs()); } /** * @ingroup simd_batch_bool_bitwise * * Computes the bitwise not of batch \c rhs. * @tparam X the actual type of boolean batch. * @param rhs batch involved in the operation. * @return the result of the bitwise not. */ template inline X operator~(const simd_batch_bool& rhs) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_bool_kernel::size>; return kernel::bitwise_not(rhs()); } /** * @ingroup simd_batch_bool_bitwise * * Computes the bitwise and not of batches \c lhs and \c rhs. * @tparam X the actual type of boolean batch. * @param lhs batch involved in the operation. * @param rhs batch involved in the operation. * @return the result of the bitwise and not. */ template X bitwise_andnot(const simd_batch_bool& lhs, const simd_batch_bool& rhs) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_bool_kernel::size>; return kernel::bitwise_andnot(lhs(), rhs()); } /** * @defgroup simd_batch_bool_comparison Comparison operators */ /** * @ingroup simd_batch_bool_comparison * * Element-wise equality of batches \c lhs and \c rhs. * @param lhs batch involved in the comparison. * @param rhs batch involved in the comparison. * @return the result of the equality comparison. */ template inline X operator==(const simd_batch_bool& lhs, const simd_batch_bool&rhs) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_bool_kernel::size>; return kernel::equal(lhs(), rhs()); } /** * @ingroup simd_batch_bool_comparison * * Element-wise inequality of batches \c lhs and \c rhs. * @param lhs batch involved in the comparison. * @param rhs batch involved in the comparison. * @return the result of the inequality comparison. */ template inline X operator!=(const simd_batch_bool& lhs, const simd_batch_bool&rhs) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_bool_kernel::size>; return kernel::not_equal(lhs(), rhs()); } /** * @defgroup simd_batch_bool_reducers Reducers */ /** * @ingroup simd_batch_bool_reducers * * Returns true if all the boolean values in the batch are true, * false otherwise. * @param rhs the batch to reduce. * @return a boolean scalar. */ template inline bool all(const simd_batch_bool& rhs) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_bool_kernel::size>; return kernel::all(rhs()); } /** * @ingroup simd_batch_bool_reducers * * Return true if any of the boolean values in the batch is true, * false otherwise. * @param rhs the batch to reduce. * @return a boolean scalar. */ template inline bool any(const simd_batch_bool& rhs) { using value_type = typename simd_batch_traits::value_type; using kernel = detail::batch_bool_kernel::size>; return kernel::any(rhs()); } /** * @defgroup simd_batch_bool_logical Logical functions */ /** * @ingroup simd_batch_bool_logical * * Computes the logical and of batches \c lhs and \c rhs. * @tparam X the actual type of boolean batch. * @param lhs batch involved in the operation. * @param rhs batch involved in the operation. * @return the result of the logical and. */ template inline X operator&&(const simd_batch_bool& lhs, const simd_batch_bool& rhs) { return lhs() & rhs(); } /** * @ingroup simd_batch_bool_logical * * Computes the logical and of the batch \c lhs and the scalar \c rhs. * Equivalent to the logical and of two boolean batches, where all the * values of the second one are initialized to \c rhs. * @tparam X the actual type of boolean batch. * @param lhs batch involved in the operation. * @param rhs boolean involved in the operation. * @return the result of the logical and. */ template inline X operator&&(const simd_batch_bool& lhs, bool rhs) { return lhs() & X(rhs); } /** * @ingroup simd_batch_bool_logical * * Computes the logical and of the scalar \c lhs and the batch \c rhs. * Equivalent to the logical and of two boolean batches, where all the * values of the first one are initialized to \c lhs. * @tparam X the actual type of boolean batch. * @param lhs boolean involved in the operation. * @param rhs batch involved in the operation. * @return the result of the logical and. */ template inline X operator&&(bool lhs, const simd_batch_bool& rhs) { return X(lhs) & rhs(); } /** * @ingroup simd_batch_bool_logical * * Computes the logical or of batches \c lhs and \c rhs. * @tparam X the actual type of boolean batch. * @param lhs batch involved in the operation. * @param rhs batch involved in the operation. * @return the result of the logical or. */ template inline X operator||(const simd_batch_bool& lhs, const simd_batch_bool& rhs) { return lhs() | rhs(); } /** * @ingroup simd_batch_bool_logical * * Computes the logical or of the batch \c lhs and the scalar \c rhs. * Equivalent to the logical or of two boolean batches, where all the * values of the second one are initialized to \c rhs. * @tparam X the actual type of boolean batch. * @param lhs batch involved in the operation. * @param rhs boolean involved in the operation. * @return the result of the logical or. */ template inline X operator||(const simd_batch_bool& lhs, bool rhs) { return lhs() | X(rhs); } /** * @ingroup simd_batch_bool_logical * * Computes the logical or of the scalar \c lhs and the batch \c rhs. * Equivalent to the logical or of two boolean batches, where all the * values of the first one are initialized to \c lhs. * @tparam X the actual type of boolean batch. * @param lhs boolean involved in the operation. * @param rhs batch involved in the operation. * @return the result of the logical or. */ template inline X operator||(bool lhs, const simd_batch_bool& rhs) { return X(lhs) | rhs(); } /* * @ingroup simd_batch_bool_logical * * Computes the logical not of \c rhs. * @tparam X the actual type of boolean batch. * @param rhs batch involved in the operation. * @return the result og the logical not. */ template inline X operator!(const simd_batch_bool& rhs) { return rhs() == X(false); } /** * Insert the batch \c rhs into the stream \c out. * @tparam X the actual type of batch. * @param out the output stream. * @param rhs the batch to output. * @return the output stream. */ template inline std::ostream& operator<<(std::ostream& out, const simd_batch_bool& rhs) { out << '('; std::size_t s = simd_batch_bool::size; for (std::size_t i = 0; i < s - 1; ++i) { out << (rhs()[i] ? 'T' : 'F') << ", "; } out << (rhs()[s - 1] ? 'T' : 'F') << ')'; return out; } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_base_constant.hpp000066400000000000000000000072501410101234500233160ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Serge Guelton * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_BASE_CONSTANT_HPP #define XSIMD_BASE_CONSTANT_HPP namespace xsimd { template class simd_base; template struct batch_bool_constant { static constexpr std::size_t size = sizeof...(Values); using value_type = bool; using batch_type = typename simd_batch_traits>::batch_bool_type; batch_type operator()() const { return *this; } operator batch_type() const { return {Values...}; } bool operator[](size_t i) const { return std::array{{Values...}}[i]; } static constexpr int mask() { return mask_helper(0, static_cast(Values)...); } private: static constexpr int mask_helper(int acc) { return acc; } template static constexpr int mask_helper(int acc, int mask, Tys... masks) { return mask_helper(acc | mask, (masks << 1)...); } }; template struct batch_constant { static constexpr std::size_t size = sizeof...(Values); using value_type = T; using batch_type = batch; batch_type operator()() const { return *this; } operator batch_type() const { return {Values...}; } constexpr T operator[](size_t i) const { return std::array{Values...}[i]; } }; namespace detail { template constexpr auto make_batch_constant(detail::index_sequence) -> batch_constant { return {}; } template constexpr auto make_batch_bool_constant(detail::index_sequence) -> batch_bool_constant { return {}; } template constexpr auto make_batch_constant(detail::index_sequence) -> batch_constant { return {}; } template constexpr auto make_batch_bool_constant(detail::index_sequence) -> batch_bool_constant { return {}; } } // namespace detail template constexpr auto make_batch_constant() -> decltype( detail::make_batch_constant(detail::make_index_sequence())) { return detail::make_batch_constant(detail::make_index_sequence()); } template constexpr auto make_batch_bool_constant() -> decltype(detail::make_batch_bool_constant( detail::make_index_sequence())) { return detail::make_batch_bool_constant( detail::make_index_sequence()); } } // namespace xsimd #endif xsimd-7.6.0/include/xsimd/types/xsimd_common_math.hpp000066400000000000000000000063231410101234500227740ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_COMMON_MATH_HPP #define XSIMD_COMMON_MATH_HPP #include #include namespace xsimd { /********************************************* * Some utility math operations shared * * across scalar versio and fallback * * versions * *********************************************/ namespace detail { template inline T0 ipow(const T0& t0, const T1& t1) { static_assert(std::is_integral::value, "second argument must be an integer"); T0 a = t0; T1 b = t1; bool const recip = b < 0; T0 r{static_cast(1)}; while (1) { if (b & 1) { r *= a; } b /= 2; if (b == 0) { break; } a *= a; } return recip ? 1 / r : r; } template::value>::type> T sadd(const T& lhs, const T& rhs) { if (std::numeric_limits::is_signed) { if ((lhs > 0) && (rhs > std::numeric_limits::max() - lhs)) { return std::numeric_limits::max(); } else if ((lhs < 0) && (rhs < std::numeric_limits::lowest() - lhs)) { return std::numeric_limits::lowest(); } else { return lhs + rhs; } } else { if (rhs > std::numeric_limits::max() - lhs) { return std::numeric_limits::max(); } else { return lhs + rhs; } } } template::value>::type> T ssub(const T& lhs, const T& rhs) { if (std::numeric_limits::is_signed) { return sadd(lhs, (T)-rhs); } else { if (lhs < rhs) { return std::numeric_limits::lowest(); } else { return lhs - rhs; } } } } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_complex_base.hpp000066400000000000000000001441031410101234500231330ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_COMPLEX_BASE_HPP #define XSIMD_COMPLEX_BASE_HPP #include #include #include #include #include #ifdef XSIMD_ENABLE_XTL_COMPLEX #include "xtl/xcomplex.hpp" #endif #include "xsimd_base.hpp" #include "xsimd_utils.hpp" namespace xsimd { /***************************** * complex_batch_bool_traits * *****************************/ template struct complex_batch_bool_traits { using value_type = C; static constexpr std::size_t size = N; using batch_type = batch; static constexpr std::size_t align = Align; using real_batch = batch_bool; }; /*************************** * simd_complex_batch_bool * ***************************/ /** * @class simd_complex_batch_bool * @brief Base class for complex batch of boolean values. * * The simd_complex_batch_bool class is the base class for all classes representing * a complex batch of boolean values. Complex batch of boolean values is meant for operations * that may involve batches of complex numbers. Thus, the boolean values are stored as floating * point values, and each type of batch of complex has its dedicated type of boolean batch. * * @tparam X The derived type * @sa simd_complex_batch */ template class simd_complex_batch_bool : public simd_batch_bool { public: using value_type = typename simd_batch_traits::value_type; static constexpr std::size_t size = simd_batch_traits::size; using real_batch = typename simd_batch_traits::real_batch; simd_complex_batch_bool() = default; simd_complex_batch_bool(bool b); simd_complex_batch_bool(const real_batch& b); const real_batch& value() const; bool operator[](std::size_t index) const; private: real_batch m_value; }; /************************ * complex_batch_traits * ************************/ template struct complex_batch_traits { using value_type = C; static constexpr std::size_t size = N; using batch_bool_type = batch_bool; static constexpr std::size_t align = Align; using real_batch = batch; }; /********************** * simd_complex_batch * **********************/ template struct is_ieee_compliant; template struct is_ieee_compliant> : std::integral_constant>::is_iec559> { }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct is_ieee_compliant> : std::false_type { }; #endif /** * @class simd_complex_batch * @brief Base class for batch complex numbers. * * The simd_complex_batch class is the base class for all classes representing * a batch of complex numbers. Each type of batch (i.e. a class inheriting from * simd_complex_batch) has its dedicated type of boolean batch (i.e. a class * inheriting from simd_complex_batch_bool) for logical operations. * * Internally, a batch of complex numbers holds two batches of real numbers, one * for the real part and one for the imaginary part. * * @tparam X The derived type * @sa simd_complex_batch_bool */ template class simd_complex_batch : public simd_base { public: using base_type = simd_base; using batch_reference = typename base_type::batch_reference; using const_batch_reference = typename base_type::const_batch_reference; using batch_type = X; using value_type = typename simd_batch_traits::value_type; static constexpr std::size_t size = simd_batch_traits::size; using real_batch = typename simd_batch_traits::real_batch; using real_value_type = typename value_type::value_type; simd_complex_batch() = default; explicit simd_complex_batch(const value_type& v); explicit simd_complex_batch(const real_value_type& v); explicit simd_complex_batch(const real_batch& re); explicit simd_complex_batch(const real_value_type* v) : simd_complex_batch(real_batch(v)) {} simd_complex_batch(const real_batch& re, const real_batch& im); simd_complex_batch(const real_value_type* re, const real_value_type* im) : simd_complex_batch(real_batch(re), real_batch(im)) {} real_batch& real(); real_batch& imag(); const real_batch& real() const; const real_batch& imag() const; X& operator+=(const X& rhs); X& operator+=(const value_type& rhs); X& operator+=(const real_batch& rhs); X& operator+=(const real_value_type& rhs); X& operator-=(const X& rhs); X& operator-=(const value_type& rhs); X& operator-=(const real_batch& rhs); X& operator-=(const real_value_type& rhs); X& operator*=(const X& rhs); X& operator*=(const value_type& rhs); X& operator*=(const real_batch& rhs); X& operator*=(const real_value_type& rhs); X& operator/=(const X& rhs); X& operator/=(const value_type& rhs); X& operator/=(const real_batch& rhs); X& operator/=(const real_value_type& rhs); template X& load_aligned(const T* real_src, const T* imag_src); template X& load_unaligned(const T* real_src, const T* imag_src); template void store_aligned(T* real_dst, T* imag_dst) const; template void store_unaligned(T* real_dst, T* imag_dst) const; template typename std::enable_if::value, X&>::type load_aligned(const T* src); template typename std::enable_if::value, X&>::type load_unaligned(const T* src); template typename std::enable_if::value, X&>::type load_aligned(const T* src); template typename std::enable_if::value, X&>::type load_unaligned(const T* src); template inline typename std::enable_if::value, void>::type store_aligned(T* dst) const; template inline typename std::enable_if::value, void>::type store_unaligned(T* dst) const; template inline typename std::enable_if::value, void>::type store_aligned(T* dst) const; template inline typename std::enable_if::value, void>::type store_unaligned(T* dst) const; value_type operator[](std::size_t index) const; batch_reference get(); const_batch_reference get() const; protected: real_batch m_real; real_batch m_imag; }; template X operator+(const simd_complex_batch& rhs); template X operator-(const simd_complex_batch& rhs); template X operator+(const simd_complex_batch& lhs, const simd_complex_batch& rhs); template X operator+(const simd_complex_batch& lhs, const typename simd_batch_traits::value_type& rhs); template X operator+(const typename simd_batch_traits::value_type& lhs, const simd_complex_batch& rhs); template X operator+(const simd_complex_batch& lhs, const typename simd_batch_traits::real_batch& rhs); template X operator+(const typename simd_batch_traits::real_batch& lhs, const simd_complex_batch& rhs); template X operator+(const simd_complex_batch& lhs, const typename simd_batch_traits::real_value_type& rhs); template X operator+(const typename simd_batch_traits::real_value_type& lhs, const simd_complex_batch& rhs); template X operator-(const simd_complex_batch& lhs, const simd_complex_batch& rhs); template X operator-(const simd_complex_batch& lhs, const typename simd_batch_traits::value_type& rhs); template X operator-(const typename simd_batch_traits::value_type& lhs, const simd_complex_batch& rhs); template X operator-(const simd_complex_batch& lhs, const typename simd_batch_traits::real_batch& rhs); template X operator-(const typename simd_batch_traits::real_batch& lhs, const simd_complex_batch& rhs); template X operator-(const simd_complex_batch& lhs, const typename simd_batch_traits::real_value_type& rhs); template X operator-(const typename simd_batch_traits::real_value_type& lhs, const simd_complex_batch& rhs); template X operator*(const simd_complex_batch& lhs, const simd_complex_batch& rhs); template X operator*(const simd_complex_batch& lhs, const typename simd_batch_traits::value_type& rhs); template X operator*(const typename simd_batch_traits::value_type& lhs, const simd_complex_batch& rhs); template X operator*(const simd_complex_batch& lhs, const typename simd_batch_traits::real_batch& rhs); template X operator*(const typename simd_batch_traits::real_batch& lhs, const simd_complex_batch& rhs); template X operator*(const simd_complex_batch& lhs, const typename simd_batch_traits::real_value_type& rhs); template X operator*(const typename simd_batch_traits::real_value_type& lhs, const simd_complex_batch& rhs); template X operator/(const simd_complex_batch& lhs, const simd_complex_batch& rhs); template X operator/(const simd_complex_batch& lhs, const typename simd_batch_traits::value_type& rhs); template X operator/(const typename simd_batch_traits::value_type& lhs, const simd_complex_batch& rhs); template X operator/(const simd_complex_batch& lhs, const typename simd_batch_traits::real_batch& rhs); template X operator/(const typename simd_batch_traits::real_batch& lhs, const simd_complex_batch& rhs); template X operator/(const simd_complex_batch& lhs, const typename simd_batch_traits::real_value_type& rhs); template X operator/(const typename simd_batch_traits::real_value_type& lhs, const simd_complex_batch& rhs); template typename simd_batch_traits::value_type hadd(const simd_complex_batch& rhs); template X select(const typename simd_batch_traits::batch_bool_type& cond, const simd_complex_batch& a, const simd_complex_batch& b); template typename simd_batch_traits::batch_bool_type operator==(const simd_complex_batch& lhs, const simd_complex_batch& rhs); template typename simd_batch_traits::batch_bool_type operator!=(const simd_complex_batch& lhs, const simd_complex_batch& rhs); template std::ostream& operator<<(std::ostream& out, const simd_complex_batch& rhs); /******************************************* * xsimd_complex_batch_bool implementation * *******************************************/ /** * Initializes all the values of the batch to \c b */ template inline simd_complex_batch_bool::simd_complex_batch_bool(bool b) : m_value(b) { } /** * Initializes the values of the batch with those of the real batch \c b. * A real batch contains scalars whose type is the \c value_type of * the complex number type. */ template inline simd_complex_batch_bool::simd_complex_batch_bool(const real_batch& b) : m_value(b) { } template inline auto simd_complex_batch_bool::value() const -> const real_batch& { return m_value; } template inline bool simd_complex_batch_bool::operator[](std::size_t index) const { return m_value[index]; } namespace detail { template struct batch_bool_complex_kernel { using batch_type = batch_bool; static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return lhs.value() & rhs.value(); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return lhs.value() | rhs.value(); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return lhs.value() ^ rhs.value(); } static batch_type bitwise_not(const batch_type& rhs) { return ~(rhs.value()); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return xsimd::bitwise_andnot(lhs.value(), rhs.value()); } static batch_type equal(const batch_type& lhs, const batch_type& rhs) { return lhs.value() == rhs.value(); } static batch_type not_equal(const batch_type& lhs, const batch_type& rhs) { return lhs.value() != rhs.value(); } static bool all(const batch_type& rhs) { return xsimd::all(rhs.value()); } static bool any(const batch_type& rhs) { return xsimd::any(rhs.value()); } }; template struct batch_bool_kernel, N> : batch_bool_complex_kernel, N> { }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct batch_bool_kernel, N> : batch_bool_complex_kernel, N> { }; #endif } /************************************** * xsimd_complex_batch implementation * **************************************/ /** * Initializes all the values of the batch to the complex value \c v. */ template inline simd_complex_batch::simd_complex_batch(const value_type& v) : m_real(v.real()), m_imag(v.imag()) { } /** * Initializes all the values of the batch to the real value \c v. */ template inline simd_complex_batch::simd_complex_batch(const real_value_type& v) : m_real(v), m_imag(real_value_type(0)) { } /** * Initializes the values of the batch whith those of the real batch \c re. * Imaginary parts are set to 0. */ template inline simd_complex_batch::simd_complex_batch(const real_batch& re) : m_real(re), m_imag(real_value_type(0)) { } /** * Initializes the batch with two real batch, one for the real part and one for the inamginary * part. */ template inline simd_complex_batch::simd_complex_batch(const real_batch& re, const real_batch& im) : m_real(re), m_imag(im) { } /** * Returns a batch for the real part. */ template inline auto simd_complex_batch::real() -> real_batch& { return m_real; } /** * Returns a batch for the imaginary part. */ template inline auto simd_complex_batch::imag() -> real_batch& { return m_imag; } /** * Returns a const batch for the real part */ template inline auto simd_complex_batch::real() const -> const real_batch& { return m_real; } /** * Returns a const batch for the imaginary part. */ template inline auto simd_complex_batch::imag() const -> const real_batch& { return m_imag; } /** * @name Arithmetic computed assignment */ //@{ /** * Adds the batch \c rhs to \c this. * @param rhs the batch to add. * @return a reference to \c this. */ template inline X& simd_complex_batch::operator+=(const X& rhs) { m_real += rhs.real(); m_imag += rhs.imag(); return (*this)(); } /** * Adds the scalar \c rhs to each value contained in \c this. * @param rhs the scalar to add. * @return a reference to \c this. */ template inline X& simd_complex_batch::operator+=(const value_type& rhs) { return (*this)() += X(rhs); } /** * Adds the real batch \c rhs to \c this. * @param rhs the real batch to add. * @return a reference to \c this. */ template inline X& simd_complex_batch::operator+=(const real_batch& rhs) { m_real += rhs; return (*this)(); } /** * Adds the real scalar \c rhs to each value contained in \c this. * @param rhs the real scalar to add. * @return a reference to \c this. */ template inline X& simd_complex_batch::operator+=(const real_value_type& rhs) { return (*this)() += real_batch(rhs); } /** * Substracts the batch \c rhs to \c this. * @param rhs the batch to substract. * @return a reference to \c this. */ template inline X& simd_complex_batch::operator-=(const X& rhs) { m_real -= rhs.real(); m_imag -= rhs.imag(); return (*this)(); } /** * Substracts the scalar \c rhs to each value contained in \c this. * @param rhs the scalar to substract. * @return a reference to \c this. */ template inline X& simd_complex_batch::operator-=(const value_type& rhs) { return (*this)() -= X(rhs); } /** * Substracts the real batch \c rhs to \c this. * @param rhs the batch to substract. * @return a reference to \c this. */ template inline X& simd_complex_batch::operator-=(const real_batch& rhs) { m_real -= rhs; return (*this)(); } /** * Substracts the real scalar \c rhs to each value contained in \c this. * @param rhs the real scalar to substract. * @return a reference to \c this. */ template inline X& simd_complex_batch::operator-=(const real_value_type& rhs) { return (*this)() -= real_batch(rhs); } namespace detail { template struct complex_batch_multiplier { using real_batch = typename simd_batch_traits::real_batch; inline static X mul(const X& lhs, const X& rhs) { real_batch a = lhs.real(); real_batch b = lhs.imag(); real_batch c = rhs.real(); real_batch d = rhs.imag(); return X(a*c - b*d, a*d + b*c); } inline static X div(const X& lhs, const X& rhs) { real_batch a = lhs.real(); real_batch b = lhs.imag(); real_batch c = rhs.real(); real_batch d = rhs.imag(); real_batch e = c*c + d*d; return X((c*a + d*b) / e, (c*b - d*a) / e); } }; } /** * Multiplies \c this with the batch \c rhs. * @param rhs the batch involved in the multiplication. * @return a reference to \c this. */ template inline X& simd_complex_batch::operator*=(const X& rhs) { using kernel = detail::complex_batch_multiplier::value>; (*this)() = kernel::mul((*this)(), rhs); return (*this)(); } /** * Multiplies each scalar contained in \c this with the scalar \c rhs. * @param rhs the scalar involved in the multiplication. * @return a reference to \c this. */ template inline X& simd_complex_batch::operator*=(const value_type& rhs) { return (*this)() *= X(rhs); } /** * Multiplies \c this with the real batch \c rhs. * @param rhs the real batch involved in the multiplication. * @return a reference to \c this. */ template inline X& simd_complex_batch::operator*=(const real_batch& rhs) { m_real *= rhs; m_imag *= rhs; return (*this)(); } /** * Multiplies each scalar contained in \c this with the real scalar \c rhs. * @param rhs the real scalar involved in the multiplication. * @return a reference to \c this. */ template inline X& simd_complex_batch::operator*=(const real_value_type& rhs) { return (*this)() *= real_batch(rhs); } /** * Divides \c this by the batch \c rhs. * @param rhs the batch involved in the division. * @return a reference to \c this. */ template inline X& simd_complex_batch::operator/=(const X& rhs) { using kernel = detail::complex_batch_multiplier::value>; (*this)() = kernel::div((*this)(), rhs); return (*this)(); } /** * Divides each scalar contained in \c this by the scalar \c rhs. * @param rhs the scalar involved in the division. * @return a reference to \c this. */ template inline X& simd_complex_batch::operator/=(const value_type& rhs) { return (*this)() /= X(rhs); } /** * Divides \c this by the real batch \c rhs. * @param rhs the real batch involved in the division. * @return a reference to \c this. */ template inline X& simd_complex_batch::operator/=(const real_batch& rhs) { m_real /= rhs; m_imag /= rhs; return (*this)(); } /** * Divides each scalar contained in \c this by the real scalar \c rhs. * @param rhs the real scalar involved in the division. * @return a reference to \c this. */ template inline X& simd_complex_batch::operator/=(const real_value_type& rhs) { return (*this)() /= real_batch(rhs); } //@} /** * @name Load and store methods */ //@{ /** * Loads the N contiguous values pointed by \c real_src into the batch holding * the real values, and N contiguous values pointed by \c imag_src into the * batch holding the imaginary values. * \c real_src and \c imag_src must be aligned. */ template template inline X& simd_complex_batch::load_aligned(const T* real_src, const T* imag_src) { m_real.load_aligned(real_src); m_imag.load_aligned(imag_src); return (*this)(); } /** * Loads the N contiguous values pointed by \c real_src into the batch holding * the real values, and N contiguous values pointed by \c imag_src into the * batch holding the imaginary values. * \c real_src and \c imag_src are not required to be aligned. */ template template inline X& simd_complex_batch::load_unaligned(const T* real_src, const T* imag_src) { m_real.load_unaligned(real_src); m_imag.load_unaligned(imag_src); return (*this)(); } /** * Stores the N values of the batch holding the real values into a contiguous array * pointed by \c real_dst., and the N values of the batch holding the imaginary values * into a contiguous array pointer by \c imag_dst. * \c real_dst and \c imag_dst must be aligned. */ template template inline void simd_complex_batch::store_aligned(T* real_dst, T* imag_dst) const { m_real.store_aligned(real_dst); m_imag.store_aligned(imag_dst); } /** * Stores the N values of the batch holding the real values into a contiguous array * pointed by \c real_dst., and the N values of the batch holding the imaginary values * into a contiguous array pointer by \c imag_dst. * \c real_dst and \c imag_dst are not required to be aligned. */ template template inline void simd_complex_batch::store_unaligned(T* real_dst, T* imag_dst) const { m_real.store_unaligned(real_dst); m_imag.store_unaligned(imag_dst); } /** * Loads the N contiguous values pointed by \c src into the batch. * \c src must be aligned. */ template template inline typename std::enable_if::value, X&>::type simd_complex_batch::load_aligned(const T* src) { using tmp_value_type = typename T::value_type; const tmp_value_type* rbuf = reinterpret_cast(src); real_batch hi, lo; hi.load_aligned(rbuf); lo.load_aligned(rbuf + size); return (*this)().load_complex(hi, lo); } /** * Loads the N contiguous values pointed by \c src into the batch. * \c src is not required to be aligned. */ template template inline typename std::enable_if::value, X&>::type simd_complex_batch::load_unaligned(const T* src) { using tmp_value_type = typename T::value_type; const tmp_value_type* rbuf = reinterpret_cast(src); real_batch hi, lo; hi.load_unaligned(rbuf); lo.load_unaligned(rbuf + size); return (*this)().load_complex(hi, lo); } ///@cond DOXYGEN_INCLUDE_SFINAE template template inline typename std::enable_if::value, X&>::type simd_complex_batch::load_aligned(const T* src) { m_real.load_aligned(src); m_imag = real_batch(real_value_type(0)); return (*this)(); } template template inline typename std::enable_if::value, X&>::type simd_complex_batch::load_unaligned(const T* src) { m_real.load_unaligned(src); m_imag = real_batch(real_value_type(0)); return (*this)(); } /// @endcond /** * Stores the N values of the batch into a contiguous array of complex * pointed by \c dst. \c dst must be aligned. */ template template inline typename std::enable_if::value, void>::type simd_complex_batch::store_aligned(T* dst) const { real_batch hi = (*this)().get_complex_high(); real_batch lo = (*this)().get_complex_low(); using tmp_value_type = typename T::value_type; tmp_value_type* rbuf = reinterpret_cast(dst); hi.store_aligned(rbuf); lo.store_aligned(rbuf + size); } /** * Stores the N values of the batch into a contiguous array of reals * pointed by \c dst. \c dst must be aligned. */ template template inline typename std::enable_if::value, void>::type simd_complex_batch::store_aligned(T* dst) const { m_real.store_aligned(dst); assert(all(m_imag == 0) && "no imaginary part"); } /** * Stores the N values of the batch into a contiguous array of complex * pointed by \c dst. \c dst is not required to be aligned. */ template template inline typename std::enable_if::value, void>::type simd_complex_batch::store_unaligned(T* dst) const { real_batch hi = (*this)().get_complex_high(); real_batch lo = (*this)().get_complex_low(); using tmp_value_type = typename T::value_type; tmp_value_type* rbuf = reinterpret_cast(dst); hi.store_unaligned(rbuf); lo.store_unaligned(rbuf + size); } /** * Stores the N values of the batch into a contiguous array of reals * pointed by \c dst. \c dst is not required to be aligned. */ template template inline typename std::enable_if::value, void>::type simd_complex_batch::store_unaligned(T* dst) const { m_real.store_aligned(dst); assert(all(m_imag == 0) && "no imaginary part"); } //@} template inline auto simd_complex_batch::operator[](std::size_t index) const -> value_type { return value_type(m_real[index], m_imag[index]); } template inline auto simd_complex_batch::get() -> batch_reference { return this->derived_cast(); } template inline auto simd_complex_batch::get() const -> const_batch_reference { return this->derived_cast(); } /******************************** * simd_complex_batch operators * ********************************/ /** * @defgroup simd_complex_batch_arithmetic Arithmetic operators */ /** * @ingroup simd_complex_batch_arithmetic * * No-op on \c rhs. * @tparam X the actual type of batch. * @param rhs batch involved in the operation. * @return the opposite of \c rhs. */ template inline X operator+(const simd_complex_batch& rhs) { return rhs(); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the opposite of the batch \c rhs. * @tparam X the actual type of batch. * @param rhs batch involved in the operation. * @return the opposite of \c rhs. */ template inline X operator-(const simd_complex_batch& rhs) { return X(-rhs().real(), -rhs().imag()); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the sum of the batches \c lhs and \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the addition. * @param rhs batch involved in the addition. * @return the result of the addition. */ template inline X operator+(const simd_complex_batch& lhs, const simd_complex_batch& rhs) { X tmp(lhs()); return tmp += rhs(); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the sum of the batch \c lhs and the scalar \c rhs. Equivalent to the * sum of two batches where all the values of the second one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the addition. * @param rhs scalar involved in the addition. * @return the result of the addition. */ template inline X operator+(const simd_complex_batch& lhs, const typename simd_batch_traits::value_type& rhs) { X tmp(lhs()); return tmp += X(rhs); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the sum of the scalar \c lhs and the batch \c rhs. Equivalent to the * sum of two batches where all the values of the first one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs scalar involved in the addition. * @param rhs batch involved in the addition. * @return the result of the addition. */ template inline X operator+(const typename simd_batch_traits::value_type& lhs, const simd_complex_batch& rhs) { X tmp(lhs); return tmp += rhs(); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the sum of the batches \c lhs and \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the addition. * @param rhs real batch involved in the addition. * @return the result of the addition. */ template inline X operator+(const simd_complex_batch& lhs, const typename simd_batch_traits::real_batch& rhs) { X tmp(lhs()); return tmp += X(rhs); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the sum of the batches \c lhs and \c rhs. * @tparam X the actual type of batch. * @param lhs real batch involved in the addition. * @param rhs batch involved in the addition. * @return the result of the addition. */ template inline X operator+(const typename simd_batch_traits::real_batch& lhs, const simd_complex_batch& rhs) { X tmp(lhs); return tmp += rhs(); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the sum of the batch \c lhs and the real scalar \c rhs. Equivalent to the * sum of two batches where all the values of the second one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the addition. * @param rhs real scalar involved in the addition. * @return the result of the addition. */ template inline X operator+(const simd_complex_batch& lhs, const typename simd_batch_traits::real_value_type& rhs) { X tmp(lhs()); return tmp += X(rhs); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the sum of the real scalar \c lhs and the batch \c rhs. Equivalent to the * sum of two batches where all the values of the first one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs real scalar involved in the addition. * @param rhs batch involved in the addition. * @return the result of the addition. */ template inline X operator+(const typename simd_batch_traits::real_value_type& lhs, const simd_complex_batch& rhs) { X tmp(lhs); return tmp += rhs(); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the difference of the batches \c lhs and \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the difference. * @param rhs batch involved in the difference. * @return the result of the difference. */ template inline X operator-(const simd_complex_batch& lhs, const simd_complex_batch& rhs) { X tmp(lhs()); return tmp -= rhs(); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the difference of the batch \c lhs and the scalar \c rhs. Equivalent to the * difference of two batches where all the values of the second one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the difference. * @param rhs scalar involved in the difference. * @return the result of the difference. */ template inline X operator-(const simd_complex_batch& lhs, const typename simd_batch_traits::value_type& rhs) { X tmp(lhs()); return tmp -= X(rhs); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the difference of the scalar \c lhs and the batch \c rhs. Equivalent to the * difference of two batches where all the values of the first one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs scalar involved in the difference. * @param rhs batch involved in the difference. * @return the result of the difference. */ template inline X operator-(const typename simd_batch_traits::value_type& lhs, const simd_complex_batch& rhs) { X tmp(lhs); return tmp -= rhs(); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the difference of the batches \c lhs and \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the difference. * @param rhs real batch involved in the difference. * @return the result of the difference. */ template inline X operator-(const simd_complex_batch& lhs, const typename simd_batch_traits::real_batch& rhs) { X tmp(lhs()); return tmp -= X(rhs); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the difference of the batches \c lhs and \c rhs. * @tparam X the actual type of batch. * @param lhs real batch involved in the difference. * @param rhs batch involved in the difference. * @return the result of the difference. */ template inline X operator-(const typename simd_batch_traits::real_batch& lhs, const simd_complex_batch& rhs) { X tmp(lhs); return tmp -= rhs(); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the difference of the batch \c lhs and the real scalar \c rhs. Equivalent to the * difference of two batches where all the values of the second one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the difference. * @param rhs real scalar involved in the difference. * @return the result of the difference. */ template inline X operator-(const simd_complex_batch& lhs, const typename simd_batch_traits::real_value_type& rhs) { X tmp(lhs()); return tmp -= X(rhs); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the difference of the real scalar \c lhs and the batch \c rhs. Equivalent to the * difference of two batches where all the values of the first one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs real scalar involved in the difference. * @param rhs batch involved in the difference. * @return the result of the difference. */ template inline X operator-(const typename simd_batch_traits::real_value_type& lhs, const simd_complex_batch& rhs) { X tmp(lhs); return tmp -= rhs(); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the product of the batches \c lhs and \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the product. * @param rhs batch involved in the product. * @return the result of the product. */ template inline X operator*(const simd_complex_batch& lhs, const simd_complex_batch& rhs) { X tmp(lhs()); return tmp *= rhs(); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the product of the batch \c lhs and the scalar \c rhs. Equivalent to the * product of two batches where all the values of the second one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the product. * @param rhs scalar involved in the product. * @return the result of the product. */ template inline X operator*(const simd_complex_batch& lhs, const typename simd_batch_traits::value_type& rhs) { X tmp(lhs()); return tmp *= X(rhs); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the product of the scalar \c lhs and the batch \c rhs. Equivalent to the * difference of two batches where all the values of the first one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs scalar involved in the product. * @param rhs batch involved in the product. * @return the result of the product. */ template inline X operator*(const typename simd_batch_traits::value_type& lhs, const simd_complex_batch& rhs) { X tmp(lhs); return tmp *= rhs(); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the product of the batches \c lhs and \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the product. * @param rhs real batch involved in the product. * @return the result of the product. */ template inline X operator*(const simd_complex_batch& lhs, const typename simd_batch_traits::real_batch& rhs) { X tmp(lhs()); return tmp *= X(rhs); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the product of the batches \c lhs and \c rhs. * @tparam X the actual type of batch. * @param lhs real batch involved in the product. * @param rhs batch involved in the product. * @return the result of the product. */ template inline X operator*(const typename simd_batch_traits::real_batch& lhs, const simd_complex_batch& rhs) { X tmp(lhs); return tmp *= rhs(); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the product of the batch \c lhs and the real scalar \c rhs. Equivalent to the * product of two batches where all the values of the second one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the product. * @param rhs real scalar involved in the product. * @return the result of the product. */ template inline X operator*(const simd_complex_batch& lhs, const typename simd_batch_traits::real_value_type& rhs) { X tmp(lhs()); return tmp *= X(rhs); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the product of the real scalar \c lhs and the batch \c rhs. Equivalent to the * difference of two batches where all the values of the first one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs real scalar involved in the product. * @param rhs batch involved in the product. * @return the result of the product. */ template inline X operator*(const typename simd_batch_traits::real_value_type& lhs, const simd_complex_batch& rhs) { X tmp(lhs); return tmp *= rhs(); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the division of the batch \c lhs by the batch \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the division. * @param rhs batch involved in the division. * @return the result of the division. */ template inline X operator/(const simd_complex_batch& lhs, const simd_complex_batch& rhs) { X tmp(lhs()); return tmp /= rhs(); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the division of the batch \c lhs by the scalar \c rhs. Equivalent to the * division of two batches where all the values of the second one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the division. * @param rhs scalar involved in the division. * @return the result of the division. */ template inline X operator/(const simd_complex_batch& lhs, const typename simd_batch_traits::value_type& rhs) { X tmp(lhs()); return tmp /= X(rhs); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the division of the scalar \c lhs and the batch \c rhs. Equivalent to the * difference of two batches where all the values of the first one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs scalar involved in the division. * @param rhs batch involved in the division. * @return the result of the division. */ template inline X operator/(const typename simd_batch_traits::value_type& lhs, const simd_complex_batch& rhs) { X tmp(lhs); return tmp /= rhs(); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the division of the batch \c lhs by the batch \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the division. * @param rhs real batch involved in the division. * @return the result of the division. */ template inline X operator/(const simd_complex_batch& lhs, const typename simd_batch_traits::real_batch& rhs) { X tmp(lhs()); return tmp /= X(rhs); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the division of the batch \c lhs by the batch \c rhs. * @tparam X the actual type of batch. * @param lhs real batch involved in the division. * @param rhs batch involved in the division. * @return the result of the division. */ template inline X operator/(const typename simd_batch_traits::real_batch& lhs, const simd_complex_batch& rhs) { X tmp(lhs); return tmp /= rhs(); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the division of the batch \c lhs by the real scalar \c rhs. Equivalent to the * division of two batches where all the values of the second one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs batch involved in the division. * @param rhs real scalar involved in the division. * @return the result of the division. */ template inline X operator/(const simd_complex_batch& lhs, const typename simd_batch_traits::real_value_type& rhs) { X tmp(lhs()); return tmp /= X(rhs); } /** * @ingroup simd_complex_batch_arithmetic * * Computes the division of the real scalar \c lhs and the batch \c rhs. Equivalent to the * difference of two batches where all the values of the first one are initialized to * \c rhs. * @tparam X the actual type of batch. * @param lhs real scalar involved in the division. * @param rhs batch involved in the division. * @return the result of the division. */ template inline X operator/(const typename simd_batch_traits::real_value_type& lhs, const simd_complex_batch& rhs) { X tmp(lhs); return tmp /= rhs(); } /** * @defgroup simd_complex_batch_reducers Reducers */ /** * @ingroup simd_complex_batch_reducers * * Adds all the scalars of the batch \c rhs. * @param rhs batch involved in the reduction * @return the result of the reduction. */ template inline typename simd_batch_traits::value_type hadd(const simd_complex_batch& rhs) { using value_type = typename simd_batch_traits::value_type; return value_type(hadd(rhs.real()), hadd(rhs.imag())); } /** * @defgroup simd_complex_batch_miscellaneous Miscellaneous */ /** * @ingroup simd_complex_batch_miscellaneous * * Ternary operator for batches: selects values from the batches \c a or \c b * depending on the boolean values in \c cond. Equivalent to * \code{.cpp} * for(std::size_t i = 0; i < N; ++i) * res[i] = cond[i] ? a[i] : b[i]; * \endcode * @param cond batch condition. * @param a batch values for truthy condition. * @param b batch value for falsy condition. * @return the result of the selection. */ template inline X select(const typename simd_batch_traits::batch_bool_type& cond, const simd_complex_batch& a, const simd_complex_batch& b) { return X(select(cond.value(), a.real(), b.real()), select(cond.value(), a.imag(), b.imag())); } /** * @defgroup simd_complex_batch_comparison Comparison operators */ /** * @ingroup simd_complex_batch_comparison * * Element-wise equality comparison of batches \c lhs and \c rhs. * @param lhs batch involved in the comparison. * @param rhs batch involved in the comparison. * @return a boolean batch. */ template inline typename simd_batch_traits::batch_bool_type operator==(const simd_complex_batch& lhs, const simd_complex_batch& rhs) { return (lhs.real() == rhs.real()) && (lhs.imag() == rhs.imag()); } /** * @ingroup simd_complex_batch_comparison * * Element-wise inequality comparison of batches \c lhs and \c rhs. * @param lhs batch involved in the comparison. * @param rhs batch involved in the comparison. * @return a boolean batch. */ template inline typename simd_batch_traits::batch_bool_type operator!=(const simd_complex_batch& lhs, const simd_complex_batch& rhs) { return !(lhs == rhs); } /** * Insert the batch \c rhs into the stream \c out. * @tparam X the actual type of batch. * @param out the output stream. * @param rhs the batch to output. * @return the output stream. */ template inline std::ostream& operator<<(std::ostream& out, const simd_complex_batch& rhs) { out << '('; std::size_t s = simd_complex_batch::size; for (std::size_t i = 0; i < s - 1; ++i) { out << "(" << rhs()[i].real() << "," << rhs()[i].imag() << "), "; } out << "(" << rhs()[s - 1].real() << "," << rhs()[s - 1].imag() << "))"; return out; } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_fallback.hpp000066400000000000000000001232741410101234500222370ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_FALLBACK_HPP #define XSIMD_FALLBACK_HPP #include #include #include #include #include "xsimd_scalar.hpp" #include "xsimd_base.hpp" #include "xsimd_complex_base.hpp" #include "xsimd_utils.hpp" #ifdef XSIMD_ENABLE_XTL_COMPLEX #include "xtl/xcomplex.hpp" #endif namespace xsimd { /*********************************************************** * Generic fallback implementation of batch and batch_bool * * * * Basically, generate a scalar loop and cross fingers: * * maybe the compiler will autovectorize, maybe not. * ***********************************************************/ /******************** * batch_bool * ********************/ template struct simd_batch_traits> { using value_type = T; static constexpr std::size_t size = N; using batch_type = batch; static constexpr std::size_t align = XSIMD_DEFAULT_ALIGNMENT; }; template class batch_bool : public simd_batch_bool> { public: batch_bool(); explicit batch_bool(bool b); // Constructor from N boolean parameters template < typename... Args, typename Enable = detail::is_array_initializer_t > batch_bool(Args... exactly_N_bools); batch_bool(const std::array& rhs); batch_bool& operator=(const std::array& rhs); operator std::array() const; const bool& operator[](std::size_t index) const; bool& operator[](std::size_t index); const std::array& get_value() const; private: template batch_bool& load_values(Args... args); std::array m_value; friend class simd_batch_bool>; }; /*************** * batch * ***************/ template struct simd_batch_traits> { using value_type = T; static constexpr std::size_t size = N; using batch_bool_type = batch_bool; static constexpr std::size_t align = XSIMD_DEFAULT_ALIGNMENT; using storage_type = std::array; }; template class batch : public simd_batch> { public: using self_type = batch; using base_type = simd_batch; using storage_type = typename base_type::storage_type; using batch_bool_type = typename base_type::batch_bool_type; batch(); explicit batch(T f); // Constructor from N scalar parameters template < typename... Args, typename Enable = typename detail::is_array_initializer::type > batch(Args... exactly_N_scalars); // Constructor from value_type of batch_bool batch(const std::array& src); explicit batch(const T* src); batch(const T* src, aligned_mode); batch(const T* src, unaligned_mode); batch(const std::array& rhs); batch(const batch_bool_type& rhs); batch& operator=(const std::array& rhs); batch& operator=(const std::array& rhs); batch& operator=(const batch_bool_type&); operator std::array() const; XSIMD_DECLARE_LOAD_STORE_ALL(T, N) using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; T& operator[](std::size_t index); const T& operator[](std::size_t index) const; private: template batch& load_unaligned_impl(const U* src); template void store_unaligned_impl(U* src) const; }; template batch operator<<(const batch& lhs, int32_t rhs); template batch operator>>(const batch& lhs, int32_t rhs); /********************************** * batch_bool, N> * **********************************/ template struct simd_batch_traits, N>> : complex_batch_bool_traits, T, N, XSIMD_DEFAULT_ALIGNMENT> { }; template class batch_bool, N> : public simd_complex_batch_bool, N>> { public: using self_type = batch_bool, N>; using base_type = simd_complex_batch_bool; using real_batch = batch_bool; batch_bool() = default; using base_type::base_type; // Constructor from N boolean parameters template < typename... Args, typename Enable = detail::is_array_initializer_t > batch_bool(Args... exactly_N_bools) : base_type(real_batch{ exactly_N_bools... }) { } }; /***************************** * batch, N> * *****************************/ template struct simd_batch_traits, N>> : complex_batch_traits, T, N, XSIMD_DEFAULT_ALIGNMENT> { }; template class batch, N> : public simd_complex_batch, N>> { public: using self_type = batch, N>; using base_type = simd_complex_batch; using value_type = std::complex; using real_batch = batch; using real_value_type = T; batch() = default; using base_type::base_type; // Constructor from N scalar parameters template < typename... Args, typename Enable = typename detail::is_array_initializer::type > batch(Args... exactly_N_scalars) : base_type(real_batch{ exactly_N_scalars.real()... }, real_batch{ exactly_N_scalars.imag()... }) { } using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; template typename std::enable_if::value, self_type&>::type load_aligned(const U* src); template typename std::enable_if::value, self_type&>::type load_unaligned(const U* src); template typename std::enable_if::value, self_type&>::type load_aligned(const U* src); template typename std::enable_if::value, self_type&>::type load_unaligned(const U* src); template void store_aligned(U* dst) const; template void store_unaligned(U* dst) const; }; #ifdef XSIMD_ENABLE_XTL_COMPLEX /******************************************** * batch_bool, N> * ********************************************/ template struct simd_batch_traits, N>> : complex_batch_bool_traits, T, N, XSIMD_DEFAULT_ALIGNMENT> { }; template class batch_bool, N> : public simd_complex_batch_bool, N>> { public: using self_type = batch_bool, N>; using base_type = simd_complex_batch_bool; using real_batch = batch_bool; batch_bool() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE // Constructor from N boolean parameters template < typename... Args, typename Enable = detail::is_array_initializer_t > batch_bool(Args... exactly_N_bools) : base_type(real_batch{ exactly_N_bools... }) { } }; /*************************************** * batch, N> * ***************************************/ template struct simd_batch_traits, N>> : complex_batch_traits, T, N, XSIMD_DEFAULT_ALIGNMENT> { }; template class batch, N> : public simd_complex_batch, N>> { public: using self_type = batch, N>; using base_type = simd_complex_batch; using value_type = xtl::xcomplex; using real_batch = batch; using real_value_type = T; batch() = default; using base_type::base_type; // Constructor from N scalar parameters template < typename... Args, typename Enable = typename detail::is_array_initializer::type > batch(Args... exactly_N_scalars) : base_type(real_batch{ exactly_N_scalars.real()... }, real_batch{ exactly_N_scalars.imag()... }) { } using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; template typename std::enable_if::value, self_type&>::type load_aligned(const U* src); template typename std::enable_if::value, self_type&>::type load_unaligned(const U* src); template typename std::enable_if::value, self_type&>::type load_aligned(const U* src); template typename std::enable_if::value, self_type&>::type load_unaligned(const U* src); template void store_aligned(U* dst) const; template void store_unaligned(U* dst) const; }; #endif /************************ * conversion functions * ************************/ template batch to_int(const batch& x); template batch to_int(const batch& x); template batch to_float(const batch& x); template batch to_float(const batch& x); /************************** * boolean cast functions * **************************/ template batch_bool bool_cast(const batch_bool& x); template batch_bool bool_cast(const batch_bool& x); template batch_bool bool_cast(const batch_bool& x); template batch_bool bool_cast(const batch_bool& x); /************************** * Boilerplate generators * **************************/ // These macros all asume that T and N are in scope and have the meaning used in // the definitions of batch and batch_bool. #define XSIMD_FALLBACK_MAPPING_LOOP(RESULT_TYPE, EXPRESSION) \ RESULT_TYPE result; \ for(std::size_t i = 0; i < N; ++i) { \ result[i] = (EXPRESSION); \ } \ return result; #define XSIMD_FALLBACK_UNARY_OP(RESULT_TYPE, OPERATOR, X) \ XSIMD_FALLBACK_MAPPING_LOOP(RESULT_TYPE, (OPERATOR X[i])) #define XSIMD_FALLBACK_BINARY_OP(RESULT_TYPE, OPERATOR, X, Y) \ XSIMD_FALLBACK_MAPPING_LOOP(RESULT_TYPE, (X[i] OPERATOR Y[i])) #define XSIMD_FALLBACK_BATCH_BITWISE_UNARY_OP(OPERATOR, X) \ XSIMD_FALLBACK_MAPPING_LOOP( \ batch, \ detail::from_unsigned_integer( \ OPERATOR detail::to_unsigned_integer(X[i]) \ ) \ ) #define XSIMD_FALLBACK_BATCH_BITWISE_BINARY_OP(OPERATOR, X, Y) \ XSIMD_FALLBACK_MAPPING_LOOP( \ batch, \ detail::from_unsigned_integer( \ detail::to_unsigned_integer(X[i]) \ OPERATOR \ detail::to_unsigned_integer(Y[i]) \ ) \ ) #define XSIMD_FALLBACK_BATCH_UNARY_FUNC(FUNCTION, X) \ XSIMD_FALLBACK_MAPPING_LOOP(batch, FUNCTION(X[i])) #define XSIMD_FALLBACK_BATCH_BINARY_FUNC(FUNCTION, X, Y) \ XSIMD_FALLBACK_MAPPING_LOOP(batch, FUNCTION(X[i], Y[i])) #define XSIMD_FALLBACK_BATCH_TERNARY_FUNC(FUNCTION, X, Y, Z) \ XSIMD_FALLBACK_MAPPING_LOOP(batch, FUNCTION(X[i], Y[i], Z[i])) // NOTE: Static casting a vector is static casting every element #define XSIMD_FALLBACK_BATCH_STATIC_CAST(T_OUT, X) \ batch result; \ for(std::size_t i = 0; i < N; ++i) { \ result[i] = static_cast(X[i]); \ } \ return result; // NOTE: Casting between batch_bools of the same size is actually trivial! #define XSIMD_FALLBACK_BOOL_CAST(T_OUT, X) \ return batch_bool(static_cast>(X)); /*********************************** * batch_bool implementation * ***********************************/ template inline batch_bool::batch_bool() { } template inline batch_bool::batch_bool(bool b) : m_value(detail::array_from_scalar(b)) { } template template inline batch_bool::batch_bool(Args... exactly_N_bools) : m_value{ exactly_N_bools... } { } template inline batch_bool::batch_bool(const std::array& rhs) : m_value(rhs) { } template inline batch_bool& batch_bool::operator=(const std::array& rhs) { m_value = rhs; return *this; } template inline batch_bool::operator std::array() const { return m_value; } template inline const bool& batch_bool::operator[](std::size_t index) const { return m_value[index]; } template inline bool& batch_bool::operator[](std::size_t index) { return m_value[index]; } template inline const std::array& batch_bool::get_value() const { return m_value; } template template inline batch_bool& batch_bool::load_values(Args... args) { m_value = std::array({args...}); return *this; } namespace detail { template struct batch_bool_kernel { using batch_type = batch_bool; static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_BINARY_OP(batch_bool, &, lhs, rhs) } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_BINARY_OP(batch_bool, | , lhs, rhs) } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_BINARY_OP(batch_bool, ^, lhs, rhs) } static batch_type bitwise_not(const batch_type& rhs) { XSIMD_FALLBACK_UNARY_OP(batch_bool, !, rhs) } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_MAPPING_LOOP(batch_bool, (!(lhs[i] & rhs[i]))) } static batch_type equal(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_BINARY_OP(batch_bool, == , lhs, rhs) } static batch_type not_equal(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_BINARY_OP(batch_bool, != , lhs, rhs) } static bool all(const batch_type& rhs) { for (std::size_t i = 0; i < N; ++i) { if (!rhs[i]) return false; } return true; } static bool any(const batch_type& rhs) { for (std::size_t i = 0; i < N; ++i) { if (rhs[i]) return true; } return false; } }; } /********************************** * batch implementation * **********************************/ template inline batch::batch() { } template inline batch::batch(T f) : base_type(detail::array_from_scalar(f)) { } template template inline batch::batch(Args... exactly_N_scalars) : base_type(storage_type{ static_cast(exactly_N_scalars)... }) { } namespace detail { template struct all_bits { template static T get(T) { return ~T(0); } }; template <> struct all_bits { template static T get(T) { T res(0); using int_type = as_unsigned_integer_t; int_type value(~int_type(0)); std::memcpy(&res, &value, sizeof(int_type)); return res; } }; } template inline batch::batch(const std::array& src) { using all_bits = detail::all_bits::value>; for(std::size_t i = 0; i < N; ++i) { this->m_value[i] = src[i] ? all_bits::get(T(0)) : T(0); } } template inline batch::batch(const T* src) : batch(src, unaligned_mode()) { } template inline batch::batch(const T* src, aligned_mode) : batch(src, unaligned_mode()) { } template inline batch::batch(const T* src, unaligned_mode) : base_type(detail::array_from_pointer(src)) { } template inline batch::batch(const std::array& rhs) : base_type(rhs) { } template inline batch::batch(const batch_bool_type& rhs) { std::transform(rhs.get_value().cbegin(), rhs.get_value().cend(), this->m_value.begin(), [](bool b) -> T { return b ? T(1) : T(0); }); } template inline batch& batch::operator=(const std::array& rhs) { this->m_value = rhs; return *this; } template inline batch& batch::operator=(const std::array& rhs) { using all_bits = detail::all_bits::value>; std::transform(rhs.cbegin(), rhs.cend(), this->m_value.begin(), [](bool b) -> T { return b ? all_bits::get(T(0)) : T(0); }); return *this; } template inline batch& batch::operator=(const batch_bool_type& rhs) { std::transform(rhs.get_value().cbegin(), rhs.get_value().cend(), this->m_value.begin(), [](bool b) -> T { return b ? T(1) : T(0); }); return *this; } template inline batch::operator std::array() const { return this->m_value; } #define FALLBACK_DEFINE_LOAD_STORE(TYPE) \ template \ inline batch& batch::load_aligned(const TYPE* src) \ { \ return this->load_unaligned_impl(src); \ } \ template \ inline batch& batch::load_unaligned(const TYPE* src) \ { \ return this->load_unaligned_impl(src); \ } \ template \ inline void batch::store_aligned(TYPE* dst) const \ { \ this->store_unaligned_impl(dst); \ } \ template \ inline void batch::store_unaligned(TYPE* dst) const \ { \ this->store_unaligned_impl(dst); \ } FALLBACK_DEFINE_LOAD_STORE(bool) FALLBACK_DEFINE_LOAD_STORE(int8_t) FALLBACK_DEFINE_LOAD_STORE(uint8_t) FALLBACK_DEFINE_LOAD_STORE(int16_t) FALLBACK_DEFINE_LOAD_STORE(uint16_t) FALLBACK_DEFINE_LOAD_STORE(int32_t) FALLBACK_DEFINE_LOAD_STORE(uint32_t) FALLBACK_DEFINE_LOAD_STORE(int64_t) FALLBACK_DEFINE_LOAD_STORE(uint64_t) FALLBACK_DEFINE_LOAD_STORE(float) FALLBACK_DEFINE_LOAD_STORE(double) #undef FALLBACK_DEFINE_LOAD_STORE template template inline batch& batch::load_unaligned_impl(const U* src) { for(std::size_t i = 0; i < N; ++i) { this->m_value[i] = static_cast(src[i]); } return *this; } template template inline void batch::store_unaligned_impl(U* dst) const { for(std::size_t i = 0; i < N; ++i) { dst[i] = static_cast(this->m_value[i]); } } template inline T& batch::operator[](std::size_t index) { return this->m_value[index % base_type::size]; } template inline const T& batch::operator[](std::size_t index) const { return this->m_value[index % base_type::size]; } namespace detail { template struct batch_kernel { using batch_type = batch; using value_type = T; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { XSIMD_FALLBACK_UNARY_OP(batch, -, rhs) } static batch_type add(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_BINARY_OP(batch, +, lhs, rhs) } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_BINARY_OP(batch, -, lhs, rhs) } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_BATCH_BINARY_FUNC(xsimd::sadd, lhs, rhs) } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_BATCH_BINARY_FUNC(xsimd::ssub, lhs, rhs) } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_BINARY_OP(batch, *, lhs, rhs) } static batch_type div(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_BINARY_OP(batch, /, lhs, rhs) } static batch_type mod(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_BINARY_OP(batch, % , lhs, rhs) } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_BINARY_OP(batch_bool, == , lhs, rhs) } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_BINARY_OP(batch_bool, != , lhs, rhs) } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_BINARY_OP(batch_bool, < , lhs, rhs) } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_BINARY_OP(batch_bool, <=, lhs, rhs) } static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_BATCH_BITWISE_BINARY_OP(&, lhs, rhs) } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_BATCH_BITWISE_BINARY_OP(|, lhs, rhs) } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_BATCH_BITWISE_BINARY_OP(^ , lhs, rhs) } static batch_type bitwise_not(const batch_type& rhs) { XSIMD_FALLBACK_BATCH_BITWISE_UNARY_OP(~, rhs) } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_MAPPING_LOOP( batch, detail::from_unsigned_integer( ~( detail::to_unsigned_integer(lhs[i]) & detail::to_unsigned_integer(rhs[i]) ) ) ) } static batch_type min(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_BATCH_BINARY_FUNC(std::min, lhs, rhs) } static batch_type max(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_BATCH_BINARY_FUNC(std::max, lhs, rhs) } static batch_type fmin(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_BATCH_BINARY_FUNC(std::fmin, lhs, rhs) } static batch_type fmax(const batch_type& lhs, const batch_type& rhs) { XSIMD_FALLBACK_BATCH_BINARY_FUNC(std::fmax, lhs, rhs) } static batch_type abs(const batch_type& rhs) { XSIMD_FALLBACK_BATCH_UNARY_FUNC(std::abs, rhs) } static batch_type fabs(const batch_type& rhs) { XSIMD_FALLBACK_BATCH_UNARY_FUNC(std::fabs, rhs) } static batch_type sqrt(const batch_type& rhs) { XSIMD_FALLBACK_BATCH_UNARY_FUNC(std::sqrt, rhs) } static batch_type fma(const batch_type& x, const batch_type& y, const batch_type& z) { XSIMD_FALLBACK_BATCH_TERNARY_FUNC(std::fma, x, y, z) } static batch_type fms(const batch_type& x, const batch_type& y, const batch_type& z) { return fma(x, y, -z); } static batch_type fnma(const batch_type& x, const batch_type& y, const batch_type& z) { return fma(-x, y, z); } static batch_type fnms(const batch_type& x, const batch_type& y, const batch_type& z) { return fma(-x, y, -z); } static value_type hadd(const batch_type& rhs) { value_type result = 0; for (std::size_t i = 0; i < N; ++i) { result += rhs[i]; } return result; } static batch_type haddp(const batch_type* row) { XSIMD_FALLBACK_MAPPING_LOOP(batch, hadd(row[i])) } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { XSIMD_FALLBACK_MAPPING_LOOP(batch, (cond[i] ? a[i] : b[i])) } template static batch_type select(const batch_bool_constant& cond, const batch_type& a, const batch_type& b) { XSIMD_FALLBACK_MAPPING_LOOP(batch, (cond[i] ? a[i] : b[i])) } static batch_bool_type isnan(const batch_type& x) { XSIMD_FALLBACK_MAPPING_LOOP(batch_bool, std::isnan(x[i])) } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { batch_type b_lo; for (std::size_t i = 0, j = 0; i < N/2; ++i, j = j + 2) { b_lo[j] = lhs[i]; b_lo[j + 1] = rhs[i]; } return b_lo; } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { batch_type b_hi; for (std::size_t i = 0, j = 0; i < N/2; ++i, j = j + 2) { b_hi[j] = lhs[i + N/2]; b_hi[j + 1] = rhs[i+ N/2]; } return b_hi; } /* 0 <= n <= N/2 */ static batch_type extract_pair(const batch_type& lhs, const batch_type& rhs, const int n) { batch_type b_concatenate; for (int i = 0 ; i < static_cast(N - n); ++i) { b_concatenate[i] = lhs[i + n]; if(i < n) { b_concatenate[N - 1 - i] = rhs[n - 1 - i]; } } return b_concatenate; } }; } template inline batch operator<<(const batch& lhs, int32_t rhs) { XSIMD_FALLBACK_MAPPING_LOOP(batch, (lhs[i] << rhs)) } template inline batch operator>>(const batch& lhs, int32_t rhs) { XSIMD_FALLBACK_MAPPING_LOOP(batch, (lhs[i] >> rhs)) } /*********************************************** * utility functions to avoid code duplication * ***********************************************/ namespace detail { template inline std::pair load_complex_impl(const U* src) { using value_type = typename U::value_type; using dst_value_type = typename B::value_type; const value_type* buf = reinterpret_cast(src); B real, imag; for (std::size_t i = 0; i < N; ++i) { real[i] = static_cast(buf[2 * i]); imag[i] = static_cast(buf[2 * i + 1]); } return std::make_pair(real, imag); } template inline void store_complex_impl(const B& real, const B& imag, U* dst) { using value_type = typename U::value_type; value_type* buf = reinterpret_cast(dst); for (std::size_t i = 0; i < N; ++i) { buf[2 * i] = static_cast(real[i]); buf[2 * i + 1] = static_cast(imag[i]); } } } /******************************************** * batch> implementation * ********************************************/ template template inline auto batch, N>::load_aligned(const U* src) -> typename std::enable_if::value, self_type&>::type { std::tie(this->m_real, this->m_imag) = detail::load_complex_impl(src); return *this; } template template inline auto batch, N>::load_unaligned(const U* src) -> typename std::enable_if::value, self_type&>::type { return load_aligned(src); } template template inline auto batch, N>::load_aligned(const U* src) -> typename std::enable_if::value, self_type&>::type { for (std::size_t i = 0; i < N; ++i) { this->m_real[i] = static_cast(src[i]); this->m_imag[i] = real_value_type(0); } return *this; } template template inline auto batch, N>::load_unaligned(const U* src) -> typename std::enable_if::value, self_type&>::type { return load_aligned(src); } template template inline void batch, N>::store_aligned(U* dst) const { detail::store_complex_impl(this->m_real, this->m_imag, dst); } template template inline void batch, N>::store_unaligned(U* dst) const { store_aligned(dst); } /****************************************************** * batch, N> implementation * ******************************************************/ #ifdef XSIMD_ENABLE_XTL_COMPLEX template template inline auto batch, N>::load_aligned(const U* src) -> typename std::enable_if::value, self_type&>::type { std::tie(this->m_real, this->m_imag) = detail::load_complex_impl(src); return *this; } template template inline auto batch, N>::load_unaligned(const U* src) -> typename std::enable_if::value, self_type&>::type { return load_aligned(src); } template template inline auto batch, N>::load_aligned(const U* src) -> typename std::enable_if::value, self_type&>::type { for (std::size_t i = 0; i < N; ++i) { this->m_real[i] = static_cast(src[i]); this->m_imag[i] = real_value_type(0); } return *this; } template template inline auto batch, N>::load_unaligned(const U* src) -> typename std::enable_if::value, self_type&>::type { return load_aligned(src); } template template inline void batch, N>::store_aligned(U* dst) const { detail::store_complex_impl(this->m_real, this->m_imag, dst); } template template inline void batch, N>::store_unaligned(U* dst) const { store_unaligned(dst); } #endif /*************************************** * conversion functions implementation * ***************************************/ template inline batch to_int(const batch& x) { XSIMD_FALLBACK_BATCH_STATIC_CAST(int32_t, x) } template inline batch to_int(const batch& x) { XSIMD_FALLBACK_BATCH_STATIC_CAST(int64_t, x) } template inline batch to_float(const batch& x) { XSIMD_FALLBACK_BATCH_STATIC_CAST(float, x) } template inline batch to_float(const batch& x) { XSIMD_FALLBACK_BATCH_STATIC_CAST(double, x) } /************************** * boolean cast functions * **************************/ template inline batch_bool bool_cast(const batch_bool& x) { XSIMD_FALLBACK_BOOL_CAST(int32_t, x) } template inline batch_bool bool_cast(const batch_bool& x) { XSIMD_FALLBACK_BOOL_CAST(int64_t, x) } template inline batch_bool bool_cast(const batch_bool& x) { XSIMD_FALLBACK_BOOL_CAST(float, x) } template inline batch_bool bool_cast(const batch_bool& x) { XSIMD_FALLBACK_BOOL_CAST(double, x) } /***************************************** * bitwise cast functions implementation * *****************************************/ template batch u8_to_u16(const batch& x); template batch u16_to_u8(const batch& x); template batch u8_to_u32(const batch& x); template batch u32_to_u8(const batch& x); template batch u8_to_u64(const batch& x); template batch u64_to_u8(const batch& x); template struct bitwise_cast_impl, batch> { private: static_assert(sizeof(T_in)*N_in % sizeof(T_out) == 0, "The input and output batches must have the same size"); static constexpr size_t N_out = sizeof(T_in)*N_in/sizeof(T_out); union Converter { std::array in; std::array out; }; public: static batch run(const batch& x) { Converter caster; caster.in = static_cast>(x); return batch(caster.out); } }; /*********************************************** * static_cast conversion by bitwise_cast_impl * ***********************************************/ template inline batch u8_to_u16(const batch& x) { return bitwise_cast_impl, batch>::run(x); } template inline batch u16_to_u8(const batch& x) { return bitwise_cast_impl, batch>::run(x); } template inline batch u8_to_u32(const batch& x) { return bitwise_cast_impl, batch>::run(x); } template inline batch u32_to_u8(const batch& x) { return bitwise_cast_impl, batch>::run(x); } template inline batch u8_to_u64(const batch& x) { return bitwise_cast_impl, batch>::run(x); } template inline batch u64_to_u8(const batch& x) { return bitwise_cast_impl, batch>::run(x); } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_int_conversion.hpp000066400000000000000000000147651410101234500235430ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_INT_CONVERSION_HPP #define XSIMD_INT_CONVERSION_HPP #include "xsimd_base.hpp" namespace xsimd { namespace detail { /************************************ * conversion of 8 int8 <-> 8 int32 * ************************************/ // a contains 8 int8 in its low half __m256i xsimd_cvtepi8_epi32(__m128i a); __m256i xsimd_cvtepu8_epi32(__m128i a); // Returns an vector containing 8 int8 in its low half __m128i xsimd_cvtepi32_epi8(__m256i a); __m128i xsimd_cvtepi32_epu8(__m256i a); // a contains 16 int8 __m256i xsimd_cvtepi16_epi32(__m128i a); __m256i xsimd_cvtepu16_epi32(__m128i a); // Returns an vector containing 8 int16 __m128i xsimd_cvtepi32_epi16(__m256i a); __m128i xsimd_cvtepi32_epu16(__m256i a); /****************** * Implementation * ******************/ inline __m256i xsimd_cvtepi8_epi32(__m128i a) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION __m256i res = _mm256_cvtepi8_epi32(a); #else __m128i mask = _mm_cmplt_epi8(a, _mm_set1_epi8(0)); __m128i tmp1 = _mm_unpacklo_epi8(a, mask); mask = _mm_cmplt_epi16(tmp1, _mm_set1_epi16(0)); __m128i tmp2 = _mm_unpacklo_epi16(tmp1, mask); __m128i tmp3 = _mm_unpackhi_epi16(tmp1, mask); __m256i res = _mm256_castsi128_si256(tmp2); res = _mm256_insertf128_si256(res, tmp3, 1); #endif return res; } inline __m256i xsimd_cvtepu8_epi32(__m128i a) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION __m256i res = _mm256_cvtepu8_epi32(a); #else __m128i tmp1 = _mm_unpacklo_epi8(a, _mm_set1_epi8(0)); __m128i tmp2 = _mm_unpacklo_epi16(tmp1, _mm_set1_epi16(0)); __m128i tmp3 = _mm_unpackhi_epi16(tmp1, _mm_set1_epi16(0)); __m256i res = _mm256_castsi128_si256(tmp2); res = _mm256_insertf128_si256(res, tmp3, 1); #endif return res; } inline __m128i xsimd_cvtepi32_epi8(__m256i a) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION __m256i tmp2 = _mm256_packs_epi32(a, a); __m256i tmp3 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(3, 1, 2, 0)); __m256i tmp4 = _mm256_packs_epi16(tmp3, _mm256_set1_epi16(0)); __m128i res = _mm256_castsi256_si128(tmp4); #else __m128i tmp_hi = _mm256_extractf128_si256(a, 1); __m128i tmp_lo = _mm256_castsi256_si128(a); tmp_hi = _mm_packs_epi32(tmp_hi, tmp_hi); tmp_lo = _mm_packs_epi32(tmp_lo, tmp_lo); __m128i res = _mm_unpacklo_epi64(tmp_lo, tmp_hi); res = _mm_packs_epi16(res, _mm_set1_epi16(0)); #endif return res; } inline __m128i xsimd_cvtepi32_epu8(__m256i a) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION __m256i tmp2 = _mm256_packs_epi32(a, a); __m256i tmp3 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(3, 1, 2, 0)); __m256i tmp4 = _mm256_packus_epi16(tmp3, _mm256_set1_epi16(0)); __m128i res = _mm256_castsi256_si128(tmp4); #else __m128i tmp_hi = _mm256_extractf128_si256(a, 1); __m128i tmp_lo = _mm256_castsi256_si128(a); tmp_hi = _mm_packs_epi32(tmp_hi, tmp_hi); tmp_lo = _mm_packs_epi32(tmp_lo, tmp_lo); __m128i res = _mm_unpacklo_epi64(tmp_lo, tmp_hi); res = _mm_packus_epi16(res, _mm_set1_epi16(0)); #endif return res; } inline __m256i xsimd_cvtepi16_epi32(__m128i a) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION __m256i res = _mm256_cvtepi16_epi32(a); #else __m128i mask = _mm_cmplt_epi16(a, _mm_set1_epi16(0)); __m128i tmp1 = _mm_unpacklo_epi16(a, mask); __m128i tmp2 = _mm_unpackhi_epi16(a, mask); __m256i res = _mm256_castsi128_si256(tmp1); res = _mm256_insertf128_si256(res, tmp2, 1); #endif return res; } inline __m256i xsimd_cvtepu16_epi32(__m128i a) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION __m256i res = _mm256_cvtepu16_epi32(a); #else __m128i tmp1 = _mm_unpacklo_epi16(a, _mm_set1_epi16(0)); __m128i tmp2 = _mm_unpackhi_epi16(a, _mm_set1_epi16(0)); __m256i res = _mm256_castsi128_si256(tmp1); res = _mm256_insertf128_si256(res, tmp2, 1); #endif return res; } inline __m128i xsimd_cvtepi32_epi16(__m256i a) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION __m256i tmp1 = _mm256_packs_epi32(a, a); __m256i tmp2 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(3, 1, 2, 0)); __m128i res = _mm256_castsi256_si128(tmp2); #else __m128i tmp_hi = _mm256_extractf128_si256(a, 1); __m128i tmp_lo = _mm256_castsi256_si128(a); tmp_hi = _mm_packs_epi32(tmp_hi, tmp_hi); tmp_lo = _mm_packs_epi32(tmp_lo, tmp_lo); __m128i res = _mm_unpacklo_epi64(tmp_lo, tmp_hi); #endif return res; } inline __m128i xsimd_cvtepi32_epu16(__m256i a) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION __m256i tmp1 = _mm256_packus_epi32(a, a); __m256i tmp2 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(3, 1, 2, 0)); __m128i res = _mm256_castsi256_si128(tmp2); #else __m128i tmp_hi = _mm256_extractf128_si256(a, 1); __m128i tmp_lo = _mm256_castsi256_si128(a); tmp_hi = _mm_packus_epi32(tmp_hi, tmp_hi); tmp_lo = _mm_packus_epi32(tmp_lo, tmp_lo); __m128i res = _mm_unpacklo_epi64(tmp_lo, tmp_hi); #endif return res; } } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_neon_bool.hpp000066400000000000000000000570351410101234500224530ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NEON_BOOL_HPP #define XSIMD_NEON_BOOL_HPP #include "xsimd_base.hpp" namespace xsimd { /******************** * batch_bool * ********************/ template struct simd_batch_traits> { using value_type = T; static constexpr std::size_t size = 8; using batch_type = batch; static constexpr std::size_t align = XSIMD_DEFAULT_ALIGNMENT; }; template class batch_bool : public simd_batch_bool> { public: using simd_type = uint16x8_t; batch_bool(); explicit batch_bool(bool b); batch_bool(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7); batch_bool(const simd_type& rhs); template batch_bool(const batch_bool& rhs); batch_bool& operator=(const simd_type& rhs); operator simd_type() const; bool_proxy operator[](std::size_t index); bool operator[](std::size_t index) const; private: batch_bool& load_values(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7); union { simd_type m_value; uint16_t m_array[8]; }; friend class simd_batch_bool>; }; /******************** * batch_bool * ********************/ template struct simd_batch_traits> { using value_type = T; static constexpr std::size_t size = 4; using batch_type = batch; static constexpr std::size_t align = XSIMD_DEFAULT_ALIGNMENT; }; template class batch_bool : public simd_batch_bool> { public: using simd_type = uint32x4_t; batch_bool(); explicit batch_bool(bool b); batch_bool(bool b0, bool b1, bool b2, bool b3); batch_bool(const simd_type& rhs); template batch_bool(const batch_bool& rhs); batch_bool& operator=(const simd_type& rhs); operator simd_type() const; bool_proxy operator[](std::size_t index); bool operator[](std::size_t index) const; private: batch_bool& load_values(bool b0, bool b1, bool b2, bool b3); union { simd_type m_value; uint32_t m_array[4]; }; friend class simd_batch_bool>; }; /******************** * batch_bool * ********************/ template struct simd_batch_traits> { using value_type = T; static constexpr std::size_t size = 2; using batch_type = batch; static constexpr std::size_t align = XSIMD_DEFAULT_ALIGNMENT; }; template class batch_bool : public simd_batch_bool> { public: using simd_type = uint64x2_t; batch_bool(); explicit batch_bool(bool b); batch_bool(bool b0, bool b1); batch_bool(const simd_type& rhs); template batch_bool(const batch_bool& rhs); batch_bool& operator=(const simd_type& rhs); operator simd_type() const; bool_proxy operator[](std::size_t index); bool operator[](std::size_t index) const; private: batch_bool& load_values(bool b0, bool b1); union { simd_type m_value; uint64_t m_array[2]; }; friend class simd_batch_bool>; }; /********************* * batch_bool * *********************/ template struct simd_batch_traits> { using value_type = T; static constexpr std::size_t size = 16; using batch_type = batch; static constexpr std::size_t align = XSIMD_DEFAULT_ALIGNMENT; }; template class batch_bool : public simd_batch_bool> { public: using simd_type = uint8x16_t; batch_bool(); explicit batch_bool(bool b); batch_bool(bool d1, bool d2, bool d3, bool d4, bool d5, bool d6, bool d7, bool d8, bool d9, bool d10, bool d11, bool d12, bool d13, bool d14, bool d15, bool d16); batch_bool(const simd_type& rhs); template batch_bool(const batch_bool& rhs); batch_bool& operator=(const simd_type& rhs); operator simd_type() const; bool_proxy operator[](std::size_t index); bool operator[](std::size_t index) const; private: batch_bool& load_values(bool d1, bool d2, bool d3, bool d4, bool d5, bool d6, bool d7, bool d8, bool d9, bool d10, bool d11, bool d12, bool d13, bool d14, bool d15, bool d16); union { simd_type m_value; uint8_t m_array[16]; }; friend class simd_batch_bool>; }; /*********************************** * batch_bool implementation * ***********************************/ template inline batch_bool::batch_bool() { } template inline batch_bool::batch_bool(bool b) : m_value(vdupq_n_u16( static_cast(-(int)b)) ) { } template inline batch_bool::batch_bool(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7) : m_value{ static_cast(-int(b0)), static_cast(-int(b1)), static_cast(-int(b2)), static_cast(-int(b3)), static_cast(-int(b4)), static_cast(-int(b5)), static_cast(-int(b6)), static_cast(-int(b7))} { } template inline batch_bool::batch_bool(const simd_type& rhs) : m_value(rhs) { } template template inline batch_bool::batch_bool(const batch_bool& rhs) : m_value(static_cast(rhs)) { } template inline batch_bool& batch_bool::operator=(const simd_type& rhs) { m_value = rhs; return *this; } template inline batch_bool::operator uint16x8_t() const { return m_value; } template inline bool_proxy batch_bool::operator[](std::size_t index) { return bool_proxy(m_array[index & 7]); } template inline bool batch_bool::operator[](std::size_t index) const { return static_cast(m_array[index & 7]); } template inline batch_bool& batch_bool::load_values(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7) { m_value[0] = static_cast(-int(b0)); m_value[1] = static_cast(-int(b1)); m_value[2] = static_cast(-int(b2)); m_value[3] = static_cast(-int(b3)); m_value[4] = static_cast(-int(b4)); m_value[5] = static_cast(-int(b5)); m_value[6] = static_cast(-int(b6)); m_value[7] = static_cast(-int(b7)); return *this; } namespace detail { template struct batch_bool_kernel { using batch_type = batch_bool; static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return vandq_u16(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return vorrq_u16(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return veorq_u16(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return vmvnq_u16(rhs); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return vbicq_u16(lhs, rhs); } static batch_type equal(const batch_type& lhs, const batch_type& rhs) { return vceqq_u16(lhs, rhs); } static batch_type not_equal(const batch_type& lhs, const batch_type& rhs) { return veorq_u16(lhs, rhs); } static bool all(const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vminvq_u16(rhs) != 0; #else uint16x4_t tmp = vand_u16(vget_low_u16(rhs), vget_high_u16(rhs)); tmp = vpmin_u16(tmp, tmp); tmp = vpmin_u16(tmp, tmp); return vget_lane_u16(tmp, 0) != 0; #endif } static bool any(const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vmaxvq_u16(rhs) != 0; #else uint16x4_t tmp = vorr_u16(vget_low_u16(rhs), vget_high_u16(rhs)); tmp = vpmax_u16(tmp, tmp); tmp = vpmax_u16(tmp, tmp); return vget_lane_u16(tmp, 0); #endif } }; } /*********************************** * batch_bool implementation * ***********************************/ template inline batch_bool::batch_bool() { } template inline batch_bool::batch_bool(bool b) : m_value(vdupq_n_u32( static_cast(-(int)b)) ) { } template inline batch_bool::batch_bool(bool b0, bool b1, bool b2, bool b3) : m_value{ static_cast(-int(b0)), static_cast(-int(b1)), static_cast(-int(b2)), static_cast(-int(b3))} { } template inline batch_bool::batch_bool(const simd_type& rhs) : m_value(rhs) { } template template inline batch_bool::batch_bool(const batch_bool& rhs) : m_value(static_cast(rhs)) { } template inline batch_bool& batch_bool::operator=(const simd_type& rhs) { m_value = rhs; return *this; } template inline batch_bool::operator uint32x4_t() const { return m_value; } template inline bool_proxy batch_bool::operator[](std::size_t index) { return bool_proxy(m_array[index & 3]); } template inline bool batch_bool::operator[](std::size_t index) const { return static_cast(m_array[index & 3]); } template inline batch_bool& batch_bool::load_values(bool b0, bool b1, bool b2, bool b3) { m_value[0] = static_cast(-int(b0)); m_value[1] = static_cast(-int(b1)); m_value[2] = static_cast(-int(b2)); m_value[3] = static_cast(-int(b3)); return *this; } namespace detail { template struct batch_bool_kernel { using batch_type = batch_bool; static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return vandq_u32(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return vorrq_u32(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return veorq_u32(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return vmvnq_u32(rhs); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return vbicq_u32(lhs, rhs); } static batch_type equal(const batch_type& lhs, const batch_type& rhs) { return vceqq_u32(lhs, rhs); } static batch_type not_equal(const batch_type& lhs, const batch_type& rhs) { return veorq_u32(lhs, rhs); } static bool all(const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vminvq_u32(rhs) != 0; #else uint32x2_t tmp = vand_u32(vget_low_u32(rhs), vget_high_u32(rhs)); return vget_lane_u32(vpmin_u32(tmp, tmp), 0) != 0; #endif } static bool any(const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vmaxvq_u32(rhs) != 0; #else uint32x2_t tmp = vorr_u32(vget_low_u32(rhs), vget_high_u32(rhs)); return vget_lane_u32(vpmax_u32(tmp, tmp), 0); #endif } }; } /************************************ * batch_bool implementation * ************************************/ template inline batch_bool::batch_bool() { } template inline batch_bool::batch_bool(bool b) : m_value(vdupq_n_u8( static_cast(-(int8_t)b)) ) { } template inline batch_bool::batch_bool(bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7, bool b8, bool b9, bool b10, bool b11, bool b12, bool b13, bool b14, bool b15, bool b16) : m_value{ static_cast(-int(b1)), static_cast(-int(b2)), static_cast(-int(b3)), static_cast(-int(b4)), static_cast(-int(b5)), static_cast(-int(b6)), static_cast(-int(b7)), static_cast(-int(b8)), static_cast(-int(b9)), static_cast(-int(b10)), static_cast(-int(b11)), static_cast(-int(b12)), static_cast(-int(b13)), static_cast(-int(b14)), static_cast(-int(b15)), static_cast(-int(b16))} { } template inline batch_bool::batch_bool(const simd_type& rhs) : m_value(rhs) { } template template inline batch_bool::batch_bool(const batch_bool& rhs) : m_value(static_cast(rhs)) { } template inline batch_bool& batch_bool::operator=(const simd_type& rhs) { m_value = rhs; return *this; } template inline batch_bool::operator uint8x16_t() const { return m_value; } template inline bool_proxy batch_bool::operator[](std::size_t index) { return bool_proxy(m_array[index & 15]); } template inline bool batch_bool::operator[](std::size_t index) const { return static_cast(m_array[index & 15]); } template inline batch_bool& batch_bool::load_values(bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7, bool b8, bool b9, bool b10, bool b11, bool b12, bool b13, bool b14, bool b15, bool b16) { m_value[0] = static_cast(-int(b1)); m_value[1] = static_cast(-int(b2)); m_value[2] = static_cast(-int(b3)); m_value[3] = static_cast(-int(b4)); m_value[4] = static_cast(-int(b5)); m_value[5] = static_cast(-int(b6)); m_value[6] = static_cast(-int(b7)); m_value[7] = static_cast(-int(b8)); m_value[8] = static_cast(-int(b9)); m_value[9] = static_cast(-int(b10)); m_value[10] = static_cast(-int(b11)); m_value[11] = static_cast(-int(b12)); m_value[12] = static_cast(-int(b13)); m_value[13] = static_cast(-int(b14)); m_value[14] = static_cast(-int(b15)); m_value[15] = static_cast(-int(b16)); return *this; } namespace detail { template struct batch_bool_kernel { using batch_type = batch_bool; static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return vandq_u8(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return vorrq_u8(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return veorq_u8(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return vmvnq_u8(rhs); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return vbicq_u8(lhs, rhs); } static batch_type equal(const batch_type& lhs, const batch_type& rhs) { return vceqq_u8(lhs, rhs); } static batch_type not_equal(const batch_type& lhs, const batch_type& rhs) { return veorq_u8(lhs, rhs); } static bool all(const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vminvq_u8(rhs) != 0; #else uint8x8_t tmp = vand_u8(vget_low_u8(rhs), vget_high_u8(rhs)); tmp = vpmin_u8(tmp, tmp); tmp = vpmin_u8(tmp, tmp); tmp = vpmin_u8(tmp, tmp); return vget_lane_u8(tmp, 0) != 0; #endif } static bool any(const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vmaxvq_u8(rhs) != 0; #else uint8x8_t tmp = vorr_u8(vget_low_u8(rhs), vget_high_u8(rhs)); tmp = vpmax_u8(tmp, tmp); tmp = vpmax_u8(tmp, tmp); tmp = vpmax_u8(tmp, tmp); return vget_lane_u8(tmp, 0); #endif } }; } /*********************************** * batch_bool implementation * ***********************************/ template inline batch_bool::batch_bool() { } template inline batch_bool::batch_bool(bool b) : m_value(vdupq_n_u64(static_cast(-(int)b))) { } template inline batch_bool::batch_bool(bool b0, bool b1) : m_value{ static_cast(-int(b0)), static_cast(-int(b1))} { } template inline batch_bool::batch_bool(const simd_type& rhs) : m_value(rhs) { } template template inline batch_bool::batch_bool(const batch_bool& rhs) : m_value(static_cast(rhs)) { } template inline batch_bool& batch_bool::operator=(const simd_type& rhs) { m_value = rhs; return *this; } template inline batch_bool::operator uint64x2_t() const { return m_value; } template inline bool_proxy batch_bool::operator[](std::size_t index) { return bool_proxy(m_array[index & 1]); } template inline bool batch_bool::operator[](std::size_t index) const { return static_cast(m_array[index & 1]); } template inline batch_bool& batch_bool::load_values(bool b0, bool b1) { m_value[0] = static_cast(-int(b0)); m_value[1] = static_cast(-int(b1)); return *this; } namespace detail { template struct batch_bool_kernel { using batch_type = batch_bool; static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return vandq_u64(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return vorrq_u64(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return veorq_u64(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(rhs))); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { // According to Eigen return vbicq_u64(lhs, rhs); } static batch_type equal(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vceqq_u64(lhs, rhs); #else return vreinterpretq_u64_u32(vceqq_u32(vreinterpretq_u32_u64(lhs), vreinterpretq_u32_u64(rhs))); #endif } static batch_type not_equal(const batch_type& lhs, const batch_type& rhs) { return veorq_u64(lhs, rhs); } static bool all(const batch_type& rhs) { uint64x1_t tmp = vand_u64(vget_low_u64(rhs), vget_high_u64(rhs)); return vget_lane_u64(tmp, 0) != 0; } static bool any(const batch_type& rhs) { uint64x1_t tmp = vorr_u64(vget_low_u64(rhs), vget_high_u64(rhs)); return bool(vget_lane_u64(tmp, 0)); } }; } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_neon_complex.hpp000066400000000000000000000565671410101234500232000ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NEON_COMPLEX_HPP #define XSIMD_NEON_COMPLEX_HPP #include #include #ifdef XSIMD_ENABLE_XTL_COMPLEX #include "xtl/xcomplex.hpp" #endif #include "xsimd_neon_float.hpp" #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION #include "xsimd_neon_double.hpp" #endif #include "xsimd_complex_base.hpp" namespace xsimd { /************************************** * batch_bool, 4> * **************************************/ template <> struct simd_batch_traits, 4>> : complex_batch_bool_traits, float, 4, 32> { }; template<> class batch_bool, 4> : public simd_complex_batch_bool, 4>> { public: using self_type = batch_bool, 4>; using base_type = simd_complex_batch_bool; using real_batch = batch_bool; batch_bool() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch_bool(bool b0, bool b1, bool b2, bool b3) : base_type(real_batch(b0, b1, b2, b3)) { } }; namespace detail { template <> struct batch_bool_kernel, 4> : batch_bool_complex_kernel, 4> { }; } /********************************* * batch, 4> * *********************************/ template <> struct simd_batch_traits, 4>> : complex_batch_traits, float, 4, 32> { }; template <> class batch, 4> : public simd_complex_batch, 4>> { public: using self_type = batch, 4>; using base_type = simd_complex_batch; using value_type = std::complex; using real_batch = batch; batch() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch(value_type c0, value_type c1, value_type c2, value_type c3) : base_type(real_batch(c0.real(), c1.real(), c2.real(), c3.real()), real_batch(c0.imag(), c1.imag(), c2.imag(), c3.imag())) { } using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; self_type& load_aligned(const std::complex* src); self_type& load_unaligned(const std::complex* src); self_type& load_aligned(const std::complex* src); self_type& load_unaligned(const std::complex* src); void store_aligned(std::complex* dst) const; void store_unaligned(std::complex* dst) const; void store_aligned(std::complex* dst) const; void store_unaligned(std::complex* dst) const; }; /************************************************ * batch, 4> implementation * ************************************************/ inline auto batch, 4>::load_aligned(const std::complex* src) -> self_type& { const float* buf = reinterpret_cast(src); float32x4x2_t tmp = vld2q_f32(buf); this->m_real = tmp.val[0]; this->m_imag = tmp.val[1]; return *this; } inline auto batch, 4>::load_unaligned(const std::complex* src) -> self_type& { return load_aligned(src); } inline auto batch, 4>::load_aligned(const std::complex* src) -> self_type& { const double* buf = reinterpret_cast(src); #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION float64x2x2_t tmp0 = vld2q_f64(buf); float64x2x2_t tmp1 = vld2q_f64(buf + 4); float32x2x2_t tmp2 { vcvt_f32_f64(tmp0.val[0]), vcvt_f32_f64(tmp0.val[1]) }; float32x2x2_t tmp3 { vcvt_f32_f64(tmp1.val[0]), vcvt_f32_f64(tmp1.val[1]) }; this->m_real = vcombine_f32(tmp2.val[0], tmp3.val[0]); this->m_imag = vcombine_f32(tmp2.val[1], tmp3.val[1]); #else this->m_real = real_batch(static_cast(buf[0]), static_cast(buf[2]), static_cast(buf[4]), static_cast(buf[6])); this->m_imag = real_batch(static_cast(buf[1]), static_cast(buf[3]), static_cast(buf[5]), static_cast(buf[7])); #endif return *this; } inline auto batch, 4>::load_unaligned(const std::complex* src) -> self_type& { return load_aligned(src); } inline void batch, 4>::store_aligned(std::complex* dst) const { float32x4x2_t tmp; tmp.val[0] = this->m_real; tmp.val[1] = this->m_imag; float* buf = reinterpret_cast(dst); vst2q_f32(buf, tmp); } inline void batch, 4>::store_unaligned(std::complex* dst) const { store_aligned(dst); } inline void batch, 4>::store_aligned(std::complex* dst) const { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION double* buf = reinterpret_cast(dst); float64x2x2_t tmp0 { vcvt_f64_f32(vget_low_f32(this->m_real)), vcvt_f64_f32(vget_low_f32(this->m_imag)) }; float64x2x2_t tmp1 { vcvt_f64_f32(vget_high_f32(this->m_real)), vcvt_f64_f32(vget_high_f32(this->m_imag)) }; vst2q_f64(buf, tmp0); vst2q_f64(buf + 4, tmp1); #else for (std::size_t i = 0; i < 4; ++i) { dst[i] = std::complex(this->m_real[i], this->m_imag[i]); } #endif } inline void batch, 4>::store_unaligned(std::complex* dst) const { store_aligned(dst); } #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION /*************************************** * batch_bool, 2> * ***************************************/ template <> struct simd_batch_traits, 2>> : complex_batch_bool_traits, double, 2, 32> { }; template<> class batch_bool, 2> : public simd_complex_batch_bool, 2>> { public: using self_type = batch_bool, 2>; using base_type = simd_complex_batch_bool; using real_batch = batch_bool; batch_bool() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch_bool(bool b0, bool b1) : base_type(real_batch(b0, b1)) { } }; namespace detail { template <> struct batch_bool_kernel, 2> : batch_bool_complex_kernel, 2> { }; } /********************************** * batch, 2> * **********************************/ template <> struct simd_batch_traits, 2>> : complex_batch_traits, double, 2, 32> { }; template <> class batch, 2> : public simd_complex_batch, 2>> { public: using self_type = batch, 2>; using base_type = simd_complex_batch; using value_type = std::complex; using real_batch = batch; batch() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch(value_type c0, value_type c1) : base_type(real_batch(c0.real(), c1.real()), real_batch(c0.imag(), c1.imag())) { } using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; self_type& load_aligned(const std::complex* src); self_type& load_unaligned(const std::complex* src); self_type& load_aligned(const std::complex* src); self_type& load_unaligned(const std::complex* src); void store_aligned(std::complex* dst) const; void store_unaligned(std::complex* dst) const; void store_aligned(std::complex* dst) const; void store_unaligned(std::complex* dst) const; }; /************************************************* * batch, 2> implementation * *************************************************/ inline auto batch, 2>::load_aligned(const std::complex* src) -> self_type& { const float* buf = reinterpret_cast(src); float32x2x2_t tmp = vld2_f32(buf); this->m_real = vcvt_f64_f32(tmp.val[0]); this->m_imag = vcvt_f64_f32(tmp.val[1]); return *this; } inline auto batch, 2>::load_unaligned(const std::complex* src) -> self_type& { return load_aligned(src); } inline auto batch, 2>::load_aligned(const std::complex* src) -> self_type& { const double* buf = reinterpret_cast(src); float64x2x2_t tmp = vld2q_f64(buf); this->m_real = tmp.val[0]; this->m_imag = tmp.val[1]; return *this; } inline auto batch, 2>::load_unaligned(const std::complex* src) -> self_type& { return load_aligned(src); } inline void batch, 2>::store_aligned(std::complex* dst) const { float* buf = reinterpret_cast(dst); float32x2x2_t tmp { vcvt_f32_f64(this->m_real), vcvt_f32_f64(this->m_imag) }; vst2_f32(buf, tmp); } inline void batch, 2>::store_unaligned(std::complex* dst) const { return store_aligned(dst); } inline void batch, 2>::store_aligned(std::complex* dst) const { float64x2x2_t tmp; tmp.val[0] = this->m_real; tmp.val[1] = this->m_imag; double* buf = reinterpret_cast(dst); vst2q_f64(buf, tmp); } inline void batch, 2>::store_unaligned(std::complex* dst) const { return store_aligned(dst); } #endif #ifdef XSIMD_ENABLE_XTL_COMPLEX /**************************************************** * batch_bool, 4> * ****************************************************/ template struct simd_batch_traits, 4>> : complex_batch_bool_traits, float, 4, 16> { }; template class batch_bool, 4> : public simd_complex_batch_bool, 4>> { public: batch_bool() = default; using simd_complex_batch_bool::simd_complex_batch_bool; using real_batch = batch_bool; // VS2015 has a bug with inheriting constructors involving SFINAE batch_bool(bool b0, bool b1, bool b2, bool b3) : simd_complex_batch_bool(real_batch(b0, b1, b2, b3)) { } }; namespace detail { template struct batch_bool_kernel, 4> : batch_bool_complex_kernel, 4> { }; } /*********************************************** * batch, 4> * ***********************************************/ template struct simd_batch_traits, 4>> : complex_batch_traits, float, 4, 16> { }; template class batch, 4> : public simd_complex_batch, 4>> { public: using base_type = batch, 4>; using self_type = simd_complex_batch; using value_type = xtl::xcomplex; using real_batch = batch; batch() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch(value_type c0, value_type c1, value_type c2, value_type c3) : base_type(real_batch(c0.real(), c1.real(), c2.real(), c3.real()), real_batch(c0.imag(), c1.imag(), c2.imag(), c3.imag())) { } using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; self_type& load_aligned(const std::complex* src); self_type& load_unaligned(const std::complex* src); self_type& load_aligned(const std::complex* src); self_type& load_unaligned(const std::complex* src); void store_aligned(std::complex* dst) const; void store_unaligned(std::complex* dst) const; void store_aligned(std::complex* dst) const; void store_unaligned(std::complex* dst) const; }; /******************************************************************* * batch, 4> implementation * *******************************************************************/ template inline auto batch, 4>::load_aligned(const value_type* src) -> self_type& { const float* buf = reinterpret_cast(src); float32x4x2_t tmp = vld2q_f32(buf); this->m_real = tmp.val[0]; this->m_imag = tmp.val[1]; return *this; } template inline auto batch, 4>::load_unaligned(const value_type* src) -> self_type& { return load_aligned(src); } template inline auto batch, 4>::load_aligned(const xtl::complex* src) -> self_type& { const double* buf = reinterpret_cast(src); #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION float64x2x2_t tmp0 = vld2q_f64(buf); float64x2x2_t tmp1 = vld2q_f64(buf + 4); float32x2x2_t tmp2{ vcvt_f32_f64(tmp0.val[0]), vcvt_f32_f64(tmp0.val[1]) }; float32x2x2_t tmp3{ vcvt_f32_f64(tmp1.val[0]), vcvt_f32_f64(tmp1.val[1]) }; this->m_real = vcombine_f32(tmp2.val[0], tmp3.val[0]); this->m_imag = vcombine_f32(tmp2.val[1], tmp3.val[1]); #else this->m_real = real_batch(static_cast(buf[0]), static_cast(buf[2]), static_cast(buf[4]), static_cast(buf[6])); this->m_imag = real_batch(static_cast(buf[1]), static_cast(buf[3]), static_cast(buf[5]), static_cast(buf[7])); #endif return *this; } template inline auto batch, 4>::load_unaligned(const xtl::complex* src) -> self_type& { return load_unaligned(src); } template inline void batch, 4>::store_aligned(value_type* dst) const { float32x4x2_t tmp; tmp.val[0] = this->m_real; tmp.val[1] = this->m_imag; float* buf = reinterpret_cast(dst); vst2q_f32(buf, tmp); } template inline void batch, 4>::store_unaligned(value_type* dst) const { store_aligned(dst); } template inline void batch::store_aligned(xtl::xcomplex* dst) const { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION double* buf = reinterpret_cast(dst); float64x2x2_t tmp0{ vcvt_f64_f32(vget_low_f32(this->m_real)), vcvt_f64_f32(vget_low_f32(this->m_imag)) }; float64x2x2_t tmp1{ vcvt_f64_f32(vget_high_f32(this->m_real)), vcvt_f64_f32(vget_high_f32(this->m_imag)) }; vst2q_f64(buf, tmp0); vst2q_f64(buf + 4, tmp1); #else for (std::size_t i = 0; i < 4; ++i) { dst[i] = std::complex(this->m_real[i], this->m_imag[i]); } #endif } template inline void batch, 4>::store_unaligned(xtl::xcomplex* dst) const { store_aligned(dst); } #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION /****************************************************** * batch_bool, 2> * ******************************************************/ template struct simd_batch_traits, 2>> : complex_batch_bool_traits, double, 2, 16> { }; template class batch_bool, 2> : public simd_complex_batch_bool, 2>> { public: using self_type = batch_bool, 2>; using base_type = simd_complex_batch_bool; using real_batch = batch_bool; batch_bool() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch_bool(bool b0, bool b1) : base_type(real_batch(b0, b1)) { } }; namespace detail { template struct batch_bool_kernel, 2> : batch_bool_complex_kernel, 2> { }; } /************************************************* * batch, 2> * *************************************************/ template struct simd_batch_traits, 2>> : complex_batch_traits, double, 2, 16> { }; template class batch, 2> : public simd_complex_batch, 2>> { public: using self_type = batch, 2>; using base_type = simd_complex_batch; using value_type = xtl::xcomplex; using real_batch = batch; batch() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch(value_type c0, value_type c1) : base_type(real_batch(c0.real(), c1.real()), real_batch(c0.imag(), c1.imag())) { } using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; self_type& load_aligned(const std::complex* src); self_type& load_unaligned(const std::complex* src); self_type& load_aligned(const std::complex* src); self_type& load_unaligned(const std::complex* src); void store_aligned(std::complex* dst) const; void store_unaligned(std::complex* dst) const; void store_aligned(std::complex* dst) const; void store_unaligned(std::complex* dst) const; }; /******************************************************************* * batch, 2> implementation * *******************************************************************/ template inline auto batch, 2>::load_aligned(const xtl::xcomplex* src) -> self_type& { const float* buf = reinterpret_cast(src); float32x2x2_t tmp = vld2_f32(buf); this->m_real = vcvt_f64_f32(tmp.val[0]); this->m_imag = vcvt_f64_f32(tmp.val[1]); return *this; } template inline auto batch, 2>::load_unaligned(const xtl::xcomplex* src) -> self_type& { return load_aligned(src); } template inline auto batch, 2>::load_aligned(const value_type* src) -> self_type& { const double* buf = reinterpret_cast(src); float64x2x2_t tmp = vld2q_f64(buf); this->m_real = tmp.val[0]; this->m_imag = tmp.val[1]; return *this; } template inline auto batch, 2>::load_unaligned(const value_type* src) -> self_type& { return load_aligned(src); } template inline void batch, 2>::store_aligned(xtl::xcomplex* dst) const { float* buf = reinterpret_cast(dst); float32x2x2_t tmp{ vcvt_f32_f64(this->m_real), vcvt_f32_f64(this->m_imag) }; vst2_f32(buf, tmp); } template inline void batch, 2>::store_unaligned(xtl::xcomplex* dst) const { return store_aligned(dst); } template inline void batch, 2>::store_aligned(value_type* dst) const { float64x2x2_t tmp; tmp.val[0] = this->m_real; tmp.val[1] = this->m_imag; double* buf = reinterpret_cast(dst); vst2q_f64(buf, tmp); } template inline void batch, 2>::store_unaligned(value_type* dst) const { return store_aligned(dst); } #endif #endif } #endif xsimd-7.6.0/include/xsimd/types/xsimd_neon_conversion.hpp000066400000000000000000000174621410101234500237050ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NEON_CONVERSION_HPP #define XSIMD_NEON_CONVERSION_HPP #include "xsimd_neon_bool.hpp" #include "xsimd_neon_float.hpp" #include "xsimd_neon_int8.hpp" #include "xsimd_neon_int16.hpp" #include "xsimd_neon_int32.hpp" #include "xsimd_neon_int64.hpp" #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION #include "xsimd_neon_double.hpp" #endif #include "xsimd_neon_uint32.hpp" #include "xsimd_neon_uint64.hpp" #include "xsimd_neon_uint16.hpp" #include "xsimd_neon_uint8.hpp" namespace xsimd { /************************ * conversion functions * ************************/ batch to_int(const batch& x); batch to_float(const batch& x); batch u8_to_u16(const batch& x); batch u16_to_u8(const batch& x); batch u8_to_u32(const batch& x); batch u32_to_u8(const batch& x); batch u8_to_u64(const batch& x); batch u64_to_u8(const batch& x); #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION batch to_int(const batch& x); batch to_float(const batch& x); #endif /************************** * boolean cast functions * **************************/ batch_bool bool_cast(const batch_bool& x); batch_bool bool_cast(const batch_bool& x); #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION batch_bool bool_cast(const batch_bool& x); batch_bool bool_cast(const batch_bool& x); #endif /******************************* * bitwise_cast implementation * *******************************/ #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION XSIMD_DEFINE_BITWISE_CAST_FLOAT(double, 2) #endif XSIMD_DEFINE_BITWISE_CAST_FLOAT(float, 4) XSIMD_DEFINE_BITWISE_CAST(int64_t, 2) XSIMD_DEFINE_BITWISE_CAST(uint64_t, 2) XSIMD_DEFINE_BITWISE_CAST(int32_t, 4) XSIMD_DEFINE_BITWISE_CAST(uint32_t, 4) XSIMD_DEFINE_BITWISE_CAST(int16_t, 8) XSIMD_DEFINE_BITWISE_CAST(uint16_t, 8) XSIMD_DEFINE_BITWISE_CAST(int8_t, 16) XSIMD_DEFINE_BITWISE_CAST(uint8_t, 16) /*************************************** * conversion functions implementation * ***************************************/ inline batch to_int(const batch& x) { return vcvtq_s32_f32(x); } inline batch to_float(const batch& x) { return vcvtq_f32_s32(x); } inline batch u8_to_u16(const batch& x) { return vreinterpretq_u16_u8(x); } inline batch u16_to_u8(const batch& x) { return vreinterpretq_u8_u16(x); } inline batch u8_to_u32(const batch& x) { return vreinterpretq_u32_u8(x); } inline batch u32_to_u8(const batch& x) { return vreinterpretq_u8_u32(x); } inline batch u8_to_u64(const batch& x) { return vreinterpretq_u64_u8(x); } inline batch u64_to_u8(const batch& x) { return vreinterpretq_u8_u64(x); } #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION inline batch to_int(const batch& x) { return vcvtq_s64_f64(x); } inline batch to_float(const batch& x) { return vcvtq_f64_s64(x); } #endif /***************************************** * batch cast functions implementation * *****************************************/ XSIMD_BATCH_CAST_INTRINSIC(int8_t, uint8_t, 16, vreinterpretq_u8_s8); XSIMD_BATCH_CAST_INTRINSIC(uint8_t, int8_t, 16, vreinterpretq_s8_u8); XSIMD_BATCH_CAST_INTRINSIC(int16_t, uint16_t, 8, vreinterpretq_u16_s16); XSIMD_BATCH_CAST_INTRINSIC(uint16_t, int16_t, 8, vreinterpretq_s16_u16); XSIMD_BATCH_CAST_INTRINSIC(int32_t, uint32_t, 4, vreinterpretq_u32_s32); XSIMD_BATCH_CAST_INTRINSIC(int32_t, float, 4, vcvtq_f32_s32); XSIMD_BATCH_CAST_INTRINSIC(uint32_t, int32_t, 4, vreinterpretq_s32_u32); XSIMD_BATCH_CAST_INTRINSIC(uint32_t, float, 4, vcvtq_f32_u32); XSIMD_BATCH_CAST_INTRINSIC(float, int32_t, 4, vcvtq_s32_f32); XSIMD_BATCH_CAST_INTRINSIC(float, uint32_t, 4, vcvtq_u32_f32); #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION XSIMD_BATCH_CAST_INTRINSIC(int64_t, uint64_t, 2, vreinterpretq_u64_s64); XSIMD_BATCH_CAST_INTRINSIC(int64_t, double, 2, vcvtq_f64_s64); XSIMD_BATCH_CAST_INTRINSIC(uint64_t, int64_t, 2, vreinterpretq_s64_u64); XSIMD_BATCH_CAST_INTRINSIC(uint64_t, double, 2, vcvtq_f64_u64); XSIMD_BATCH_CAST_INTRINSIC(double, int64_t, 2, vcvtq_s64_f64); XSIMD_BATCH_CAST_INTRINSIC(double, uint64_t, 2, vcvtq_u64_f64); #endif /************************** * boolean cast functions * **************************/ inline batch_bool bool_cast(const batch_bool& x) { return x; } inline batch_bool bool_cast(const batch_bool& x) { return x; } #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION inline batch_bool bool_cast(const batch_bool& x) { return x; } inline batch_bool bool_cast(const batch_bool& x) { return x; } #endif /***************************************** * bitwise cast functions implementation * *****************************************/ XSIMD_BITWISE_CAST_INTRINSIC(float, 4, int32_t, 4, vreinterpretq_s32_f32) XSIMD_BITWISE_CAST_INTRINSIC(float, 4, int64_t, 2, vreinterpretq_s64_f32) XSIMD_BITWISE_CAST_INTRINSIC(int32_t, 4, float, 4, vreinterpretq_f32_s32) XSIMD_BITWISE_CAST_INTRINSIC(int64_t, 2, float, 4, vreinterpretq_f32_s64) #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION XSIMD_BITWISE_CAST_INTRINSIC(double, 2, float, 4, vreinterpretq_f32_f64) XSIMD_BITWISE_CAST_INTRINSIC(double, 2, int32_t, 4, vreinterpretq_s32_f64) XSIMD_BITWISE_CAST_INTRINSIC(double, 2, int64_t, 2, vreinterpretq_s64_f64) XSIMD_BITWISE_CAST_INTRINSIC(int32_t, 4, double, 2, vreinterpretq_f64_s32) XSIMD_BITWISE_CAST_INTRINSIC(int64_t, 2, double, 2, vreinterpretq_f64_s64) XSIMD_BITWISE_CAST_INTRINSIC(float, 4, double, 2, vreinterpretq_f64_f32) #endif } #endif xsimd-7.6.0/include/xsimd/types/xsimd_neon_double.hpp000066400000000000000000000464551410101234500227760ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NEON_DOUBLE_HPP #define XSIMD_NEON_DOUBLE_HPP #include "xsimd_base.hpp" namespace xsimd { template <> struct simd_batch_traits> { using value_type = double; static constexpr std::size_t size = 2; using batch_bool_type = batch_bool; static constexpr std::size_t align = XSIMD_DEFAULT_ALIGNMENT; using storage_type = float64x2_t; }; template <> class batch : public simd_batch> { public: using self_type = batch; using base_type = simd_batch; using storage_type = typename base_type::storage_type; using batch_bool_type = typename base_type::batch_bool_type; batch(); explicit batch(double d); batch(double d0, double d1); explicit batch(const double* src); batch(const double* src, aligned_mode); batch(const double* src, unaligned_mode); batch(const storage_type& rhs); batch& operator=(const storage_type& rhs); batch(const batch_bool_type& rhs); batch& operator=(const batch_bool_type& rhs); operator storage_type() const; XSIMD_DECLARE_LOAD_STORE_ALL(double, 2) XSIMD_DECLARE_LOAD_STORE_LONG(double, 2) using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; }; /** * Implementation of batch */ inline batch::batch() { } inline batch::batch(double d) : base_type(vdupq_n_f64(d)) { } inline batch::batch(double d1, double d2) : base_type(storage_type{ d1, d2 }) { } inline batch::batch(const double* d) : base_type(vld1q_f64(d)) { } inline batch::batch(const double* d, aligned_mode) : batch(d) { } inline batch::batch(const double* d, unaligned_mode) : batch(d) { } inline batch::batch(const storage_type& rhs) : base_type(rhs) { } inline batch& batch::operator=(const storage_type& rhs) { this->m_value = rhs; return *this; } inline batch::batch(const batch_bool_type& rhs) : base_type(vreinterpretq_f64_u64(vandq_u64(rhs, vreinterpretq_u64_f64(batch(1.))))) { } inline batch& batch::operator=(const batch_bool_type& rhs) { this->m_value = vreinterpretq_f64_u64(vandq_u64(rhs, vreinterpretq_u64_f64(batch(1.)))); return *this; } inline batch& batch::load_aligned(const int8_t* src) { int8x8_t tmp = vld1_s8((const int8_t*)src); int16x8_t tmp2 = vmovl_s8(tmp); int16x4_t tmp3 = vget_low_s16(tmp2); int32x4_t tmp4 = vmovl_s16(tmp3); float32x4_t tmp5 = vcvtq_f32_s32(tmp4); float32x2_t tmp6 = vget_low_f32(tmp5); this->m_value = vcvt_f64_f32(tmp6); return *this; } inline batch& batch::load_unaligned(const int8_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const uint8_t* src) { uint8x8_t tmp = vld1_u8((const uint8_t*)src); uint16x8_t tmp2 = vmovl_u8(tmp); uint16x4_t tmp3 = vget_low_u16(tmp2); uint32x4_t tmp4 = vmovl_u16(tmp3); float32x4_t tmp5 = vcvtq_f32_u32(tmp4); float32x2_t tmp6 = vget_low_f32(tmp5); this->m_value = vcvt_f64_f32(tmp6); return *this; } XSIMD_DEFINE_LOAD_STORE(double, 2, bool, XSIMD_DEFAULT_ALIGNMENT) inline batch& batch::load_unaligned(const uint8_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const int16_t* src) { int16x4_t tmp1 = vld1_s16(src); int32x4_t tmp2 = vmovl_s16(tmp1); float32x4_t tmp3 = vcvtq_f32_s32(tmp2); float32x2_t tmp4 = vget_low_f32(tmp3); this->m_value = vcvt_f64_f32(tmp4); return *this; } inline batch& batch::load_unaligned(const int16_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const uint16_t* src) { uint16x4_t tmp1 = vld1_u16(src); uint32x4_t tmp2 = vmovl_u16(tmp1); float32x4_t tmp3 = vcvtq_f32_u32(tmp2); float32x2_t tmp4 = vget_low_f32(tmp3); this->m_value = vcvt_f64_f32(tmp4); return *this; } inline batch& batch::load_unaligned(const uint16_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const int32_t* d) { this->m_value = vcvt_f64_f32(vcvt_f32_s32(vld1_s32(d))); return *this; } inline batch& batch::load_unaligned(const int32_t* d) { return load_aligned(d); } inline batch& batch::load_aligned(const uint32_t* d) { this->m_value = vcvt_f64_f32(vcvt_f32_u32(vld1_u32(d))); return *this; } inline batch& batch::load_unaligned(const uint32_t* d) { return load_aligned(d); } inline batch& batch::load_aligned(const int64_t* d) { this->m_value = vcvtq_f64_s64(vld1q_s64(d)); return *this; } inline batch& batch::load_unaligned(const int64_t* d) { return load_aligned(d); } inline batch& batch::load_aligned(const uint64_t* d) { this->m_value = vcvtq_f64_u64(vld1q_u64(d)); return *this; } inline batch& batch::load_unaligned(const uint64_t* d) { return load_aligned(d); } XSIMD_DEFINE_LOAD_STORE_LONG(double, 2, 16) inline batch& batch::load_aligned(const float* d) { this->m_value = vcvt_f64_f32(vld1_f32(d)); return *this; } inline batch& batch::load_unaligned(const float* d) { return load_aligned(d); } inline batch& batch::load_aligned(const double* d) { this->m_value = vld1q_f64(d); return *this; } inline batch& batch::load_unaligned(const double* d) { return load_aligned(d); } inline void batch::store_aligned(int8_t* dst) const { float32x2_t tmp = vcvt_f32_f64(this->m_value); int32x2_t tmp2 = vcvtn_s32_f32(tmp); int32x4_t tmp3 = vcombine_s32(tmp2, vdup_n_s32(0)); int16x4_t tmp4 = vmovn_s32(tmp3); int16x8_t tmp5 = vcombine_s16(tmp4, vdup_n_s16(0)); int8x8_t tmp6 = vmovn_s16(tmp5); vst1_s8((int8_t*)dst, tmp6); } inline void batch::store_unaligned(int8_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint8_t* dst) const { float32x2_t tmp = vcvt_f32_f64(this->m_value); uint32x2_t tmp2 = vcvtn_u32_f32(tmp); uint32x4_t tmp3 = vcombine_u32(tmp2, vdup_n_u32(0)); uint16x4_t tmp4 = vmovn_u32(tmp3); uint16x8_t tmp5 = vcombine_u16(tmp4, vdup_n_u16(0)); uint8x8_t tmp6 = vmovn_u16(tmp5); vst1_u8((uint8_t*)dst, tmp6); } inline void batch::store_unaligned(uint8_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(int16_t* dst) const { float32x2_t tmp = vcvt_f32_f64(this->m_value); int32x2_t tmp2 = vcvtn_s32_f32(tmp); int32x4_t tmp3 = vcombine_s32(tmp2, vdup_n_s32(0)); int16x4_t tmp4 = vmovn_s32(tmp3); vst1_s16((int16_t*)dst, tmp4); } inline void batch::store_unaligned(int16_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint16_t* dst) const { float32x2_t tmp = vcvt_f32_f64(this->m_value); uint32x2_t tmp2 = vcvtn_u32_f32(tmp); uint32x4_t tmp3 = vcombine_u32(tmp2, vdup_n_u32(0)); uint16x4_t tmp4 = vmovn_u32(tmp3); vst1_u16((uint16_t*)dst, tmp4); } inline void batch::store_unaligned(uint16_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(int32_t* dst) const { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION vst1_s32(dst, vcvt_s32_f32(vcvt_f32_f64(m_value))); #else dst[0] = static_cast(this->m_value[0]); dst[1] = static_cast(this->m_value[1]); #endif } inline void batch::store_unaligned(int32_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint32_t* dst) const { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION vst1_u32(dst, vcvt_u32_f32(vcvt_f32_f64(m_value))); #else dst[0] = static_cast(this->m_value[0]); dst[1] = static_cast(this->m_value[1]); #endif } inline void batch::store_unaligned(uint32_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(int64_t* dst) const { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION vst1q_s64(dst, vcvtq_s64_f64(m_value)); #else dst[0] = static_cast(this->m_value[0]); dst[1] = static_cast(this->m_value[1]); #endif } inline void batch::store_unaligned(int64_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint64_t* dst) const { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION vst1q_u64(dst, vcvtq_u64_f64(m_value)); #else dst[0] = static_cast(this->m_value[0]); dst[1] = static_cast(this->m_value[1]); #endif } inline void batch::store_unaligned(uint64_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(float* dst) const { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION vst1_f32(dst, vcvt_f32_f64(m_value)); #else dst[0] = static_cast(this->m_value[0]); dst[1] = static_cast(this->m_value[1]); #endif } inline void batch::store_unaligned(float* dst) const { store_aligned(dst); } inline void batch::store_aligned(double* dst) const { vst1q_f64(dst, m_value); } inline void batch::store_unaligned(double* dst) const { store_aligned(dst); } inline batch::operator float64x2_t() const { return this->m_value; } namespace detail { template <> struct batch_kernel { using batch_type = batch; using value_type = double; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { return vnegq_f64(rhs); } static batch_type add(const batch_type& lhs, const batch_type& rhs) { return vaddq_f64(lhs, rhs); } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { return vsubq_f64(lhs, rhs); } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { return add(lhs, rhs); } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return sub(lhs, rhs); } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { return vmulq_f64(lhs, rhs); } static batch_type div(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vdivq_f64(lhs, rhs); #else // from stackoverflow & https://projectne10.github.io/Ne10/doc/NE10__divc_8neon_8c_source.html // get an initial estimate of 1/b. float64x2_t reciprocal = vrecpeq_f64(rhs); // use a couple Newton-Raphson steps to refine the estimate. Depending on your // application's accuracy requirements, you may be able to get away with only // one refinement (instead of the two used here). Be sure to test! reciprocal = vmulq_f64(vrecpsq_f64(rhs, reciprocal), reciprocal); reciprocal = vmulq_f64(vrecpsq_f64(rhs, reciprocal), reciprocal); // and finally, compute a / b = a * (1 / b) return vmulq_f64(lhs, reciprocal); #endif } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { return vceqq_f64(lhs, rhs); } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { return !(lhs == rhs); } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { return vcltq_f64(lhs, rhs); } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { return vcleq_f64(lhs, rhs); } static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(lhs), vreinterpretq_u64_f64(rhs))); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(lhs), vreinterpretq_u64_f64(rhs))); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(lhs), vreinterpretq_u64_f64(rhs))); } static batch_type bitwise_not(const batch_type& rhs) { return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_f64(rhs))); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(lhs), vreinterpretq_u64_f64(rhs))); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return vminq_f64(lhs, rhs); } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return vmaxq_f64(lhs, rhs); } static batch_type fmin(const batch_type& lhs, const batch_type& rhs) { return min(lhs, rhs); } static batch_type fmax(const batch_type& lhs, const batch_type& rhs) { return max(lhs, rhs); } static batch_type abs(const batch_type& rhs) { return vabsq_f64(rhs); } static batch_type fabs(const batch_type& rhs) { return abs(rhs); } static batch_type sqrt(const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vsqrtq_f64(rhs); #else float64x2_t sqrt_reciprocal = vrsqrteq_f64(rhs); // one iter // sqrt_reciprocal = sqrt_reciprocal * vrsqrtsq_f64(lhs * sqrt_reciprocal, sqrt_reciprocal); return rhs * sqrt_reciprocal * vrsqrtsq_f64(rhs * sqrt_reciprocal, sqrt_reciprocal); #endif } static batch_type fma(const batch_type& x, const batch_type& y, const batch_type& z) { return vfmaq_f64(z, x, y); } static batch_type fms(const batch_type& x, const batch_type& y, const batch_type& z) { return vfmaq_f64(-z, x, y); } static batch_type fnma(const batch_type& x, const batch_type& y, const batch_type& z) { return fma(-x, y, z); } static batch_type fnms(const batch_type& x, const batch_type& y, const batch_type& z) { return fms(-x, y, z); } static value_type hadd(const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vaddvq_f64(rhs); #else float64x2_t tmp = vpaddq_f64(rhs, rhs); return vgetq_lane_f64(tmp, 0); #endif } static batch_type haddp(const simd_batch* row) { return vpaddq_f64(row[0](), row[1]()); } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { return vbslq_f64(cond, a, b); } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vzip1q_f64(lhs, rhs); #else return vcombine_f64(vget_low_f64(lhs), vget_low_f64(rhs)); #endif } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vzip2q_f64(lhs, rhs); #else return vcombine_f64(vget_high_f64(lhs), vget_high_f64(rhs)); #endif } static batch_type extract_pair(const batch_type& lhs, const batch_type& rhs, const int n) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION switch(n) { case 0: return lhs; XSIMD_REPEAT_2(vextq_f64); default: break; } return batch_type(double(0)); #else return vcombine_f64(vget_high_f64(lhs), vget_low_f64(rhs)); #endif } static batch_bool_type isnan(const batch_type& x) { return !(x == x); } }; } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_neon_float.hpp000066400000000000000000000534251410101234500226240ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NEON_FLOAT_HPP #define XSIMD_NEON_FLOAT_HPP #include "xsimd_base.hpp" #include "xsimd_neon_bool.hpp" namespace xsimd { /******************* * batch * *******************/ template <> struct simd_batch_traits> { using value_type = float; static constexpr std::size_t size = 4; using batch_bool_type = batch_bool; static constexpr std::size_t align = XSIMD_DEFAULT_ALIGNMENT; using storage_type = float32x4_t; }; template <> class batch : public simd_batch> { public: using self_type = batch; using base_type = simd_batch; using storage_type = typename base_type::storage_type; using batch_bool_type = typename base_type::batch_bool_type; batch(); explicit batch(float d); batch(float d0, float d1, float d2, float d3); explicit batch(const float* src); batch(const float* src, aligned_mode); batch(const float* src, unaligned_mode); batch(const storage_type& rhs); batch& operator=(const storage_type& rhs); batch(const batch_bool_type& rhs); batch& operator=(const batch_bool_type& rhs); operator storage_type() const; XSIMD_DECLARE_LOAD_STORE_ALL(float, 4) XSIMD_DECLARE_LOAD_STORE_LONG(float, 4) using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; }; /********************************** * batch implementation * **********************************/ inline batch::batch() { } inline batch::batch(float d) : base_type(vdupq_n_f32(d)) { } inline batch::batch(float d1, float d2, float d3, float d4) : base_type(storage_type{d1, d2, d3, d4}) { } inline batch::batch(const float* d) : base_type(vld1q_f32(d)) { } inline batch::batch(const float* d, aligned_mode) : batch(d) { } inline batch::batch(const float* d, unaligned_mode) : batch(d) { } inline batch::batch(const storage_type& rhs) : base_type(rhs) { } inline batch& batch::operator=(const storage_type& rhs) { this->m_value = rhs; return *this; } inline batch::batch(const batch_bool_type& rhs) : base_type(vreinterpretq_f32_u32(vandq_u32(rhs, vreinterpretq_u32_f32(batch(1.f))))) { } inline batch& batch::operator=(const batch_bool_type& rhs) { this->m_value = vreinterpretq_f32_u32(vandq_u32(rhs, vreinterpretq_u32_f32(batch(1.f)))); return *this; } inline batch& batch::load_aligned(const int8_t* src) { int8x8_t tmp = vld1_s8((const int8_t*)src); int16x8_t tmp2 = vmovl_s8(tmp); int16x4_t tmp3 = vget_low_s16(tmp2); int32x4_t tmp4 = vmovl_s16(tmp3); this->m_value = vcvtq_f32_s32(tmp4); return *this; } inline batch& batch::load_unaligned(const int8_t* src) { return load_aligned(src); } XSIMD_DEFINE_LOAD_STORE(float, 4, bool, XSIMD_DEFAULT_ALIGNMENT) inline batch& batch::load_aligned(const uint8_t* src) { uint8x8_t tmp = vld1_u8((const uint8_t*)src); uint16x8_t tmp2 = vmovl_u8(tmp); uint16x4_t tmp3 = vget_low_u16(tmp2); uint32x4_t tmp4 = vmovl_u16(tmp3); this->m_value = vcvtq_f32_u32(tmp4); return *this; } inline batch& batch::load_unaligned(const uint8_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const int16_t* src) { int16x4_t tmp1 = vld1_s16(src); int32x4_t tmp2 = vmovl_s16(tmp1); this->m_value = vcvtq_f32_s32(tmp2); return *this; } inline batch& batch::load_unaligned(const int16_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const uint16_t* src) { uint16x4_t tmp1 = vld1_u16(src); uint32x4_t tmp2 = vmovl_u16(tmp1); this->m_value = vcvtq_f32_u32(tmp2); return *this; } inline batch& batch::load_unaligned(const uint16_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const int32_t* d) { this->m_value = vcvtq_f32_s32(vld1q_s32(d)); return *this; } inline batch& batch::load_unaligned(const int32_t* d) { return load_aligned(d); } inline batch& batch::load_aligned(const uint32_t* d) { this->m_value = vcvtq_f32_u32(vld1q_u32(d)); return *this; } inline batch& batch::load_unaligned(const uint32_t* d) { return load_aligned(d); } inline batch& batch::load_aligned(const int64_t* d) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION float32x2_t tmp_l = vcvt_f32_f64(vcvtq_f64_s64(vld1q_s64(&d[0]))); float32x2_t tmp_h = vcvt_f32_f64(vcvtq_f64_s64(vld1q_s64(&d[2]))); this->m_value = vcombine_f32(tmp_l, tmp_h); #else this->m_value = float32x4_t{ static_cast(d[0]), static_cast(d[1]), static_cast(d[2]), static_cast(d[3]) }; #endif return *this; } inline batch& batch::load_unaligned(const int64_t* d) { return load_aligned(d); } inline batch& batch::load_aligned(const uint64_t* d) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION float32x2_t tmp_l = vcvt_f32_f64(vcvtq_f64_u64(vld1q_u64(&d[0]))); float32x2_t tmp_h = vcvt_f32_f64(vcvtq_f64_u64(vld1q_u64(&d[2]))); this->m_value = vcombine_f32(tmp_l, tmp_h); #else this->m_value = float32x4_t{ static_cast(d[0]), static_cast(d[1]), static_cast(d[2]), static_cast(d[3]) }; #endif return *this; } inline batch& batch::load_unaligned(const uint64_t* d) { return load_aligned(d); } XSIMD_DEFINE_LOAD_STORE_LONG(float, 4, 16) inline batch& batch::load_aligned(const float* d) { this->m_value = vld1q_f32(d); return *this; } inline batch& batch::load_unaligned(const float* d) { return load_aligned(d); } inline batch& batch::load_aligned(const double* d) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION float32x2_t tmp_l = vcvt_f32_f64(vld1q_f64(&d[0])); float32x2_t tmp_h = vcvt_f32_f64(vld1q_f64(&d[2])); this->m_value = vcombine_f32(tmp_l, tmp_h); return *this; #else this->m_value = float32x4_t{ static_cast(d[0]), static_cast(d[1]), static_cast(d[2]), static_cast(d[3]) }; #endif return *this; } inline batch& batch::load_unaligned(const double* d) { return load_aligned(d); } inline void batch::store_aligned(int8_t* dst) const { int32x4_t tmp = vcvtq_s32_f32(this->m_value); int16x4_t tmp2 = vmovn_s32(tmp); int16x8_t tmp3 = vcombine_s16(tmp2, vdup_n_s16(0)); int8x8_t tmp4 = vmovn_s16(tmp3); vst1_s8((int8_t*)dst, tmp4); } inline void batch::store_unaligned(int8_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint8_t* dst) const { uint32x4_t tmp = vcvtq_u32_f32(this->m_value); uint16x4_t tmp2 = vmovn_u32(tmp); uint16x8_t tmp3 = vcombine_u16(tmp2, vdup_n_u16(0)); uint8x8_t tmp4 = vmovn_u16(tmp3); vst1_u8((uint8_t*)dst, tmp4); } inline void batch::store_unaligned(uint8_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(int16_t* dst) const { int32x4_t tmp = vcvtq_s32_f32(this->m_value); int16x4_t tmp2 = vmovn_s32(tmp); vst1_s16((int16_t*)dst, tmp2); } inline void batch::store_unaligned(int16_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint16_t* dst) const { uint32x4_t tmp = vcvtq_u32_f32(this->m_value); uint16x4_t tmp2 = vmovn_u32(tmp); vst1_u16((uint16_t*)dst, tmp2); } inline void batch::store_unaligned(uint16_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(int32_t* dst) const { vst1q_s32(dst, vcvtq_s32_f32(this->m_value)); } inline void batch::store_unaligned(int32_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint32_t* dst) const { vst1q_u32(dst, vcvtq_u32_f32(this->m_value)); } inline void batch::store_unaligned(uint32_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(int64_t* dst) const { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION int64x2_t tmp_l = vcvtq_s64_f64(vcvt_f64_f32(vget_low_f32(this->m_value))); int64x2_t tmp_h = vcvtq_s64_f64(vcvt_f64_f32(vget_high_f32(this->m_value))); vst1q_s64(&(dst[0]), tmp_l); vst1q_s64(&(dst[2]), tmp_h); #else dst[0] = static_cast(this->m_value[0]); dst[1] = static_cast(this->m_value[1]); dst[2] = static_cast(this->m_value[2]); dst[3] = static_cast(this->m_value[3]); #endif } inline void batch::store_unaligned(int64_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint64_t* dst) const { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION uint64x2_t tmp_l = vcvtq_u64_f64(vcvt_f64_f32(vget_low_f32(this->m_value))); uint64x2_t tmp_h = vcvtq_u64_f64(vcvt_f64_f32(vget_high_f32(this->m_value))); vst1q_u64(&(dst[0]), tmp_l); vst1q_u64(&(dst[2]), tmp_h); #else dst[0] = static_cast(this->m_value[0]); dst[1] = static_cast(this->m_value[1]); dst[2] = static_cast(this->m_value[2]); dst[3] = static_cast(this->m_value[3]); #endif } inline void batch::store_unaligned(uint64_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(float* dst) const { vst1q_f32(dst, this->m_value); } inline void batch::store_unaligned(float* dst) const { store_aligned(dst); } inline void batch::store_aligned(double* dst) const { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION float64x2_t tmp_l = vcvt_f64_f32(vget_low_f32(this->m_value)); float64x2_t tmp_h = vcvt_f64_f32(vget_high_f32(this->m_value)); vst1q_f64(&(dst[0]), tmp_l); vst1q_f64(&(dst[2]), tmp_h); #else dst[0] = static_cast(this->m_value[0]); dst[1] = static_cast(this->m_value[1]); dst[2] = static_cast(this->m_value[2]); dst[3] = static_cast(this->m_value[3]); #endif } inline void batch::store_unaligned(double* dst) const { store_aligned(dst); } inline batch::operator float32x4_t() const { return this->m_value; } namespace detail { template <> struct batch_kernel { using batch_type = batch; using value_type = float; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { return vnegq_f32(rhs); } static batch_type add(const batch_type& lhs, const batch_type& rhs) { return vaddq_f32(lhs, rhs); } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { return vsubq_f32(lhs, rhs); } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { return add(lhs, rhs); } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return sub(lhs, rhs); } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { return vmulq_f32(lhs, rhs); } static batch_type div(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vdivq_f32(lhs, rhs); #else // from stackoverflow & https://projectne10.github.io/Ne10/doc/NE10__divc_8neon_8c_source.html // get an initial estimate of 1/b. float32x4_t reciprocal = vrecpeq_f32(rhs); // use a couple Newton-Raphson steps to refine the estimate. Depending on your // application's accuracy requirements, you may be able to get away with only // one refinement (instead of the two used here). Be sure to test! reciprocal = vmulq_f32(vrecpsq_f32(rhs, reciprocal), reciprocal); reciprocal = vmulq_f32(vrecpsq_f32(rhs, reciprocal), reciprocal); // and finally, compute a / b = a * (1 / b) return vmulq_f32(lhs, reciprocal); #endif } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { return vceqq_f32(lhs, rhs); } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { return !(lhs == rhs); } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { return vcltq_f32(lhs, rhs); } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { return vcleq_f32(lhs, rhs); } static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs))); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs))); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs))); } static batch_type bitwise_not(const batch_type& rhs) { return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(rhs))); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs))); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return vminq_f32(lhs, rhs); } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return vmaxq_f32(lhs, rhs); } static batch_type fmin(const batch_type& lhs, const batch_type& rhs) { return min(lhs, rhs); } static batch_type fmax(const batch_type& lhs, const batch_type& rhs) { return max(lhs, rhs); } static batch_type abs(const batch_type& rhs) { return vabsq_f32(rhs); } static batch_type fabs(const batch_type& rhs) { return abs(rhs); } static batch_type sqrt(const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vsqrtq_f32(rhs); #else batch sqrt_reciprocal = vrsqrteq_f32(rhs); // one iter sqrt_reciprocal = sqrt_reciprocal * batch(vrsqrtsq_f32(rhs * sqrt_reciprocal, sqrt_reciprocal)); batch sqrt_approx = rhs * sqrt_reciprocal * batch(vrsqrtsq_f32(rhs * sqrt_reciprocal, sqrt_reciprocal)); batch zero(0.f); return select(rhs == zero, zero, sqrt_approx); #endif } static batch_type fma(const batch_type& x, const batch_type& y, const batch_type& z) { #ifdef __ARM_FEATURE_FMA return vfmaq_f32(z, x, y); #else return x * y + z; #endif } static batch_type fms(const batch_type& x, const batch_type& y, const batch_type& z) { #ifdef __ARM_FEATURE_FMA return vfmaq_f32(-z, x, y); #else return x * y - z; #endif } static batch_type fnma(const batch_type& x, const batch_type& y, const batch_type& z) { return fma(-x, y, z); } static batch_type fnms(const batch_type& x, const batch_type& y, const batch_type& z) { return fms(-x, y, z); } static value_type hadd(const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vaddvq_f32(rhs); #else float32x2_t tmp = vpadd_f32(vget_low_f32(rhs), vget_high_f32(rhs)); tmp = vpadd_f32(tmp, tmp); return vget_lane_f32(tmp, 0); #endif } static batch_type haddp(const simd_batch* row) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION float32x4_t tmp1 = vpaddq_f32(row[0](), row[1]()); float32x4_t tmp2 = vpaddq_f32(row[2](), row[3]()); return vpaddq_f32(tmp1, tmp2); #else // row = (a,b,c,d) float32x2_t tmp1, tmp2, tmp3; // tmp1 = (a0 + a2, a1 + a3) tmp1 = vpadd_f32(vget_low_f32(row[0]()), vget_high_f32(row[0]())); // tmp2 = (b0 + b2, b1 + b3) tmp2 = vpadd_f32(vget_low_f32(row[1]()), vget_high_f32(row[1]())); // tmp1 = (a0..3, b0..3) tmp1 = vpadd_f32(tmp1, tmp2); // tmp2 = (c0 + c2, c1 + c3) tmp2 = vpadd_f32(vget_low_f32(row[2]()), vget_high_f32(row[2]())); // tmp3 = (d0 + d2, d1 + d3) tmp3 = vpadd_f32(vget_low_f32(row[3]()), vget_high_f32(row[3]())); // tmp1 = (c0..3, d0..3) tmp2 = vpadd_f32(tmp2, tmp3); // return = (a0..3, b0..3, c0..3, d0..3) return vcombine_f32(tmp1, tmp2); #endif } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { return vbslq_f32(cond, a, b); } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vzip1q_f32(lhs, rhs); #else float32x2x2_t tmp = vzip_f32(vget_low_f32(lhs), vget_low_f32(rhs)); return vcombine_f32(tmp.val[0], tmp.val[1]); #endif } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vzip2q_f32(lhs, rhs); #else float32x2x2_t tmp = vzip_f32(vget_high_f32(lhs), vget_high_f32(rhs)); return vcombine_f32(tmp.val[0], tmp.val[1]); #endif } static batch_type extract_pair(const batch_type& lhs, const batch_type& rhs, const int n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_4(vextq_f32); default: break; } return batch_type(float(0)); } static batch_bool_type isnan(const batch_type& x) { return !(x == x); } }; } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_neon_int16.hpp000066400000000000000000000314661410101234500224610ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NEON_INT16_HPP #define XSIMD_NEON_INT16_HPP #include #include "xsimd_base.hpp" #include "xsimd_neon_bool.hpp" #include "xsimd_neon_int_base.hpp" #include "xsimd_neon_utils.hpp" namespace xsimd { /********************* * batch * *********************/ template <> struct simd_batch_traits> { using value_type = int16_t; static constexpr std::size_t size = 8; using batch_bool_type = batch_bool; static constexpr std::size_t align = XSIMD_DEFAULT_ALIGNMENT; using storage_type = int16x8_t; }; template <> class batch : public simd_batch> { public: using base_type = simd_batch>; using storage_type = typename base_type::storage_type; using batch_bool_type = typename base_type::batch_bool_type; batch(); explicit batch(int16_t d); template > batch(Args... args); explicit batch(const int16_t* src); batch(const int16_t* src, aligned_mode); batch(const int16_t* src, unaligned_mode); explicit batch(const char* src); batch(const char* src, aligned_mode); batch(const char* src, unaligned_mode); batch(const storage_type& rhs); batch& operator=(const storage_type& rhs); batch(const batch_bool_type& rhs); batch& operator=(const batch_bool_type& rhs); operator storage_type() const; batch& load_aligned(const int16_t* src); batch& load_unaligned(const int16_t* src); batch& load_aligned(const uint16_t* src); batch& load_unaligned(const uint16_t* src); void store_aligned(int16_t* dst) const; void store_unaligned(int16_t* dst) const; void store_aligned(uint16_t* dst) const; void store_unaligned(uint16_t* dst) const; using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT16(int16_t, 8) XSIMD_DECLARE_LOAD_STORE_LONG(int16_t, 8) }; batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); /************************************ * batch implementation * ************************************/ inline batch::batch() { } inline batch::batch(int16_t d) : base_type(vdupq_n_s16(d)) { } template inline batch::batch(Args... args) : base_type(storage_type{static_cast(args)...}) { } inline batch::batch(const int16_t* d) : base_type(vld1q_s16(d)) { } inline batch::batch(const int16_t* d, aligned_mode) : batch(d) { } inline batch::batch(const int16_t* d, unaligned_mode) : batch(d) { } inline batch::batch(const char* d) : batch(reinterpret_cast(d)) { } inline batch::batch(const char* d, aligned_mode) : batch(reinterpret_cast(d)) { } inline batch::batch(const char* d, unaligned_mode) : batch(reinterpret_cast(d)) { } inline batch::batch(const storage_type& rhs) : base_type(rhs) { } inline batch& batch::operator=(const storage_type& rhs) { this->m_value = rhs; return *this; } namespace detail { inline int16x8_t init_from_bool(uint16x8_t a) { return vandq_s16(reinterpret_cast(a), vdupq_n_s16(1)); } } inline batch::batch(const batch_bool_type& rhs) : base_type(detail::init_from_bool(rhs)) { } inline batch& batch::operator=(const batch_bool_type& rhs) { this->m_value = detail::init_from_bool(rhs); return *this; } inline batch& batch::load_aligned(const int16_t* src) { this->m_value = vld1q_s16(src); return *this; } inline batch& batch::load_unaligned(const int16_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const uint16_t* src) { this->m_value = vreinterpretq_s16_u16(vld1q_u16(src)); return *this; } inline batch& batch::load_unaligned(const uint16_t* src) { return load_aligned(src); } inline void batch::store_aligned(int16_t* dst) const { vst1q_s16(dst, this->m_value); } inline void batch::store_unaligned(int16_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint16_t* dst) const { vst1q_u16(dst, vreinterpretq_u16_s16(this->m_value)); } inline void batch::store_unaligned(uint16_t* dst) const { store_aligned(dst); } XSIMD_DEFINE_LOAD_STORE_INT16(int16_t, 8, 8) XSIMD_DEFINE_LOAD_STORE_LONG(int16_t, 8, 8) inline batch::operator int16x8_t() const { return this->m_value; } namespace detail { template <> struct batch_kernel : neon_int_kernel_base> { using batch_type = batch; using value_type = int16_t; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { return vnegq_s16(rhs); } static batch_type add(const batch_type& lhs, const batch_type& rhs) { return vaddq_s16(lhs, rhs); } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { return vsubq_s16(lhs, rhs); } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { return vqaddq_s16(lhs, rhs); } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return vqsubq_s16(lhs, rhs); } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { return vmulq_s16(lhs, rhs); } static batch_type div(const batch_type& lhs, const batch_type& rhs) { return neon_detail::unroll_op<8, int16x8_t, int16_t>([&lhs, &rhs] (std::size_t idx) { return lhs[idx] / rhs[idx]; }); } static batch_type mod(const batch_type& lhs, const batch_type& rhs) { return neon_detail::unroll_op<8, int16x8_t, int16_t>([&lhs, &rhs] (std::size_t idx) { return lhs[idx] % rhs[idx]; }); } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { return vceqq_s16(lhs, rhs); } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { return !(lhs == rhs); } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { return vcltq_s16(lhs, rhs); } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { return vcleq_s16(lhs, rhs); } static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return vandq_s16(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return vorrq_s16(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return veorq_s16(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return vmvnq_s16(rhs); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return vbicq_s16(lhs, rhs); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return vminq_s16(lhs, rhs); } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return vmaxq_s16(lhs, rhs); } static batch_type abs(const batch_type& rhs) { return vabsq_s16(rhs); } static value_type hadd(const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vaddvq_s16(rhs); #else int16x4_t tmp = vpadd_s16(vget_low_s16(rhs), vget_high_s16(rhs)); value_type res = 0; for (std::size_t i = 0; i < 4; ++i) { res += tmp[i]; } return res; #endif } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { return vbslq_s16(cond, a, b); } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vzip1q_s16(lhs, rhs); #else int16x4x2_t tmp = vzip_s16(vget_low_s16(lhs), vget_low_s16(rhs)); return vcombine_s16(tmp.val[0], tmp.val[1]); #endif } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vzip2q_s16(lhs, rhs); #else int16x4x2_t tmp = vzip_s16(vget_high_s16(lhs), vget_high_s16(rhs)); return vcombine_s16(tmp.val[0], tmp.val[1]); #endif } static batch_type extract_pair(const batch_type& lhs, const batch_type& rhs, const int n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_8_v2(vextq_s16); default: break; } return batch_type(int16_t(0)); } }; } namespace detail { inline batch shift_left(const batch& lhs, int32_t n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_16(vshlq_n_s16); default: break; } return batch(int16_t(0)); } inline batch shift_right(const batch& lhs, int32_t n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_16(vshrq_n_s16); default: break; } return batch(int16_t(0)); } } inline batch operator<<(const batch& lhs, int32_t rhs) { return detail::shift_left(lhs, rhs); } inline batch operator>>(const batch& lhs, int32_t rhs) { return detail::shift_right(lhs, rhs); } inline batch operator<<(const batch& lhs, const batch& rhs) { return vshlq_s16(lhs, rhs); } inline batch operator>>(const batch& lhs, const batch& rhs) { return vshlq_s16(lhs, vnegq_s16(rhs)); } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_neon_int32.hpp000066400000000000000000000405501410101234500224510ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NEON_INT32_HPP #define XSIMD_NEON_INT32_HPP #include #include "xsimd_base.hpp" #include "xsimd_neon_bool.hpp" #include "xsimd_neon_int_base.hpp" #include "xsimd_neon_utils.hpp" namespace xsimd { /********************* * batch * *********************/ template <> struct simd_batch_traits> { using value_type = int32_t; static constexpr std::size_t size = 4; using batch_bool_type = batch_bool; static constexpr std::size_t align = XSIMD_DEFAULT_ALIGNMENT; using storage_type = int32x4_t; }; template <> class batch : public simd_batch> { public: using self_type = batch; using base_type = simd_batch; using storage_type = typename base_type::storage_type; using batch_bool_type = typename base_type::batch_bool_type; batch(); explicit batch(int32_t src); template > batch(Args... args); explicit batch(const int32_t* src); batch(const int32_t* src, aligned_mode); batch(const int32_t* src, unaligned_mode); batch(const storage_type& rhs); batch& operator=(const storage_type& rhs); batch(const batch_bool_type& rhs); batch& operator=(const batch_bool_type& rhs); operator storage_type() const; XSIMD_DECLARE_LOAD_STORE_ALL(int32_t, 4) XSIMD_DECLARE_LOAD_STORE_LONG(int32_t, 4) using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; }; batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); /************************************ * batch implementation * ************************************/ inline batch::batch() { } inline batch::batch(int32_t src) : base_type(vdupq_n_s32(src)) { } template inline batch::batch(Args... args) : base_type(storage_type{static_cast(args)...}) { } inline batch::batch(const int32_t* src) : base_type(vld1q_s32(src)) { } inline batch::batch(const int32_t* src, aligned_mode) : batch(src) { } inline batch::batch(const int32_t* src, unaligned_mode) : batch(src) { } inline batch::batch(const storage_type& rhs) : base_type(rhs) { } inline batch& batch::operator=(const storage_type& rhs) { this->m_value = rhs; return *this; } namespace detail { inline int32x4_t init_from_bool(uint32x4_t a) { return vandq_s32(reinterpret_cast(a), vdupq_n_s32(1)); } } inline batch::batch(const batch_bool_type& rhs) : base_type(detail::init_from_bool(rhs)) { } inline batch& batch::operator=(const batch_bool_type& rhs) { this->m_value = detail::init_from_bool(rhs); return *this; } XSIMD_DEFINE_LOAD_STORE(int32_t, 4, bool, XSIMD_DEFAULT_ALIGNMENT) XSIMD_DEFINE_LOAD_STORE(int32_t, 4, int8_t, XSIMD_DEFAULT_ALIGNMENT) XSIMD_DEFINE_LOAD_STORE(int32_t, 4, uint8_t, XSIMD_DEFAULT_ALIGNMENT) inline batch& batch::load_aligned(const int16_t* src) { int16x4_t tmp = vld1_s16((const int16_t*)src); this->m_value = vmovl_s16(tmp); return *this; } inline batch& batch::load_unaligned(const int16_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const uint16_t* src) { uint16x4_t tmp = vld1_u16((const uint16_t*)src); this->m_value = vreinterpretq_s32_u32(vmovl_u16(tmp)); return *this; } inline batch& batch::load_unaligned(const uint16_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const int32_t* src) { this->m_value = vld1q_s32(src); return *this; } inline batch& batch::load_unaligned(const int32_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const uint32_t* src) { this->m_value = vreinterpretq_s32_u32(vld1q_u32(src)); return *this; } inline batch& batch::load_unaligned(const uint32_t* src) { return load_aligned(src); } XSIMD_DEFINE_LOAD_STORE(int32_t, 4, int64_t, XSIMD_DEFAULT_ALIGNMENT) XSIMD_DEFINE_LOAD_STORE(int32_t, 4, uint64_t, XSIMD_DEFAULT_ALIGNMENT) XSIMD_DEFINE_LOAD_STORE_LONG(int32_t, 4, 64) inline batch& batch::load_aligned(const float* src) { this->m_value = vcvtq_s32_f32(vld1q_f32(src)); return *this; } inline batch& batch::load_unaligned(const float* src) { this->m_value = vcvtq_s32_f32(vld1q_f32(src)); return *this; } inline batch& batch::load_aligned(const double* src) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION float32x2_t tmp_l = vcvtx_f32_f64(float64x2_t{src[0], src[1]}); float32x2_t tmp_h = vcvtx_f32_f64(float64x2_t{src[2], src[3]}); this->m_value = vcvtq_s32_f32(vcombine_f32(tmp_l, tmp_h)); return *this; #else this->m_value = int32x4_t{ static_cast(src[0]), static_cast(src[1]), static_cast(src[2]), static_cast(src[3]) }; #endif return *this; } inline batch& batch::load_unaligned(const double* src) { return load_aligned(src); } inline void batch::store_aligned(int16_t* dst) const { int16x4_t tmp = vmovn_s32(this->m_value); vst1_s16((int16_t*)dst, tmp); } inline void batch::store_unaligned(int16_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint16_t* dst) const { uint16x4_t tmp = vmovn_u32(vreinterpretq_u32_s32(this->m_value)); vst1_u16((uint16_t*)dst, tmp); } inline void batch::store_unaligned(uint16_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(int32_t* dst) const { vst1q_s32(dst, this->m_value); } inline void batch::store_unaligned(int32_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint32_t* dst) const { vst1q_u32(dst, vreinterpretq_u32_s32(this->m_value)); } inline void batch::store_unaligned(uint32_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(float* dst) const { vst1q_f32(dst, vcvtq_f32_s32(this->m_value)); } inline void batch::store_unaligned(float* dst) const { store_aligned(dst); } inline void batch::store_aligned(double* dst) const { alignas(16) int32_t tmp[4]; vst1q_s32(tmp, this->m_value); dst[0] = static_cast(tmp[0]); dst[1] = static_cast(tmp[1]); dst[2] = static_cast(tmp[2]); dst[3] = static_cast(tmp[3]); } inline void batch::store_unaligned(double* dst) const { store_aligned(dst); } inline batch::operator int32x4_t() const { return this->m_value; } namespace detail { template <> struct batch_kernel : neon_int_kernel_base> { using batch_type = batch; using value_type = int32_t; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { return vnegq_s32(rhs); } static batch_type add(const batch_type& lhs, const batch_type& rhs) { return vaddq_s32(lhs, rhs); } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { return vsubq_s32(lhs, rhs); } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { return vqaddq_s32(lhs, rhs); } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return vqsubq_s32(lhs, rhs); } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { return vmulq_s32(lhs, rhs); } static batch_type div(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_FAST_INTEGER_DIVISION) return vcvtq_s32_f32(vcvtq_f32_s32(lhs) / vcvtq_f32_s32(rhs)); #else return neon_detail::unroll_op<4, int32x4_t, int32_t>([&lhs, &rhs] (std::size_t idx) { return lhs[idx] / rhs[idx]; }); #endif } static batch_type mod(const batch_type& lhs, const batch_type& rhs) { return neon_detail::unroll_op<4, int32x4_t, int32_t>([&lhs, &rhs] (std::size_t idx) { return lhs[idx] % rhs[idx]; }); } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { return vceqq_s32(lhs, rhs); } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { return !(lhs == rhs); } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { return vcltq_s32(lhs, rhs); } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { return vcleq_s32(lhs, rhs); } static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return vandq_s32(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return vorrq_s32(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return veorq_s32(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return vmvnq_s32(rhs); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return vbicq_s32(lhs, rhs); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return vminq_s32(lhs, rhs); } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return vmaxq_s32(lhs, rhs); } static batch_type abs(const batch_type& rhs) { return vabsq_s32(rhs); } static value_type hadd(const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vaddvq_s32(rhs); #else int32x2_t tmp = vpadd_s32(vget_low_s32(rhs), vget_high_s32(rhs)); tmp = vpadd_s32(tmp, tmp); return vget_lane_s32(tmp, 0); #endif } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { return vbslq_s32(cond, a, b); } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vzip1q_s32(lhs, rhs); #else int32x2x2_t tmp = vzip_s32(vget_low_s32(lhs), vget_low_s32(rhs)); return vcombine_s32(tmp.val[0], tmp.val[1]); #endif } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vzip2q_s32(lhs, rhs); #else int32x2x2_t tmp = vzip_s32(vget_high_s32(lhs), vget_high_s32(rhs)); return vcombine_s32(tmp.val[0], tmp.val[1]); #endif } static batch_type extract_pair(const batch_type& lhs, const batch_type& rhs, const int n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_4(vextq_s32); default: break; } return batch_type(int32_t(0)); } }; } /*inline batch haddp(const batch* row) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION int32x4_t tmp1 = vpaddq_s32(row[0], row[1]); int32x4_t tmp2 = vpaddq_s32(row[2], row[3]); return vpaddq_s32(tmp1, tmp2); #else // row = (a,b,c,d) int32x2_t tmp1, tmp2, tmp3; // tmp1 = (a0 + a2, a1 + a3) tmp1 = vpadd_s32(vget_low_s32(row[0]), vget_high_s32(row[0])); // tmp2 = (b0 + b2, b1 + b3) tmp2 = vpadd_s32(vget_low_s32(row[1]), vget_high_s32(row[1])); // tmp1 = (a0..3, b0..3) tmp1 = vpadd_s32(tmp1, tmp2); // tmp2 = (c0 + c2, c1 + c3) tmp2 = vpadd_s32(vget_low_s32(row[2]), vget_high_s32(row[2])); // tmp3 = (d0 + d2, d1 + d3) tmp3 = vpadd_s32(vget_low_s32(row[3]), vget_high_s32(row[3])); // tmp1 = (c0..3, d0..3) tmp2 = vpadd_s32(tmp2, tmp3); // return = (a0..3, b0..3, c0..3, d0..3) return vcombine_s32(tmp1, tmp2); #endif }*/ namespace detail { inline batch shift_left(const batch& lhs, int32_t n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_32(vshlq_n_s32); default: break; } return batch(int32_t(0)); } inline batch shift_right(const batch& lhs, int32_t n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_32(vshrq_n_s32); default: break; } return batch(int32_t(0)); } } inline batch operator<<(const batch& lhs, int32_t rhs) { return detail::shift_left(lhs, rhs); } inline batch operator>>(const batch& lhs, int32_t rhs) { return detail::shift_right(lhs, rhs); } inline batch operator<<(const batch& lhs, const batch& rhs) { return vshlq_s32(lhs, rhs); } inline batch operator>>(const batch& lhs, const batch& rhs) { return vshlq_s32(lhs, vnegq_s32(rhs)); } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_neon_int64.hpp000066400000000000000000000403421410101234500224550ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NEON_INT64_HPP #define XSIMD_NEON_INT64_HPP #include "xsimd_base.hpp" #include "xsimd_neon_int_base.hpp" namespace xsimd { /********************* * batch * *********************/ template <> struct simd_batch_traits> { using value_type = int64_t; static constexpr std::size_t size = 2; using batch_bool_type = batch_bool; static constexpr std::size_t align = XSIMD_DEFAULT_ALIGNMENT; using storage_type = int64x2_t; }; template <> class batch : public simd_batch> { public: using self_type = batch; using base_type = simd_batch; using storage_type = typename base_type::storage_type; using batch_bool_type = typename base_type::batch_bool_type; batch(); explicit batch(int64_t src); template > batch(Args... args); explicit batch(const int64_t* src); batch(const int64_t* src, aligned_mode); batch(const int64_t* src, unaligned_mode); batch(const storage_type& rhs); batch& operator=(const storage_type& rhs); batch(const batch_bool_type& rhs); batch& operator=(const batch_bool_type& rhs); operator storage_type() const; XSIMD_DECLARE_LOAD_STORE_ALL(int64_t, 2) XSIMD_DECLARE_LOAD_STORE_LONG(int64_t, 2) using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; }; batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); /*********************************** * batch implementation * ************************************/ inline batch::batch() { } inline batch::batch(int64_t src) : base_type(vdupq_n_s64(src)) { } template inline batch::batch(Args... args) : base_type(storage_type{static_cast(args)...}) { } inline batch::batch(const int64_t* src) : base_type(vld1q_s64(src)) { } inline batch::batch(const int64_t* src, aligned_mode) : batch(src) { } inline batch::batch(const int64_t* src, unaligned_mode) : batch(src) { } inline batch::batch(const storage_type& rhs) : base_type(rhs) { } inline batch& batch::operator=(const storage_type& rhs) { this->m_value = rhs; return *this; } namespace detail { inline int64x2_t init_from_bool(uint64x2_t a) { return vandq_s64(reinterpret_cast(a), vdupq_n_s64(1)); } } inline batch::batch(const batch_bool_type& rhs) : base_type(detail::init_from_bool(rhs)) { } inline batch& batch::operator=(const batch_bool_type& rhs) { this->m_value = detail::init_from_bool(rhs); return *this; } XSIMD_DEFINE_LOAD_STORE(int64_t, 2, bool, XSIMD_DEFAULT_ALIGNMENT) XSIMD_DEFINE_LOAD_STORE(int64_t, 2, int8_t, XSIMD_DEFAULT_ALIGNMENT) XSIMD_DEFINE_LOAD_STORE(int64_t, 2, uint8_t, XSIMD_DEFAULT_ALIGNMENT) XSIMD_DEFINE_LOAD_STORE(int64_t, 2, int16_t, XSIMD_DEFAULT_ALIGNMENT) XSIMD_DEFINE_LOAD_STORE(int64_t, 2, uint16_t, XSIMD_DEFAULT_ALIGNMENT) inline batch& batch::load_aligned(const int32_t* src) { int32x2_t tmp = vld1_s32(src); this->m_value = vmovl_s32(tmp); return *this; } inline batch& batch::load_unaligned(const int32_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const uint32_t* src) { uint32x2_t tmp = vld1_u32(src); this->m_value = vreinterpretq_s64_u64(vmovl_u32(tmp)); return *this; } inline batch& batch::load_unaligned(const uint32_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const int64_t* src) { this->m_value = vld1q_s64(src); return *this; } inline batch& batch::load_unaligned(const int64_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const uint64_t* src) { this->m_value = vreinterpretq_s64_u64(vld1q_u64(src)); return *this; } inline batch& batch::load_unaligned(const uint64_t* src) { return load_aligned(src); } XSIMD_DEFINE_LOAD_STORE_LONG(int64_t, 2, 16) inline batch& batch::load_aligned(const float* src) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION this->m_value = vcvtq_s64_f64(vcvt_f64_f32(vld1_f32(src))); #else this->m_value = int64x2_t{static_cast(src[0]), static_cast(src[1])}; #endif return *this; } inline batch& batch::load_unaligned(const float* src) { return load_aligned(src); } inline batch& batch::load_aligned(const double* src) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION this->m_value = vcvtq_s64_f64(vld1q_f64(src)); #else this->m_value = int64x2_t{static_cast(src[0]), static_cast(src[1])}; #endif return *this; } inline batch& batch::load_unaligned(const double* src) { return load_aligned(src); } inline void batch::store_aligned(int32_t* dst) const { int32x2_t tmp = vmovn_s64(this->m_value); vst1_s32((int32_t*)dst, tmp); } inline void batch::store_unaligned(int32_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint32_t* dst) const { uint32x2_t tmp = vmovn_u64(vreinterpretq_u64_s64(this->m_value)); vst1_u32((uint32_t*)dst, tmp); } inline void batch::store_unaligned(uint32_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(int64_t* dst) const { vst1q_s64(dst, this->m_value); } inline void batch::store_unaligned(int64_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint64_t* dst) const { vst1q_u64(dst, vreinterpretq_u64_s64(this->m_value)); } inline void batch::store_unaligned(uint64_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(float* dst) const { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION vst1_f32(dst, vcvt_f32_f64(vcvtq_f64_s64(this->m_value))); #else dst[0] = static_cast(this->m_value[0]); dst[1] = static_cast(this->m_value[1]); #endif } inline void batch::store_unaligned(float* dst) const { store_aligned(dst); } inline void batch::store_aligned(double* dst) const { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION vst1q_f64(dst, vcvtq_f64_s64(this->m_value)); #else dst[0] = static_cast(this->m_value[0]); dst[1] = static_cast(this->m_value[1]); #endif } inline void batch::store_unaligned(double* dst) const { store_aligned(dst); } inline batch::operator storage_type() const { return this->m_value; } namespace detail { template <> struct batch_kernel : neon_int_kernel_base> { using batch_type = batch; using value_type = int64_t; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vnegq_s64(rhs); #else return batch(-rhs[0], -rhs[1]); #endif } static batch_type add(const batch_type& lhs, const batch_type& rhs) { return vaddq_s64(lhs, rhs); } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { return vsubq_s64(lhs, rhs); } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { return vqaddq_s64(lhs, rhs); } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return vqsubq_s64(lhs, rhs); } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { return { lhs[0] * rhs[0], lhs[1] * rhs[1] }; } static batch_type div(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION && defined(XSIMD_FAST_INTEGER_DIVISION) return vcvtq_s64_f64(vcvtq_f64_s64(lhs) / vcvtq_f64_s64(rhs)); #else return{ lhs[0] / rhs[0], lhs[1] / rhs[1] }; #endif } static batch_type mod(const batch_type& lhs, const batch_type& rhs) { return{ lhs[0] % rhs[0], lhs[1] % rhs[1] }; } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vceqq_s64(lhs, rhs); #else return batch_bool(lhs[0] == rhs[0], lhs[1] == rhs[1]); #endif } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { return !(lhs == rhs); } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vcltq_s64(lhs, rhs); #else return batch_bool(lhs[0] < rhs[0], lhs[1] < rhs[1]); #endif } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vcleq_s64(lhs, rhs); #else return batch_bool(lhs[0] <= rhs[0], lhs[1] <= rhs[1]); #endif } static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return vandq_s64(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return vorrq_s64(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return veorq_s64(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(rhs))); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return vbicq_s64(lhs, rhs); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return { lhs[0] < rhs[0] ? lhs[0] : rhs[0], lhs[1] < rhs[1] ? lhs[1] : rhs[1] }; } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return { lhs[0] > rhs[0] ? lhs[0] : rhs[0], lhs[1] > rhs[1] ? lhs[1] : rhs[1] }; } static batch_type abs(const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vabsq_s64(rhs); #else return batch(std::abs(rhs[0]), std::abs(rhs[1])); #endif } static value_type hadd(const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vaddvq_s64(rhs); #else return rhs[0] + rhs[1]; #endif } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { return vbslq_s64(cond, a, b); } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vzip1q_s64(lhs, rhs); #else return vcombine_s64(vget_low_s64(lhs), vget_low_s64(rhs)); #endif } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vzip2q_s64(lhs, rhs); #else return vcombine_s64(vget_high_s64(lhs), vget_high_s64(rhs)); #endif } static batch_type extract_pair(const batch_type& lhs, const batch_type& rhs, const int n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_2(vextq_s64); default: break; } return batch_type(int64_t(0)); } }; } /*inline batch haddp(const batch* row) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vpaddq_s64(row[0], row[1]); #else return batch(row[0][0] + row[0][1], row[1][0] + row[1][1]); #endif }*/ namespace detail { inline batch shift_left(const batch& lhs, int32_t n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_64(vshlq_n_s64); default: break; } return batch(int64_t(0)); } inline batch shift_right(const batch& lhs, int32_t n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_64(vshrq_n_s64); default: break; } return batch(int64_t(0)); } } inline batch operator<<(const batch& lhs, int32_t rhs) { return detail::shift_left(lhs, rhs); } inline batch operator>>(const batch& lhs, int32_t rhs) { return detail::shift_right(lhs, rhs); } inline batch operator<<(const batch& lhs, const batch& rhs) { return vshlq_s64(lhs, rhs); } inline batch operator>>(const batch& lhs, const batch& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vshlq_s64(lhs, vnegq_s64(rhs)); #else return batch(lhs[0] >> rhs[0], lhs[1] >> rhs[1]); #endif } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_neon_int8.hpp000066400000000000000000000314141410101234500223730ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NEON_INT8_HPP #define XSIMD_NEON_INT8_HPP #include #include "xsimd_base.hpp" #include "xsimd_neon_bool.hpp" #include "xsimd_neon_int_base.hpp" #include "xsimd_neon_utils.hpp" namespace xsimd { /********************* * batch * *********************/ template <> struct simd_batch_traits> { using value_type = int8_t; static constexpr std::size_t size = 16; using batch_bool_type = batch_bool; static constexpr std::size_t align = XSIMD_DEFAULT_ALIGNMENT; using storage_type = int8x16_t; }; template <> class batch : public simd_batch> { public: using base_type = simd_batch>; using storage_type = typename base_type::storage_type; using batch_bool_type = typename base_type::batch_bool_type; batch(); explicit batch(int8_t d); template > batch(Args... args); explicit batch(const int8_t* src); batch(const int8_t* src, aligned_mode); batch(const int8_t* src, unaligned_mode); explicit batch(const char* src); batch(const char* src, aligned_mode); batch(const char* src, unaligned_mode); batch(const storage_type& rhs); batch& operator=(const storage_type& rhs); batch(const batch_bool_type& rhs); batch& operator=(const batch_bool_type& rhs); operator storage_type() const; batch& load_aligned(const int8_t* src); batch& load_unaligned(const int8_t* src); batch& load_aligned(const uint8_t* src); batch& load_unaligned(const uint8_t* src); void store_aligned(int8_t* dst) const; void store_unaligned(int8_t* dst) const; void store_aligned(uint8_t* dst) const; void store_unaligned(uint8_t* dst) const; using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT8(int8_t, 16) XSIMD_DECLARE_LOAD_STORE_LONG(int8_t, 16) }; batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); /************************************ * batch implementation * ************************************/ inline batch::batch() { } inline batch::batch(int8_t d) : base_type(vdupq_n_s8(d)) { } template inline batch::batch(Args... args) : base_type(storage_type{static_cast(args)...}) { } inline batch::batch(const int8_t* d) : base_type(vld1q_s8(d)) { } inline batch::batch(const int8_t* d, aligned_mode) : batch(d) { } inline batch::batch(const int8_t* d, unaligned_mode) : batch(d) { } inline batch::batch(const char* d) : batch(reinterpret_cast(d)) { } inline batch::batch(const char* d, aligned_mode) : batch(reinterpret_cast(d)) { } inline batch::batch(const char* d, unaligned_mode) : batch(reinterpret_cast(d)) { } inline batch::batch(const storage_type& rhs) : base_type(rhs) { } inline batch& batch::operator=(const storage_type& rhs) { this->m_value = rhs; return *this; } namespace detail { inline int8x16_t init_from_bool(uint8x16_t a) { return vandq_s8(reinterpret_cast(a), vdupq_n_s8(1)); } } inline batch::batch(const batch_bool_type& rhs) : base_type(detail::init_from_bool(rhs)) { } inline batch& batch::operator=(const batch_bool_type& rhs) { this->m_value = detail::init_from_bool(rhs); return *this; } inline batch& batch::load_aligned(const int8_t* src) { this->m_value = vld1q_s8(src); return *this; } inline batch& batch::load_unaligned(const int8_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const uint8_t* src) { this->m_value = vreinterpretq_s8_u8(vld1q_u8(src)); return *this; } inline batch& batch::load_unaligned(const uint8_t* src) { return load_aligned(src); } inline void batch::store_aligned(int8_t* dst) const { vst1q_s8(dst, this->m_value); } inline void batch::store_unaligned(int8_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint8_t* dst) const { vst1q_u8(dst, vreinterpretq_u8_s8(this->m_value)); } inline void batch::store_unaligned(uint8_t* dst) const { store_aligned(dst); } XSIMD_DEFINE_LOAD_STORE_INT8(int8_t, 16, 16) XSIMD_DEFINE_LOAD_STORE_LONG(int8_t, 16, 16) inline batch::operator int8x16_t() const { return this->m_value; } namespace detail { template <> struct batch_kernel : neon_int_kernel_base> { using batch_type = batch; using value_type = int8_t; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { return vnegq_s8(rhs); } static batch_type add(const batch_type& lhs, const batch_type& rhs) { return vaddq_s8(lhs, rhs); } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { return vsubq_s8(lhs, rhs); } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { return vqaddq_s8(lhs, rhs); } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return vqsubq_s8(lhs, rhs); } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { return vmulq_s8(lhs, rhs); } static batch_type div(const batch_type& lhs, const batch_type& rhs) { return neon_detail::unroll_op<16, int8x16_t, int8_t>([&lhs, &rhs] (std::size_t idx) { return lhs[idx] / rhs[idx]; }); } static batch_type mod(const batch_type& lhs, const batch_type& rhs) { return neon_detail::unroll_op<16, int8x16_t, int8_t>([&lhs, &rhs] (std::size_t idx) { return lhs[idx] % rhs[idx]; }); } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { return vceqq_s8(lhs, rhs); } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { return !(lhs == rhs); } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { return vcltq_s8(lhs, rhs); } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { return vcleq_s8(lhs, rhs); } static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return vandq_s8(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return vorrq_s8(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return veorq_s8(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return vmvnq_s8(rhs); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return vbicq_s8(lhs, rhs); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return vminq_s8(lhs, rhs); } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return vmaxq_s8(lhs, rhs); } static batch_type abs(const batch_type& rhs) { return vabsq_s8(rhs); } // Not implemented yet static value_type hadd(const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vaddvq_s8(rhs); #else int8x8_t tmp = vpadd_s8(vget_low_s8(rhs), vget_high_s8(rhs)); value_type res = 0; for (std::size_t i = 0; i < 8; ++i) { res += tmp[i]; } return res; #endif } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { return vbslq_s8(cond, a, b); } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vzip1q_s8(lhs, rhs); #else int8x8x2_t tmp = vzip_s8(vget_low_s8(lhs), vget_low_s8(rhs)); return vcombine_s8(tmp.val[0], tmp.val[1]); #endif } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vzip2q_s8(lhs, rhs); #else int8x8x2_t tmp = vzip_s8(vget_high_s8(lhs), vget_high_s8(rhs)); return vcombine_s8(tmp.val[0], tmp.val[1]); #endif } static batch_type extract_pair(const batch_type& lhs, const batch_type& rhs, const int n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_16_v2(vextq_s8); default: break; } return batch_type(int8_t(0)); } }; } namespace detail { inline batch shift_left(const batch& lhs, int32_t n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_8(vshlq_n_s8); default: break; } return batch(int8_t(0)); } inline batch shift_right(const batch& lhs, int32_t n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_8(vshrq_n_s8); default: break; } return batch(int8_t(0)); } } inline batch operator<<(const batch& lhs, int32_t rhs) { return detail::shift_left(lhs, rhs); } inline batch operator>>(const batch& lhs, int32_t rhs) { return detail::shift_right(lhs, rhs); } inline batch operator<<(const batch& lhs, const batch& rhs) { return vshlq_s8(lhs, rhs); } inline batch operator>>(const batch& lhs, const batch& rhs) { return vshlq_s8(lhs, vnegq_s8(rhs)); } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_neon_int_base.hpp000066400000000000000000000037171410101234500233020ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NEON_INT_BASE_HPP #define XSIMD_NEON_INT_BASE_HPP namespace xsimd { namespace detail { template struct neon_int_kernel_base { using batch_type = B; static batch_type fmin(const batch_type& lhs, const batch_type& rhs) { return min(lhs, rhs); } static batch_type fmax(const batch_type& lhs, const batch_type& rhs) { return max(lhs, rhs); } static batch_type fabs(const batch_type& rhs) { return abs(rhs); } static batch_type fma(const batch_type& x, const batch_type& y, const batch_type& z) { return x * y + z; } static batch_type fms(const batch_type& x, const batch_type& y, const batch_type& z) { return x * y - z; } static batch_type fnma(const batch_type& x, const batch_type& y, const batch_type& z) { return -x * y + z; } static batch_type fnms(const batch_type& x, const batch_type& y, const batch_type& z) { return -x * y - z; } }; } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_neon_uint16.hpp000066400000000000000000000304311410101234500226350ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NEON_UINT16_HPP #define XSIMD_NEON_UINT16_HPP #include #include "xsimd_base.hpp" #include "xsimd_neon_bool.hpp" #include "xsimd_neon_int_base.hpp" #include "xsimd_neon_utils.hpp" namespace xsimd { /********************* * batch* *********************/ template <> struct simd_batch_traits> { using value_type = uint16_t; static constexpr std::size_t size = 8; using batch_bool_type = batch_bool; static constexpr std::size_t align = XSIMD_DEFAULT_ALIGNMENT; using storage_type = uint16x8_t; }; template <> class batch : public simd_batch> { public: using base_type = simd_batch>; using storage_type = typename base_type::storage_type; using batch_bool_type = typename base_type::batch_bool_type; batch(); explicit batch(uint16_t d); template > batch(Args... args); explicit batch(const uint16_t* src); batch(const uint16_t* src, aligned_mode); batch(const uint16_t* src, unaligned_mode); batch(const storage_type& rhs); batch& operator=(const storage_type& rhs); batch(const batch_bool_type& rhs); batch& operator=(const batch_bool_type& rhs); operator storage_type() const; batch& load_aligned(const int16_t* src); batch& load_unaligned(const int16_t* src); batch& load_aligned(const uint16_t* src); batch& load_unaligned(const uint16_t* src); void store_aligned(int16_t* dst) const; void store_unaligned(int16_t* dst) const; void store_aligned(uint16_t* dst) const; void store_unaligned(uint16_t* dst) const; using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT16(uint16_t, 8) XSIMD_DECLARE_LOAD_STORE_LONG(uint16_t, 8) }; batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); /************************************ * batch implementation * ************************************/ inline batch::batch() { } inline batch::batch(uint16_t d) : base_type(vdupq_n_u16(d)) { } template inline batch::batch(Args... args) : base_type(storage_type{static_cast(args)...}) { } inline batch::batch(const uint16_t* d) : base_type(vld1q_u16(d)) { } inline batch::batch(const uint16_t* d, aligned_mode) : batch(d) { } inline batch::batch(const uint16_t* d, unaligned_mode) : batch(d) { } inline batch::batch(const storage_type& rhs) : base_type(rhs) { } inline batch& batch::operator=(const storage_type& rhs) { this->m_value = rhs; return *this; } inline batch::batch(const batch_bool_type& rhs) : base_type(vandq_u16(rhs, batch(1))) { } inline batch& batch::operator=(const batch_bool_type& rhs) { this->m_value = vandq_u16(rhs, batch(1)); return *this; } inline batch& batch::load_aligned(const int16_t* src) { this->m_value = vreinterpretq_u16_s16(vld1q_s16(src)); return *this; } inline batch& batch::load_unaligned(const int16_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const uint16_t* src) { this->m_value = vld1q_u16(src); return *this; } inline batch& batch::load_unaligned(const uint16_t* src) { return load_aligned(src); } inline void batch::store_aligned(int16_t* dst) const { vst1q_s16(dst, vreinterpretq_s16_u16(this->m_value)); } inline void batch::store_unaligned(int16_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint16_t* dst) const { vst1q_u16(dst, this->m_value); } inline void batch::store_unaligned(uint16_t* dst) const { store_aligned(dst); } XSIMD_DEFINE_LOAD_STORE_INT16(uint16_t, 8, 8) XSIMD_DEFINE_LOAD_STORE_LONG(uint16_t, 8, 8) inline batch::operator uint16x8_t() const { return this->m_value; } namespace detail { template <> struct batch_kernel : neon_int_kernel_base> { using batch_type = batch; using value_type = uint16_t; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { return vreinterpretq_u16_s16(vnegq_s16(vreinterpretq_s16_u16(rhs))); } static batch_type add(const batch_type& lhs, const batch_type& rhs) { return vaddq_u16(lhs, rhs); } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { return vsubq_u16(lhs, rhs); } static batch_type sadd(const batch_type &lhs, const batch_type &rhs) { return vqaddq_u16(lhs, rhs); } static batch_type ssub(const batch_type &lhs, const batch_type &rhs) { return vqsubq_u16(lhs, rhs); } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { return vmulq_u16(lhs, rhs); } static batch_type div(const batch_type& lhs, const batch_type& rhs) { return neon_detail::unroll_op<8, uint16x8_t, uint16_t>([&lhs, &rhs] (std::size_t idx) { return lhs[idx] / rhs[idx]; }); } static batch_type mod(const batch_type& lhs, const batch_type& rhs) { return neon_detail::unroll_op<8, uint16x8_t, uint16_t>([&lhs, &rhs] (std::size_t idx) { return lhs[idx] % rhs[idx]; }); } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { return vceqq_u16(lhs, rhs); } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { return !(lhs == rhs); } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { return vcltq_u16(lhs, rhs); } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { return vcleq_u16(lhs, rhs); } static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return vandq_u16(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return vorrq_u16(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return veorq_u16(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return vmvnq_u16(rhs); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return vbicq_u16(lhs, rhs); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return vminq_u16(lhs, rhs); } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return vmaxq_u16(lhs, rhs); } static batch_type abs(const batch_type& rhs) { return rhs; } // Not implemented yet static value_type hadd(const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vaddvq_u16(rhs); #else uint16x4_t tmp = vpadd_u16(vget_low_u16(rhs), vget_high_u16(rhs)); value_type res = 0; for (std::size_t i = 0; i < 4; ++i) { res += tmp[i]; } return res; #endif } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { return vbslq_u16(cond, a, b); } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vzip1q_u16(lhs, rhs); #else uint16x4x2_t tmp = vzip_u16(vget_low_u16(lhs), vget_low_u16(rhs)); return vcombine_u16(tmp.val[0], tmp.val[1]); #endif } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vzip2q_u16(lhs, rhs); #else uint16x4x2_t tmp = vzip_u16(vget_high_u16(lhs), vget_high_u16(rhs)); return vcombine_u16(tmp.val[0], tmp.val[1]); #endif } static batch_type extract_pair(const batch_type& lhs, const batch_type& rhs, const int n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_8_v2(vextq_u16); default: break; } return batch_type(uint16_t(0)); } }; } namespace detail { inline batch shift_left(const batch& lhs, int32_t n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_16(vshlq_n_u16); default: break; } return batch(uint16_t(0)); } inline batch shift_right(const batch& lhs, int32_t n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_16(vshrq_n_u16); default: break; } return batch(uint16_t(0)); } } inline batch operator<<(const batch& lhs, int32_t rhs) { return detail::shift_left(lhs, rhs); } inline batch operator>>(const batch& lhs, int32_t rhs) { return detail::shift_right(lhs, rhs); } inline batch operator<<(const batch& lhs, const batch& rhs) { return vshlq_u16(lhs, rhs); } inline batch operator>>(const batch& lhs, const batch& rhs) { return vshlq_u16(lhs, vnegq_s16(rhs)); } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_neon_uint32.hpp000066400000000000000000000363651410101234500226470ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NEON_UINT32_HPP #define XSIMD_NEON_UINT32_HPP #include #include "xsimd_base.hpp" #include "xsimd_neon_bool.hpp" #include "xsimd_neon_int_base.hpp" #include "xsimd_neon_utils.hpp" namespace xsimd { /********************** * batch * **********************/ template <> struct simd_batch_traits> { using value_type = uint32_t; static constexpr std::size_t size = 4; using batch_bool_type = batch_bool; static constexpr std::size_t align = XSIMD_DEFAULT_ALIGNMENT; using storage_type = uint32x4_t; }; template <> class batch : public simd_batch> { public: using base_type = simd_batch>; using storage_type = typename base_type::storage_type; using batch_bool_type = typename base_type::batch_bool_type; batch(); explicit batch(uint32_t src); template > batch(Args... args); explicit batch(const uint32_t* src); batch(const uint32_t* src, aligned_mode); batch(const uint32_t* src, unaligned_mode); batch(const storage_type& rhs); batch& operator=(const storage_type& rhs); batch(const batch_bool_type& rhs); batch& operator=(const batch_bool_type& rhs); operator storage_type() const; XSIMD_DECLARE_LOAD_STORE_ALL(uint32_t, 4) XSIMD_DECLARE_LOAD_STORE_LONG(uint32_t, 4) using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; }; batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); /************************************* * batch implementation * *************************************/ inline batch::batch() { } inline batch::batch(uint32_t src) : base_type(vdupq_n_u32(src)) { } template inline batch::batch(Args... args) : base_type(storage_type{static_cast(args)...}) { } inline batch::batch(const uint32_t* src) : base_type(vld1q_u32(src)) { } inline batch::batch(const uint32_t* src, aligned_mode) : batch(src) { } inline batch::batch(const uint32_t* src, unaligned_mode) : batch(src) { } inline batch::batch(const storage_type& rhs) : base_type(rhs) { } inline batch& batch::operator=(const storage_type& rhs) { this->m_value = rhs; return *this; } inline batch::batch(const batch_bool_type& rhs) : base_type(vandq_u32(rhs, batch(1))) { } inline batch& batch::operator=(const batch_bool_type& rhs) { this->m_value = vandq_u32(rhs, batch(1)); return *this; } XSIMD_DEFINE_LOAD_STORE(uint32_t, 4, bool, XSIMD_DEFAULT_ALIGNMENT) XSIMD_DEFINE_LOAD_STORE(uint32_t, 4, int8_t, XSIMD_DEFAULT_ALIGNMENT) XSIMD_DEFINE_LOAD_STORE(uint32_t, 4, uint8_t, XSIMD_DEFAULT_ALIGNMENT) inline batch& batch::load_aligned(const int16_t* src) { int16x4_t tmp = vld1_s16((const int16_t*)src); this->m_value = vreinterpretq_u32_s32(vmovl_s16(tmp)); return *this; } inline batch& batch::load_unaligned(const int16_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const uint16_t* src) { uint16x4_t tmp = vld1_u16((const uint16_t*)src); this->m_value = vmovl_u16(tmp); return *this; } inline batch& batch::load_unaligned(const uint16_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const int32_t* src) { this->m_value = vreinterpretq_u32_s32(vld1q_s32(src)); return *this; } inline batch& batch::load_unaligned(const int32_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const uint32_t* src) { this->m_value = vld1q_u32(src); return *this; } inline batch& batch::load_unaligned(const uint32_t* src) { return load_aligned(src); } XSIMD_DEFINE_LOAD_STORE(uint32_t, 4, int64_t, XSIMD_DEFAULT_ALIGNMENT) XSIMD_DEFINE_LOAD_STORE(uint32_t, 4, uint64_t, XSIMD_DEFAULT_ALIGNMENT) XSIMD_DEFINE_LOAD_STORE_LONG(uint32_t, 4, 64) inline batch& batch::load_aligned(const float* src) { this->m_value = vcvtq_u32_f32(vld1q_f32(src)); return *this; } inline batch& batch::load_unaligned(const float* src) { this->m_value = vcvtq_u32_f32(vld1q_f32(src)); return *this; } inline batch& batch::load_aligned(const double* src) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION float32x2_t tmp_l = vcvtx_f32_f64(float64x2_t{src[0], src[1]}); float32x2_t tmp_h = vcvtx_f32_f64(float64x2_t{src[2], src[3]}); this->m_value = vcvtq_u32_f32(vcombine_f32(tmp_l, tmp_h)); return *this; #else this->m_value = uint32x4_t{ static_cast(src[0]), static_cast(src[1]), static_cast(src[2]), static_cast(src[3]) }; #endif return *this; } inline batch& batch::load_unaligned(const double* src) { return load_aligned(src); } inline void batch::store_aligned(int16_t* dst) const { int16x4_t tmp = vmovn_s32(vreinterpretq_s32_u32(this->m_value)); vst1_s16((int16_t*)dst, tmp); } inline void batch::store_unaligned(int16_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint16_t* dst) const { uint16x4_t tmp = vmovn_u32(this->m_value); vst1_u16((uint16_t*)dst, tmp); } inline void batch::store_unaligned(uint16_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(int32_t* dst) const { vst1q_s32(dst, vreinterpretq_s32_u32(this->m_value)); } inline void batch::store_unaligned(int32_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint32_t* dst) const { vst1q_u32(dst, this->m_value); } inline void batch::store_unaligned(uint32_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(float* dst) const { vst1q_f32(dst, vcvtq_f32_u32(this->m_value)); } inline void batch::store_unaligned(float* dst) const { store_aligned(dst); } inline void batch::store_aligned(double* dst) const { alignas(16) uint32_t tmp[4]; vst1q_u32(tmp, this->m_value); dst[0] = static_cast(tmp[0]); dst[1] = static_cast(tmp[1]); dst[2] = static_cast(tmp[2]); dst[3] = static_cast(tmp[3]); } inline void batch::store_unaligned(double* dst) const { store_aligned(dst); } inline batch::operator storage_type() const { return this->m_value; } namespace detail { template <> struct batch_kernel : neon_int_kernel_base> { using batch_type = batch; using value_type = uint32_t; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { return vreinterpretq_u32_s32(vnegq_s32(vreinterpretq_s32_u32(rhs))); } static batch_type add(const batch_type& lhs, const batch_type& rhs) { return vaddq_u32(lhs, rhs); } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { return vsubq_u32(lhs, rhs); } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { return vqaddq_u32(lhs, rhs); } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return vqsubq_u32(lhs, rhs); } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { return vmulq_u32(lhs, rhs); } static batch_type div(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_FAST_INTEGER_DIVISION) return vcvtq_u32_f32(vcvtq_f32_u32(lhs) / vcvtq_f32_u32(rhs)); #else return neon_detail::unroll_op<4, uint32x4_t, uint32_t>([&lhs, &rhs] (std::size_t idx) { return lhs[idx] / rhs[idx]; }); #endif } static batch_type mod(const batch_type& lhs, const batch_type& rhs) { return neon_detail::unroll_op<4, uint32x4_t, uint32_t>([&lhs, &rhs] (std::size_t idx) { return lhs[idx] % rhs[idx]; }); } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { return vceqq_u32(lhs, rhs); } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { return !(lhs == rhs); } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { return vcltq_u32(lhs, rhs); } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { return vcleq_u32(lhs, rhs); } static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return vandq_u32(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return vorrq_u32(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return veorq_u32(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return vmvnq_u32(rhs); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return vbicq_u32(lhs, rhs); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return vminq_u32(lhs, rhs); } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return vmaxq_u32(lhs, rhs); } static batch_type abs(const batch_type& rhs) { return rhs; } static value_type hadd(const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vaddvq_u32(rhs); #else uint32x2_t tmp = vpadd_u32(vget_low_u32(rhs), vget_high_u32(rhs)); tmp = vpadd_u32(tmp, tmp); return vget_lane_u32(tmp, 0); #endif } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { return vbslq_u32(cond, a, b); } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vzip1q_u32(lhs, rhs); #else uint32x2x2_t tmp = vzip_u32(vget_low_u32(lhs), vget_low_u32(rhs)); return vcombine_u32(tmp.val[0], tmp.val[1]); #endif } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vzip2q_u32(lhs, rhs); #else uint32x2x2_t tmp = vzip_u32(vget_high_u32(lhs), vget_high_u32(rhs)); return vcombine_u32(tmp.val[0], tmp.val[1]); #endif } static batch_type extract_pair(const batch_type& lhs, const batch_type& rhs, const int n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_4(vextq_u32); default: break; } return batch_type(uint32_t(0)); } }; inline batch shift_left(const batch& lhs, int32_t n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_32(vshlq_n_u32); default: break; } return batch(uint32_t(0)); } inline batch shift_right(const batch& lhs, int32_t n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_32(vshrq_n_u32); default: break; } return batch(uint32_t(0)); } } inline batch operator<<(const batch& lhs, int32_t rhs) { return detail::shift_left(lhs, rhs); } inline batch operator>>(const batch& lhs, int32_t rhs) { return detail::shift_right(lhs, rhs); } inline batch operator<<(const batch& lhs, const batch& rhs) { return vshlq_u32(lhs, rhs); } inline batch operator>>(const batch& lhs, const batch& rhs) { return vshlq_u32(lhs, vnegq_s32(rhs)); } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_neon_uint64.hpp000066400000000000000000000430341410101234500226430ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NEON_UINT64_HPP #define XSIMD_NEON_UINT64_HPP #include "xsimd_base.hpp" #include "xsimd_neon_int_base.hpp" namespace xsimd { /********************** * batch * **********************/ template <> struct simd_batch_traits> { using value_type = uint64_t; static constexpr std::size_t size = 2; using batch_bool_type = batch_bool; static constexpr std::size_t align = XSIMD_DEFAULT_ALIGNMENT; using storage_type = uint64x2_t; }; template <> class batch : public simd_batch> { public: using self_type = batch; using base_type = simd_batch; using storage_type = typename base_type::storage_type; using batch_bool_type = typename base_type::batch_bool_type; batch(); explicit batch(uint64_t src); template > batch(Args... args); explicit batch(const uint64_t* src); batch(const uint64_t* src, aligned_mode); batch(const uint64_t* src, unaligned_mode); batch(const storage_type& rhs); batch& operator=(const storage_type& rhs); batch(const batch_bool_type& rhs); batch& operator=(const batch_bool_type& rhs); operator storage_type() const; XSIMD_DECLARE_LOAD_STORE_ALL(uint64_t, 2) XSIMD_DECLARE_LOAD_STORE_LONG(uint64_t, 2) using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; }; batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); /************************************ * batch implementation * *************************************/ inline batch::batch() { } inline batch::batch(uint64_t src) : base_type(vdupq_n_u64(src)) { } template inline batch::batch(Args... args) : base_type(storage_type{static_cast(args)...}) { } inline batch::batch(const uint64_t* src) : base_type(vld1q_u64(src)) { } inline batch::batch(const uint64_t* src, aligned_mode) : batch(src) { } inline batch::batch(const uint64_t* src, unaligned_mode) : batch(src) { } inline batch::batch(const storage_type& rhs) : base_type(rhs) { } inline batch& batch::operator=(const storage_type& rhs) { this->m_value = rhs; return *this; } inline batch::batch(const batch_bool_type& rhs) : base_type(vandq_u64(rhs, batch(1))) { } inline batch& batch::operator=(const batch_bool_type& rhs) { this->m_value = vandq_u64(rhs, batch(1)); return *this; } XSIMD_DEFINE_LOAD_STORE(uint64_t, 2, bool, XSIMD_DEFAULT_ALIGNMENT) XSIMD_DEFINE_LOAD_STORE(uint64_t, 2, int8_t, XSIMD_DEFAULT_ALIGNMENT) XSIMD_DEFINE_LOAD_STORE(uint64_t, 2, uint8_t, XSIMD_DEFAULT_ALIGNMENT) XSIMD_DEFINE_LOAD_STORE(uint64_t, 2, int16_t, XSIMD_DEFAULT_ALIGNMENT) XSIMD_DEFINE_LOAD_STORE(uint64_t, 2, uint16_t, XSIMD_DEFAULT_ALIGNMENT) inline batch& batch::load_aligned(const int32_t* src) { int32x2_t tmp = vld1_s32(src); this->m_value = vreinterpretq_u64_s64(vmovl_s32(tmp)); return *this; } inline batch& batch::load_unaligned(const int32_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const uint32_t* src) { uint32x2_t tmp = vld1_u32(src); this->m_value = vmovl_u32(tmp); return *this; } inline batch& batch::load_unaligned(const uint32_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const int64_t* src) { this->m_value = vreinterpretq_u64_s64(vld1q_s64(src)); return *this; } inline batch& batch::load_unaligned(const int64_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const uint64_t* src) { this->m_value = vld1q_u64(src); return *this; } inline batch& batch::load_unaligned(const uint64_t* src) { return load_aligned(src); } XSIMD_DEFINE_LOAD_STORE_LONG(uint64_t, 2, 16) inline batch& batch::load_aligned(const float* src) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION this->m_value = vcvtq_u64_f64(vcvt_f64_f32(vld1_f32(src))); #else this->m_value = uint64x2_t{ static_cast(src[0]), static_cast(src[1]) }; #endif return *this; } inline batch& batch::load_unaligned(const float* src) { return load_aligned(src); } inline batch& batch::load_aligned(const double* src) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION this->m_value = vcvtq_u64_f64(vld1q_f64(src)); #else this->m_value = uint64x2_t{ static_cast(src[0]), static_cast(src[1]) }; #endif return *this; } inline batch& batch::load_unaligned(const double* src) { return load_aligned(src); } inline void batch::store_aligned(int32_t* dst) const { int32x2_t tmp = vmovn_s64(vreinterpretq_s64_u64(this->m_value)); vst1_s32((int32_t*)dst, tmp); } inline void batch::store_unaligned(int32_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint32_t* dst) const { uint32x2_t tmp = vmovn_u64(this->m_value); vst1_u32((uint32_t*)dst, tmp); } inline void batch::store_unaligned(uint32_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(int64_t* dst) const { vst1q_s64(dst, vreinterpretq_s64_u64(this->m_value)); } inline void batch::store_unaligned(int64_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint64_t* dst) const { vst1q_u64(dst, this->m_value); } inline void batch::store_unaligned(uint64_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(float* dst) const { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION vst1_f32(dst, vcvt_f32_f64(vcvtq_f64_u64(this->m_value))); #else dst[0] = static_cast(this->m_value[0]); dst[1] = static_cast(this->m_value[1]); #endif } inline void batch::store_unaligned(float* dst) const { store_aligned(dst); } inline void batch::store_aligned(double* dst) const { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION vst1q_f64(dst, vcvtq_f64_u64(this->m_value)); #else dst[0] = static_cast(this->m_value[0]); dst[1] = static_cast(this->m_value[1]); #endif } inline void batch::store_unaligned(double* dst) const { store_aligned(dst); } inline batch::operator storage_type() const { return this->m_value; } namespace detail { template <> struct batch_kernel : neon_int_kernel_base> { using batch_type = batch; using value_type = uint64_t; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vreinterpretq_u64_s64(vnegq_s64(vreinterpretq_s64_u64(rhs))); #else return batch(-rhs[0], -rhs[1]); #endif } static batch_type add(const batch_type& lhs, const batch_type& rhs) { return vaddq_u64(lhs, rhs); } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { return vsubq_u64(lhs, rhs); } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { return vqaddq_u64(lhs, rhs); } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return vqsubq_u64(lhs, rhs); } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return { lhs[0] * rhs[0], lhs[1] * rhs[1] }; #else /* * Clang 7 and GCC 8 both generate highly inefficient code here for * ARMv7. They will repeatedly extract and reinsert lanes. * While bug reports have been opened, for now, this is an efficient * workaround. * * It is unknown if there is a benefit for aarch64 (I do not have * a device), but I presume it would be lower considering aarch64 * has a native uint64_t multiply. * * Effective code: * * uint32x2_t lhs_lo = lhs & 0xFFFFFFFF; * uint32x2_t lhs_hi = lhs >> 32; * uint32x2_t rhs_lo = rhs & 0xFFFFFFFF; * uint32x2_t rhs_hi = rhs >> 32; * * uint64x2_t result = (uint64x2_t)lhs_hi * (uint64x2_t)rhs_lo; * result += (uint64x2_t)lhs_lo * (uint64x2_t)rhs_hi; * result <<= 32; * result += (uint64x2_t)lhs_lo * (uint64x2_t)rhs_lo; * return result; */ uint32x2_t lhs_lo = vmovn_u64 (lhs); uint32x2_t lhs_hi = vshrn_n_u64 (lhs, 32); uint32x2_t rhs_lo = vmovn_u64 (rhs); uint32x2_t rhs_hi = vshrn_n_u64 (rhs, 32); uint64x2_t result = vmull_u32 (lhs_hi, rhs_lo); result = vmlal_u32 (result, lhs_lo, rhs_hi); result = vshlq_n_u64 (result, 32); result = vmlal_u32 (result, lhs_lo, rhs_lo); return result; #endif } static batch_type div(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION && defined(XSIMD_FAST_INTEGER_DIVISION) return vcvtq_u64_f64(vcvtq_f64_u64(lhs) / vcvtq_f64_u64(rhs)); #else return{ lhs[0] / rhs[0], lhs[1] / rhs[1] }; #endif } static batch_type mod(const batch_type& lhs, const batch_type& rhs) { return{ lhs[0] % rhs[0], lhs[1] % rhs[1] }; } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vceqq_u64(lhs, rhs); #else return batch_bool(lhs[0] == rhs[0], lhs[1] == rhs[1]); #endif } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { return !(lhs == rhs); } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vcltq_u64(lhs, rhs); #else return batch_bool(lhs[0] < rhs[0], lhs[1] < rhs[1]); #endif } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vcleq_u64(lhs, rhs); #else return batch_bool(lhs[0] <= rhs[0], lhs[1] <= rhs[1]); #endif } static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return vandq_u64(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return vorrq_u64(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return veorq_u64(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(rhs))); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return vbicq_u64(lhs, rhs); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return { lhs[0] < rhs[0] ? lhs[0] : rhs[0], lhs[1] < rhs[1] ? lhs[1] : rhs[1] }; } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return { lhs[0] > rhs[0] ? lhs[0] : rhs[0], lhs[1] > rhs[1] ? lhs[1] : rhs[1] }; } static batch_type abs(const batch_type& rhs) { return rhs; } static value_type hadd(const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vaddvq_u64(rhs); #else return rhs[0] + rhs[1]; #endif } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { return vbslq_u64(cond, a, b); } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vzip1q_u64(lhs, rhs); #else return vcombine_u64(vget_low_u64(lhs), vget_low_u64(rhs)); #endif } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vzip2q_u64(lhs, rhs); #else return vcombine_u64(vget_high_u64(lhs), vget_high_u64(rhs)); #endif } static batch_type extract_pair(const batch_type& lhs, const batch_type& rhs, const int n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_2(vextq_u64); default: break; } return batch_type(uint64_t(0)); } }; inline batch shift_left(const batch& lhs, int32_t n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_64(vshlq_n_u64); default: break; } return batch(uint64_t(0)); } inline batch shift_right(const batch& lhs, int32_t n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_64(vshrq_n_u64); default: break; } return batch(uint64_t(0)); } } inline batch operator<<(const batch& lhs, int32_t rhs) { return detail::shift_left(lhs, rhs); } inline batch operator>>(const batch& lhs, int32_t rhs) { return detail::shift_right(lhs, rhs); } inline batch operator<<(const batch& lhs, const batch& rhs) { return vshlq_u64(lhs, rhs); } inline batch operator>>(const batch& lhs, const batch& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vshlq_u64(lhs, vnegq_s64(rhs)); #else return batch(lhs[0] >> rhs[0], lhs[1] >> rhs[1]); #endif } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_neon_uint8.hpp000066400000000000000000000303031410101234500225540ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NEON_UINT8_HPP #define XSIMD_NEON_UINT8_HPP #include #include "xsimd_base.hpp" #include "xsimd_neon_bool.hpp" #include "xsimd_neon_int_base.hpp" #include "xsimd_neon_utils.hpp" namespace xsimd { /********************* * batch * *********************/ template <> struct simd_batch_traits> { using value_type = uint8_t; static constexpr std::size_t size = 16; using batch_bool_type = batch_bool; static constexpr std::size_t align = XSIMD_DEFAULT_ALIGNMENT; using storage_type = uint8x16_t; }; template <> class batch : public simd_batch> { public: using base_type = simd_batch>; using storage_type = typename base_type::storage_type; using batch_bool_type = typename base_type::batch_bool_type; batch(); explicit batch(uint8_t d); template > batch(Args... args); explicit batch(const uint8_t* src); batch(const uint8_t* src, aligned_mode); batch(const uint8_t* src, unaligned_mode); batch(const storage_type& rhs); batch& operator=(const storage_type& rhs); batch(const batch_bool_type& rhs); batch& operator=(const batch_bool_type& rhs); operator storage_type() const; batch& load_aligned(const int8_t* src); batch& load_unaligned(const int8_t* src); batch& load_aligned(const uint8_t* src); batch& load_unaligned(const uint8_t* src); void store_aligned(int8_t* dst) const; void store_unaligned(int8_t* dst) const; void store_aligned(uint8_t* dst) const; void store_unaligned(uint8_t* dst) const; using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT8(uint8_t, 16) XSIMD_DECLARE_LOAD_STORE_LONG(uint8_t, 16) }; batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); /************************************ * batch implementation * ************************************/ inline batch::batch() { } inline batch::batch(uint8_t d) : base_type(vdupq_n_u8(d)) { } template inline batch::batch(Args... args) : base_type(storage_type{static_cast(args)...}) { } inline batch::batch(const uint8_t* d) : base_type(vld1q_u8(d)) { } inline batch::batch(const uint8_t* d, aligned_mode) : batch(d) { } inline batch::batch(const uint8_t* d, unaligned_mode) : batch(d) { } inline batch::batch(const storage_type& rhs) : base_type(rhs) { } inline batch& batch::operator=(const storage_type& rhs) { this->m_value = rhs; return *this; } inline batch::batch(const batch_bool_type& rhs) : base_type(vandq_u8(rhs, batch(1))) { } inline batch& batch::operator=(const batch_bool_type& rhs) { this->m_value = vandq_u8(rhs, batch(1)); return *this; } inline batch& batch::load_aligned(const int8_t* src) { this->m_value = vreinterpretq_u8_s8(vld1q_s8(src)); return *this; } inline batch& batch::load_unaligned(const int8_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const uint8_t* src) { this->m_value = vld1q_u8(src); return *this; } inline batch& batch::load_unaligned(const uint8_t* src) { return load_aligned(src); } inline void batch::store_aligned(int8_t* dst) const { vst1q_s8(dst, vreinterpretq_s8_u8(this->m_value)); } inline void batch::store_unaligned(int8_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint8_t* dst) const { vst1q_u8(dst, this->m_value); } inline void batch::store_unaligned(uint8_t* dst) const { store_aligned(dst); } XSIMD_DEFINE_LOAD_STORE_INT8(uint8_t, 16, 16) XSIMD_DEFINE_LOAD_STORE_LONG(uint8_t, 16, 16) inline batch::operator uint8x16_t() const { return this->m_value; } namespace detail { template <> struct batch_kernel : neon_int_kernel_base> { using batch_type = batch; using value_type = uint8_t; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(rhs))); } static batch_type add(const batch_type& lhs, const batch_type& rhs) { return vaddq_u8(lhs, rhs); } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { return vsubq_u8(lhs, rhs); } static batch_type sadd(const batch_type &lhs, const batch_type &rhs) { return vqaddq_u8(lhs, rhs); } static batch_type ssub(const batch_type &lhs, const batch_type &rhs) { return vqsubq_u8(lhs, rhs); } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { return vmulq_u8(lhs, rhs); } static batch_type div(const batch_type& lhs, const batch_type& rhs) { return neon_detail::unroll_op<16, uint8x16_t, uint8_t>([&lhs, &rhs] (std::size_t idx) { return lhs[idx] / rhs[idx]; }); } static batch_type mod(const batch_type& lhs, const batch_type& rhs) { return neon_detail::unroll_op<16, uint8x16_t, uint8_t>([&lhs, &rhs] (std::size_t idx) { return lhs[idx] % rhs[idx]; }); } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { return vceqq_u8(lhs, rhs); } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { return !(lhs == rhs); } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { return vcltq_u8(lhs, rhs); } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { return vcleq_u8(lhs, rhs); } static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return vandq_u8(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return vorrq_u8(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return veorq_u8(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return vmvnq_u8(rhs); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return vbicq_u8(lhs, rhs); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return vminq_u8(lhs, rhs); } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return vmaxq_u8(lhs, rhs); } static batch_type abs(const batch_type& rhs) { return rhs; } // Not implemented yet static value_type hadd(const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vaddvq_u8(rhs); #else uint8x8_t tmp = vpadd_u8(vget_low_u8(rhs), vget_high_u8(rhs)); value_type res = 0; for (std::size_t i = 0; i < 8; ++i) { res += tmp[i]; } return res; #endif } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { return vbslq_u8(cond, a, b); } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vzip1q_u8(lhs, rhs); #else uint8x8x2_t tmp = vzip_u8(vget_low_u8(lhs), vget_low_u8(rhs)); return vcombine_u8(tmp.val[0], tmp.val[1]); #endif } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION return vzip2q_u8(lhs, rhs); #else uint8x8x2_t tmp = vzip_u8(vget_high_u8(lhs), vget_high_u8(rhs)); return vcombine_u8(tmp.val[0], tmp.val[1]); #endif } static batch_type extract_pair(const batch_type& lhs, const batch_type& rhs, const int n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_16_v2(vextq_u8); default: break; } return batch_type(uint8_t(0)); } }; } namespace detail { inline batch shift_left(const batch& lhs, int32_t n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_8(vshlq_n_u8); default: break; } return batch(uint8_t(0)); } inline batch shift_right(const batch& lhs, int32_t n) { switch(n) { case 0: return lhs; XSIMD_REPEAT_8(vshrq_n_u8); default: break; } return batch(uint8_t(0)); } } inline batch operator<<(const batch& lhs, int32_t rhs) { return detail::shift_left(lhs, rhs); } inline batch operator>>(const batch& lhs, int32_t rhs) { return detail::shift_right(lhs, rhs); } inline batch operator<<(const batch& lhs, const batch& rhs) { return vshlq_u8(lhs, rhs); } inline batch operator>>(const batch& lhs, const batch& rhs) { return vshlq_u8(lhs, vnegq_s8(rhs)); } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_neon_utils.hpp000066400000000000000000000022741410101234500226530ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NEON_UTILS_HPP #define XSIMD_NEON_UTILS_HPP namespace xsimd { namespace neon_detail { template inline R unroll_op_impl(F&& f, detail::index_sequence) { return R{static_cast(f(I))...}; } template inline R unroll_op(F&& f) { return unroll_op_impl(f, detail::make_index_sequence{}); } } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_scalar.hpp000066400000000000000000000371041410101234500217410ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_SCALAR_HPP #define XSIMD_SCALAR_HPP #include #include #include "xsimd_common_math.hpp" namespace xsimd { /********************************************* * scalar fallback for xsimd math operations * *********************************************/ using std::abs; using std::acos; using std::acosh; using std::asin; using std::asinh; using std::atan2; using std::atan; using std::atanh; using std::cbrt; using std::ceil; using std::copysign; using std::cos; using std::cosh; using std::erf; using std::erfc; using std::exp2; using std::exp; using std::expm1; using std::fabs; using std::fdim; using std::fmax; using std::fmin; using std::floor; using std::fmod; using std::hypot; using std::lgamma; using std::ldexp; using std::log10; using std::log1p; using std::log2; using std::log; using std::modf; using std::nearbyint; using std::nextafter; using std::proj; using std::remainder; using std::rint; using std::rint; using std::round; using std::round; using std::sin; using std::sinh; using std::sqrt; using std::tan; using std::tanh; using std::tgamma; using std::trunc; #ifndef _WIN32 using std::isfinite; using std::isinf; using std::isnan; #else // Windows defines catch all templates template typename std::enable_if::value, bool>::type isfinite(T var) { return std::isfinite(var); } template typename std::enable_if::value, bool>::type isfinite(T var) { return isfinite(double(var)); } template typename std::enable_if::value, bool>::type isinf(T var) { return std::isinf(var); } template typename std::enable_if::value, bool>::type isinf(T var) { return isinf(double(var)); } template typename std::enable_if::value, bool>::type isnan(T var) { return std::isnan(var); } template typename std::enable_if::value, bool>::type isnan(T var) { return isnan(double(var)); } #endif #ifdef XSIMD_ENABLE_NUMPY_COMPLEX template bool isnan(std::complex var) { return std::isnan(std::real(var)) || std::isnan(std::imag(var)); } template bool isinf(std::complex var) { return std::isinf(std::real(var)) || std::isinf(std::imag(var)); } #endif #ifdef XSIMD_ENABLE_XTL_COMPLEX using xtl::abs; using xtl::norm; using xtl::proj; using xtl::exp; using xtl::log; using xtl::log10; using xtl::pow; using xtl::sqrt; using xtl::sin; using xtl::cos; using xtl::tan; using xtl::asin; using xtl::acos; using xtl::atan; using xtl::sinh; using xtl::cosh; using xtl::tanh; using xtl::asinh; using xtl::acosh; using xtl::atanh; #endif template ::value>::type> inline bool is_flint(const T& x) { return std::isnan(x - x) ? std::numeric_limits::quiet_NaN() : x - std::trunc(x); } template ::value>::type> inline bool is_odd(const T& x) { return is_even(x - 1.); } template ::value>::type> inline bool is_even(const T& x) { return is_flint(x * T(0.5)); } template ::value>::type> inline T exp10(const T& x) { // FIXME: faster alternatives exist return std::pow(T(10), x); } namespace detail { template inline C expm1_complex_scalar_impl(const C& val) { using T = typename C::value_type; T isin = sin(val.imag()); T rem1 = expm1(val.real()); T re = rem1 + T(1.); T si = sin(val.imag() * T(0.5)); return std::complex(rem1 - T(2.) * re *si * si, re * isin); } } template inline std::complex expm1(const std::complex& val) { return detail::expm1_complex_scalar_impl(val); } #ifdef XSIMD_ENABLE_XTL_COMPLEX template inline xtl::xcomplex expm1(const xtl::xcomplex& val) { return detail::expm1_complex_scalar_impl(val); } #endif namespace detail { } template inline typename std::enable_if::value, T0>::type pow(const T0& t0, const T1& t1) { return detail::ipow(t0, t1); } template inline auto pow(const T0& t0, const T1& t1) -> typename std::enable_if::value && std::is_floating_point::value, decltype(std::pow(t0, t1))>::type { return std::pow(t0, t1); } template inline typename std::enable_if::value, std::complex>::type pow(const std::complex& t0, const T1& t1) { return detail::ipow(t0, t1); } template inline typename std::enable_if::value, std::complex>::type pow(const std::complex& t0, const T1& t1) { return std::pow(t0, t1); } template inline auto pow(const T0& t0, const std::complex& t1) -> typename std::enable_if::value, decltype(std::pow(t0, t1))>::type { return std::pow(t0, t1); } template inline auto bitofsign(T const& x) -> decltype(std::signbit(x)) { return std::signbit(x); } template inline auto signbit(T const& v) -> decltype(bitofsign(v)) { return bitofsign(v); } inline double sign(bool const &v) { return v; } template ::value>::type> inline T sign(const T& v) { return v < T(0) ? T(-1.) : v == T(0) ? T(0.) : T(1.); } namespace detail { template inline C sign_complex_scalar_impl(const C& v) { using value_type = typename C::value_type; if (v.real()) { return C(sign(v.real()), value_type(0)); } else { return C(sign(v.imag()), value_type(0)); } } } template inline std::complex sign(const std::complex& v) { return detail::sign_complex_scalar_impl(v); } #ifdef XSIMD_ENABLE_XTL_COMPLEX template inline xtl::xcomplex sign(const xtl::xcomplex& v) { return detail::sign_complex_scalar_impl(v); } #endif template std::complex log2(const std::complex& val) { return log(val) / log(T(2)); } #ifdef XSIMD_ENABLE_XTL_COMPLEX template inline xtl::xcomplex log2(const xtl::xcomplex& val) { return log(val) / log(T(2)); } #endif namespace detail { template inline C log1p_complex_scalar_impl(const C& val) { using T = typename C::value_type; C u = C(1.) + val; return u == C(1.) ? val : (u.real() <= T(0.) ? log(u) : log(u) * val / (u - C(1.))); } } template inline std::complex log1p(const std::complex& val) { return detail::log1p_complex_scalar_impl(val); } #ifdef XSIMD_ENABLE_XTL_COMPLEX template inline xtl::xcomplex log1p(const xtl::xcomplex& val) { return detail::log1p_complex_scalar_impl(val); } #endif template inline auto min(T0 const &self, T1 const &other) -> typename std::enable_if::value && std::is_scalar::value, typename std::decay other ? other : self)>::type>::type { return self > other ? other : self; } // numpy defines minimum operator on complex using lexical comparison template inline std::complex::type> min(std::complex const &self, std::complex const &other) { return (self.real() < other.real()) ? (self) : (self.real() == other.real() ? (self.imag() < other.imag() ? self : other) : other); } template inline auto max(T0 const &self, T1 const &other) -> typename std::enable_if::value && std::is_scalar::value, typename std::decay other ? other : self)>::type>::type { return self < other ? other : self; } // numpy defines maximum operator on complex using lexical comparison template inline std::complex::type> max(std::complex const &self, std::complex const &other) { return (self.real() > other.real()) ? (self) : (self.real() == other.real() ? (self.imag() > other.imag() ? self : other) : other); } template inline typename std::enable_if::value, T>::type fma(const T& a, const T& b, const T& c) { return std::fma(a, b, c); } namespace detail { template inline C fma_complex_scalar_impl(const C& a, const C& b, const C& c) { return {fms(a.real(), b.real(), fms(a.imag(), b.imag(), c.real())), fma(a.real(), b.imag(), fma(a.imag(), b.real(), c.imag()))}; } } template inline std::complex fma(const std::complex& a, const std::complex& b, const std::complex& c) { return detail::fma_complex_scalar_impl(a, b, c); } #ifdef XSIMD_ENABLE_XTL_COMPLEX template inline xtl::xcomplex fma(const xtl::xcomplex& a, const xtl::xcomplex& b, const xtl::xcomplex& c) { return detail::fma_complex_scalar_impl(a, b, c); } #endif namespace detail { #define XSIMD_HASSINCOS_TRAIT(func) \ template \ struct has##func \ { \ template static auto get(T* ptr) -> decltype(func(std::declval(), std::declval(), std::declval()), std::true_type{});\ static std::false_type get(...); \ static constexpr bool value = decltype(get((S*)nullptr))::value; \ } #define XSIMD_HASSINCOS(func, T) has##func::value XSIMD_HASSINCOS_TRAIT(sincos); XSIMD_HASSINCOS_TRAIT(sincosf); XSIMD_HASSINCOS_TRAIT(__sincos); XSIMD_HASSINCOS_TRAIT(__sincosf); struct generic_sincosf { template typename std::enable_if::type operator()(float val, T &s, T &c) { sincosf(val, &s, &c); } template typename std::enable_if::type operator()(float val, T &s, T &c) { __sincosf(val, &s, &c); } template typename std::enable_if::type operator()(float val, T &s, T &c) { s = std::sin(val); c = std::cos(val); } }; struct generic_sincos { template typename std::enable_if::type operator()(double val, T &s, T &c) { sincos(val, &s, &c); } template typename std::enable_if::type operator()(double val, T &s, T &c) { __sincos(val, &s, &c); } template typename std::enable_if::type operator()(double val, T &s, T &c) { s = std::sin(val); c = std::cos(val); } }; #undef XSIMD_HASSINCOS_TRAIT #undef XSIMD_HASSINCOS } inline void sincos(float val, float&s, float& c) { detail::generic_sincosf{}(val, s, c); } inline void sincos(double val, double&s, double& c) { detail::generic_sincos{}(val, s, c); } template inline void sincos(const std::complex& val, std::complex& s, std::complex& c) { s = std::sin(val); c = std::cos(val); } #ifdef XSIMD_ENABLE_XTL_COMPLEX template inline void sincos(const xtl::xcomplex& val, xtl::xcomplex& s, xtl::xcomplex& c) { s = sin(val); c = cos(val); } #endif template inline T frexp(T const& val, int& exp) { return std::frexp(val, &exp); } template inline decltype(abs(std::declval())) norm(const T& val) { auto tmp = abs(val); return tmp * tmp; } template::value>::type> T sadd(const T& lhs, const T& rhs) { if (std::numeric_limits::is_signed) { if ((lhs > 0) && (rhs > std::numeric_limits::max() - lhs)) { return std::numeric_limits::max(); } else if ((lhs < 0) && (rhs < std::numeric_limits::lowest() - lhs)) { return std::numeric_limits::lowest(); } else { return lhs + rhs; } } else { if (rhs > std::numeric_limits::max() - lhs) { return std::numeric_limits::max(); } else { return lhs + rhs; } } } template::value>::type> T ssub(const T& lhs, const T& rhs) { if (std::numeric_limits::is_signed) { return sadd(lhs, (T)-rhs); } else { if (lhs < rhs) { return std::numeric_limits::lowest(); } else { return lhs - rhs; } } } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_sse_complex.hpp000066400000000000000000000317651410101234500230240ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_SSE_COMPLEX_HPP #define XSIMD_SSE_COMPLEX_HPP #include #ifdef XSIMD_ENABLE_XTL_COMPLEX #include "xtl/xcomplex.hpp" #endif #include "xsimd_sse_float.hpp" #include "xsimd_sse_double.hpp" #include "xsimd_complex_base.hpp" namespace xsimd { /************************************** * batch_bool, 4> * **************************************/ template <> struct simd_batch_traits, 4>> : complex_batch_bool_traits, float, 4, 16> { }; template<> class batch_bool, 4> : public simd_complex_batch_bool, 4>> { public: using self_type = batch_bool, 4>; using base_type = simd_complex_batch_bool; using real_batch = batch_bool; batch_bool() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch_bool(bool b0, bool b1, bool b2, bool b3) : base_type(real_batch(b0, b1, b2, b3)) { } }; /********************************* * batch, 4> * *********************************/ template <> struct simd_batch_traits, 4>> : complex_batch_traits, float, 4, 16> { }; template <> class batch, 4> : public simd_complex_batch, 4>> { public: using self_type = batch, 4>; using base_type = simd_complex_batch; using value_type = std::complex; using real_batch = batch; batch() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch(value_type c0, value_type c1, value_type c2, value_type c3) : base_type(real_batch(c0.real(), c1.real(), c2.real(), c3.real()), real_batch(c0.imag(), c1.imag(), c2.imag(), c3.imag())) { } private: batch& load_complex(const real_batch& hi, const real_batch& lo); real_batch get_complex_high() const; real_batch get_complex_low() const; friend class simd_complex_batch, 4>>; }; /*************************************** * batch_bool, 2> * ***************************************/ template <> struct simd_batch_traits, 2>> : complex_batch_bool_traits, double, 2, 16> { }; template<> class batch_bool, 2> : public simd_complex_batch_bool, 2>> { public: using self_type = batch_bool, 2>; using base_type = simd_complex_batch_bool; using real_batch = batch_bool; batch_bool() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch_bool(bool b0, bool b1) : base_type(real_batch(b0, b1)) { } }; /********************************** * batch, 2> * **********************************/ template <> struct simd_batch_traits, 2>> : complex_batch_traits, double, 2, 16> { }; template <> class batch, 2> : public simd_complex_batch, 2>> { public: using self_type = batch, 2>; using base_type = simd_complex_batch; using value_type = std::complex; using real_batch = batch; batch() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch(value_type c0, value_type c1) : base_type(real_batch(c0.real(), c1.real()), real_batch(c0.imag(), c1.imag())) { } private: batch& load_complex(const real_batch& hi, const real_batch& lo); real_batch get_complex_high() const; real_batch get_complex_low() const; friend class simd_complex_batch, 2>>; }; /******************************************** * batch, N> implementation * ********************************************/ inline batch, 4>& batch, 4>::load_complex(const real_batch& hi, const real_batch& lo) { this->m_real = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)); this->m_imag = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1)); return *this; } inline auto batch, 4>::get_complex_high() const -> real_batch { return _mm_unpacklo_ps(this->m_real, this->m_imag); } inline auto batch, 4>::get_complex_low() const -> real_batch { return _mm_unpackhi_ps(this->m_real, this->m_imag); } inline batch, 2>& batch, 2>::load_complex(const real_batch& hi, const real_batch& lo) { this->m_real = _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(0, 0)); this->m_imag = _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(1, 1)); return *this; } inline auto batch, 2>::get_complex_high() const -> real_batch { return _mm_unpacklo_pd(this->m_real, this->m_imag); } inline auto batch, 2>::get_complex_low() const -> real_batch { return _mm_unpackhi_pd(this->m_real, this->m_imag); } #ifdef XSIMD_ENABLE_XTL_COMPLEX /**************************************************** * batch_bool, 4> * ****************************************************/ template struct simd_batch_traits, 4>> : complex_batch_bool_traits, float, 4, 16> { }; template class batch_bool, 4> : public simd_complex_batch_bool, 4>> { public: using self_type = batch_bool, 4>; using base_type = simd_complex_batch_bool; using real_batch = batch_bool; batch_bool() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch_bool(bool b0, bool b1, bool b2, bool b3) : base_type(real_batch(b0, b1, b2, b3)) { } }; /*********************************************** * batch, 4> * ***********************************************/ template struct simd_batch_traits, 4>> : complex_batch_traits, float, 4, 16> { }; template class batch, 4> : public simd_complex_batch, 4>> { public: using self_type = batch, 4>; using base_type = simd_complex_batch; using value_type = xtl::xcomplex; using real_batch = batch; batch() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch(value_type c0, value_type c1, value_type c2, value_type c3) : base_type(real_batch(c0.real(), c1.real(), c2.real(), c3.real()), real_batch(c0.imag(), c1.imag(), c2.imag(), c3.imag())) { } private: batch& load_complex(const real_batch& hi, const real_batch& lo); real_batch get_complex_high() const; real_batch get_complex_low() const; friend class simd_complex_batch, 4>>; }; /****************************************************** * batch_bool, 2> * ******************************************************/ template struct simd_batch_traits, 2>> : complex_batch_bool_traits, double, 2, 16> { }; template class batch_bool, 2> : public simd_complex_batch_bool, 2>> { public: using self_type = batch_bool, 2>; using base_type = simd_complex_batch_bool; using real_batch = batch_bool; batch_bool() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch_bool(bool b0, bool b1) : base_type(real_batch(b0, b1)) { } }; /************************************************* * batch, 2> * *************************************************/ template struct simd_batch_traits, 2>> : complex_batch_traits, double, 2, 16> { }; template class batch, 2> : public simd_complex_batch, 2>> { public: using self_type = batch, 2>; using base_type = simd_complex_batch; using value_type = xtl::xcomplex; using real_batch = batch; batch() = default; using base_type::base_type; // VS2015 has a bug with inheriting constructors involving SFINAE batch(value_type c0, value_type c1) : base_type(real_batch(c0.real(), c1.real()), real_batch(c0.imag(), c1.imag())) { } private: batch& load_complex(const real_batch& hi, const real_batch& lo); real_batch get_complex_high() const; real_batch get_complex_low() const; friend class simd_complex_batch, 2>>; }; /*********************************************************** * batch, N> implementation * ***********************************************************/ template inline batch, 4>& batch, 4>::load_complex(const real_batch& hi, const real_batch& lo) { this->m_real = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)); this->m_imag = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1)); return *this; } template inline auto batch, 4>::get_complex_high() const -> real_batch { return _mm_unpacklo_ps(this->m_real, this->m_imag); } template inline auto batch, 4>::get_complex_low() const -> real_batch { return _mm_unpackhi_ps(this->m_real, this->m_imag); } template inline batch, 2>& batch, 2>::load_complex(const real_batch& hi, const real_batch& lo) { this->m_real = _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(0, 0)); this->m_imag = _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(1, 1)); return *this; } template inline auto batch, 2>::get_complex_high() const -> real_batch { return _mm_unpacklo_pd(this->m_real, this->m_imag); } template inline auto batch, 2>::get_complex_low() const -> real_batch { return _mm_unpackhi_pd(this->m_real, this->m_imag); } #endif } #endif xsimd-7.6.0/include/xsimd/types/xsimd_sse_conversion.hpp000066400000000000000000000164141410101234500235340ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_SSE_CONVERSION_HPP #define XSIMD_SSE_CONVERSION_HPP #include "xsimd_sse_double.hpp" #include "xsimd_sse_float.hpp" #include "xsimd_sse_int8.hpp" #include "xsimd_sse_int16.hpp" #include "xsimd_sse_int32.hpp" #include "xsimd_sse_int64.hpp" namespace xsimd { /************************ * conversion functions * ************************/ batch to_int(const batch& x); batch to_int(const batch& x); batch to_float(const batch& x); batch to_float(const batch& x); batch u8_to_u16(const batch& x); batch u16_to_u8(const batch& x); batch u8_to_u32(const batch& x); batch u32_to_u8(const batch& x); batch u8_to_u64(const batch& x); batch u64_to_u8(const batch& x); /************************** * boolean cast functions * **************************/ batch_bool bool_cast(const batch_bool& x); batch_bool bool_cast(const batch_bool& x); batch_bool bool_cast(const batch_bool& x); batch_bool bool_cast(const batch_bool& x); /*************************************** * conversion functions implementation * ***************************************/ inline batch to_int(const batch& x) { return _mm_cvttps_epi32(x); } inline batch to_int(const batch& x) { #if defined(XSIMD_AVX512VL_AVAILABLE) & defined(XSIMD_AVX512DQ_AVAILABLE) return _mm_cvttpd_epi64(x); #else return batch(static_cast(x[0]), static_cast(x[1])); #endif } inline batch to_float(const batch& x) { return _mm_cvtepi32_ps(x); } inline batch to_float(const batch& x) { #if defined(XSIMD_AVX512VL_AVAILABLE) & defined(XSIMD_AVX512DQ_AVAILABLE) return _mm_cvtepi64_pd(x); #else return batch(static_cast(x[0]), static_cast(x[1])); #endif } inline batch u8_to_u16(const batch& x) { return static_cast>(x); } inline batch u16_to_u8(const batch& x) { return static_cast>(x); } inline batch u8_to_u32(const batch& x) { return static_cast>(x); } inline batch u32_to_u8(const batch& x) { return static_cast>(x); } inline batch u8_to_u64(const batch& x) { return static_cast>(x); } inline batch u64_to_u8(const batch& x) { return static_cast>(x); } /***************************************** * batch cast functions implementation * *****************************************/ XSIMD_BATCH_CAST_IMPLICIT(int8_t, uint8_t, 16) XSIMD_BATCH_CAST_IMPLICIT(uint8_t, int8_t, 16) XSIMD_BATCH_CAST_IMPLICIT(int16_t, uint16_t, 8) XSIMD_BATCH_CAST_IMPLICIT(uint16_t, int16_t, 8) XSIMD_BATCH_CAST_IMPLICIT(int32_t, uint32_t, 4) XSIMD_BATCH_CAST_INTRINSIC(int32_t, float, 4, _mm_cvtepi32_ps) XSIMD_BATCH_CAST_IMPLICIT(uint32_t, int32_t, 4) XSIMD_BATCH_CAST_IMPLICIT(int64_t, uint64_t, 2) XSIMD_BATCH_CAST_IMPLICIT(uint64_t, int64_t, 2) XSIMD_BATCH_CAST_INTRINSIC(float, int32_t, 4, _mm_cvttps_epi32) #if defined(XSIMD_AVX512VL_AVAILABLE) #if defined(_MSC_VER) namespace detail { static inline __m128 xsimd_mm_cvtepu32_ps(__m128i a) { return _mm512_castps512_ps128(_mm512_cvtepu32_ps(_mm512_castsi128_si512(a))); } } XSIMD_BATCH_CAST_INTRINSIC(uint32_t, float, 4, detail::xsimd_mm_cvtepu32_ps) #else XSIMD_BATCH_CAST_INTRINSIC(uint32_t, float, 4, _mm_cvtepu32_ps) #endif XSIMD_BATCH_CAST_INTRINSIC(float, uint32_t, 4, _mm_cvttps_epu32) #if defined(XSIMD_AVX512DQ_AVAILABLE) XSIMD_BATCH_CAST_INTRINSIC(int64_t, double, 2, _mm_cvtepi64_pd) XSIMD_BATCH_CAST_INTRINSIC(uint64_t, double, 2, _mm_cvtepu64_pd) XSIMD_BATCH_CAST_INTRINSIC(double, int64_t, 2, _mm_cvttpd_epi64) XSIMD_BATCH_CAST_INTRINSIC(double, uint64_t, 2, _mm_cvttpd_epu64) #endif #endif /************************** * boolean cast functions * **************************/ inline batch_bool bool_cast(const batch_bool& x) { return _mm_castps_si128(x); } inline batch_bool bool_cast(const batch_bool& x) { return _mm_castpd_si128(x); } inline batch_bool bool_cast(const batch_bool& x) { return _mm_castsi128_ps(x); } inline batch_bool bool_cast(const batch_bool& x) { return _mm_castsi128_pd(x); } /***************************************** * bitwise cast functions implementation * *****************************************/ XSIMD_BITWISE_CAST_INTRINSIC(float, 4, double, 2, _mm_castps_pd) XSIMD_BITWISE_CAST_INTRINSIC(float, 4, int32_t, 4, _mm_castps_si128) XSIMD_BITWISE_CAST_INTRINSIC(float, 4, int64_t, 2, _mm_castps_si128) XSIMD_BITWISE_CAST_INTRINSIC(double, 2, float, 4, _mm_castpd_ps) XSIMD_BITWISE_CAST_INTRINSIC(double, 2, int32_t, 4, _mm_castpd_si128) XSIMD_BITWISE_CAST_INTRINSIC(double, 2, int64_t, 2, _mm_castpd_si128) XSIMD_BITWISE_CAST_INTRINSIC(int32_t, 4, float, 4, _mm_castsi128_ps) XSIMD_BITWISE_CAST_INTRINSIC(int32_t, 4, double, 2, _mm_castsi128_pd) XSIMD_BITWISE_CAST_INTRINSIC(int64_t, 2, float, 4, _mm_castsi128_ps) XSIMD_BITWISE_CAST_INTRINSIC(int64_t, 2, double, 2, _mm_castsi128_pd) } #endif xsimd-7.6.0/include/xsimd/types/xsimd_sse_double.hpp000066400000000000000000000502231410101234500226150ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_SSE_DOUBLE_HPP #define XSIMD_SSE_DOUBLE_HPP #include "xsimd_base.hpp" #include namespace xsimd { /************************* * batch_bool * *************************/ template <> struct simd_batch_traits> { using value_type = double; static constexpr std::size_t size = 2; using batch_type = batch; static constexpr std::size_t align = 16; }; template <> class batch_bool : public simd_batch_bool> { public: batch_bool(); explicit batch_bool(bool b); batch_bool(bool b0, bool b1); batch_bool(const __m128d& rhs); batch_bool& operator=(const __m128d& rhs); operator __m128d() const; bool_proxy operator[](std::size_t index); bool operator[](std::size_t index) const; __m128d get_value() const; private: batch_bool& load_values(bool b0, bool b1); union { __m128d m_value; double m_array[2]; }; friend class simd_batch_bool>; }; /******************** * batch * ********************/ template <> struct simd_batch_traits> { using value_type = double; static constexpr std::size_t size = 2; using batch_bool_type = batch_bool; static constexpr std::size_t align = 16; using storage_type = __m128d; }; template <> class batch : public simd_batch> { public: using self_type = batch; using base_type = simd_batch; using batch_bool_type = typename base_type::batch_bool_type; batch(); explicit batch(double d); batch(double d0, double d1); explicit batch(const double* src); batch(const double* src, aligned_mode); batch(const double* src, unaligned_mode); batch(const __m128d& rhs); batch& operator=(const __m128d& rhs); batch(const batch_bool_type& rhs); batch& operator=(const batch_bool_type& rhs); operator __m128d() const; XSIMD_DECLARE_LOAD_STORE_ALL(double, 2) XSIMD_DECLARE_LOAD_STORE_LONG(double, 2) using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; }; /**************************************** * batch_bool implementation * ****************************************/ inline batch_bool::batch_bool() { } inline batch_bool::batch_bool(bool b) { m_value = _mm_castsi128_pd(_mm_set1_epi32(-(int)b)); } inline batch_bool::batch_bool(bool b0, bool b1) { m_value = _mm_castsi128_pd(_mm_setr_epi32(-(int)b0, -(int)b0, -(int)b1, -(int)b1)); } inline batch_bool::batch_bool(const __m128d& rhs) { m_value = rhs; } inline batch_bool& batch_bool::operator=(const __m128d& rhs) { m_value = rhs; return *this; } inline batch::batch(const batch_bool_type& rhs) : base_type(_mm_and_pd(rhs, batch(1.))) { } inline batch& batch::operator=(const batch_bool_type& rhs) { this->m_value = _mm_and_pd(rhs, batch(1.)); return *this; } inline batch_bool::operator __m128d() const { return m_value; } inline bool_proxy batch_bool::operator[](std::size_t index) { return bool_proxy(m_array[index & 1]); } inline bool batch_bool::operator[](std::size_t index) const { return static_cast(m_array[index & 1]); } inline __m128d batch_bool::get_value() const { return m_value; } inline batch_bool& batch_bool::load_values(bool b0, bool b1) { m_value = _mm_castsi128_pd(_mm_setr_epi32(-(int)b0, -(int)b0, -(int)b1, -(int)b1)); return *this; } namespace detail { template <> struct batch_bool_kernel { using batch_type = batch_bool; static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return _mm_and_pd(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return _mm_or_pd(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return _mm_xor_pd(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return _mm_xor_pd(rhs, _mm_castsi128_pd(_mm_set1_epi32(-1))); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return _mm_andnot_pd(lhs, rhs); } static batch_type equal(const batch_type& lhs, const batch_type& rhs) { return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(lhs), _mm_castpd_si128(rhs))); } static batch_type not_equal(const batch_type& lhs, const batch_type& rhs) { return _mm_cmpneq_pd(lhs, rhs); } static bool all(const batch_type& rhs) { return _mm_movemask_pd(rhs) == 3; } static bool any(const batch_type& rhs) { return _mm_movemask_pd(rhs) != 0; } }; } /*********************************** * batch implementation * ***********************************/ inline batch::batch() { } inline batch::batch(double d) : base_type(_mm_set1_pd(d)) { } inline batch::batch(double d0, double d1) : base_type(_mm_setr_pd(d0, d1)) { } inline batch::batch(const double* src) : base_type(_mm_loadu_pd(src)) { } inline batch::batch(const double* src, aligned_mode) : base_type(_mm_load_pd(src)) { } inline batch::batch(const double* src, unaligned_mode) : base_type(_mm_loadu_pd(src)) { } inline batch::batch(const __m128d& rhs) : base_type(rhs) { } inline batch& batch::operator=(const __m128d& rhs) { this->m_value = rhs; return *this; } inline batch::operator __m128d() const { return this->m_value; } inline batch& batch::load_aligned(const int8_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i tmp1 = _mm_cvtepi8_epi32(tmp); #else __m128i mask = _mm_cmplt_epi8(tmp, _mm_set1_epi8(0)); __m128i tmp2 = _mm_unpacklo_epi8(tmp, mask); mask = _mm_cmplt_epi16(tmp2, _mm_set1_epi16(0)); __m128i tmp1 = _mm_unpacklo_epi16(tmp2, mask); #endif this->m_value = _mm_cvtepi32_pd(tmp1); return *this; } XSIMD_DEFINE_LOAD_STORE(double, 2, bool, 16) inline batch& batch::load_unaligned(const int8_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const uint8_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i tmp1 = _mm_cvtepu8_epi32(tmp); #else __m128i tmp2 = _mm_unpacklo_epi8(tmp, _mm_set1_epi8(0)); __m128i tmp1 = _mm_unpacklo_epi16(tmp2, _mm_set1_epi16(0)); #endif this->m_value = _mm_cvtepi32_pd(tmp1); return *this; } inline batch& batch::load_unaligned(const uint8_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const int16_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i tmp1 = _mm_cvtepi16_epi32(tmp); #else __m128i mask = _mm_cmplt_epi16(tmp, _mm_set1_epi16(0)); __m128i tmp1 = _mm_unpacklo_epi16(tmp, mask); #endif this->m_value = _mm_cvtepi32_pd(tmp1); return *this; } inline batch& batch::load_unaligned(const int16_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const uint16_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i tmp1 = _mm_cvtepu16_epi32(tmp); #else __m128i tmp1 = _mm_unpacklo_epi16(tmp, _mm_set1_epi16(0)); #endif this->m_value = _mm_cvtepi32_pd(tmp1); return *this; } inline batch& batch::load_unaligned(const uint16_t* src) { return load_aligned(src); } XSIMD_DEFINE_LOAD_STORE(double, 2, int32_t, 16) XSIMD_DEFINE_LOAD_STORE(double, 2, uint32_t, 16) XSIMD_DEFINE_LOAD_STORE(double, 2, int64_t, 16) XSIMD_DEFINE_LOAD_STORE(double, 2, uint64_t, 16) XSIMD_DEFINE_LOAD_STORE_LONG(double, 2, 16) XSIMD_DEFINE_LOAD_STORE(double, 2, float, 16) inline batch& batch::load_aligned(const double* src) { this->m_value = _mm_load_pd(src); return *this; } inline batch& batch::load_unaligned(const double* src) { this->m_value = _mm_loadu_pd(src); return *this; } inline void batch::store_aligned(int8_t* dst) const { __m128i tmp = _mm_cvtpd_epi32(this->m_value); __m128i tmp1 = _mm_packs_epi32(tmp, tmp); __m128i tmp2 = _mm_packs_epi16(tmp1, tmp1); _mm_storel_epi64((__m128i*)dst, tmp2); } inline void batch::store_unaligned(int8_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint8_t* dst) const { __m128i tmp = _mm_cvtpd_epi32(this->m_value); __m128i tmp1 = _mm_packs_epi32(tmp, tmp); __m128i tmp2 = _mm_packus_epi16(tmp1, tmp1); _mm_storel_epi64((__m128i*)dst, tmp2); } inline void batch::store_unaligned(uint8_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(int16_t* dst) const { __m128i tmp = _mm_cvtpd_epi32(this->m_value); __m128i tmp1 = _mm_packs_epi32(tmp, tmp); _mm_storel_epi64((__m128i*)dst, tmp1); } inline void batch::store_unaligned(int16_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint16_t* dst) const { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i tmp = _mm_cvtpd_epi32(this->m_value); __m128i tmp1 = _mm_packus_epi32(tmp, tmp); _mm_storel_epi64((__m128i*)dst, tmp1); #else alignas(16) double tmp[2]; _mm_store_pd(tmp, this->m_value); unroller<2>([&](std::size_t i){ dst[i] = static_cast(tmp[i]); }); #endif } inline void batch::store_unaligned(uint16_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(double* dst) const { _mm_store_pd(dst, this->m_value); } inline void batch::store_unaligned(double* dst) const { _mm_storeu_pd(dst, this->m_value); } namespace detail { template <> struct batch_kernel { using batch_type = batch; using value_type = double; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { return _mm_xor_pd(rhs, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000))); } static batch_type add(const batch_type& lhs, const batch_type& rhs) { return _mm_add_pd(lhs, rhs); } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { return _mm_sub_pd(lhs, rhs); } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { return add(lhs, rhs); //do something special for inf? } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return sub(lhs, rhs); //do something special for inf? } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { return _mm_mul_pd(lhs, rhs); } static batch_type div(const batch_type& lhs, const batch_type& rhs) { return _mm_div_pd(lhs, rhs); } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { return _mm_cmpeq_pd(lhs, rhs); } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { return _mm_cmpneq_pd(lhs, rhs); } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { return _mm_cmplt_pd(lhs, rhs); } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { return _mm_cmple_pd(lhs, rhs); } static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return _mm_and_pd(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return _mm_or_pd(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return _mm_xor_pd(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return _mm_xor_pd(rhs, _mm_castsi128_pd(_mm_set1_epi32(-1))); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return _mm_andnot_pd(lhs, rhs); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return _mm_min_pd(lhs, rhs); } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return _mm_max_pd(lhs, rhs); } static batch_type fmin(const batch_type& lhs, const batch_type& rhs) { return min(lhs, rhs); } static batch_type fmax(const batch_type& lhs, const batch_type& rhs) { return max(lhs, rhs); } static batch_type abs(const batch_type& rhs) { __m128d sign_mask = _mm_set1_pd(-0.); // -0. = 1 << 63 return _mm_andnot_pd(sign_mask, rhs); } static batch_type fabs(const batch_type& rhs) { return abs(rhs); } static batch_type sqrt(const batch_type& rhs) { return _mm_sqrt_pd(rhs); } static batch_type fma(const batch_type& x, const batch_type& y, const batch_type& z) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_FMA3_VERSION return _mm_fmadd_pd(x, y, z); #elif XSIMD_X86_INSTR_SET >= XSIMD_X86_AMD_FMA4_VERSION return _mm_macc_pd(x, y, z); #else return x * y + z; #endif } static batch_type fms(const batch_type& x, const batch_type& y, const batch_type& z) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_FMA3_VERSION return _mm_fmsub_pd(x, y, z); #elif XSIMD_X86_INSTR_SET >= XSIMD_X86_AMD_FMA4_VERSION return _mm_msub_pd(x, y, z); #else return x * y - z; #endif } static batch_type fnma(const batch_type& x, const batch_type& y, const batch_type& z) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_FMA3_VERSION return _mm_fnmadd_pd(x, y, z); #elif XSIMD_X86_INSTR_SET >= XSIMD_X86_AMD_FMA4_VERSION return _mm_nmacc_pd(x, y, z); #else return -x * y + z; #endif } static batch_type fnms(const batch_type& x, const batch_type& y, const batch_type& z) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_FMA3_VERSION return _mm_fnmsub_pd(x, y, z); #elif XSIMD_X86_INSTR_SET >= XSIMD_X86_AMD_FMA4_VERSION return _mm_nmsub_pd(x, y, z); #else return -x * y - z; #endif } static value_type hadd(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE3_VERSION __m128d tmp0 = _mm_hadd_pd(rhs, rhs); #else __m128d tmp0 = _mm_add_sd(rhs, _mm_unpackhi_pd(rhs, rhs)); #endif return _mm_cvtsd_f64(tmp0); } static batch_type haddp(const batch_type* row) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE3_VERSION return _mm_hadd_pd(row[0], row[1]); #else return _mm_add_pd(_mm_unpacklo_pd(row[0], row[1]), _mm_unpackhi_pd(row[0], row[1])); #endif } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION return _mm_blendv_pd(b, a, cond); #else return _mm_or_pd(_mm_and_pd(cond, a), _mm_andnot_pd(cond, b)); #endif } template static batch_type select(const batch_bool_constant& cond, const batch_type& a, const batch_type& b) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION (void)cond; constexpr int mask = batch_bool_constant::mask(); return _mm_blend_pd(b, a, mask); #else return select(cond(), a, b); #endif } static batch_bool_type isnan(const batch_type& x) { return _mm_cmpunord_pd(x, x); } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { return _mm_unpacklo_pd(lhs, rhs); } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { return _mm_unpackhi_pd(lhs, rhs); } static batch_type extract_pair(const batch_type& lhs, const batch_type& rhs, const int n) { batch_type b_concatenate; /* Double: n = [0,1] */ if(n) { b_concatenate[0] = lhs[1]; b_concatenate[1] = rhs[0]; return b_concatenate; } return lhs; } }; } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_sse_float.hpp000066400000000000000000000551171410101234500224570ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_SSE_FLOAT_HPP #define XSIMD_SSE_FLOAT_HPP #include "xsimd_base.hpp" #include namespace xsimd { /************************ * batch_bool * ************************/ template <> struct simd_batch_traits> { using value_type = float; static constexpr std::size_t size = 4; using batch_type = batch; static constexpr std::size_t align = 16; }; template <> class batch_bool : public simd_batch_bool> { public: batch_bool(); explicit batch_bool(bool b); batch_bool(bool b0, bool b1, bool b2, bool b3); batch_bool(const __m128& rhs); batch_bool& operator=(const __m128& rhs); operator __m128() const; bool_proxy operator[](std::size_t index); bool operator[](std::size_t index) const; __m128 get_value() const; private: batch_bool& load_values(bool b0, bool b1, bool b2, bool b3); union { __m128 m_value; float m_array[4]; }; friend class simd_batch_bool>; }; /******************* * batch * *******************/ template <> struct simd_batch_traits> { using value_type = float; static constexpr std::size_t size = 4; using batch_bool_type = batch_bool; static constexpr std::size_t align = 16; using storage_type = __m128; }; template <> class batch : public simd_batch> { public: using self_type = batch; using base_type = simd_batch; using batch_bool_type = typename base_type::batch_bool_type; batch(); explicit batch(float f); batch(float f0, float f1, float f2, float f3); explicit batch(const float* src); batch(const float* src, aligned_mode); batch(const float* src, unaligned_mode); batch(const __m128& rhs); batch& operator=(const __m128& rhs); batch(const batch_bool_type& rhs); batch& operator=(const batch_bool_type& rhs); operator __m128() const; XSIMD_DECLARE_LOAD_STORE_ALL(float, 4) XSIMD_DECLARE_LOAD_STORE_LONG(float, 4) using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; }; /*************************************** * batch_bool implementation * ***************************************/ inline batch_bool::batch_bool() { } inline batch_bool::batch_bool(bool b) { m_value = _mm_castsi128_ps(_mm_set1_epi32(-(int)b)); } inline batch_bool::batch_bool(bool b0, bool b1, bool b2, bool b3) { m_value = _mm_castsi128_ps(_mm_setr_epi32(-(int)b0, -(int)b1, -(int)b2, -(int)b3)); } inline batch_bool::batch_bool(const __m128& rhs) { m_value = rhs; } inline batch_bool& batch_bool::operator=(const __m128& rhs) { m_value = rhs; return *this; } inline batch::batch(const batch_bool_type& rhs) : base_type(_mm_and_ps(rhs, batch(1.f))) { } inline batch& batch::operator=(const batch_bool_type& rhs) { this->m_value = _mm_and_ps(rhs, batch(1.f)); return *this; } inline batch_bool::operator __m128() const { return m_value; } inline bool_proxy batch_bool::operator[](std::size_t index) { return bool_proxy(m_array[index & 3]); } inline bool batch_bool::operator[](std::size_t index) const { return static_cast(m_array[index & 3]); } inline __m128 batch_bool::get_value() const { return m_value; } inline batch_bool& batch_bool::load_values(bool b0, bool b1, bool b2, bool b3) { m_value = _mm_castsi128_ps(_mm_setr_epi32(-(int)b0, -(int)b1, -(int)b2, -(int)b3)); return *this; } namespace detail { template <> struct batch_bool_kernel { using batch_type = batch_bool; static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return _mm_and_ps(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return _mm_or_ps(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return _mm_xor_ps(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return _mm_xor_ps(rhs, _mm_castsi128_ps(_mm_set1_epi32(-1))); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return _mm_andnot_ps(lhs, rhs); } static batch_type equal(const batch_type& lhs, const batch_type& rhs) { return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(lhs), _mm_castps_si128(rhs))); } static batch_type not_equal(const batch_type& lhs, const batch_type& rhs) { return _mm_cmpneq_ps(lhs, rhs); } static bool all(const batch_type& rhs) { return _mm_movemask_ps(rhs) == 0x0F; } static bool any(const batch_type& rhs) { return _mm_movemask_ps(rhs) != 0; } }; } /********************************** * batch implementation * **********************************/ inline batch::batch() { } inline batch::batch(float f) : base_type(_mm_set1_ps(f)) { } inline batch::batch(float f0, float f1, float f2, float f3) : base_type(_mm_setr_ps(f0, f1, f2, f3)) { } inline batch::batch(const float* src) : base_type(_mm_loadu_ps(src)) { } inline batch::batch(const float* src, aligned_mode) : base_type(_mm_load_ps(src)) { } inline batch::batch(const float* src, unaligned_mode) : base_type(_mm_loadu_ps(src)) { } inline batch::batch(const __m128& rhs) : base_type(rhs) { } inline batch& batch::operator=(const __m128& rhs) { this->m_value = rhs; return *this; } inline batch::operator __m128() const { return this->m_value; } XSIMD_DEFINE_LOAD_STORE(float, 4, bool, 16) inline batch& batch::load_aligned(const int8_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i tmp1 = _mm_cvtepi8_epi32(tmp); #else __m128i mask = _mm_cmplt_epi8(tmp, _mm_set1_epi8(0)); __m128i tmp2 = _mm_unpacklo_epi8(tmp, mask); mask = _mm_cmplt_epi16(tmp2, _mm_set1_epi16(0)); __m128i tmp1 = _mm_unpacklo_epi16(tmp2, mask); #endif this->m_value = _mm_cvtepi32_ps(tmp1); return *this; } inline batch& batch::load_unaligned(const int8_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const uint8_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i tmp1 = _mm_cvtepu8_epi32(tmp); #else __m128i tmp2 = _mm_unpacklo_epi8(tmp, _mm_set1_epi8(0)); __m128i tmp1 = _mm_unpacklo_epi16(tmp2, _mm_set1_epi16(0)); #endif this->m_value = _mm_cvtepi32_ps(tmp1); return *this; } inline batch& batch::load_unaligned(const uint8_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const int16_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i tmp1 = _mm_cvtepi16_epi32(tmp); #else __m128i mask = _mm_cmplt_epi16(tmp, _mm_set1_epi16(0)); __m128i tmp1 = _mm_unpacklo_epi16(tmp, mask); #endif this->m_value = _mm_cvtepi32_ps(tmp1); return *this; } inline batch& batch::load_unaligned(const int16_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const uint16_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i tmp1 = _mm_cvtepu16_epi32(tmp); #else __m128i tmp1 = _mm_unpacklo_epi16(tmp, _mm_set1_epi16(0)); #endif this->m_value = _mm_cvtepi32_ps(tmp1); return *this; } inline batch& batch::load_unaligned(const uint16_t* src) { return load_aligned(src); } inline batch& batch::load_aligned(const int32_t* src) { this->m_value = _mm_cvtepi32_ps(_mm_load_si128((__m128i const*)src)); return *this; } inline batch& batch::load_unaligned(const int32_t* src) { this->m_value = _mm_cvtepi32_ps(_mm_loadu_si128((__m128i const*)src)); return *this; } XSIMD_DEFINE_LOAD_STORE(float, 4, uint32_t, 16) XSIMD_DEFINE_LOAD_STORE(float, 4, int64_t, 16) XSIMD_DEFINE_LOAD_STORE(float, 4, uint64_t, 16) XSIMD_DEFINE_LOAD_STORE_LONG(float, 4, 16) inline batch& batch::load_aligned(const float* src) { this->m_value = _mm_load_ps(src); return *this; } inline batch& batch::load_unaligned(const float* src) { this->m_value = _mm_loadu_ps(src); return *this; } inline batch& batch::load_aligned(const double* src) { __m128 tmp1 = _mm_cvtpd_ps(_mm_load_pd(src)); __m128 tmp2 = _mm_cvtpd_ps(_mm_load_pd(src+2)); this->m_value = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(1, 0, 1, 0)); return *this; } inline batch& batch::load_unaligned(const double* src) { __m128 tmp1 = _mm_cvtpd_ps(_mm_loadu_pd(src)); __m128 tmp2 = _mm_cvtpd_ps(_mm_loadu_pd(src + 2)); this->m_value = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(1, 0, 1, 0)); return *this; } inline void batch::store_aligned(int8_t* dst) const { __m128i tmp = _mm_cvtps_epi32(this->m_value); __m128i tmp1 = _mm_packs_epi32(tmp, _mm_set1_epi32(0)); __m128i tmp2 = _mm_packs_epi16(tmp1, _mm_set1_epi16(0)); _mm_storel_epi64((__m128i*)dst, tmp2); } inline void batch::store_unaligned(int8_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint8_t* dst) const { __m128i tmp = _mm_cvtps_epi32(this->m_value); __m128i tmp1 = _mm_packs_epi32(tmp, _mm_set1_epi32(0)); __m128i tmp2 = _mm_packus_epi16(tmp1, _mm_set1_epi16(0)); _mm_storel_epi64((__m128i*)dst, tmp2); } inline void batch::store_unaligned(uint8_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(int16_t* dst) const { __m128i tmp = _mm_cvtps_epi32(this->m_value); __m128i tmp1 = _mm_packs_epi32(tmp, _mm_set1_epi32(0)); _mm_storel_epi64((__m128i*)dst, tmp1); } inline void batch::store_unaligned(int16_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(uint16_t* dst) const { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i tmp = _mm_cvtps_epi32(this->m_value); __m128i tmp1 = _mm_packus_epi32(tmp, _mm_set1_epi32(0)); _mm_storel_epi64((__m128i*)dst, tmp1); #else alignas(16) float tmp[4]; _mm_store_ps(tmp, this->m_value); unroller<4>([&](std::size_t i){ dst[i] = static_cast(tmp[i]); }); #endif } inline void batch::store_unaligned(uint16_t* dst) const { store_aligned(dst); } inline void batch::store_aligned(int32_t* dst) const { _mm_store_si128((__m128i*)dst, _mm_cvtps_epi32(this->m_value)); } inline void batch::store_unaligned(int32_t* dst) const { _mm_storeu_si128((__m128i*)dst, _mm_cvtps_epi32(this->m_value)); } inline void batch::store_aligned(float* dst) const { _mm_store_ps(dst, this->m_value); } inline void batch::store_unaligned(float* dst) const { _mm_storeu_ps(dst, this->m_value); } inline void batch::store_aligned(double* dst) const { __m128d tmp1 = _mm_cvtps_pd(this->m_value); __m128 ftmp = _mm_shuffle_ps(this->m_value, this->m_value, _MM_SHUFFLE(3, 2, 3, 2)); __m128d tmp2 = _mm_cvtps_pd(ftmp); _mm_store_pd(dst, tmp1); _mm_store_pd(dst + 2, tmp2); } inline void batch::store_unaligned(double* dst) const { __m128d tmp1 = _mm_cvtps_pd(this->m_value); __m128 ftmp = _mm_shuffle_ps(this->m_value, this->m_value, _MM_SHUFFLE(3, 2, 3, 2)); __m128d tmp2 = _mm_cvtps_pd(ftmp); _mm_storeu_pd(dst, tmp1); _mm_storeu_pd(dst + 2, tmp2); } namespace detail { template <> struct batch_kernel { using batch_type = batch; using value_type = float; using batch_bool_type = batch_bool; static batch_type neg(const batch_type& rhs) { return _mm_xor_ps(rhs, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } static batch_type add(const batch_type& lhs, const batch_type& rhs) { return _mm_add_ps(lhs, rhs); } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { return _mm_sub_ps(lhs, rhs); } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { return add(lhs, rhs); //do something special for inf? } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return sub(lhs, rhs); //do something special for inf? } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { return _mm_mul_ps(lhs, rhs); } static batch_type div(const batch_type& lhs, const batch_type& rhs) { return _mm_div_ps(lhs, rhs); } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { return _mm_cmpeq_ps(lhs, rhs); } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { return _mm_cmpneq_ps(lhs, rhs); } static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { return _mm_cmplt_ps(lhs, rhs); } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { return _mm_cmple_ps(lhs, rhs); } static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return _mm_and_ps(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return _mm_or_ps(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return _mm_xor_ps(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return _mm_xor_ps(rhs, _mm_castsi128_ps(_mm_set1_epi32(-1))); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return _mm_andnot_ps(lhs, rhs); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return _mm_min_ps(lhs, rhs); } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return _mm_max_ps(lhs, rhs); } static batch_type fmin(const batch_type& lhs, const batch_type& rhs) { return min(lhs, rhs); } static batch_type fmax(const batch_type& lhs, const batch_type& rhs) { return max(lhs, rhs); } static batch_type abs(const batch_type& rhs) { __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31 return _mm_andnot_ps(sign_mask, rhs); } static batch_type fabs(const batch_type& rhs) { return abs(rhs); } static batch_type sqrt(const batch_type& rhs) { return _mm_sqrt_ps(rhs); } static batch_type fma(const batch_type& x, const batch_type& y, const batch_type& z) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_FMA3_VERSION return _mm_fmadd_ps(x, y, z); #elif XSIMD_X86_INSTR_SET >= XSIMD_X86_AMD_FMA4_VERSION return _mm_macc_ps(x, y, z); #else return x * y + z; #endif } static batch_type fms(const batch_type& x, const batch_type& y, const batch_type& z) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_FMA3_VERSION return _mm_fmsub_ps(x, y, z); #elif XSIMD_X86_INSTR_SET >= XSIMD_X86_AMD_FMA4_VERSION return _mm_msub_ps(x, y, z); #else return x * y - z; #endif } static batch_type fnma(const batch_type& x, const batch_type& y, const batch_type& z) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_FMA3_VERSION return _mm_fnmadd_ps(x, y, z); #elif XSIMD_X86_INSTR_SET >= XSIMD_X86_AMD_FMA4_VERSION return _mm_nmacc_ps(x, y, z); #else return -x * y + z; #endif } static batch_type fnms(const batch_type& x, const batch_type& y, const batch_type& z) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_FMA3_VERSION return _mm_fnmsub_ps(x, y, z); #elif XSIMD_X86_INSTR_SET >= XSIMD_X86_AMD_FMA4_VERSION return _mm_nmsub_ps(x, y, z); #else return -x * y - z; #endif } static value_type hadd(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE3_VERSION __m128 tmp0 = _mm_hadd_ps(rhs, rhs); __m128 tmp1 = _mm_hadd_ps(tmp0, tmp0); #else __m128 tmp0 = _mm_add_ps(rhs, _mm_movehl_ps(rhs, rhs)); __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1)); #endif return _mm_cvtss_f32(tmp1); } static batch_type haddp(const batch_type* row) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE3_VERSION return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]), _mm_hadd_ps(row[2], row[3])); #else __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]); __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]); __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]); tmp0 = _mm_add_ps(tmp0, tmp1); tmp1 = _mm_unpacklo_ps(row[2], row[3]); tmp1 = _mm_add_ps(tmp1, tmp2); tmp2 = _mm_movehl_ps(tmp1, tmp0); tmp0 = _mm_movelh_ps(tmp0, tmp1); return _mm_add_ps(tmp0, tmp2); #endif } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION return _mm_blendv_ps(b, a, cond); #else return _mm_or_ps(_mm_and_ps(cond, a), _mm_andnot_ps(cond, b)); #endif } template static batch_type select(const batch_bool_constant& cond, const batch_type& a, const batch_type& b) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION (void)cond; constexpr int mask = batch_bool_constant::mask(); return _mm_blend_ps(b, a, mask); #else return select((batch_bool_type)cond, a, b); #endif } static batch_bool_type isnan(const batch_type& x) { return _mm_cmpunord_ps(x, x); } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { return _mm_unpacklo_ps(lhs, rhs); } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { return _mm_unpackhi_ps(lhs, rhs); } static batch_type extract_pair(const batch_type& lhs, const batch_type& rhs, const int n) { batch_type b_concatenate; for (int i = 0 ; i < (4 - n); ++i) { b_concatenate[i] = lhs[i + n]; if(i < n) { b_concatenate[4 - 1 - i] = rhs[n - 1 - i]; } } return b_concatenate; } }; } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_sse_int16.hpp000066400000000000000000000304111410101234500223010ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_SSE_INT16_HPP #define XSIMD_SSE_INT16_HPP #include #include "xsimd_base.hpp" #include "xsimd_sse_int_base.hpp" namespace xsimd { /************************** * batch_bool * **************************/ template <> struct simd_batch_traits> { using value_type = int16_t; static constexpr std::size_t size = 8; using batch_type = batch; static constexpr std::size_t align = 16; }; template <> struct simd_batch_traits> { using value_type = uint16_t; static constexpr std::size_t size = 8; using batch_type = batch; static constexpr std::size_t align = 16; }; template <> class batch_bool : public sse_int_batch_bool { public: using sse_int_batch_bool::sse_int_batch_bool; }; template <> class batch_bool : public sse_int_batch_bool { public: using sse_int_batch_bool::sse_int_batch_bool; }; namespace detail { template <> struct batch_bool_kernel : public sse_int_batch_bool_kernel { }; template <> struct batch_bool_kernel : public sse_int_batch_bool_kernel { }; } /********************* * batch * *********************/ template <> struct simd_batch_traits> { using value_type = int16_t; static constexpr std::size_t size = 8; using batch_bool_type = batch_bool; static constexpr std::size_t align = 16; using storage_type = __m128i; }; template <> struct simd_batch_traits> { using value_type = uint16_t; static constexpr std::size_t size = 8; using batch_bool_type = batch_bool; static constexpr std::size_t align = 16; using storage_type = __m128i; }; template <> class batch : public sse_int_batch { public: using base_class = sse_int_batch; using base_class::base_class; using base_class::load_aligned; using base_class::load_unaligned; using base_class::store_aligned; using base_class::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT16(int16_t, 8) XSIMD_DECLARE_LOAD_STORE_LONG(int16_t, 8) }; template <> class batch : public sse_int_batch { public: using base_class = sse_int_batch; using base_class::base_class; using base_class::load_aligned; using base_class::load_unaligned; using base_class::store_aligned; using base_class::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT16(uint16_t, 8) XSIMD_DECLARE_LOAD_STORE_LONG(uint16_t, 8) }; batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); /************************************ * batch implementation * ************************************/ namespace detail { template struct sse_int16_batch_kernel : sse_int_kernel_base> { using batch_type = batch; using value_type = T; using batch_bool_type = batch_bool; static constexpr bool is_signed = std::is_signed::value; static batch_type add(const batch_type& lhs, const batch_type& rhs) { return _mm_add_epi16(lhs, rhs); } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { return _mm_sub_epi16(lhs, rhs); } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { return _mm_adds_epi16(lhs, rhs); } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return _mm_subs_epi16(lhs, rhs); } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { return _mm_mullo_epi16(lhs, rhs); } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { return _mm_cmpeq_epi16(lhs, rhs); } static value_type hadd(const batch_type& rhs) { // TODO implement with hadd_epi16 alignas(16) T tmp[8]; rhs.store_aligned(tmp); T res = 0; for (int i = 0; i < 8; ++i) { res += tmp[i]; } return res; } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION return _mm_blendv_epi8(b, a, cond); #else return _mm_or_si128(_mm_and_si128(cond, a), _mm_andnot_si128(cond, b)); #endif } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { return _mm_unpacklo_epi16(lhs, rhs); } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { return _mm_unpackhi_epi16(lhs, rhs); } static batch_type extract_pair(const batch_type& v_lhs, const batch_type& v_rhs, const int num) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSSE3_VERSION const batch_type lhs = v_rhs; const batch_type rhs = v_lhs; const int n = 2 * num; switch(n) { case 0: return rhs; XSIMD_REPEAT_16_v2(_mm_alignr_epi8); default: break; } return batch_type(T(0)); #else batch_type b_concatenate; const int n = num; for (int i = 0 ; i < (8 - n); ++i) { b_concatenate[i] = v_lhs[i + n]; if(i < n) { b_concatenate[8 - 1 - i] = v_rhs[n - 1 - i]; } } return b_concatenate; #endif } }; template <> struct batch_kernel : sse_int16_batch_kernel { static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { return _mm_cmplt_epi16(lhs, rhs); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return _mm_min_epi16(lhs, rhs); } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return _mm_max_epi16(lhs, rhs); } static batch_type abs(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSSE3_VERSION return _mm_abs_epi16(rhs); #else __m128i tmp, res; tmp = _mm_cmplt_epi16(rhs, _mm_setzero_si128()); res = _mm_xor_si128(rhs, tmp); res = _mm_sub_epi16(res, tmp); return res; #endif } }; template <> struct batch_kernel : public sse_int16_batch_kernel { static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { return _mm_cmplt_epi16(_mm_xor_si128(lhs, _mm_set1_epi16(std::numeric_limits::min())), _mm_xor_si128(rhs, _mm_set1_epi16(std::numeric_limits::min()))); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION return _mm_min_epu16(lhs, rhs); #else return select(lhs < rhs, lhs, rhs); #endif } static batch_type max(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION return _mm_max_epu16(lhs, rhs); #else return select(lhs < rhs, rhs, lhs); #endif } static batch_type abs(const batch_type& rhs) { return rhs; } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { return _mm_adds_epu16(lhs, rhs); } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return _mm_subs_epu16(lhs, rhs); } }; } XSIMD_DEFINE_LOAD_STORE_INT16(int16_t, 8, 16) XSIMD_DEFINE_LOAD_STORE_LONG(int16_t, 8, 16) inline batch operator<<(const batch& lhs, int32_t rhs) { return _mm_slli_epi16(lhs, rhs); } inline batch operator>>(const batch& lhs, int32_t rhs) { return _mm_srai_epi16(lhs, rhs); } inline batch operator<<(const batch& lhs, const batch& rhs) { #if defined(XSIMD_AVX512VL_AVAILABLE) && defined(XSIMD_AVX512BW_AVAILABLE) return _mm_sllv_epi16(lhs, rhs); #else return sse_detail::shift_impl([](int16_t alhs, int16_t s) { return alhs << s; }, lhs, rhs); #endif } inline batch operator>>(const batch& lhs, const batch& rhs) { #if defined(XSIMD_AVX512VL_AVAILABLE) && defined(XSIMD_AVX512BW_AVAILABLE) return _mm_srav_epi16(lhs, rhs); #else return sse_detail::shift_impl([](int16_t alhs, int16_t s) { return alhs >> s; }, lhs, rhs); #endif } XSIMD_DEFINE_LOAD_STORE_INT16(uint16_t, 8, 16) XSIMD_DEFINE_LOAD_STORE_LONG(uint16_t, 8, 16) inline batch operator<<(const batch& lhs, int32_t rhs) { return _mm_slli_epi16(lhs, rhs); } inline batch operator>>(const batch& lhs, int32_t rhs) { return _mm_srli_epi16(lhs, rhs); } inline batch operator<<(const batch& lhs, const batch& rhs) { #if defined(XSIMD_AVX512VL_AVAILABLE) && defined(XSIMD_AVX512BW_AVAILABLE) return _mm_sllv_epi16(lhs, rhs); #else return sse_detail::shift_impl([](uint16_t alhs, int16_t s) { return alhs << s; }, lhs, rhs); #endif } inline batch operator>>(const batch& lhs, const batch& rhs) { #if defined(XSIMD_AVX512VL_AVAILABLE) && defined(XSIMD_AVX512BW_AVAILABLE) return _mm_srlv_epi16(lhs, rhs); #else return sse_detail::shift_impl([](uint16_t alhs, int16_t s) { return alhs >> s; }, lhs, rhs); #endif } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_sse_int32.hpp000066400000000000000000000523031410101234500223030ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_SSE_INT32_HPP #define XSIMD_SSE_INT32_HPP #include #include "xsimd_base.hpp" #include "xsimd_sse_int_base.hpp" namespace xsimd { /************************** * batch_bool * **************************/ template <> struct simd_batch_traits> { using value_type = int32_t; static constexpr std::size_t size = 4; using batch_type = batch; static constexpr std::size_t align = 16; }; template <> struct simd_batch_traits> { using value_type = uint32_t; static constexpr std::size_t size = 4; using batch_type = batch; static constexpr std::size_t align = 16; }; template <> class batch_bool : public sse_int_batch_bool { public: using sse_int_batch_bool::sse_int_batch_bool; }; template <> class batch_bool : public sse_int_batch_bool { public: using sse_int_batch_bool::sse_int_batch_bool; }; namespace detail { template <> struct batch_bool_kernel : public sse_int_batch_bool_kernel { }; template <> struct batch_bool_kernel : public sse_int_batch_bool_kernel { }; } /********************* * batch * *********************/ template <> struct simd_batch_traits> { using value_type = int32_t; static constexpr std::size_t size = 4; using batch_bool_type = batch_bool; static constexpr std::size_t align = 16; using storage_type = __m128i; }; template <> struct simd_batch_traits> { using value_type = uint32_t; static constexpr std::size_t size = 4; using batch_bool_type = batch_bool; static constexpr std::size_t align = 16; using storage_type = __m128i; }; template <> class batch : public sse_int_batch { public: using base_type = sse_int_batch; using base_type::base_type; using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT32(int32_t, 4) XSIMD_DECLARE_LOAD_STORE_LONG(int32_t, 4) }; template <> class batch : public sse_int_batch { public: using base_type = sse_int_batch; using base_type::base_type; using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT32(uint32_t, 4) XSIMD_DECLARE_LOAD_STORE_LONG(uint32_t, 4) }; batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); /************************************ * batch implementation * ************************************/ namespace sse_detail { inline __m128i load_aligned_int32(const int8_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i res = _mm_cvtepi8_epi32(tmp); #else __m128i mask = _mm_cmplt_epi8(tmp, _mm_set1_epi8(0)); __m128i tmp1 = _mm_unpacklo_epi8(tmp, mask); mask = _mm_cmplt_epi16(tmp1, _mm_set1_epi16(0)); __m128i res = _mm_unpacklo_epi16(tmp1, mask); #endif return res; } inline __m128i load_aligned_int32(const uint8_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i res = _mm_cvtepu8_epi32(tmp); #else __m128i tmp2 = _mm_unpacklo_epi8(tmp, _mm_set1_epi8(0)); __m128i res = _mm_unpacklo_epi16(tmp2, _mm_set1_epi16(0)); #endif return res; } inline __m128i load_aligned_int32(const int16_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i res = _mm_cvtepi16_epi32(tmp); #else __m128i mask = _mm_cmplt_epi16(tmp, _mm_set1_epi16(0)); __m128i res = _mm_unpacklo_epi16(tmp, mask); #endif return res; } inline __m128i load_aligned_int32(const uint16_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i res = _mm_cvtepu16_epi32(tmp); #else __m128i res = _mm_unpacklo_epi16(tmp, _mm_set1_epi16(0)); #endif return res; } inline void store_aligned_int32(__m128i src, int8_t* dst) { __m128i tmp1 = _mm_packs_epi32(src, src); __m128i tmp2 = _mm_packs_epi16(tmp1, tmp1); _mm_storel_epi64((__m128i*)dst, tmp2); } inline void store_aligned_int32(__m128i src, uint8_t* dst) { __m128i tmp1 = _mm_packs_epi32(src, src); __m128i tmp2 = _mm_packus_epi16(tmp1, tmp1); _mm_storel_epi64((__m128i*)dst, tmp2); } inline void store_aligned_int32(__m128i src, int16_t* dst) { __m128i tmp1 = _mm_packs_epi32(src, src); _mm_storel_epi64((__m128i*)dst, tmp1); } inline void store_aligned_int32(__m128i src, uint16_t* dst) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i tmp = _mm_packus_epi32(src, src); _mm_storel_epi64((__m128i*)dst, tmp); #else alignas(16) int32_t tmp[4]; _mm_store_si128((__m128i*)tmp, src); unroller<4>([&](std::size_t i){ dst[i] = static_cast(tmp[i]); }); #endif } } #define SSE_DEFINE_LOAD_STORE_INT32(TYPE, CVT_TYPE) \ inline batch& batch::load_aligned(const CVT_TYPE* src) \ { \ this->m_value = sse_detail::load_aligned_int32(src); \ return *this; \ } \ inline batch& batch::load_unaligned(const CVT_TYPE* src) \ { \ return load_aligned(src); \ } \ inline void batch::store_aligned(CVT_TYPE* dst) const \ { \ sse_detail::store_aligned_int32(this->m_value, dst); \ } \ inline void batch::store_unaligned(CVT_TYPE* dst) const \ { \ store_aligned(dst); \ } XSIMD_DEFINE_LOAD_STORE(int32_t, 4, bool, 16) SSE_DEFINE_LOAD_STORE_INT32(int32_t, int8_t) SSE_DEFINE_LOAD_STORE_INT32(int32_t, uint8_t) SSE_DEFINE_LOAD_STORE_INT32(int32_t, int16_t) SSE_DEFINE_LOAD_STORE_INT32(int32_t, uint16_t) XSIMD_DEFINE_LOAD_STORE_LONG(int32_t, 4, 16) XSIMD_DEFINE_LOAD_STORE(int32_t, 4, int64_t, 16) XSIMD_DEFINE_LOAD_STORE(int32_t, 4, uint64_t, 16) inline batch& batch::load_aligned(const float* src) { this->m_value = _mm_cvtps_epi32(_mm_load_ps(src)); return *this; } inline batch& batch::load_unaligned(const float* src) { this->m_value = _mm_cvtps_epi32(_mm_loadu_ps(src)); return *this; } inline batch& batch::load_aligned(const double* src) { __m128i tmp1 = _mm_cvtpd_epi32(_mm_load_pd(src)); __m128i tmp2 = _mm_cvtpd_epi32(_mm_load_pd(src + 2)); this->m_value = _mm_unpacklo_epi64(tmp1, tmp2); return *this; } inline batch& batch::load_unaligned(const double* src) { __m128i tmp1 = _mm_cvtpd_epi32(_mm_loadu_pd(src)); __m128i tmp2 = _mm_cvtpd_epi32(_mm_loadu_pd(src + 2)); this->m_value = _mm_unpacklo_epi64(tmp1, tmp2); return *this; } inline void batch::store_aligned(float* dst) const { _mm_store_ps(dst, _mm_cvtepi32_ps(this->m_value)); } inline void batch::store_unaligned(float* dst) const { _mm_storeu_ps(dst, _mm_cvtepi32_ps(this->m_value)); } inline void batch::store_aligned(double* dst) const { __m128d tmp1 = _mm_cvtepi32_pd(this->m_value); __m128d tmp2 = _mm_cvtepi32_pd(_mm_unpackhi_epi64(this->m_value, this->m_value)); _mm_store_pd(dst, tmp1); _mm_store_pd(dst + 2, tmp2); } inline void batch::store_unaligned(double* dst) const { __m128d tmp1 = _mm_cvtepi32_pd(this->m_value); __m128d tmp2 = _mm_cvtepi32_pd(_mm_unpackhi_epi64(this->m_value, this->m_value)); _mm_storeu_pd(dst, tmp1); _mm_storeu_pd(dst + 2, tmp2); } XSIMD_DEFINE_LOAD_STORE(uint32_t, 4, bool, 16) SSE_DEFINE_LOAD_STORE_INT32(uint32_t, int8_t) SSE_DEFINE_LOAD_STORE_INT32(uint32_t, uint8_t) SSE_DEFINE_LOAD_STORE_INT32(uint32_t, int16_t) SSE_DEFINE_LOAD_STORE_INT32(uint32_t, uint16_t) XSIMD_DEFINE_LOAD_STORE_LONG(uint32_t, 4, 16) XSIMD_DEFINE_LOAD_STORE(uint32_t, 4, int64_t, 16) XSIMD_DEFINE_LOAD_STORE(uint32_t, 4, uint64_t, 16) XSIMD_DEFINE_LOAD_STORE(uint32_t, 4, float, 16) XSIMD_DEFINE_LOAD_STORE(uint32_t, 4, double, 16) #undef SSE_DEFINE_LOAD_STORE_INT32 namespace detail { template struct sse_int32_batch_kernel : sse_int_kernel_base> { using batch_type = batch; using value_type = T; using batch_bool_type = batch_bool; static batch_type add(const batch_type& lhs, const batch_type& rhs) { return _mm_add_epi32(lhs, rhs); } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { return _mm_sub_epi32(lhs, rhs); } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { batch_type mask = rhs >> (8 * sizeof(value_type) - 1); batch_type lhs_pos_branch = min(std::numeric_limits::max() - rhs, lhs); batch_type lhs_neg_branch = max(std::numeric_limits::min() - rhs, lhs); return rhs + select((typename batch_type::storage_type)mask, lhs_neg_branch, lhs_pos_branch); } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return sadd(lhs, sub(_mm_setzero_si128(), rhs)); } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION return _mm_mullo_epi32(lhs, rhs); #else __m128i a13 = _mm_shuffle_epi32(lhs, 0xF5); __m128i b13 = _mm_shuffle_epi32(rhs, 0xF5); __m128i prod02 = _mm_mul_epu32(lhs, rhs); __m128i prod13 = _mm_mul_epu32(a13, b13); __m128i prod01 = _mm_unpacklo_epi32(prod02, prod13); __m128i prod23 = _mm_unpackhi_epi32(prod02, prod13); return _mm_unpacklo_epi64(prod01, prod23); #endif } static batch_type div(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_FAST_INTEGER_DIVISION) return _mm_cvttps_epi32(_mm_div_ps(_mm_cvtepi32_ps(lhs), _mm_cvtepi32_ps(rhs))); #else XSIMD_MACRO_UNROLL_BINARY(/); #endif } static batch_type mod(const batch_type& lhs, const batch_type& rhs) { XSIMD_MACRO_UNROLL_BINARY(%); } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { return _mm_cmpeq_epi32(lhs, rhs); } static value_type hadd(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSSE3_VERSION __m128i tmp1 = _mm_hadd_epi32(rhs, rhs); __m128i tmp2 = _mm_hadd_epi32(tmp1, tmp1); return _mm_cvtsi128_si32(tmp2); #else __m128i tmp1 = _mm_shuffle_epi32(rhs, 0x0E); __m128i tmp2 = _mm_add_epi32(rhs, tmp1); __m128i tmp3 = _mm_shuffle_epi32(tmp2, 0x01); __m128i tmp4 = _mm_add_epi32(tmp2, tmp3); return _mm_cvtsi128_si32(tmp4); #endif } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION return _mm_blendv_epi8(b, a, cond); #else return _mm_or_si128(_mm_and_si128(cond, a), _mm_andnot_si128(cond, b)); #endif } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { return _mm_unpacklo_epi32(lhs, rhs); } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { return _mm_unpackhi_epi32(lhs, rhs); } static batch_type extract_pair(const batch_type& v_lhs, const batch_type& v_rhs, const int num) { #if defined(XSIMD_AVX512VL_AVAILABLE) const batch_type lhs = v_rhs; const batch_type rhs = v_lhs; const int n = num; switch(n) { case 0: return rhs; XSIMD_REPEAT_4(_mm_alignr_epi32); default: break; } return batch_type(T(0)); #else #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSSE3_VERSION const batch_type lhs = v_rhs; const batch_type rhs = v_lhs; const int n = 4 * num; switch(n) { case 0: return rhs; XSIMD_REPEAT_16_v2(_mm_alignr_epi8); default: break; } return batch_type(T(0)); #else batch_type b_concatenate; const int n = num; for (int i = 0 ; i < (4 - n); ++i) { b_concatenate[i] = v_lhs[i + n]; if(i < n) { b_concatenate[4 - 1 - i] = v_rhs[n - 1 - i]; } } return b_concatenate; #endif #endif } }; template <> struct batch_kernel : sse_int32_batch_kernel { static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { return _mm_cmplt_epi32(lhs, rhs); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION return _mm_min_epi32(lhs, rhs); #else __m128i greater = _mm_cmpgt_epi32(lhs, rhs); return select(greater, rhs, lhs); #endif } static batch_type max(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION return _mm_max_epi32(lhs, rhs); #else __m128i greater = _mm_cmpgt_epi32(lhs, rhs); return select(greater, lhs, rhs); #endif } static batch_type abs(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSSE3_VERSION return _mm_abs_epi32(rhs); #else __m128i sign = _mm_srai_epi32(rhs, 31); __m128i inv = _mm_xor_si128(rhs, sign); return _mm_sub_epi32(inv, sign); #endif } }; template <> struct batch_kernel : sse_int32_batch_kernel { static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { auto xlhs = _mm_xor_si128(lhs, _mm_set1_epi32(std::numeric_limits::lowest())); auto xrhs = _mm_xor_si128(rhs, _mm_set1_epi32(std::numeric_limits::lowest())); return _mm_cmplt_epi32(xlhs, xrhs); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION return _mm_min_epu32(lhs, rhs); #else auto mask = lhs < rhs; return select(mask, lhs, rhs); #endif } static batch_type max(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION return _mm_max_epu32(lhs, rhs); #else auto mask = lhs < rhs; return select(mask, rhs, lhs); #endif } static batch_type abs(const batch_type& rhs) { return rhs; } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { const auto diffmax = std::numeric_limits::max() - lhs; const auto mindiff = min(diffmax, rhs); return lhs + mindiff; } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { const auto diff = min(lhs, rhs); return lhs - diff; } }; } inline batch operator<<(const batch& lhs, int32_t rhs) { return _mm_slli_epi32(lhs, rhs); } inline batch operator>>(const batch& lhs, int32_t rhs) { return _mm_srai_epi32(lhs, rhs); } inline batch operator<<(const batch& lhs, const batch& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm_sllv_epi32(lhs, rhs); #else return sse_detail::shift_impl([](int32_t alhs, int32_t s) { return alhs << s; }, lhs, rhs); #endif } inline batch operator>>(const batch& lhs, const batch& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm_srav_epi32(lhs, rhs); #else return sse_detail::shift_impl([](int32_t alhs, int32_t s) { return alhs >> s; }, lhs, rhs); #endif } inline batch operator<<(const batch& lhs, int32_t rhs) { return _mm_slli_epi32(lhs, rhs); } inline batch operator>>(const batch& lhs, int32_t rhs) { return _mm_srli_epi32(lhs, rhs); } inline batch operator<<(const batch& lhs, const batch& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm_sllv_epi32(lhs, rhs); #else return sse_detail::shift_impl([](uint32_t alhs, int32_t s) { return alhs << s; }, lhs, rhs); #endif } inline batch operator>>(const batch& lhs, const batch& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm_srlv_epi32(lhs, rhs); #else return sse_detail::shift_impl([](uint32_t alhs, int32_t s) { return alhs >> s; }, lhs, rhs); #endif } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_sse_int64.hpp000066400000000000000000000475271410101234500223240ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_SSE_INT64_HPP #define XSIMD_SSE_INT64_HPP #include #include "xsimd_base.hpp" #include "xsimd_sse_int_base.hpp" namespace xsimd { /************************** * batch_bool * **************************/ template <> struct simd_batch_traits> { using value_type = int64_t; static constexpr std::size_t size = 2; using batch_type = batch; static constexpr std::size_t align = 16; }; template <> struct simd_batch_traits> { using value_type = uint64_t; static constexpr std::size_t size = 2; using batch_type = batch; static constexpr std::size_t align = 16; }; template <> class batch_bool : public sse_int_batch_bool { public: using sse_int_batch_bool::sse_int_batch_bool; }; template <> class batch_bool : public sse_int_batch_bool { public: using sse_int_batch_bool::sse_int_batch_bool; }; namespace detail { template <> struct batch_bool_kernel : public sse_int_batch_bool_kernel { }; template <> struct batch_bool_kernel : public sse_int_batch_bool_kernel { }; } /********************* * batch * *********************/ template <> struct simd_batch_traits> { using value_type = int64_t; static constexpr std::size_t size = 2; using batch_bool_type = batch_bool; static constexpr std::size_t align = 16; using storage_type = __m128i; }; template <> struct simd_batch_traits> { using value_type = uint64_t; static constexpr std::size_t size = 2; using batch_bool_type = batch_bool; static constexpr std::size_t align = 16; using storage_type = __m128i; }; template <> class batch : public sse_int_batch { public: using base_type = sse_int_batch; using base_type::base_type; using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT64(int64_t, 2) XSIMD_DECLARE_LOAD_STORE_LONG(int64_t, 2) }; template <> class batch : public sse_int_batch { public: using base_type = sse_int_batch; using base_type::base_type; using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT64(uint64_t, 2) XSIMD_DECLARE_LOAD_STORE_LONG(uint64_t, 2) }; batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); /************************************ * batch implementation * ************************************/ namespace sse_detail { inline __m128i load_aligned_int64(const int8_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i res = _mm_cvtepi8_epi64(tmp); #else __m128i mask = _mm_cmplt_epi8(tmp, _mm_set1_epi8(0)); __m128i tmp1 = _mm_unpacklo_epi8(tmp, mask); mask = _mm_cmplt_epi16(tmp1, _mm_set1_epi16(0)); __m128i tmp2 = _mm_unpacklo_epi16(tmp1, mask); mask = _mm_cmplt_epi32(tmp2, _mm_set1_epi32(0)); __m128i res = _mm_unpacklo_epi32(tmp2, mask); #endif return res; } inline __m128i load_aligned_int64(const uint8_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i res = _mm_cvtepu8_epi64(tmp); #else __m128i tmp1 = _mm_unpacklo_epi8(tmp, _mm_set1_epi8(0)); __m128i tmp2 = _mm_unpacklo_epi16(tmp1, _mm_set1_epi16(0)); __m128i res = _mm_unpacklo_epi32(tmp2, _mm_set1_epi32(0)); #endif return res; } inline __m128i load_aligned_int64(const int16_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i res = _mm_cvtepi16_epi64(tmp); #else __m128i mask = _mm_cmplt_epi16(tmp, _mm_set1_epi16(0)); __m128i tmp1 = _mm_unpacklo_epi16(tmp, mask); mask = _mm_cmplt_epi32(tmp1, _mm_set1_epi32(0)); __m128i res = _mm_unpacklo_epi32(tmp1, mask); #endif return res; } inline __m128i load_aligned_int64(const uint16_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i res = _mm_cvtepu16_epi64(tmp); #else __m128i tmp1 = _mm_unpacklo_epi16(tmp, _mm_set1_epi16(0)); __m128i res = _mm_unpacklo_epi32(tmp1, _mm_set1_epi32(0)); #endif return res; } inline void store_aligned_int64(__m128i src, int8_t* dst) { alignas(16) int64_t tmp[2]; _mm_store_si128((__m128i*)tmp, src); dst[0] = static_cast(tmp[0]); dst[1] = static_cast(tmp[1]); } inline void store_aligned_int64(__m128i src, uint8_t* dst) { alignas(16) int64_t tmp[2]; _mm_store_si128((__m128i*)tmp, src); dst[0] = static_cast(tmp[0]); dst[1] = static_cast(tmp[1]); } inline void store_aligned_int64(__m128i src, int16_t* dst) { alignas(16) int64_t tmp[2]; _mm_store_si128((__m128i*)tmp, src); dst[0] = static_cast(tmp[0]); dst[1] = static_cast(tmp[1]); } inline void store_aligned_int64(__m128i src, uint16_t* dst) { alignas(16) int64_t tmp[2]; _mm_store_si128((__m128i*)tmp, src); dst[0] = static_cast(tmp[0]); dst[1] = static_cast(tmp[1]); } } #define SSE_DEFINE_LOAD_STORE_INT64(TYPE, CVT_TYPE) \ inline batch& batch::load_aligned(const CVT_TYPE* src) \ { \ this->m_value = sse_detail::load_aligned_int64(src); \ return *this; \ } \ inline batch& batch::load_unaligned(const CVT_TYPE* src) \ { \ return load_aligned(src); \ } \ inline void batch::store_aligned(CVT_TYPE* dst) const \ { \ sse_detail::store_aligned_int64(this->m_value, dst); \ } \ inline void batch::store_unaligned(CVT_TYPE* dst) const \ { \ store_aligned(dst); \ } XSIMD_DEFINE_LOAD_STORE(int64_t, 2, bool, 16) SSE_DEFINE_LOAD_STORE_INT64(int64_t, int8_t) SSE_DEFINE_LOAD_STORE_INT64(int64_t, uint8_t) SSE_DEFINE_LOAD_STORE_INT64(int64_t, int16_t) SSE_DEFINE_LOAD_STORE_INT64(int64_t, uint16_t) XSIMD_DEFINE_LOAD_STORE(int64_t, 2, int32_t, 16) XSIMD_DEFINE_LOAD_STORE(int64_t, 2, uint32_t, 16) XSIMD_DEFINE_LOAD_STORE_LONG(int64_t, 2, 16) XSIMD_DEFINE_LOAD_STORE(int64_t, 2, float, 16) XSIMD_DEFINE_LOAD_STORE(int64_t, 2, double, 16) XSIMD_DEFINE_LOAD_STORE(uint64_t, 2, bool, 16) SSE_DEFINE_LOAD_STORE_INT64(uint64_t, int8_t) SSE_DEFINE_LOAD_STORE_INT64(uint64_t, uint8_t) SSE_DEFINE_LOAD_STORE_INT64(uint64_t, int16_t) SSE_DEFINE_LOAD_STORE_INT64(uint64_t, uint16_t) XSIMD_DEFINE_LOAD_STORE(uint64_t, 2, int32_t, 16) XSIMD_DEFINE_LOAD_STORE(uint64_t, 2, uint32_t, 16) XSIMD_DEFINE_LOAD_STORE_LONG(uint64_t, 2, 16) XSIMD_DEFINE_LOAD_STORE(uint64_t, 2, float, 16) XSIMD_DEFINE_LOAD_STORE(uint64_t, 2, double, 16) #undef SSE_DEFINE_LOAD_STORE_INT64 namespace detail { template struct sse_int64_batch_kernel : sse_int_kernel_base> { using batch_type = batch; using value_type = T; using batch_bool_type = batch_bool; static batch_type add(const batch_type& lhs, const batch_type& rhs) { return _mm_add_epi64(lhs, rhs); } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { return _mm_sub_epi64(lhs, rhs); } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { batch_type mask = rhs >> (8 * sizeof(value_type) - 1); batch_type lhs_pos_branch = min(std::numeric_limits::max() - rhs, lhs); batch_type lhs_neg_branch = max(std::numeric_limits::min() - rhs, lhs); return rhs + select((typename batch_type::storage_type)mask, lhs_neg_branch, lhs_pos_branch); } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return sadd(lhs, sub(_mm_setzero_si128(), rhs)); } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { XSIMD_MACRO_UNROLL_BINARY(*); } static batch_type div(const batch_type& lhs, const batch_type& rhs) { #if defined(XSIMD_FAST_INTEGER_DIVISION) __m128d dlhs = _mm_setr_pd(static_cast(lhs[0]), static_cast(lhs[1])); __m128d drhs = _mm_setr_pd(static_cast(rhs[0]), static_cast(rhs[1])); __m128i tmp = _mm_cvttpd_epi32(_mm_div_pd(dlhs, drhs)); using batch_int = batch; return _mm_unpacklo_epi32(tmp, batch_int(tmp) < batch_int(int64_t(0))); #else XSIMD_MACRO_UNROLL_BINARY(/); #endif } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION return _mm_cmpeq_epi64(lhs, rhs); #else return sse_detail::cmpeq_epi64_sse2(lhs, rhs); #endif } static batch_type min(const batch_type& lhs, const batch_type& rhs) { return select(lhs < rhs, lhs, rhs); } static batch_type max(const batch_type& lhs, const batch_type& rhs) { return select(lhs > rhs, lhs, rhs); } static value_type hadd(const batch_type& rhs) { __m128i tmp1 = _mm_shuffle_epi32(rhs, 0x0E); __m128i tmp2 = _mm_add_epi64(rhs, tmp1); #if defined(__x86_64__) return _mm_cvtsi128_si64(tmp2); #else union { int64_t i; __m128i m; } u; _mm_storel_epi64(&u.m, tmp2); return u.i; #endif } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION return _mm_blendv_epi8(b, a, cond); #else return _mm_or_si128(_mm_and_si128(cond, a), _mm_andnot_si128(cond, b)); #endif } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { return _mm_unpacklo_epi64(lhs, rhs); } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { return _mm_unpackhi_epi64(lhs, rhs); } static batch_type extract_pair(const batch_type& v_lhs, const batch_type& v_rhs, const int num) { #if defined(XSIMD_AVX512VL_AVAILABLE) const batch_type lhs = v_rhs; const batch_type rhs = v_lhs; const int n = num; switch(n) { case 0: return rhs; XSIMD_REPEAT_4(_mm_alignr_epi64); default: break; } return batch_type(T(0)); #else #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSSE3_VERSION const batch_type lhs = v_rhs; const batch_type rhs = v_lhs; const int n = 8 * num; switch(n) { case 0: return rhs; XSIMD_REPEAT_16_v2(_mm_alignr_epi8); default: break; } return batch_type(T(0)); #else batch_type b_concatenate; const int n = num; for (int i = 0 ; i < (2 - n); ++i) { b_concatenate[i] = v_lhs[i + n]; if(i < n) { b_concatenate[2 - 1 - i] = v_rhs[n - 1 - i]; } } return b_concatenate; #endif #endif } }; template <> struct batch_kernel : sse_int64_batch_kernel { static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_2_VERSION return _mm_cmpgt_epi64(rhs, lhs); #else __m128i tmp1 = _mm_sub_epi64(lhs, rhs); __m128i tmp2 = _mm_xor_si128(lhs, rhs); __m128i tmp3 = _mm_andnot_si128(rhs, lhs); __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1); __m128i tmp5 = _mm_or_si128(tmp3, tmp4); __m128i tmp6 = _mm_srai_epi32(tmp5, 31); return _mm_shuffle_epi32(tmp6, 0xF5); #endif } static batch_type abs(const batch_type& rhs) { #if defined(XSIMD_AVX512VL_AVAILABLE) return _mm_abs_epi64(rhs); #else #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_2_VERSION __m128i sign = _mm_cmpgt_epi64(_mm_setzero_si128(), rhs); #else __m128i signh = _mm_srai_epi32(rhs, 31); __m128i sign = _mm_shuffle_epi32(signh, 0xF5); #endif __m128i inv = _mm_xor_si128(rhs, sign); return _mm_sub_epi64(inv, sign); #endif } }; template <> struct batch_kernel : sse_int64_batch_kernel { static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { auto xlhs = _mm_xor_si128(lhs, _mm_set1_epi64x(std::numeric_limits::lowest())); auto xrhs = _mm_xor_si128(rhs, _mm_set1_epi64x(std::numeric_limits::lowest())); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_2_VERSION return _mm_cmpgt_epi64(xrhs, xlhs); #else __m128i tmp1 = _mm_sub_epi64(xlhs, xrhs); __m128i tmp2 = _mm_xor_si128(xlhs, xrhs); __m128i tmp3 = _mm_andnot_si128(xrhs, xlhs); __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1); __m128i tmp5 = _mm_or_si128(tmp3, tmp4); __m128i tmp6 = _mm_srai_epi32(tmp5, 31); return _mm_shuffle_epi32(tmp6, 0xF5); #endif } static batch_type abs(const batch_type& rhs) { return rhs; } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { const auto diffmax = batch_type(std::numeric_limits::max()) - lhs; const auto mindiff = min(diffmax, rhs); return lhs + mindiff; } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { const auto diff = min(lhs, rhs); return lhs - diff; } }; } inline batch operator<<(const batch& lhs, int32_t rhs) { return _mm_slli_epi64(lhs, rhs); } inline batch operator>>(const batch& lhs, int32_t rhs) { #if defined(XSIMD_AVX512VL_AVAILABLE) return _mm_srai_epi64(lhs, rhs); #else return sse_detail::shift_impl([](int64_t alhs, int32_t s) { return alhs >> s; }, lhs, rhs); #endif } inline batch operator<<(const batch& lhs, const batch& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm_sllv_epi64(lhs, rhs); #else return sse_detail::shift_impl([](int64_t alhs, int64_t s) { return alhs << s; }, lhs, rhs); #endif } inline batch operator>>(const batch& lhs, const batch& rhs) { #if defined(XSIMD_AVX512VL_AVAILABLE) return _mm_srav_epi64(lhs, rhs); #else return sse_detail::shift_impl([](int64_t alhs, int64_t s) { return alhs >> s; }, lhs, rhs); #endif } inline batch operator<<(const batch& lhs, int32_t rhs) { return _mm_slli_epi64(lhs, rhs); } inline batch operator>>(const batch& lhs, int32_t rhs) { return _mm_srli_epi64(lhs, rhs); } inline batch operator<<(const batch& lhs, const batch& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm_sllv_epi64(lhs, rhs); #else return sse_detail::shift_impl([](uint64_t alhs, int64_t s) { return alhs << s; }, lhs, rhs); #endif } inline batch operator>>(const batch& lhs, const batch& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX2_VERSION return _mm_srlv_epi64(lhs, rhs); #else return sse_detail::shift_impl([](uint64_t alhs, int64_t s) { return alhs >> s; }, lhs, rhs); #endif } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_sse_int8.hpp000066400000000000000000000317361410101234500222350ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_SSE_INT8_HPP #define XSIMD_SSE_INT8_HPP #include #include #include "xsimd_base.hpp" #include "xsimd_sse_int_base.hpp" namespace xsimd { /************************** * batch_bool * **************************/ template <> struct simd_batch_traits> { using value_type = int8_t; static constexpr std::size_t size = 16; using batch_type = batch; static constexpr std::size_t align = 16; }; template <> struct simd_batch_traits> { using value_type = uint8_t; static constexpr std::size_t size = 16; using batch_type = batch; static constexpr std::size_t align = 16; }; template <> class batch_bool : public sse_int_batch_bool { public: using sse_int_batch_bool::sse_int_batch_bool; }; template <> class batch_bool : public sse_int_batch_bool { public: using sse_int_batch_bool::sse_int_batch_bool; }; namespace detail { template <> struct batch_bool_kernel : public sse_int_batch_bool_kernel { }; template <> struct batch_bool_kernel : public sse_int_batch_bool_kernel { }; } /********************* * batch * *********************/ template <> struct simd_batch_traits> { using value_type = int8_t; static constexpr std::size_t size = 16; using batch_bool_type = batch_bool; static constexpr std::size_t align = 16; using storage_type = __m128i; }; template <> struct simd_batch_traits> { using value_type = uint8_t; static constexpr std::size_t size = 16; using batch_bool_type = batch_bool; static constexpr std::size_t align = 16; using storage_type = __m128i; }; template <> class batch : public sse_int_batch { public: using base_class = sse_int_batch; using base_class::base_class; using base_class::load_aligned; using base_class::load_unaligned; using base_class::store_aligned; using base_class::store_unaligned; batch() = default; explicit batch(const char* src) : batch(reinterpret_cast(src)) { } batch(const char* src, aligned_mode) : batch(reinterpret_cast(src), aligned_mode{}) { } batch(const char* src, unaligned_mode) : batch(reinterpret_cast(src), unaligned_mode{}) { } XSIMD_DECLARE_LOAD_STORE_INT8(int8_t, 16) XSIMD_DECLARE_LOAD_STORE_LONG(int8_t, 16) }; template <> class batch : public sse_int_batch { public: using base_class = sse_int_batch; using base_class::base_class; using base_class::load_aligned; using base_class::load_unaligned; using base_class::store_aligned; using base_class::store_unaligned; XSIMD_DECLARE_LOAD_STORE_INT8(uint8_t, 16) XSIMD_DECLARE_LOAD_STORE_LONG(uint8_t, 16) }; batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); batch operator<<(const batch& lhs, int32_t rhs); batch operator>>(const batch& lhs, int32_t rhs); batch operator<<(const batch& lhs, const batch& rhs); batch operator>>(const batch& lhs, const batch& rhs); /************************************ * batch implementation * ************************************/ namespace detail { template struct sse_int8_batch_kernel : sse_int_kernel_base> { using batch_type = batch; using value_type = T; using batch_bool_type = batch_bool; static constexpr bool is_signed = std::is_signed::value; static batch_type add(const batch_type& lhs, const batch_type& rhs) { return _mm_add_epi8(lhs, rhs); } static batch_type sub(const batch_type& lhs, const batch_type& rhs) { return _mm_sub_epi8(lhs, rhs); } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { return _mm_adds_epi8(lhs, rhs); } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return _mm_subs_epi8(lhs,rhs); } static batch_type mul(const batch_type& lhs, const batch_type& rhs) { return sse_int8_batch_kernel::bitwise_or( sse_int8_batch_kernel::bitwise_and(_mm_mullo_epi16(lhs, rhs), _mm_srli_epi16(_mm_set1_epi32(-1), 8)), _mm_slli_epi16(_mm_mullo_epi16(_mm_srli_si128(lhs, 1), _mm_srli_si128(rhs, 1)), 8) ); } static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs) { return _mm_cmpeq_epi8(lhs, rhs); } static value_type hadd(const batch_type& rhs) { // TODO implement with hadd_epi16 alignas(16) T tmp[16]; rhs.store_aligned(tmp); T res = 0; for (int i = 0; i < 16; ++i) { res += tmp[i]; } return res; } static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION return _mm_blendv_epi8(b, a, cond); #else return _mm_or_si128(_mm_and_si128(cond, a), _mm_andnot_si128(cond, b)); #endif } static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs) { return _mm_unpacklo_epi8(lhs, rhs); } static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs) { return _mm_unpackhi_epi8(lhs, rhs); } static batch_type extract_pair(const batch_type& v_lhs, const batch_type& v_rhs, const int n) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSSE3_VERSION const batch_type lhs = v_rhs; const batch_type rhs = v_lhs; switch(n) { case 0: return rhs; XSIMD_REPEAT_16_v2(_mm_alignr_epi8); default: break; } return batch_type(T(0)); #else batch_type b_concatenate; for (int i = 0 ; i < (16 - n); ++i) { b_concatenate[i] = v_lhs[i + n]; if(i < n) { b_concatenate[16 - 1 - i] = v_rhs[n - 1 - i]; } } return b_concatenate; #endif } }; template <> struct batch_kernel : sse_int8_batch_kernel { static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { return _mm_cmplt_epi8(lhs, rhs); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION return _mm_min_epi8(lhs, rhs); #else __m128i greater = _mm_cmpgt_epi8(lhs, rhs); return select(greater, rhs, lhs); #endif } static batch_type max(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION return _mm_max_epi8(lhs, rhs); #else __m128i greater = _mm_cmpgt_epi8(lhs, rhs); return select(greater, lhs, rhs); #endif } static batch_type abs(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSSE3_VERSION return _mm_abs_epi8(rhs); #else __m128i neg = _mm_sub_epi8(_mm_setzero_si128(), rhs); return _mm_min_epu8(rhs, neg); #endif } }; template <> struct batch_kernel : public sse_int8_batch_kernel { static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs) { return _mm_cmplt_epi8(_mm_xor_si128(lhs, _mm_set1_epi8(std::numeric_limits::min())), _mm_xor_si128(rhs, _mm_set1_epi8(std::numeric_limits::min()))); } static batch_type min(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION return _mm_min_epu8(lhs, rhs); #else return select(lhs < rhs, lhs, rhs); #endif } static batch_type max(const batch_type& lhs, const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION return _mm_max_epu8(lhs, rhs); #else return select(lhs < rhs, rhs, lhs); #endif } static batch_type abs(const batch_type& rhs) { return rhs; } static batch_type sadd(const batch_type& lhs, const batch_type& rhs) { return _mm_adds_epu8(lhs, rhs); } static batch_type ssub(const batch_type& lhs, const batch_type& rhs) { return _mm_subs_epu8(lhs,rhs); } }; } XSIMD_DEFINE_LOAD_STORE_INT8(int8_t, 16, 16) XSIMD_DEFINE_LOAD_STORE_LONG(int8_t, 16, 16) inline batch operator<<(const batch& lhs, int32_t rhs) { return _mm_and_si128(_mm_set1_epi8(0xFF << rhs), _mm_slli_epi32(lhs, rhs)); } inline batch operator>>(const batch& lhs, int32_t rhs) { __m128i sign_mask = _mm_set1_epi16((0xFF00 >> rhs) & 0x00FF); __m128i cmp_is_negative = _mm_cmpgt_epi8(_mm_setzero_si128(), lhs); __m128i res = _mm_srai_epi16(lhs, rhs); return _mm_or_si128(_mm_and_si128(sign_mask, cmp_is_negative), _mm_andnot_si128(sign_mask, res)); } inline batch operator<<(const batch& lhs, const batch& rhs) { return sse_detail::shift_impl([](int8_t alhs, int8_t s) { return alhs << s; }, lhs, rhs); } inline batch operator>>(const batch& lhs, const batch& rhs) { return sse_detail::shift_impl([](int8_t alhs, int8_t s) { return alhs >> s; }, lhs, rhs); } XSIMD_DEFINE_LOAD_STORE_INT8(uint8_t, 16, 16) XSIMD_DEFINE_LOAD_STORE_LONG(uint8_t, 16, 16) inline batch operator<<(const batch& lhs, int32_t rhs) { return _mm_and_si128(_mm_set1_epi8(0xFF << rhs), _mm_slli_epi32(lhs, rhs)); } inline batch operator>>(const batch& lhs, int32_t rhs) { return _mm_and_si128(_mm_set1_epi8(0xFF >> rhs), _mm_srli_epi32(lhs, rhs)); } inline batch operator<<(const batch& lhs, const batch& rhs) { return sse_detail::shift_impl([](uint8_t alhs, int8_t s) { return alhs << s; }, lhs, rhs); } inline batch operator>>(const batch& lhs, const batch& rhs) { return sse_detail::shift_impl([](uint8_t alhs, int8_t s) { return alhs >> s; }, lhs, rhs); } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_sse_int_base.hpp000066400000000000000000000417471410101234500231420ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_SSE_INT_BASE_HPP #define XSIMD_SSE_INT_BASE_HPP #include #include "xsimd_base.hpp" namespace xsimd { /******************** * batch_bool * ********************/ template class sse_int_batch_bool : public simd_batch_bool> { public: sse_int_batch_bool(); explicit sse_int_batch_bool(bool b); template > sse_int_batch_bool(Args... args); sse_int_batch_bool(const __m128i& rhs); sse_int_batch_bool& operator=(const __m128i& rhs); operator __m128i() const; bool_proxy operator[](std::size_t index); bool operator[](std::size_t index) const; __m128i get_value() const; private: template batch_bool& load_values(Args... args); union { __m128i m_value; T m_array[N]; }; friend class simd_batch_bool>; }; /*********************** * sse_int_batch * ***********************/ template class sse_int_batch : public simd_batch> { public: using base_type = simd_batch>; using batch_bool_type = typename base_type::batch_bool_type; sse_int_batch(); explicit sse_int_batch(T i); template > constexpr sse_int_batch(Args... args); explicit sse_int_batch(const T* src); sse_int_batch(const T* src, aligned_mode); sse_int_batch(const T* src, unaligned_mode); sse_int_batch(const __m128i& rhs); sse_int_batch(const batch_bool_type& rhs); batch& operator=(const __m128i& rhs); batch& operator=(const batch_bool_type& rhs); operator __m128i() const; batch& load_aligned(const T* src); batch& load_unaligned(const T* src); batch& load_aligned(const flipped_sign_type_t* src); batch& load_unaligned(const flipped_sign_type_t* src); void store_aligned(T* dst) const; void store_unaligned(T* dst) const; void store_aligned(flipped_sign_type_t* src) const; void store_unaligned(flipped_sign_type_t* src) const; using base_type::load_aligned; using base_type::load_unaligned; using base_type::store_aligned; using base_type::store_unaligned; }; /******************** * helper functions * ********************/ namespace sse_detail { template inline __m128i int_init(std::integral_constant, Args... args) { return _mm_setr_epi8(args...); } template inline __m128i int_init(std::integral_constant, Args... args) { return _mm_setr_epi16(args...); } template inline __m128i int_init(std::integral_constant, Args... args) { return _mm_setr_epi32(args...); } template inline __m128i int_init(std::integral_constant, I0 i0, I1 i1) { return _mm_set_epi64x(i1, i0); } template inline __m128i int_set(std::integral_constant, T v) { return _mm_set1_epi8(v); } template inline __m128i int_set(std::integral_constant, T v) { return _mm_set1_epi16(v); } template inline __m128i int_set(std::integral_constant, T v) { return _mm_set1_epi32(v); } template inline __m128i int_set(std::integral_constant, T v) { return _mm_set1_epi64x(v); } inline __m128i cmpeq_epi64_sse2(__m128i lhs, __m128i rhs) { __m128i tmp1 = _mm_cmpeq_epi32(lhs, rhs); __m128i tmp2 = _mm_shuffle_epi32(tmp1, 0xB1); __m128i tmp3 = _mm_and_si128(tmp1, tmp2); __m128i tmp4 = _mm_srai_epi32(tmp3, 31); return _mm_shuffle_epi32(tmp4, 0xF5); } } /*********************************** * batch_bool implementation * ***********************************/ template inline sse_int_batch_bool::sse_int_batch_bool() { } template inline sse_int_batch_bool::sse_int_batch_bool(bool b) : m_value(_mm_set1_epi32(-(int)b)) { } template template inline sse_int_batch_bool::sse_int_batch_bool(Args... args) : m_value(sse_detail::int_init(std::integral_constant{}, static_cast(args ? typename std::make_signed::type{-1} : 0)...)) { } template inline sse_int_batch_bool::sse_int_batch_bool(const __m128i& rhs) : m_value(rhs) { } template inline sse_int_batch_bool& sse_int_batch_bool::operator=(const __m128i& rhs) { m_value = rhs; return *this; } template inline sse_int_batch_bool::operator __m128i() const { return m_value; } template inline bool_proxy sse_int_batch_bool::operator[](std::size_t index) { return bool_proxy(m_array[index & (N - 1)]); } template inline bool sse_int_batch_bool::operator[](std::size_t index) const { return static_cast(m_array[index & (N - 1)]); } template inline __m128i sse_int_batch_bool::get_value() const { return m_value; } template template inline batch_bool& sse_int_batch_bool::load_values(Args... args) { m_value = sse_detail::int_init(std::integral_constant{}, static_cast(args ? typename std::make_signed::type{-1} : 0)...); return (*this)(); } namespace detail { template struct sse_int_batch_bool_kernel { using batch_type = batch_bool; static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return _mm_and_si128(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return _mm_or_si128(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return _mm_xor_si128(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return _mm_xor_si128(rhs, _mm_set1_epi32(-1)); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return _mm_andnot_si128(lhs, rhs); } static batch_type equal(const batch_type& lhs, const batch_type& rhs) { switch(sizeof(T)) { case 1: return _mm_cmpeq_epi8(lhs, rhs); case 2: return _mm_cmpeq_epi16(lhs, rhs); case 4: return _mm_cmpeq_epi32(lhs, rhs); case 8: { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION return _mm_cmpeq_epi64(lhs, rhs); #else return sse_detail::cmpeq_epi64_sse2(lhs, rhs); #endif } } } static batch_type not_equal(const batch_type& lhs, const batch_type& rhs) { return ~(lhs == rhs); } static bool all(const batch_type& rhs) { return _mm_movemask_epi8(rhs) == 0xFFFF; } static bool any(const batch_type& rhs) { #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION return !_mm_testz_si128(rhs, rhs); #else return _mm_movemask_epi8(rhs) != 0; #endif } }; } /************************************** * sse_int_batch implementation * **************************************/ template inline sse_int_batch::sse_int_batch() { } template inline sse_int_batch::sse_int_batch(T i) : base_type(sse_detail::int_set(std::integral_constant{}, i)) { } template template constexpr inline sse_int_batch::sse_int_batch(Args... args) : base_type(sse_detail::int_init(std::integral_constant{}, args...)) { } template inline sse_int_batch::sse_int_batch(const T* src) : base_type(_mm_loadu_si128((__m128i const*)src)) { } template inline sse_int_batch::sse_int_batch(const T* src, aligned_mode) : base_type(_mm_load_si128((__m128i const*)src)) { } template inline sse_int_batch::sse_int_batch(const T* src, unaligned_mode) : base_type(_mm_loadu_si128((__m128i const*)src)) { } template inline sse_int_batch::sse_int_batch(const __m128i& rhs) : base_type(rhs) { } template inline sse_int_batch::sse_int_batch(const batch_bool_type& rhs) : base_type(_mm_and_si128(rhs, batch(1))) { } template inline batch& sse_int_batch::operator=(const __m128i& rhs) { this->m_value = rhs; return (*this)(); } template inline batch& sse_int_batch::operator=(const batch_bool_type& rhs) { this->m_value = _mm_and_si128(rhs, batch(1)); return (*this)(); } template inline sse_int_batch::operator __m128i() const { return this->m_value; } template inline batch& sse_int_batch::load_aligned(const T* src) { this->m_value = _mm_load_si128((__m128i const*)src); return (*this)(); } template inline batch& sse_int_batch::load_unaligned(const T* src) { this->m_value = _mm_loadu_si128((__m128i const*)src); return (*this)(); } template inline batch& sse_int_batch::load_aligned(const flipped_sign_type_t* src) { this->m_value = _mm_load_si128((__m128i const*)src); return (*this)(); } template inline batch& sse_int_batch::load_unaligned(const flipped_sign_type_t* src) { this->m_value = _mm_loadu_si128((__m128i const*)src); return (*this)(); } template inline void sse_int_batch::store_aligned(T* dst) const { _mm_store_si128((__m128i*)dst, this->m_value); } template inline void sse_int_batch::store_unaligned(T* dst) const { _mm_storeu_si128((__m128i*)dst, this->m_value); } template inline void sse_int_batch::store_aligned(flipped_sign_type_t* dst) const { _mm_store_si128((__m128i*)dst, this->m_value); } template inline void sse_int_batch::store_unaligned(flipped_sign_type_t* dst) const { _mm_storeu_si128((__m128i*)dst, this->m_value); } namespace detail { template struct sse_int_kernel_base { using batch_type = B; using batch_bool_type = typename simd_batch_traits::batch_bool_type; static constexpr std::size_t size = simd_batch_traits::size; static constexpr std::size_t align = simd_batch_traits::align; static batch_type neg(const batch_type& rhs) { return batch_type(_mm_setzero_si128()) - rhs; } static batch_type div(const batch_type& lhs, const batch_type& rhs) { // TODO implement divison as floating point! XSIMD_MACRO_UNROLL_BINARY(/); } static batch_type mod(const batch_type& lhs, const batch_type& rhs) { XSIMD_MACRO_UNROLL_BINARY(%); } static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs) { return ~(lhs == rhs); } static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs) { return ~(rhs < lhs); } static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs) { return _mm_and_si128(lhs, rhs); } static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs) { return _mm_or_si128(lhs, rhs); } static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs) { return _mm_xor_si128(lhs, rhs); } static batch_type bitwise_not(const batch_type& rhs) { return _mm_xor_si128(rhs, _mm_set1_epi8(-1)); } static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs) { return _mm_andnot_si128(lhs, rhs); } static batch_type fmin(const batch_type& lhs, const batch_type& rhs) { return min(lhs, rhs); } static batch_type fmax(const batch_type& lhs, const batch_type& rhs) { return max(lhs, rhs); } static batch_type fabs(const batch_type& rhs) { return abs(rhs); } static batch_type fma(const batch_type& x, const batch_type& y, const batch_type& z) { return x * y + z; } static batch_type fms(const batch_type& x, const batch_type& y, const batch_type& z) { return x * y - z; } static batch_type fnma(const batch_type& x, const batch_type& y, const batch_type& z) { return -x * y + z; } static batch_type fnms(const batch_type& x, const batch_type& y, const batch_type& z) { return -x * y - z; } }; } namespace sse_detail { template inline batch shift_impl(F&& f, const batch& lhs, int32_t rhs) { alignas(16) T tmp_lhs[N], tmp_res[N]; lhs.store_aligned(&tmp_lhs[0]); unroller([&](std::size_t i) { tmp_res[i] = f(tmp_lhs[i], rhs); }); return batch(tmp_res, aligned_mode()); } template inline batch shift_impl(F&& f, const batch& lhs, const batch& rhs) { alignas(16) T tmp_lhs[N], tmp_res[N]; alignas(16) S tmp_rhs[N]; lhs.store_aligned(&tmp_lhs[0]); rhs.store_aligned(&tmp_rhs[0]); unroller([&](std::size_t i) { tmp_res[i] = f(tmp_lhs[i], tmp_rhs[i]); }); return batch(tmp_res, aligned_mode()); } } } #endif xsimd-7.6.0/include/xsimd/types/xsimd_traits.hpp000066400000000000000000000336161410101234500220060ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_TRAITS_HPP #define XSIMD_TRAITS_HPP #include #include "../types/xsimd_base.hpp" #include "xsimd_types_include.hpp" #undef XSIMD_BATCH_INT_SIZE #undef XSIMD_BATCH_FLOAT_SIZE #undef XSIMD_BATCH_DOUBLE_SIZE #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX512_VERSION #define XSIMD_BATCH_INT8_SIZE 64 #define XSIMD_BATCH_INT16_SIZE 32 #define XSIMD_BATCH_INT32_SIZE 16 #define XSIMD_BATCH_INT64_SIZE 8 #define XSIMD_BATCH_FLOAT_SIZE 16 #define XSIMD_BATCH_DOUBLE_SIZE 8 #elif XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION #define XSIMD_BATCH_INT8_SIZE 32 #define XSIMD_BATCH_INT16_SIZE 16 #define XSIMD_BATCH_INT32_SIZE 8 #define XSIMD_BATCH_INT64_SIZE 4 #define XSIMD_BATCH_FLOAT_SIZE 8 #define XSIMD_BATCH_DOUBLE_SIZE 4 #elif XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION #define XSIMD_BATCH_INT8_SIZE 16 #define XSIMD_BATCH_INT16_SIZE 8 #define XSIMD_BATCH_INT32_SIZE 4 #define XSIMD_BATCH_INT64_SIZE 2 #define XSIMD_BATCH_FLOAT_SIZE 4 #define XSIMD_BATCH_DOUBLE_SIZE 2 #elif XSIMD_ARM_INSTR_SET >= XSIMD_ARM7_NEON_VERSION #define XSIMD_BATCH_INT8_SIZE 16 #define XSIMD_BATCH_INT16_SIZE 8 #define XSIMD_BATCH_INT32_SIZE 4 #define XSIMD_BATCH_INT64_SIZE 2 #define XSIMD_BATCH_FLOAT_SIZE 4 #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION #define XSIMD_BATCH_DOUBLE_SIZE 2 #elif defined(XSIMD_ENABLE_FALLBACK) #define XSIMD_BATCH_DOUBLE_SIZE 1 #endif #endif namespace xsimd { template struct simd_traits { using type = T; using bool_type = bool; static constexpr size_t size = 1; }; template struct revert_simd_traits { using type = T; static constexpr size_t size = simd_traits::size; }; template using simd_type = typename simd_traits::type; template using simd_bool_type = typename simd_traits::bool_type; template using revert_simd_type = typename revert_simd_traits::type; #ifdef XSIMD_BATCH_FLOAT_SIZE template <> struct simd_traits { using type = batch; using bool_type = simd_batch_traits::batch_bool_type; static constexpr size_t size = type::size; }; template <> struct revert_simd_traits> { using type = int8_t; static constexpr size_t size = simd_traits::size; }; template <> struct simd_traits { using type = batch; using bool_type = simd_batch_traits::batch_bool_type; static constexpr size_t size = type::size; }; template <> struct revert_simd_traits> { using type = uint8_t; static constexpr size_t size = simd_traits::size; }; template <> struct simd_traits { using type = batch; using bool_type = simd_batch_traits::batch_bool_type; static constexpr size_t size = type::size; }; template <> struct revert_simd_traits> { using type = int16_t; static constexpr size_t size = simd_traits::size; }; template <> struct simd_traits { using type = batch; using bool_type = simd_batch_traits::batch_bool_type; static constexpr size_t size = type::size; }; template <> struct revert_simd_traits> { using type = uint16_t; static constexpr size_t size = simd_traits::size; }; template <> struct simd_traits : std::conditional::value, simd_traits, simd_traits>::type { }; template <> struct simd_traits { using type = batch; using bool_type = simd_batch_traits::batch_bool_type; static constexpr size_t size = type::size; }; template <> struct revert_simd_traits> { using type = int32_t; static constexpr size_t size = simd_traits::size; }; template <> struct simd_traits { using type = batch; using bool_type = simd_batch_traits::batch_bool_type; static constexpr size_t size = type::size; }; template <> struct revert_simd_traits> { using type = uint32_t; static constexpr size_t size = simd_traits::size; }; // On some architectures long is a different type from int32_t or int64_t #ifdef XSIMD_32_BIT_ABI template <> struct simd_traits : simd_traits { }; template <> struct revert_simd_traits> { using type = long; static constexpr size_t size = simd_traits::size; }; template <> struct simd_traits : simd_traits { }; template <> struct revert_simd_traits> { using type = unsigned long; static constexpr size_t size = simd_traits::size; }; #endif // XSIMD_32_BIT_ABI template <> struct simd_traits { using type = batch; using bool_type = simd_batch_traits::batch_bool_type; static constexpr size_t size = type::size; }; template <> struct revert_simd_traits> { using type = int64_t; static constexpr size_t size = simd_traits::size; }; template <> struct simd_traits { using type = batch; using bool_type = simd_batch_traits::batch_bool_type; static constexpr size_t size = type::size; }; template <> struct revert_simd_traits> { using type = uint64_t; static constexpr size_t size = simd_traits::size; }; template <> struct simd_traits { using type = batch; using bool_type = simd_batch_traits::batch_bool_type; static constexpr size_t size = type::size; }; template <> struct revert_simd_traits> { using type = float; static constexpr size_t size = simd_traits::size; }; template <> struct simd_traits> { using type = batch, XSIMD_BATCH_FLOAT_SIZE>; using bool_type = simd_batch_traits::batch_bool_type; static constexpr size_t size = type::size; }; template <> struct revert_simd_traits, XSIMD_BATCH_FLOAT_SIZE>> { using type = std::complex; static constexpr size_t size = simd_traits::size; }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct simd_traits> { using type = batch, XSIMD_BATCH_FLOAT_SIZE>; using bool_type = typename simd_batch_traits::batch_bool_type; static constexpr size_t size = type::size; }; template struct revert_simd_traits, XSIMD_BATCH_FLOAT_SIZE>> { using type = xtl::xcomplex; static constexpr size_t size = simd_traits::size; }; #endif // XSIMD_ENABLE_XTL_COMPLEX #endif // XSIMD_BATCH_FLOAT_SIZE #ifdef XSIMD_BATCH_DOUBLE_SIZE template <> struct simd_traits { using type = batch; using bool_type = simd_batch_traits::batch_bool_type; static constexpr size_t size = type::size; }; template <> struct revert_simd_traits> { using type = double; static constexpr size_t size = simd_traits::size; }; template <> struct simd_traits> { using type = batch, XSIMD_BATCH_DOUBLE_SIZE>; using bool_type = simd_batch_traits::batch_bool_type; static constexpr size_t size = type::size; }; template <> struct revert_simd_traits, XSIMD_BATCH_DOUBLE_SIZE>> { using type = std::complex; static constexpr size_t size = simd_traits::size; }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct simd_traits> { using type = batch, XSIMD_BATCH_DOUBLE_SIZE>; using bool_type = typename simd_batch_traits::batch_bool_type; static constexpr size_t size = type::size; }; template struct revert_simd_traits, XSIMD_BATCH_DOUBLE_SIZE>> { using type = xtl::xcomplex; static constexpr size_t size = simd_traits::size; }; #endif // XSIMD_ENABLE_XTL_COMPLEX #endif // XSIMD_BATCH_DOUBLE_SIZE /******************** * simd_return_type * ********************/ namespace detail { template struct simd_condition { static constexpr bool value = (std::is_same::value && !std::is_same::value) || (std::is_same::value && !std::is_same::value) || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || detail::is_complex::value; }; template struct simd_return_type_impl : std::enable_if::value, batch> { }; template struct simd_return_type_impl : std::conditional::value, simd_return_type_impl, simd_return_type_impl>::type { }; template struct simd_return_type_impl : std::enable_if::value, batch_bool> { }; template struct simd_return_type_impl, T2, N> : std::enable_if::value, batch, N>> { }; template struct simd_return_type_impl, std::complex, N> : std::enable_if::value, batch, N>> { }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct simd_return_type_impl, T2, N> : std::enable_if::value, batch, N>> { }; template struct simd_return_type_impl, xtl::xcomplex, N> : std::enable_if::value, batch, N>> { }; #endif // XSIMD_ENABLE_XTL_COMPLEX } template ::size> using simd_return_type = typename detail::simd_return_type_impl::type; /************ * is_batch * ************/ template struct is_batch : std::false_type { }; template struct is_batch> : std::true_type { }; /***************** * is_batch_bool * *****************/ template struct is_batch_bool : std::false_type { }; template struct is_batch_bool> : std::true_type { }; /******************** * is_batch_complex * ********************/ template struct is_batch_complex : std::false_type { }; template struct is_batch_complex, N>> : std::true_type { }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct is_batch_complex, N>> : std::true_type { }; #endif //XSIMD_ENABLE_XTL_COMPLEX } #endif xsimd-7.6.0/include/xsimd/types/xsimd_types_include.hpp000066400000000000000000000063371410101234500233470ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_TYPES_INCLUDE_HPP #define XSIMD_TYPES_INCLUDE_HPP #include "../config/xsimd_include.hpp" #if defined(XSIMD_ENABLE_FALLBACK) #include "xsimd_fallback.hpp" #endif #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION #include "xsimd_sse_conversion.hpp" #include "xsimd_sse_double.hpp" #include "xsimd_sse_float.hpp" #include "xsimd_sse_int8.hpp" #include "xsimd_sse_int16.hpp" #include "xsimd_sse_int32.hpp" #include "xsimd_sse_int64.hpp" #include "xsimd_sse_complex.hpp" #endif #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION #include "xsimd_avx_conversion.hpp" #include "xsimd_avx_double.hpp" #include "xsimd_avx_float.hpp" #include "xsimd_avx_int8.hpp" #include "xsimd_avx_int16.hpp" #include "xsimd_avx_int32.hpp" #include "xsimd_avx_int64.hpp" #include "xsimd_avx_complex.hpp" #endif #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX512_VERSION #include "xsimd_avx512_conversion.hpp" #include "xsimd_avx512_bool.hpp" #include "xsimd_avx512_double.hpp" #include "xsimd_avx512_float.hpp" #include "xsimd_avx512_int8.hpp" #include "xsimd_avx512_int16.hpp" #include "xsimd_avx512_int32.hpp" #include "xsimd_avx512_int64.hpp" #include "xsimd_avx512_complex.hpp" #endif #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM7_NEON_VERSION #include "xsimd_neon_conversion.hpp" #include "xsimd_neon_bool.hpp" #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION #include "xsimd_neon_double.hpp" #endif #include "xsimd_neon_float.hpp" #include "xsimd_neon_int8.hpp" #include "xsimd_neon_uint8.hpp" #include "xsimd_neon_int16.hpp" #include "xsimd_neon_uint16.hpp" #include "xsimd_neon_int32.hpp" #include "xsimd_neon_uint32.hpp" #include "xsimd_neon_int64.hpp" #include "xsimd_neon_uint64.hpp" #include "xsimd_neon_complex.hpp" #endif #if !defined(XSIMD_INSTR_SET_AVAILABLE) #ifndef XSIMD_SKIP_ON_WERROR #if defined(XSIMD_ENABLE_FALLBACK) #ifdef _MSC_VER #pragma message("Warning: No SIMD instructions set detected, using fallback mode.") #else #warning "No SIMD instructions set detected, using fallback mode." #endif #else #ifdef _MSC_VER #pragma message("Warning: No SIMD instructions set detected, please enable SIMD instructions or activate the fallback mode. (e.g. for x86 -march=native or for ARM -mfpu=neon)") #else #warning "No SIMD instructions set detected, please enable SIMD instructions or activate the fallback mode. (e.g. for x86 -march=native or for ARM -mfpu=neon)" #endif #endif #endif #endif #include "xsimd_utils.hpp" #endif xsimd-7.6.0/include/xsimd/types/xsimd_utils.hpp000066400000000000000000000356441410101234500216430ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_UTILS_HPP #define XSIMD_UTILS_HPP #include #include #include #ifdef XSIMD_ENABLE_XTL_COMPLEX #include "xtl/xcomplex.hpp" #endif /* For Shift instruction: vshlq_n_u8/vshrq_n_u8 (lhs, n), * 'n' must be a constant and is the compile-time literal constant. * * This Macro is to fix compiling issues from llvm(clang): * "argument must be a constant..." * */ #define EXPAND(...) __VA_ARGS__ #define CASE_LHS(op, i) \ case i: return op(lhs, i); #define XSIMD_REPEAT_8_0(op, addx) \ CASE_LHS(EXPAND(op), 1 + addx); \ CASE_LHS(EXPAND(op), 2 + addx); \ CASE_LHS(EXPAND(op), 3 + addx); \ CASE_LHS(EXPAND(op), 4 + addx); \ CASE_LHS(EXPAND(op), 5 + addx); \ CASE_LHS(EXPAND(op), 6 + addx); \ CASE_LHS(EXPAND(op), 7 + addx); #define XSIMD_REPEAT_8_N(op, addx) \ CASE_LHS(EXPAND(op), 0 + addx); \ XSIMD_REPEAT_8_0(op, addx); #define XSIMD_REPEAT_8(op) \ XSIMD_REPEAT_8_0(op, 0); #define XSIMD_REPEAT_16_0(op, addx) \ XSIMD_REPEAT_8_0(op, 0 + addx); \ XSIMD_REPEAT_8_N(op, 8 + addx); #define XSIMD_REPEAT_16_N(op, addx) \ XSIMD_REPEAT_8_N(op, 0 + addx); \ XSIMD_REPEAT_8_N(op, 8 + addx); #define XSIMD_REPEAT_16(op) \ XSIMD_REPEAT_16_0(op, 0); #define XSIMD_REPEAT_32_0(op, addx) \ XSIMD_REPEAT_16_0(op, 0 + addx); \ XSIMD_REPEAT_16_N(op, 16 + addx); #define XSIMD_REPEAT_32_N(op, addx) \ XSIMD_REPEAT_16_N(op, 0 + addx); \ XSIMD_REPEAT_16_N(op, 16 + addx); #define XSIMD_REPEAT_32(op) \ XSIMD_REPEAT_32_0(op, 0); #define XSIMD_REPEAT_64(op) \ XSIMD_REPEAT_32_0(op, 0); \ XSIMD_REPEAT_32_N(op, 32); /* The Macro is for vext (lhs, rhs, n) * * _mm_alignr_epi8, _mm_alignr_epi32 ... */ #define CASE_LHS_RHS(op, i) \ case i: return op(lhs, rhs, i); #define XSIMD_REPEAT_2_0(op, addx) \ CASE_LHS_RHS(EXPAND(op), 1 + addx); #define XSIMD_REPEAT_2_N(op, addx) \ CASE_LHS_RHS(EXPAND(op), 0 + addx); \ XSIMD_REPEAT_2_0(op, addx); #define XSIMD_REPEAT_2(op) \ XSIMD_REPEAT_2_0(op, 0); #define XSIMD_REPEAT_4_0(op, addx) \ XSIMD_REPEAT_2_0(op, 0 + addx); \ XSIMD_REPEAT_2_N(op, 2 + addx); #define XSIMD_REPEAT_4_N(op, addx) \ XSIMD_REPEAT_2_N(op, 0 + addx); \ XSIMD_REPEAT_2_N(op, 2 + addx); #define XSIMD_REPEAT_4(op) \ XSIMD_REPEAT_4_0(op, 0); #define XSIMD_REPEAT_8_0_v2(op, addx) \ XSIMD_REPEAT_4_0(op, 0 + addx); \ XSIMD_REPEAT_4_N(op, 4 + addx); #define XSIMD_REPEAT_8_N_v2(op, addx) \ XSIMD_REPEAT_4_N(op, 0 + addx); \ XSIMD_REPEAT_4_N(op, 4 + addx); #define XSIMD_REPEAT_8_v2(op) \ XSIMD_REPEAT_8_0_v2(op, 0); #define XSIMD_REPEAT_16_0_v2(op, addx) \ XSIMD_REPEAT_8_0_v2(op, 0 + addx); \ XSIMD_REPEAT_8_N_v2(op, 8 + addx); #define XSIMD_REPEAT_16_N_v2(op, addx) \ XSIMD_REPEAT_8_N_v2(op, 0 + addx); \ XSIMD_REPEAT_8_N_v2(op, 8 + addx); #define XSIMD_REPEAT_16_v2(op) \ XSIMD_REPEAT_16_0_v2(op, 0); #define XSIMD_REPEAT_32_0_v2(op, addx) \ XSIMD_REPEAT_16_0_v2(op, 0 + addx); \ XSIMD_REPEAT_16_N_v2(op, 16 + addx); #define XSIMD_REPEAT_32_N_v2(op, addx) \ XSIMD_REPEAT_16_N_v2(op, 0 + addx); \ XSIMD_REPEAT_16_N_v2(op, 16 + addx); #define XSIMD_REPEAT_32_v2(op) \ XSIMD_REPEAT_32_0_v2(op, 0); #define XSIMD_REPEAT_64_v2(op) \ XSIMD_REPEAT_32_0_v2(op, 0); \ XSIMD_REPEAT_32_N_v2(op, 32); namespace xsimd { template class batch; template class batch_bool; /************** * as_integer * **************/ template struct as_integer : std::make_signed { }; template <> struct as_integer { using type = int32_t; }; template <> struct as_integer { using type = int64_t; }; template struct as_integer> { using type = batch::type, N>; }; template using as_integer_t = typename as_integer::type; /*********************** * as_unsigned_integer * ***********************/ template struct as_unsigned_integer : std::make_unsigned { }; template <> struct as_unsigned_integer { using type = uint32_t; }; template <> struct as_unsigned_integer { using type = uint64_t; }; template struct as_unsigned_integer> { using type = batch::type, N>; }; template using as_unsigned_integer_t = typename as_unsigned_integer::type; /****************** * flip_sign_type * ******************/ namespace detail { template struct flipped_sign_type_impl : std::make_signed { }; template struct flipped_sign_type_impl : std::make_unsigned { }; } template struct flipped_sign_type : detail::flipped_sign_type_impl::value> { }; template using flipped_sign_type_t = typename flipped_sign_type::type; /*********** * as_float * ************/ template struct as_float; template <> struct as_float { using type = float; }; template <> struct as_float { using type = double; }; template struct as_float> { using type = batch::type, N>; }; template using as_float_t = typename as_float::type; /************** * as_logical * **************/ template struct as_logical; template struct as_logical> { using type = batch_bool; }; template using as_logical_t = typename as_logical::type; /******************** * primitive caster * ********************/ namespace detail { template union generic_caster { UI ui; I i; F f; constexpr generic_caster(UI t) : ui(t) {} constexpr generic_caster(I t) : i(t) {} constexpr generic_caster(F t) : f(t) {} }; using caster32_t = generic_caster; using caster64_t = generic_caster; template struct caster; template <> struct caster { using type = caster32_t; }; template <> struct caster { using type = caster64_t; }; template using caster_t = typename caster::type; } /**************************** * to/from_unsigned_integer * ****************************/ namespace detail { template union unsigned_convertor { T data; as_unsigned_integer_t bits; }; template as_unsigned_integer_t to_unsigned_integer(const T& input) { unsigned_convertor convertor; convertor.data = input; return convertor.bits; } template T from_unsigned_integer(const as_unsigned_integer_t& input) { unsigned_convertor convertor; convertor.bits = input; return convertor.data; } } /***************************************** * Backport of index_sequence from c++14 * *****************************************/ // TODO: Remove this once we drop C++11 support namespace detail { template struct identity { using type = T; }; #ifdef __cpp_lib_integer_sequence using std::integer_sequence; using std::index_sequence; using std::make_index_sequence; using std::index_sequence_for; #else template struct integer_sequence { using value_type = T; static constexpr std::size_t size() noexcept { return sizeof...(Is); } }; template using index_sequence = integer_sequence; template struct make_index_sequence_concat; template struct make_index_sequence_concat, index_sequence> : identity> {}; template struct make_index_sequence_impl; template using make_index_sequence = typename make_index_sequence_impl::type; template struct make_index_sequence_impl : make_index_sequence_concat, make_index_sequence> {}; template <> struct make_index_sequence_impl<0> : identity> {}; template <> struct make_index_sequence_impl<1> : identity> {}; template using index_sequence_for = make_index_sequence; #endif } #define XSIMD_MACRO_UNROLL_BINARY(FUNC) \ constexpr std::size_t size = simd_batch_traits::size; \ using tmp_value_type = typename simd_batch_traits::value_type; \ alignas(simd_batch_traits::align) tmp_value_type tmp_lhs[size], tmp_rhs[size], tmp_res[size]; \ lhs.store_aligned(tmp_lhs); \ rhs.store_aligned(tmp_rhs); \ unroller([&](std::size_t i) { \ tmp_res[i] = tmp_lhs[i] FUNC tmp_rhs[i]; \ }); \ return batch_type(&tmp_res[0], aligned_mode()); template inline void unroller_impl(F&& f, detail::index_sequence) { static_cast(std::initializer_list{(f(I), 0)...}); } template inline void unroller(F&& f) { unroller_impl(f, detail::make_index_sequence{}); } /***************************************** * Supplementary std::array constructors * *****************************************/ namespace detail { // std::array constructor from scalar value ("broadcast") template constexpr std::array array_from_scalar_impl(const T& scalar, index_sequence) { // You can safely ignore this silly ternary, the "scalar" is all // that matters. The rest is just a dirty workaround... return std::array{ (Is+1) ? scalar : T() ... }; } template constexpr std::array array_from_scalar(const T& scalar) { return array_from_scalar_impl(scalar, make_index_sequence()); } // std::array constructor from C-style pointer (handled as an array) template constexpr std::array array_from_pointer_impl(const T* c_array, index_sequence) { return std::array{ c_array[Is]... }; } template constexpr std::array array_from_pointer(const T* c_array) { return array_from_pointer_impl(c_array, make_index_sequence()); } } /************************ * is_array_initializer * ************************/ namespace detail { template struct bool_pack; template using all_true = std::is_same< bool_pack, bool_pack >; template using is_all_convertible = all_true::value...>; template using is_array_initializer = std::enable_if< (sizeof...(Args) == N) && is_all_convertible::value >; // Check that a variadic argument pack is a list of N values of type T, // as usable for instantiating a value of type std::array. template using is_array_initializer_t = typename is_array_initializer::type; } /************** * is_complex * **************/ // This is used in both xsimd_complex_base.hpp and xsimd_traits.hpp // However xsimd_traits.hpp indirectly includes xsimd_complex_base.hpp // so we cannot define is_complex in xsimd_traits.hpp. Besides, if // no file defining batches is included, we still need this definition // in xsimd_traits.hpp, so let's define it here. namespace detail { template struct is_complex : std::false_type { }; template struct is_complex> : std::true_type { }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct is_complex> : std::true_type { }; #endif } } #endif xsimd-7.6.0/include/xsimd/xsimd.hpp000066400000000000000000000017701410101234500172500ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_HPP #define XSIMD_HPP #include "memory/xsimd_alignment.hpp" #include "config/xsimd_config.hpp" #include "types/xsimd_traits.hpp" #include "math/xsimd_math.hpp" #include "math/xsimd_math_complex.hpp" #include "memory/xsimd_load_store.hpp" #include "stl/algorithms.hpp" #include "stl/iterator.hpp" #endif xsimd-7.6.0/install_sde.sh000066400000000000000000000026311410101234500154760ustar00rootroot00000000000000#git clone https://github.com/marehr/intel-sde-downloader #cd intel-sde-downloader #pip install -r requirements.txt #python ./intel-sde-downloader.py sde-external-8.35.0-2019-03-11-lin.tar.bz2 #wget http://software.intel.com/content/dam/develop/external/us/en/protected/sde-external-8.50.0-2020-03-26-lin.tar.bz2 curl 'https://software.intel.com/content/dam/develop/external/us/en/documents/sde-external-8.56.0-2020-07-05-lin.tar.bz2' -H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Accept-Language: en-US,en;q=0.5' --compressed -H 'Connection: keep-alive' -H 'Referer: https://software.intel.com/content/www/us/en/develop/articles/pre-release-license-agreement-for-intel-software-development-emulator-accept-end-user-license-agreement-and-download.html' -H 'Cookie: AWSALB=xdEyuBs+g/QOsC4iy4IiI9PDOuiExTuglYNXQzDO4xoupxFgOsrOCkq4CfnhHc7XBY2SbhWsjn83d6MtgpFxtcCQvBqy9VPBWg+W885Kz5aCj6uHJlTH7gS+t6NS; AWSALBCORS=xdEyuBs+g/QOsC4iy4IiI9PDOuiExTuglYNXQzDO4xoupxFgOsrOCkq4CfnhHc7XBY2SbhWsjn83d6MtgpFxtcCQvBqy9VPBWg+W885Kz5aCj6uHJlTH7gS+t6NS; ref=; OldBrowsersCookie=Cookie for old browser popup message' -H 'Upgrade-Insecure-Requests: 1' --output sde-external-8.56.0-2020-07-05-lin.tar.bz2 tar xvf sde-external-8.56.0-2020-07-05-lin.tar.bz2 sudo sh -c "echo 0 > /proc/sys/kernel/yama/ptrace_scope" xsimd-7.6.0/readthedocs.yml000066400000000000000000000000461410101234500156470ustar00rootroot00000000000000conda: file: docs/environment.yml xsimd-7.6.0/test/000077500000000000000000000000001410101234500136165ustar00rootroot00000000000000xsimd-7.6.0/test/CMakeLists.txt000066400000000000000000000225751410101234500163710ustar00rootroot00000000000000############################################################################ # Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and # # Martin Renou # # Copyright (c) QuantStack # # # # Distributed under the terms of the BSD 3-Clause License. # # # # The full license is in the file LICENSE, distributed with this software. # ############################################################################ cmake_minimum_required(VERSION 3.1) if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) project(xsimd-test) enable_testing() find_package(xsimd REQUIRED CONFIG) set(XSIMD_INCLUDE_DIR ${xsimd_INCLUDE_DIRS}) endif () if(NOT CMAKE_BUILD_TYPE) message(STATUS "Setting tests build type to Release") set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) else() message(STATUS "Tests build type is ${CMAKE_BUILD_TYPE}") endif() include(CheckCXXCompilerFlag) string(TOUPPER "${CMAKE_BUILD_TYPE}" U_CMAKE_BUILD_TYPE) OPTION(XSIMD_ENABLE_WERROR "Turn on -Werror" OFF) ################ # ARM SETTINGS # ################ OPTION(CROSS_COMPILE_ARM "cross compile for ARM targets" OFF) # Note: to compile on ARM (or cross compile), you may need to add the following: # -DTARGET_ARCH="armv8-a -mfpu=neon -mfloat-abi=softfp -target arm-linux-gnueabi" set(TARGET_ARCH "native" CACHE STRING "Target architecture arguments") set(ARM_ARCH_DIRECTORY "arm-linux-gnueabi" CACHE STRING "ARM arch header dir") set(ARM_GCC_VER "4.7.3" CACHE STRING "ARM GCC header dir") if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Intel") if (CROSS_COMPILE_ARM) # We're cross-compiling with clang++ on Azure Pipelines, this is all pretty specific and just for testing set(CMAKE_SHARED_LIBRARY_LINK_C_FLAGS) set(CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS) set(CMAKE_THREAD_LIBS_INIT) set(CMAKE_SYSTEM_PROCESSOR arm) set(CMAKE_C_COMPILER_TARGET arm-linux-gnueabi) set(CMAKE_CXX_COMPILER_TARGET arm-linux-gnueabi) include_directories(/usr/${ARM_ARCH_DIRECTORY}/include/c++/${ARM_GCC_VER}/${ARM_ARCH_DIRECTORY}/) include_directories(/usr/${ARM_ARCH_DIRECTORY}/include/c++/${ARM_GCC_VER}/) include_directories(/usr/${ARM_ARCH_DIRECTORY}/include/) if(NOT CMAKE_CXX_FLAGS MATCHES "-march") message(STATUS "SETTING ARCH TO ${TARGET_ARCH}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${TARGET_ARCH}") endif() if(ARM_ARCH_DIRECTORY MATCHES "arm-linux-gnueabi") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -mfloat-abi=softfp -target arm-linux-gnueabi") else () # delegating to gcc here endif() message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") message(STATUS "CMAKE_CXX_LINK_EXECUTABLE: ${CMAKE_CXX_LINK_EXECUTABLE}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wunused-parameter -Wextra -Wreorder -std=c++11") elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^ppc64") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -fPIC -mcpu=${TARGET_ARCH} -mtune=${TARGET_ARCH} -Wunused-parameter -Wextra -Wreorder -std=c++11") elseif(NOT WIN32) if(NOT CMAKE_CXX_FLAGS MATCHES "-march") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${TARGET_ARCH}") endif() CHECK_CXX_COMPILER_FLAG("-std=c++11" HAS_CPP11_FLAG) if (ENABLE_XTL_COMPLEX) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -fPIC -Wunused-parameter -Wextra -Wreorder -std=c++14") elseif (HAS_CPP11_FLAG) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -fPIC -Wunused-parameter -Wextra -Wreorder -std=c++11") else() message(FATAL_ERROR "Unsupported compiler -- xsimd requires C++11 support!") endif() endif() endif() if(CMAKE_CXX_COMPILER_ID MATCHES MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /MP /bigobj") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") set(CMAKE_EXE_LINKER_FLAGS /MANIFEST:NO) endif() if(CMAKE_CXX_COMPILER_ID MATCHES Clang AND WIN32) # We are using clang-cl add_compile_options(/EHsc /bigobj) set(CMAKE_EXE_LINKER_FLAGS /MANIFEST:NO) endif() if(DOWNLOAD_GTEST OR GTEST_SRC_DIR) if(DOWNLOAD_GTEST) # Download and unpack googletest at configure time configure_file(downloadGTest.cmake.in googletest-download/CMakeLists.txt) else() # Copy local source of googletest at configure time configure_file(copyGTest.cmake.in googletest-download/CMakeLists.txt) endif() execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . RESULT_VARIABLE result WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download ) if(result) message(FATAL_ERROR "CMake step for googletest failed: ${result}") endif() execute_process(COMMAND ${CMAKE_COMMAND} --build . RESULT_VARIABLE result WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download ) if(result) message(FATAL_ERROR "Build step for googletest failed: ${result}") endif() set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) # Add googletest directly to our build. This defines # the gtest and gtest_main targets. add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src ${CMAKE_CURRENT_BINARY_DIR}/googletest-build EXCLUDE_FROM_ALL) set(GTEST_INCLUDE_DIRS "${gtest_SOURCE_DIR}/include") add_library(GTest::GTest INTERFACE IMPORTED) target_link_libraries(GTest::GTest INTERFACE gtest) add_library(GTest::Main INTERFACE IMPORTED) target_link_libraries(GTest::Main INTERFACE gtest_main) else() find_package(GTest REQUIRED) endif() find_package(Threads) include_directories(${GTEST_INCLUDE_DIRS}) set(XSIMD_TESTS main.cpp test_algorithms.cpp test_api.cpp test_basic_math.cpp test_batch.cpp test_batch_bool.cpp test_batch_cast.cpp test_batch_complex.cpp test_batch_float.cpp test_batch_int.cpp test_bitwise_cast.cpp test_constant_batch.cpp test_complex_exponential.cpp test_complex_hyperbolic.cpp test_complex_power.cpp test_complex_trigonometric.cpp test_conversion.cpp test_error_gamma.cpp test_exponential.cpp test_extract_pair.cpp test_fp_manipulation.cpp test_hyperbolic.cpp test_load_store.cpp test_memory.cpp test_poly_evaluation.cpp test_power.cpp test_rounding.cpp test_select.cpp test_shuffle_128.cpp test_trigonometric.cpp test_utils.hpp #[[ xsimd_api_test.hpp xsimd_api_test.cpp xsimd_algorithms.cpp xsimd_basic_test.hpp xsimd_basic_test.cpp xsimd_basic_math_test.hpp xsimd_basic_math_test.cpp xsimd_complex_basic_test.hpp xsimd_complex_tester.hpp xsimd_error_gamma_test.hpp xsimd_error_gamma_test.cpp xsimd_exponential_test.hpp xsimd_exponential_test.cpp xsimd_cexponential_test.hpp xsimd_cexponential_test.cpp xsimd_chyperbolic_test.hpp xsimd_chyperbolic_test.cpp xsimd_cpower_test.hpp xsimd_cpower_test.cpp xsimd_ctrigonometric_test.hpp xsimd_ctrigonometric_test.cpp xsimd_fp_manipulation_test.hpp xsimd_fp_manipulation_test.cpp xsimd_hyperbolic_test.hpp xsimd_hyperbolic_test.cpp xsimd_interface_test.cpp xsimd_memory_test.cpp xsimd_poly_evaluation_test.cpp xsimd_poly_evaluation_test.hpp xsimd_power_test.hpp xsimd_power_test.cpp xsimd_rounding_test.hpp xsimd_rounding_test.cpp xsimd_tester.hpp xsimd_test_utils.hpp xsimd_trigonometric_test.hpp xsimd_trigonometric_test.cpp]] ) add_executable(test_xsimd ${XSIMD_TESTS} ${XSIMD_HEADERS}) target_link_libraries(test_xsimd xsimd GTest::GTest GTest::Main ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(test_xsimd PRIVATE ${XSIMD_INCLUDE_DIR}) add_test(NAME test_xsimd COMMAND test_xsimd) if(DEFINED XSIMD_FORCE_X86_INSTR_SET) message("Forcing XSIMD_FORCE_X86_INSTR_SET to ${XSIMD_FORCE_X86_INSTR_SET}") target_compile_definitions(test_xsimd PRIVATE XSIMD_FORCE_X86_INSTR_SET=${XSIMD_FORCE_X86_INSTR_SET}) endif() if(DEFINED XSIMD_FORCE_X86_AMD_INSTR_SET) message("Forcing XSIMD_FORCE_X86_AMD_INSTR_SET to ${XSIMD_FORCE_X86_AMD_INSTR_SET}") target_compile_definitions(test_xsimd PRIVATE XSIMD_FORCE_X86_AMD_INSTR_SET=${XSIMD_FORCE_X86_AMD_INSTR_SET}) endif() if(DEFINED XSIMD_FORCE_PPC_INSTR_SET) message("Forcing XSIMD_FORCE_PPC_INSTR_SET to ${XSIMD_FORCE_PPC_INSTR_SET}") target_compile_definitions(test_xsimd PRIVATE XSIMD_FORCE_PPC_INSTR_SET=${XSIMD_FORCE_PPC_INSTR_SET}) endif() if(DEFINED XSIMD_FORCE_ARM_INSTR_SET) message("Forcing XSIMD_FORCE_ARM_INSTR_SET to ${XSIMD_FORCE_ARM_INSTR_SET}") target_compile_definitions(test_xsimd PRIVATE XSIMD_FORCE_ARM_INSTR_SET=${XSIMD_FORCE_ARM_INSTR_SET}) endif() if (CROSS_COMPILE_ARM) add_custom_target(xtest COMMAND qemu-arm -L /usr/arm-linux-gnueabi/ test_xsimd DEPENDS test_xsimd) else() add_custom_target(xtest COMMAND test_xsimd DEPENDS test_xsimd) endif() if (XSIMD_ENABLE_WERROR) target_compile_options(test_xsimd PRIVATE -Werror -Wall -DXSIMD_SKIP_ON_WERROR) endif() xsimd-7.6.0/test/copyGTest.cmake.in000066400000000000000000000021201410101234500171410ustar00rootroot00000000000000############################################################################ # Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and # # Martin Renou # # Copyright (c) QuantStack # # # # Distributed under the terms of the BSD 3-Clause License. # # # # The full license is in the file LICENSE, distributed with this software. # ############################################################################ cmake_minimum_required(VERSION 2.8.2) project(googletest-download NONE) include(ExternalProject) ExternalProject_Add(googletest URL "${GTEST_SRC_DIR}" SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-src" BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-build" CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" TEST_COMMAND "" ) xsimd-7.6.0/test/downloadGTest.cmake.in000066400000000000000000000022141410101234500200020ustar00rootroot00000000000000############################################################################ # Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and # # Martin Renou # # Copyright (c) QuantStack # # # # Distributed under the terms of the BSD 3-Clause License. # # # # The full license is in the file LICENSE, distributed with this software. # ############################################################################ cmake_minimum_required(VERSION 2.8.2) project(googletest-download NONE) include(ExternalProject) ExternalProject_Add(googletest GIT_REPOSITORY https://github.com/JohanMabille/googletest.git GIT_TAG no_werror SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-src" BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-build" CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" TEST_COMMAND "" ) xsimd-7.6.0/test/main.cpp000066400000000000000000000051271410101234500152530ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include #include #include #include #include "gtest/gtest.h" #include "xsimd/config/xsimd_instruction_set.hpp" using info_map_type = std::map; info_map_type init_instruction_map() { info_map_type res; #ifdef XSIMD_X86_INSTR_SET_AVAILABLE res[XSIMD_X86_SSE_VERSION] = "Intel SSE"; res[XSIMD_X86_SSE2_VERSION] = "Intel SSE2"; res[XSIMD_X86_SSE3_VERSION] = "Intel SSE3"; res[XSIMD_X86_SSSE3_VERSION] = "Intel SSSE3"; res[XSIMD_X86_SSE4_1_VERSION] = "Intel SSE4.1"; res[XSIMD_X86_SSE4_2_VERSION] = "Intel SSE4.2"; res[XSIMD_X86_AVX_VERSION] = "Intel AVX"; res[XSIMD_X86_AVX512_VERSION] = "Intel AVX 512"; res[XSIMD_X86_FMA3_VERSION] = "Intel FMA3"; res[XSIMD_X86_AVX2_VERSION] = "Intel AVX2"; res[XSIMD_X86_MIC_VERSION] = "Intel MIC"; res[XSIMD_X86_AMD_SSE4A_VERSION] = "AMD SSE4A"; res[XSIMD_X86_AMD_FMA4_VERSION] = "AMD FMA4"; res[XSIMD_X86_AMD_XOP_VERSION] = "AMD XOP"; #elif defined(XSIMD_PPC_INSTR_SET_AVAILABLE) res[XSIMD_PPC_VMX_VERSION] = "PowerPC VM"; res[XSIMD_PPC_VSX_VERSION] = "PowerPC VSX"; res[XSIMD_PPC_QPX_VERSION] = "PowerPC QPX"; #else res[XSIMD_ARM7_NEON_VERSION] = "ARMv7 Neon"; res[XSIMD_ARM8_32_NEON_VERSION] = "ARMv8 32bit Neon"; res[XSIMD_ARM8_64_NEON_VERSION] = "ARMv8 64bit Neon"; res[XSIMD_VERSION_NUMBER_NOT_AVAILABLE] = "No SIMD available"; #endif return res; } std::string get_instruction_set_name() { static info_map_type info_map(init_instruction_map()); return info_map[XSIMD_INSTR_SET]; } int main(int argc, char* argv[]) { std::ofstream out("log/xsimd_info.log", std::ios_base::out); std::string instruction_set = get_instruction_set_name(); out << "Instruction set: " << instruction_set << std::endl; std::cout << "Instruction set: " << instruction_set << std::endl; ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } xsimd-7.6.0/test/test_algorithms.cpp000066400000000000000000000217621410101234500175420ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include #include "test_utils.hpp" struct binary_functor { template T operator()(const T& a, const T& b) const { return a + b; } }; struct unary_functor { template T operator()(const T& a) const { return -a; } }; #ifdef XSIMD_DEFAULT_ALIGNMENT template using test_allocator_type = xsimd::aligned_allocator; #else template using test_allocator_type = std::allocator; #endif TEST(algorithms, binary_transform) { std::vector expected(93); std::vector a(93, 123), b(93, 123), c(93); std::vector> aa(93, 123), ba(93, 123), ca(93); std::transform(a.begin(), a.end(), b.begin(), expected.begin(), binary_functor{}); xsimd::transform(a.begin(), a.end(), b.begin(), c.begin(), binary_functor{}); EXPECT_TRUE(std::equal(expected.begin(), expected.end(), c.begin()) && expected.size() == c.size()); std::fill(c.begin(), c.end(), -1); // erase xsimd::transform(aa.begin(), aa.end(), ba.begin(), c.begin(), binary_functor{}); EXPECT_TRUE(std::equal(expected.begin(), expected.end(), c.begin()) && expected.size() == c.size()); std::fill(c.begin(), c.end(), -1); // erase xsimd::transform(aa.begin(), aa.end(), b.begin(), c.begin(), binary_functor{}); EXPECT_TRUE(std::equal(expected.begin(), expected.end(), c.begin()) && expected.size() == c.size()); std::fill(c.begin(), c.end(), -1); // erase xsimd::transform(a.begin(), a.end(), ba.begin(), c.begin(), binary_functor{}); EXPECT_TRUE(std::equal(expected.begin(), expected.end(), c.begin()) && expected.size() == c.size()); std::fill(c.begin(), c.end(), -1); // erase xsimd::transform(aa.begin(), aa.end(), ba.begin(), ca.begin(), binary_functor{}); EXPECT_TRUE(std::equal(expected.begin(), expected.end(), ca.begin()) && expected.size() == ca.size()); std::fill(ca.begin(), ca.end(), -1); // erase xsimd::transform(aa.begin(), aa.end(), b.begin(), ca.begin(), binary_functor{}); EXPECT_TRUE(std::equal(expected.begin(), expected.end(), ca.begin()) && expected.size() == ca.size()); std::fill(ca.begin(), ca.end(), -1); // erase xsimd::transform(a.begin(), a.end(), ba.begin(), ca.begin(), binary_functor{}); EXPECT_TRUE(std::equal(expected.begin(), expected.end(), ca.begin()) && expected.size() == ca.size()); std::fill(ca.begin(), ca.end(), -1); // erase } TEST(algorithms, unary_transform) { std::vector expected(93); std::vector a(93, 123), c(93); std::vector> aa(93, 123), ca(93); std::transform(a.begin(), a.end(), expected.begin(), unary_functor{}); xsimd::transform(a.begin(), a.end(), c.begin(), unary_functor{}); EXPECT_TRUE(std::equal(expected.begin(), expected.end(), c.begin()) && expected.size() == c.size()); std::fill(c.begin(), c.end(), -1); // erase xsimd::transform(aa.begin(), aa.end(), c.begin(), unary_functor{}); EXPECT_TRUE(std::equal(expected.begin(), expected.end(), c.begin()) && expected.size() == c.size()); std::fill(c.begin(), c.end(), -1); // erase xsimd::transform(a.begin(), a.end(), ca.begin(), unary_functor{}); EXPECT_TRUE(std::equal(expected.begin(), expected.end(), ca.begin()) && expected.size() == ca.size()); std::fill(ca.begin(), ca.end(), -1); // erase xsimd::transform(aa.begin(), aa.end(), ca.begin(), unary_functor{}); EXPECT_TRUE(std::equal(expected.begin(), expected.end(), ca.begin()) && expected.size() == ca.size()); std::fill(ca.begin(), ca.end(), -1); // erase } class xsimd_reduce : public ::testing::Test { public: using aligned_vec_t = std::vector>; static constexpr std::size_t num_elements = 4 * xsimd::simd_traits::size; static constexpr std::size_t small_num = xsimd::simd_traits::size - 1; aligned_vec_t vec = aligned_vec_t(num_elements, 123.); aligned_vec_t small_vec = aligned_vec_t(small_num, 42.); double init = 1337.; struct multiply { template T operator()(const T& a, const T& b) const { return a * b; } }; }; TEST_F(xsimd_reduce, unaligned_begin_unaligned_end) { auto const begin = std::next(vec.begin()); auto const end = std::prev(vec.end()); EXPECT_EQ(std::accumulate(begin, end, init), xsimd::reduce(begin, end, init)); if(small_vec.size() > 1) { auto const sbegin = std::next(small_vec.begin()); auto const send = std::prev(small_vec.end()); EXPECT_EQ(std::accumulate(sbegin, send, init), xsimd::reduce(sbegin, send, init)); } } TEST_F(xsimd_reduce, unaligned_begin_aligned_end) { auto const begin = std::next(vec.begin()); auto const end = vec.end(); EXPECT_EQ(std::accumulate(begin, end, init), xsimd::reduce(begin, end, init)); if(small_vec.size() > 1) { auto const sbegin = std::next(small_vec.begin()); auto const send = small_vec.end(); EXPECT_EQ(std::accumulate(sbegin, send, init), xsimd::reduce(sbegin, send, init)); } } TEST_F(xsimd_reduce, aligned_begin_unaligned_end) { auto const begin = vec.begin(); auto const end = std::prev(vec.end()); EXPECT_EQ(std::accumulate(begin, end, init), xsimd::reduce(begin, end, init)); if(small_vec.size() > 1) { auto const sbegin = small_vec.begin(); auto const send = std::prev(small_vec.end()); EXPECT_EQ(std::accumulate(sbegin, send, init), xsimd::reduce(sbegin, send, init)); } } TEST_F(xsimd_reduce, aligned_begin_aligned_end) { auto const begin = vec.begin(); auto const end = vec.end(); EXPECT_EQ(std::accumulate(begin, end, init), xsimd::reduce(begin, end, init)); if(small_vec.size() > 1) { auto const sbegin = small_vec.begin(); auto const send = small_vec.end(); EXPECT_EQ(std::accumulate(sbegin, send, init), xsimd::reduce(sbegin, send, init)); } } TEST_F(xsimd_reduce, using_custom_binary_function) { auto const begin = vec.begin(); auto const end = vec.end(); EXPECT_DOUBLE_EQ(std::accumulate(begin, end, init, multiply{}), xsimd::reduce(begin, end, init, multiply{})); if(small_vec.size() > 1) { auto const sbegin = small_vec.begin(); auto const send = small_vec.end(); EXPECT_DOUBLE_EQ(std::accumulate(sbegin, send, init, multiply{}), xsimd::reduce(sbegin, send, init, multiply{})); } } #if XSIMD_X86_INSTR_SET > XSIMD_VERSION_NUMBER_NOT_AVAILABLE || XSIMD_ARM_INSTR_SET > XSIMD_VERSION_NUMBER_NOT_AVAILABLE TEST(algorithms, iterator) { std::vector> a(10 * 16, 0.2), b(1000, 2.), c(1000, 3.); std::iota(a.begin(), a.end(), 0.f); std::vector a_cpy(a.begin(), a.end()); using batch_type = typename xsimd::simd_traits::type; auto begin = xsimd::aligned_iterator(&a[0]); auto end = xsimd::aligned_iterator(&a[0] + a.size()); for (; begin != end; ++begin) { *begin = *begin / 2.f; } for (auto& el : a_cpy) { el /= 2.f; } EXPECT_TRUE(a.size() == a_cpy.size() && std::equal(a.begin(), a.end(), a_cpy.begin())); begin = xsimd::aligned_iterator(&a[0]); *begin = sin(*begin); for (std::size_t i = 0; i < batch_type::size; ++i) { EXPECT_NEAR(a[i], sinf(a_cpy[i]), 1e-6); } #ifdef XSIMD_BATCH_DOUBLE_SIZE std::vector, test_allocator_type>> ca(10 * 16, std::complex(0.2)); using cbatch_type = typename xsimd::simd_traits>::type; auto cbegin = xsimd::aligned_iterator(&ca[0]); auto cend = xsimd::aligned_iterator(&ca[0] + a.size()); for (; cbegin != cend; ++cbegin) { *cbegin = (*cbegin + std::complex(0, .3)) / 2.; } cbegin = xsimd::aligned_iterator(&ca[0]); *cbegin = sin(*cbegin); *cbegin = sqrt(*cbegin); auto real_part = abs(*(cbegin)); (void)real_part; #endif } #endif xsimd-7.6.0/test/test_api.cpp000066400000000000000000000213441410101234500161360ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include #include "test_utils.hpp" template class xsimd_api_test : public testing::Test { protected: using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; using array_type = std::array; using int8_vector_type = std::vector; using uint8_vector_type = std::vector; using int16_vector_type = std::vector; using uint16_vector_type = std::vector; using int32_vector_type = std::vector; using uint32_vector_type = std::vector; using int64_vector_type = std::vector; using uint64_vector_type = std::vector; using float_vector_type = std::vector; using double_vector_type = std::vector; int8_vector_type i8_vec; uint8_vector_type ui8_vec; int16_vector_type i16_vec; uint16_vector_type ui16_vec; int32_vector_type i32_vec; uint32_vector_type ui32_vec; int64_vector_type i64_vec; uint64_vector_type ui64_vec; float_vector_type f_vec; double_vector_type d_vec; array_type expected; xsimd_api_test() { init_test_vector(i8_vec); init_test_vector(ui8_vec); init_test_vector(i16_vec); init_test_vector(ui16_vec); init_test_vector(i32_vec); init_test_vector(ui32_vec); init_test_vector(i64_vec); init_test_vector(ui64_vec); init_test_vector(f_vec); init_test_vector(d_vec); } void test_load() { test_load_impl(i8_vec, "load int8_t"); test_load_impl(ui8_vec, "load uint8_t"); test_load_impl(i16_vec, "load int16_t"); test_load_impl(ui16_vec, "load uint16_t"); test_load_impl(i32_vec, "load int32_t"); test_load_impl(ui32_vec, "load uint32_t"); test_load_impl(i64_vec, "load int64_t"); test_load_impl(ui64_vec, "load uint64_t"); test_load_impl(f_vec, "load float"); test_load_impl(d_vec, "load double"); } void test_store() { test_store_impl(i8_vec, "load int8_t"); test_store_impl(ui8_vec, "load uint8_t"); test_store_impl(i16_vec, "load int16_t"); test_store_impl(ui16_vec, "load uint16_t"); test_store_impl(i32_vec, "load int32_t"); test_store_impl(ui32_vec, "load uint32_t"); test_store_impl(i64_vec, "load int64_t"); test_store_impl(ui64_vec, "load uint64_t"); test_store_impl(f_vec, "load float"); test_store_impl(d_vec, "load double"); } void test_set() { test_set_impl("set int8_t"); test_set_impl("set uint8_t"); test_set_impl("set int16_t"); test_set_impl("set uint16_t"); test_set_impl("set int32_t"); test_set_impl("set uint32_t"); test_set_impl("set int64_t"); test_set_impl("set uint64_t"); test_set_impl("set float"); test_set_impl("set double"); } private: template void test_load_impl(const V& v, const std::string& name) { using src_value_type = typename V::value_type; batch_type b; std::copy(v.cbegin(), v.cend(), expected.begin()); b = xsimd::load_simd(v.data(), xsimd::unaligned_mode()); EXPECT_BATCH_EQ(b, expected) << print_function_name(name + " unaligned"); b = xsimd::load_simd(v.data(), xsimd::aligned_mode()); EXPECT_BATCH_EQ(b, expected) << print_function_name(name + " aligned"); } template void test_store_impl(const V& v, const std::string& name) { using src_value_type = typename V::value_type; batch_type b = xsimd::load_simd(v.data(), xsimd::aligned_mode()); V res(size); xsimd::store_simd(res.data(), b, xsimd::unaligned_mode()); EXPECT_VECTOR_EQ(res, v) << print_function_name(name + " unaligned"); xsimd::store_simd(res.data(), b, xsimd::aligned_mode()); EXPECT_VECTOR_EQ(res, v) << print_function_name(name + " aligned"); } template void test_set_impl(const std::string& name) { T v = T(1); batch_type expected(v); batch_type res = xsimd::set_simd(v); EXPECT_BATCH_EQ(res, expected) << print_function_name(name); } template void init_test_vector(V& vec) { vec.resize(size); value_type min = value_type(0); value_type max = value_type(100); std::default_random_engine generator; std::uniform_int_distribution distribution(min, max); auto gen = [&distribution, &generator](){ return static_cast(distribution(generator)); }; std::generate(vec.begin(), vec.end(), gen); } }; using xsimd_api_types = testing::Types< #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX512_VERSION xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch #elif XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch #elif XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch #elif XSIMD_ARM_INSTR_SET >= XSIMD_ARM7_NEON_VERSION xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION , xsimd::batch #endif #endif >; TYPED_TEST_SUITE(xsimd_api_test, xsimd_api_types, simd_test_names); TYPED_TEST(xsimd_api_test, load) { this->test_load(); } TYPED_TEST(xsimd_api_test, store) { this->test_store(); } #ifdef XSIMD_BATCH_DOUBLE_SIZE TYPED_TEST(xsimd_api_test, set) { this->test_set(); } #endif xsimd-7.6.0/test/test_basic_math.cpp000066400000000000000000000126231410101234500174570ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "test_utils.hpp" namespace detail { template ::value> struct infinity_tester { static void test_isfinite() { T input(1.); EXPECT_TRUE(xsimd::all(xsimd::isfinite(input))) << print_function_name("isfinite"); } static void test_isinf() { T input(1.); EXPECT_FALSE(xsimd::any(xsimd::isinf(input))) << print_function_name("isfinite"); } }; template struct infinity_tester { static void test_isfinite() { T input = xsimd::infinity(); EXPECT_FALSE(xsimd::any(xsimd::isfinite(input))) << print_function_name("isfinite"); } static void test_isinf() { T input = xsimd::infinity(); EXPECT_TRUE(xsimd::all(xsimd::isinf(input))) << print_function_name("isfinite"); } }; } template class basic_math_test : public testing::Test { protected: using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; using array_type = std::array; array_type lhs; array_type rhs; array_type clip_input; array_type from_input; basic_math_test() { for (size_t i = 0; i < size; ++i) { lhs[i] = value_type(i) / 4 + value_type(1.2) * std::sqrt(value_type(i + 0.25)) + value_type(1.); rhs[i] = value_type(10.2) / (i + 2) + value_type(0.25) + value_type(1.); clip_input[i] = i * value_type(0.25); from_input[i] = rhs[i] - value_type(1); } } void test_basic_functions() const { // fmod { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return std::fmod(l, r); }); batch_type res = xsimd::fmod(batch_lhs(), batch_rhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("fmod"); } // remainder { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return std::remainder(l, r); }); batch_type res = xsimd::remainder(batch_lhs(), batch_rhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("remainder"); } // fdim { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return std::fdim(l, r); }); batch_type res = xsimd::fdim(batch_lhs(), batch_rhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("fdim"); } // clip { value_type clip_lo = static_cast(0.5); value_type clip_hi = static_cast(1.); array_type expected; std::transform(clip_input.cbegin(), clip_input.cend(), expected.begin(), [clip_lo, clip_hi](const value_type& l) { return l < clip_lo ? clip_lo : clip_hi < l ? clip_hi : l; }); batch_type res = xsimd::clip(batch_clip_input(), batch_type(clip_lo), batch_type(clip_hi)); EXPECT_BATCH_EQ(res, expected) << print_function_name("clip"); } // isfinite { detail::infinity_tester::test_isfinite(); } // isinf { detail::infinity_tester::test_isinf(); } // nextafter { array_type expected; std::transform(from_input.cbegin(), from_input.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return std::nextafter(l, r); }); batch_type res = xsimd::nextafter(batch_from_input(), batch_rhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("nextafter"); } } private: batch_type batch_lhs() const { return batch_type(lhs.data()); } batch_type batch_rhs() const { return batch_type(rhs.data()); } batch_type batch_clip_input() const { return batch_type(clip_input.data()); } batch_type batch_from_input() const { return batch_type(from_input.data()); } }; TYPED_TEST_SUITE(basic_math_test, batch_math_types, simd_test_names); TYPED_TEST(basic_math_test, basic_functions) { this->test_basic_functions(); } xsimd-7.6.0/test/test_batch.cpp000066400000000000000000000641441410101234500164530ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include #include #include #include "test_utils.hpp" using namespace std::placeholders; template class batch_test : public testing::Test { protected: using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; using array_type = std::array; using bool_array_type = std::array; array_type lhs; array_type rhs; value_type scalar; batch_test() { init_operands(); } void test_load_store() const { array_type res; batch_type b; b.load_unaligned(lhs.data()); b.store_unaligned(res.data()); EXPECT_EQ(res, lhs) << print_function_name("load_unaligned / store_unaligned"); alignas(XSIMD_DEFAULT_ALIGNMENT) array_type arhs(this->rhs); alignas(XSIMD_DEFAULT_ALIGNMENT) array_type ares; b.load_aligned(arhs.data()); b.store_aligned(ares.data()); EXPECT_EQ(ares, rhs) << print_function_name("load_aligned / store_aligned"); } void test_constructors() const { array_type tmp; std::fill(tmp.begin(), tmp.end(), value_type(2)); batch_type b0(2); EXPECT_EQ(b0, tmp) << print_function_name("batch(value_type)"); batch_type b1(lhs.data()); EXPECT_EQ(b1, lhs) << print_function_name("batch(value_type*)"); } void test_static_builders() const { { array_type expected; std::fill(expected.begin(), expected.end(), value_type(2)); auto res = batch_type::broadcast(value_type(2)); EXPECT_EQ(res, expected) << print_function_name("batch::broadcast"); } { array_type res; auto b = batch_type::from_unaligned(lhs.data()); b.store_unaligned(res.data()); EXPECT_EQ(res, lhs) << print_function_name("batch::from_unaligned"); } { alignas(XSIMD_DEFAULT_ALIGNMENT) array_type arhs(this->rhs); alignas(XSIMD_DEFAULT_ALIGNMENT) array_type ares; auto b = batch_type::from_aligned(arhs.data()); b.store_aligned(ares.data()); EXPECT_EQ(ares, rhs) << print_function_name("batch::from_aligned"); } } void test_access_operator() const { batch_type res = batch_lhs(); for (size_t i = 0; i < size; ++i) { EXPECT_EQ(res[i], lhs[i]) << print_function_name("operator[](") << i << ")"; } } void test_arithmetic() const { // +batch { array_type expected = lhs; batch_type res = +batch_lhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("+batch"); } // -batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::negate()); batch_type res = -batch_lhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("-batch"); } // batch + batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::plus()); batch_type res = batch_lhs() + batch_rhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch + batch"); } // batch + scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::plus(), _1, scalar)); batch_type lres = batch_lhs() + scalar; EXPECT_BATCH_EQ(lres, expected) << print_function_name("batch + scalar"); batch_type rres = scalar + batch_lhs(); EXPECT_BATCH_EQ(rres, expected) << print_function_name("scalar + batch"); } // batch - batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::minus()); batch_type res = batch_lhs() - batch_rhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch - batch"); } // batch - scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::minus(), _1, scalar)); batch_type lres = batch_lhs() - scalar; EXPECT_BATCH_EQ(lres, expected) << print_function_name("batch - scalar"); std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::minus(), scalar, _1)); batch_type rres = scalar - batch_lhs(); EXPECT_BATCH_EQ(rres, expected) << print_function_name("scalar - batch"); } // batch * batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::multiplies()); batch_type res = batch_lhs() * batch_rhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch * batch"); } // batch * scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::multiplies(), _1, scalar)); batch_type lres = batch_lhs() * scalar; EXPECT_BATCH_EQ(lres, expected) << print_function_name("batch * scalar"); batch_type rres = scalar * batch_lhs(); EXPECT_BATCH_EQ(rres, expected) << print_function_name("scalar * batch"); } // batch / batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::divides()); batch_type res = batch_lhs() / batch_rhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch / batch"); } // batch / scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::divides(), _1, scalar)); batch_type lres = batch_lhs() / scalar; EXPECT_BATCH_EQ(lres, expected) << print_function_name("batch / scalar"); std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::divides(), scalar, _1)); batch_type rres = scalar / batch_lhs(); EXPECT_BATCH_EQ(rres, expected) << print_function_name("scalar / batch"); } } void test_saturated_arithmetic() const { // batch + batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), xsimd::sadd); batch_type res = xsimd::sadd(batch_lhs(), batch_rhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("sadd(batch, batch)"); } // batch + scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(xsimd::sadd, _1, scalar)); batch_type lres = xsimd::sadd(batch_lhs(), scalar); EXPECT_BATCH_EQ(lres, expected) << print_function_name("sadd(batch, scalar)"); batch_type rres = xsimd::sadd(scalar, batch_lhs()); EXPECT_BATCH_EQ(rres, expected) << print_function_name("sadd(scalar, batch)"); } // batch - batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), xsimd::ssub); batch_type res = xsimd::ssub(batch_lhs(), batch_rhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("ssub(batch, batch)"); } // batch - scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(xsimd::ssub, _1, scalar)); batch_type lres = xsimd::ssub(batch_lhs(), scalar); EXPECT_BATCH_EQ(lres, expected) << print_function_name("ssub(batch, scalar)"); std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(xsimd::ssub, scalar, _1)); batch_type rres = xsimd::ssub(scalar, batch_lhs()); EXPECT_BATCH_EQ(rres, expected) << print_function_name("ssub(scalar, batch)"); } } void test_computed_assignment() const { // batch += batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::plus()); batch_type res = batch_lhs(); res += batch_rhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch += batch"); } // batch += scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::plus(), _1, scalar)); batch_type res = batch_lhs(); res += scalar; EXPECT_BATCH_EQ(res, expected) << print_function_name("batch += scalar"); } // batch -= batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::minus()); batch_type res = batch_lhs(); res -= batch_rhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch -= batch"); } // batch -= scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::minus(), _1, scalar)); batch_type res = batch_lhs(); res -= scalar; EXPECT_BATCH_EQ(res, expected) << print_function_name("batch -= scalar"); } // batch *= batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::multiplies()); batch_type res = batch_lhs(); res *= batch_rhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch *= batch"); } // batch *= scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::multiplies(), _1, scalar)); batch_type res = batch_lhs(); res *= scalar; EXPECT_BATCH_EQ(res, expected) << print_function_name("batch *= scalar"); } // batch /= batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::divides()); batch_type res = batch_lhs(); res /= batch_rhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch /= batch"); } // batch /= scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::divides(), _1, scalar)); batch_type res = batch_lhs(); res /= scalar; EXPECT_BATCH_EQ(res, expected) << print_function_name("batch /= scalar"); } } void test_comparison() const { // batch == batch { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l == r; }); auto res = batch_lhs() == batch_rhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch == batch"); } // batch == scalar { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [this](const value_type& l) { return l == scalar; }); auto res = batch_lhs() == scalar; EXPECT_BATCH_EQ(res, expected) << print_function_name("batch == scalar"); } // batch != batch { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l != r; }); auto res = batch_lhs() != batch_rhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch != batch"); } // batch != scalar { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [this](const value_type& l) { return l != scalar; }); auto res = batch_lhs() != scalar; EXPECT_BATCH_EQ(res, expected) << print_function_name("batch != scalar"); } // batch < batch { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l < r; }); auto res = batch_lhs() < batch_rhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch < batch"); } // batch < scalar { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [this](const value_type& l) { return l < scalar; }); auto res = batch_lhs() < scalar; EXPECT_BATCH_EQ(res, expected) << print_function_name("batch < scalar"); } // batch <= batch { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l <= r; }); auto res = batch_lhs() <= batch_rhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch <= batch"); } // batch <= scalar { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [this](const value_type& l) { return l <= scalar; }); auto res = batch_lhs() <= scalar; EXPECT_BATCH_EQ(res, expected) << print_function_name("batch <= scalar"); } // batch > batch { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l > r; }); auto res = batch_lhs() > batch_rhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch > batch"); } // batch > scalar { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [this](const value_type& l) { return l > scalar; }); auto res = batch_lhs() > scalar; EXPECT_BATCH_EQ(res, expected) << print_function_name("batch > scalar"); } // batch >= batch { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l >= r; }); auto res = batch_lhs() >= batch_rhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch >= batch"); } // batch >= scalar { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [this](const value_type& l) { return l >= scalar; }); auto res = batch_lhs() >= scalar; EXPECT_BATCH_EQ(res, expected) << print_function_name("batch >= scalar"); } } void test_min_max() const { // min { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return std::min(l, r); }); batch_type res = min(batch_lhs(), batch_rhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("min"); } // min limit case { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& , const value_type& r) { return std::min(std::numeric_limits::min(), r); }); batch_type res = xsimd::min(batch_type(std::numeric_limits::min()), batch_rhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("min limit"); } // fmin { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return std::fmin(l, r); }); batch_type res = min(batch_lhs(), batch_rhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("fmin"); } // max { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return std::max(l, r); }); batch_type res = max(batch_lhs(), batch_rhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("max"); } // max limit case { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& , const value_type& r) { return std::max(std::numeric_limits::max(), r); }); batch_type res = xsimd::max(batch_type(std::numeric_limits::max()), batch_rhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("max limit"); } // fmax { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return std::fmax(l, r); }); batch_type res = fmax(batch_lhs(), batch_rhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("fmax"); } } void test_fused_operations() const { // fma { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.begin(), expected.begin(), [](const value_type& l, const value_type& r) { return l * r + r; }); // Warning: ADL seems to not work correctly on Windows, thus the full qualified call batch_type res = xsimd::fma(batch_lhs(), batch_rhs(), batch_rhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("fma"); } // fms { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.begin(), expected.begin(), [](const value_type& l, const value_type& r) { return l * r - r; }); batch_type res = fms(batch_lhs(), batch_rhs(), batch_rhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("fms"); } // fnma { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.begin(), expected.begin(), [](const value_type& l, const value_type& r) { return -l * r + r; }); batch_type res = fnma(batch_lhs(), batch_rhs(), batch_rhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("fnma"); } // fnms { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.begin(), expected.begin(), [](const value_type& l, const value_type& r) { return -l * r - r; }); batch_type res = fnms(batch_lhs(), batch_rhs(), batch_rhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("fnms"); } } void test_abs() const { // abs { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [](const value_type& l) { return ::detail::uabs(l); }); batch_type res = abs(batch_lhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("abs"); } // fabs { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [](const value_type& l) { return std::fabs(l); }); batch_type res = fabs(batch_lhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("fabs"); } } void test_horizontal_operations() const { // hadd { value_type expected = std::accumulate(lhs.cbegin(), lhs.cend(), value_type(0)); value_type res = hadd(batch_lhs()); EXPECT_SCALAR_EQ(res, expected) << print_function_name("hadd"); } } void test_boolean_conversions() const { using batch_bool_type = typename batch_type::batch_bool_type; // batch = true { batch_bool_type tbt(true); batch_type expected = batch_type(value_type(1)); batch_type res = tbt; EXPECT_BATCH_EQ(res, expected) << print_function_name("batch = true"); } // batch = false { batch_bool_type fbt(false); batch_type expected = batch_type(value_type(0)); batch_type res = fbt; EXPECT_BATCH_EQ(res, expected) << print_function_name("batch = false"); } // !batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [](const value_type& l) { return !l; }); batch_type res = !batch_lhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("!batch"); } // bitwise_cast { batch_bool_type fbt(false); batch_type expected = batch_type(value_type(0)); batch_type res = bitwise_cast(fbt); EXPECT_BATCH_EQ(res, expected) << print_function_name("bitwise_cast"); } // bitwise not { batch_bool_type fbt(true); batch_type expected = batch_type(value_type(0)); batch_type res = ~bitwise_cast(fbt); EXPECT_BATCH_EQ(res, expected) << print_function_name("~batch"); } } void test_iterator() const { array_type expected = lhs; batch_type v = batch_lhs(); array_type res; // iterator { std::copy(v.begin(), v.end(), res.begin()); EXPECT_EQ(res, expected) << print_function_name("iterator"); } // constant iterator { std::copy(v.cbegin(), v.cend(), res.begin()); EXPECT_EQ(res, expected) << print_function_name("const iterator"); } // reverse iterator { std::copy(v.rbegin(), v.rend(), res.rbegin()); EXPECT_EQ(res, expected) << print_function_name("reverse iterator"); } // constant reverse iterator { std::copy(v.crbegin(), v.crend(), res.rbegin()); EXPECT_EQ(res, expected) << print_function_name("const reverse iterator"); } } private: batch_type batch_lhs() const { return batch_type(lhs.data()); } batch_type batch_rhs() const { return batch_type(rhs.data()); } template xsimd::enable_integral_t init_operands() { for (size_t i = 0; i < size; ++i) { bool negative_lhs = std::is_signed::value && (i % 2 == 1); lhs[i] = value_type(i) * (negative_lhs ? -10 : 10); if (lhs[i] == value_type(0)) { lhs[i] += value_type(1); } rhs[i] = value_type(i) + value_type(4); } scalar = value_type(3); } template xsimd::enable_floating_point_t init_operands() { for (size_t i = 0; i < size; ++i) { lhs[i] = value_type(i) / 4 + value_type(1.2) * std::sqrt(value_type(i + 0.25)); if (lhs[i] == value_type(0)) { lhs[i] += value_type(0.1); } rhs[i] = value_type(10.2) / (i + 2) + value_type(0.25); } scalar = value_type(1.2); } }; TYPED_TEST_SUITE(batch_test, batch_types, simd_test_names); TYPED_TEST(batch_test, load_store) { this->test_load_store(); } TYPED_TEST(batch_test, constructors) { this->test_constructors(); } TYPED_TEST(batch_test, static_builders) { this->test_static_builders(); } TYPED_TEST(batch_test, access_operator) { this->test_access_operator(); } TYPED_TEST(batch_test, arithmetic) { this->test_arithmetic(); } TYPED_TEST(batch_test, saturated_arithmetic) { this->test_saturated_arithmetic(); } TYPED_TEST(batch_test, computed_assignment) { this->test_computed_assignment(); } TYPED_TEST(batch_test, comparison) { this->test_comparison(); } TYPED_TEST(batch_test, min_max) { this->test_min_max(); } TYPED_TEST(batch_test, fused_operations) { this->test_fused_operations(); } TYPED_TEST(batch_test, abs) { this->test_abs(); } TYPED_TEST(batch_test, horizontal_operations) { this->test_horizontal_operations(); } TYPED_TEST(batch_test, boolean_conversions) { this->test_boolean_conversions(); } TYPED_TEST(batch_test, iterator) { this-> test_iterator(); } xsimd-7.6.0/test/test_batch_bool.cpp000066400000000000000000000262601410101234500174630ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include #include "test_utils.hpp" namespace xsimd { template struct get_bool_base { using vector_type = std::array; std::vector almost_all_false() { std::vector vectors; vectors.reserve(N); for (size_t i = 0; i < N; ++i) { vector_type v; v.fill(false); v[i] = true; vectors.push_back(std::move(v)); } return vectors; } std::vector almost_all_true() { auto vectors = almost_all_false(); flip(vectors); return vectors; } void flip(vector_type& vec) { std::transform(vec.begin(), vec.end(), vec.begin(), std::logical_not{}); } void flip(std::vector& vectors) { for (auto& vec : vectors) { flip(vec); } } }; template struct get_bool; template struct get_bool> : public get_bool_base { using type = batch_bool; type all_true = type(true); type all_false = type(false); type half = type(0, 1); type ihalf = type(1, 0); type interspersed = type(0, 1); }; template struct get_bool> : public get_bool_base { using type = batch_bool; type all_true = type(1); type all_false = type(0); type half = type(0, 0, 1, 1); type ihalf = type(1, 1, 0, 0); type interspersed = type(0, 1, 0, 1); }; template struct get_bool> : public get_bool_base { using type = batch_bool; type all_true = type(true); type all_false = type(false); type half = type(0, 0, 0, 0, 1, 1, 1, 1); type ihalf = type(1, 1, 1, 1, 0, 0, 0, 0); type interspersed = type(0, 1, 0, 1, 0, 1, 0, 1); }; template struct get_bool> : public get_bool_base { using type = batch_bool; type all_true = type(true); type all_false = type(false); type half = type(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1); type ihalf = type(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0); type interspersed = type(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); }; template struct get_bool> : public get_bool_base { using type = batch_bool; type all_true = type(true); type all_false = type(false); type half = type(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); type ihalf = type(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); type interspersed = type(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); }; template struct get_bool> : public get_bool_base { using type = batch_bool; type all_true = type(true); type all_false = type(false); type half = type (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); type ihalf = type(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); type interspersed = type(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); }; // For fallbacks template struct get_bool> : public get_bool_base { using type = batch_bool; type all_true = type(true); type all_false = type(false); type half = type(false, false, true); type ihalf = type(true, true, false); }; template struct get_bool> : public get_bool_base { using type = batch_bool; type all_true = type(true); type all_false = type(false); type half = type(false, false, false, false, true, true, true); type ihalf = type(true, true, true, true, false, false, false); }; } template class batch_bool_test : public testing::Test { protected: using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; using batch_bool_type = typename B::batch_bool_type; using array_type = std::array; using bool_array_type = std::array; array_type lhs; array_type rhs; bool_array_type ba; batch_bool_test() { for (size_t i = 0; i < size; ++i) { lhs[i] = value_type(i); rhs[i] = i == 0%2 ? lhs[i] : lhs[i] * 2; ba[i] = i == 0%2 ? true : false; } } void test_load_store() const { bool_array_type res; batch_bool_type b; b.load_unaligned(ba); b.store_unaligned(res.data()); EXPECT_EQ(res, ba) << print_function_name("load_unaligned / store_unaligned"); alignas(XSIMD_DEFAULT_ALIGNMENT) bool_array_type arhs(this->ba); alignas(XSIMD_DEFAULT_ALIGNMENT) bool_array_type ares; b.load_aligned(arhs.data()); b.store_aligned(ares.data()); EXPECT_EQ(ares, arhs) << print_function_name("load_aligned / store_aligned"); } void test_any_all() const { auto bool_g = xsimd::get_bool{}; // any { auto any_check_false = (batch_lhs() != batch_lhs()); bool any_res_false = xsimd::any(any_check_false); EXPECT_FALSE(any_res_false) << print_function_name("any (false)"); auto any_check_true = (batch_lhs() == batch_rhs()); bool any_res_true = xsimd::any(any_check_true); EXPECT_TRUE(any_res_true) << print_function_name("any (true)"); for (const auto& vec : bool_g.almost_all_false()) { batch_bool_type b; b.load_unaligned(vec.data()); bool any_res = xsimd::any(b); EXPECT_TRUE(any_res) << print_function_name("any (almost_all_false)"); } for (const auto& vec : bool_g.almost_all_true()) { batch_bool_type b; b.load_unaligned(vec.data()); bool any_res = xsimd::any(b); EXPECT_TRUE(any_res) << print_function_name("any (almost_all_true)"); } } // all { auto all_check_false = (batch_lhs() == batch_rhs()); bool all_res_false = xsimd::all(all_check_false); EXPECT_FALSE(all_res_false) << print_function_name("all (false)"); auto all_check_true = (batch_lhs() == batch_lhs()); bool all_res_true = xsimd::all(all_check_true); EXPECT_TRUE(all_res_true) << print_function_name("all (true)"); for (const auto& vec : bool_g.almost_all_false()) { // TODO: implement batch_bool(bool*) // It currently compiles (need to understand why) but does not // give expected result batch_bool_type b; b.load_unaligned(vec.data()); bool all_res = xsimd::all(b); EXPECT_FALSE(all_res) << print_function_name("all (almost_all_false)"); } for (const auto& vec : bool_g.almost_all_true()) { batch_bool_type b; b.load_unaligned(vec.data()); bool all_res = xsimd::all(b); EXPECT_FALSE(all_res) << print_function_name("all (almost_all_true)"); } } } void test_logical_operations() const { auto bool_g = xsimd::get_bool{}; size_t s = size; // operator!= { bool res = xsimd::all(bool_g.half != bool_g.ihalf); EXPECT_TRUE(res) << print_function_name("operator!="); } // operator== { bool res = xsimd::all(bool_g.half == !bool_g.ihalf); EXPECT_TRUE(res) << print_function_name("operator=="); } // operator && { batch_bool_type res = bool_g.half && bool_g.ihalf; bool_array_type ares; res.store_unaligned(ares.data()); size_t nb_false = std::count(ares.cbegin(), ares.cend(), false); EXPECT_EQ(nb_false, s) << print_function_name("operator&&"); } // operator || { batch_bool_type res = bool_g.half || bool_g.ihalf; bool_array_type ares; res.store_unaligned(ares.data()); size_t nb_false = std::count(ares.cbegin(), ares.cend(), true); EXPECT_EQ(nb_false, s) << print_function_name("operator||"); } } void test_bitwise_operations() const { auto bool_g = xsimd::get_bool{}; // operator~ { bool res = xsimd::all(bool_g.half == ~bool_g.ihalf); EXPECT_TRUE(res) << print_function_name("operator~"); } // operator| { bool res = xsimd::all((bool_g.half | bool_g.ihalf) == bool_g.all_true); EXPECT_TRUE(res) << print_function_name("operator|"); } // operator& { bool res = xsimd::all((bool_g.half & bool_g.ihalf) == bool_g.all_false); EXPECT_TRUE(res) << print_function_name("operator&"); } } private: batch_type batch_lhs() const { return batch_type(lhs.data()); } batch_type batch_rhs() const { return batch_type(rhs.data()); } }; TYPED_TEST_SUITE(batch_bool_test, batch_types, simd_test_names); TYPED_TEST(batch_bool_test, load_store) { this->test_load_store(); } TYPED_TEST(batch_bool_test, any_all) { this->test_any_all(); } TYPED_TEST(batch_bool_test, logical_operations) { this->test_logical_operations(); } TYPED_TEST(batch_bool_test, bitwise_operations) { this->test_bitwise_operations(); } xsimd-7.6.0/test/test_batch_cast.cpp000066400000000000000000000367651410101234500174750ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "test_utils.hpp" namespace detail { template inline typename std::enable_if::value && std::is_integral::value, bool>::type is_convertible(T_in value) { return static_cast(value) <= static_cast(std::numeric_limits::max()); } template inline typename std::enable_if::value && std::is_signed::value && std::is_integral::value && std::is_signed::value, bool>::type is_convertible(T_in value) { int64_t signed_value = static_cast(value); return signed_value <= static_cast(std::numeric_limits::max()) && signed_value >= static_cast(std::numeric_limits::lowest()); } template inline typename std::enable_if::value && std::is_signed::value && std::is_unsigned::value, bool>::type is_convertible(T_in value) { return value >= 0 && is_convertible(static_cast(value)); } template inline typename std::enable_if::value && std::is_integral::value, bool>::type is_convertible(T_in value) { return value <= static_cast(std::numeric_limits::max()) && value >= static_cast(std::numeric_limits::lowest()); } template inline typename std::enable_if::value, bool>::type is_convertible(T_in) { return true; } } template class batch_cast_test : public testing::Test { protected: static constexpr size_t N = CP::size; static constexpr size_t A = CP::alignment; using int8_batch = xsimd::batch; using uint8_batch = xsimd::batch; using int16_batch = xsimd::batch; using uint16_batch = xsimd::batch; using int32_batch = xsimd::batch; using uint32_batch = xsimd::batch; using int64_batch = xsimd::batch; using uint64_batch = xsimd::batch; using float_batch = xsimd::batch; using double_batch = xsimd::batch; std::vector int_test_values; std::vector float_test_values; std::vector double_test_values; batch_cast_test() { int_test_values = { 0, 0x01, 0x7f, 0x80, 0xff, 0x0100, 0x7fff, 0x8000, 0xffff, 0x00010000, 0x7fffffff, 0x80000000, 0xffffffff, 0x0000000100000000, 0x7fffffffffffffff, 0x8000000000000000, 0xffffffffffffffff }; float_test_values = { 0.0f, 1.0f, -1.0f, 127.0f, 128.0f, -128.0f, 255.0f, 256.0f, -256.0f, 32767.0f, 32768.0f, -32768.0f, 65535.0f, 65536.0f, -65536.0f, 2147483647.0f, 2147483648.0f, -2147483648.0f, 4294967167.0f }; double_test_values = { 0.0, 1.0, -1.0, 127.0, 128.0, -128.0, 255.0, 256.0, -256.0, 32767.0, 32768.0, -32768.0, 65535.0, 65536.0, -65536.0, 2147483647.0, 2147483648.0, -2147483648.0, 4294967295.0, 4294967296.0, -4294967296.0, 9223372036854775807.0, 9223372036854775808.0, -9223372036854775808.0, 18446744073709550591.0 }; } void test_cast() const { for (const auto& test_value : int_test_values) { test_cast_impl(test_value, "batch cast int8 -> int8"); test_cast_impl(test_value, "batch cast int8 -> uint8"); test_cast_impl(test_value, "batch cast uint8 -> int8"); test_cast_impl(test_value, "batch cast uint8 -> uint8"); test_cast_impl(test_value, "batch cast int16 -> int16"); test_cast_impl(test_value, "batch cast int16 -> uint16"); test_cast_impl(test_value, "batch cast uint16 -> int16"); test_cast_impl(test_value, "batch cast uint16 -> uint16"); test_cast_impl(test_value, "batch cast int32 -> int32"); test_cast_impl(test_value, "batch cast int32 -> uint32"); test_cast_impl(test_value, "batch cast int32 -> float"); test_cast_impl(test_value, "batch cast uint32 -> int32"); test_cast_impl(test_value, "batch cast uint32 -> uint32"); test_cast_impl(test_value, "batch cast uint32 -> float"); test_cast_impl(test_value, "batch cast int64 -> int64"); test_cast_impl(test_value, "batch cast int64 -> uint64"); test_cast_impl(test_value, "batch cast int64 -> double"); test_cast_impl(test_value, "batch cast uint64 -> int64"); test_cast_impl(test_value, "batch cast uint64 -> uint64"); test_cast_impl(test_value, "batch cast uint64 -> double"); } for (const auto& test_value : float_test_values) { test_cast_impl(test_value, "batch cast float -> int32"); test_cast_impl(test_value, "batch cast float -> uint32"); test_cast_impl(test_value, "batch cast float -> float"); } for (const auto& test_value : double_test_values) { test_cast_impl(test_value, "batch cast double -> int64"); test_cast_impl(test_value, "batch cast double -> uint64"); test_cast_impl(test_value, "batch cast double -> double"); } } #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION template typename std::enable_if= 32, void>::type test_cast_sizeshift1() const { for (const auto& test_value : int_test_values) { test_cast_impl(test_value, "batch cast int8 -> int16"); test_cast_impl(test_value, "batch cast int8 -> uint16"); test_cast_impl(test_value, "batch cast uint8 -> int16"); test_cast_impl(test_value, "batch cast uint8 -> uint16"); test_cast_impl(test_value, "batch cast int16 -> int8"); test_cast_impl(test_value, "batch cast int16 -> uint8"); test_cast_impl(test_value, "batch cast int16 -> int32"); test_cast_impl(test_value, "batch cast int16 -> uint32"); test_cast_impl(test_value, "batch cast int16 -> float"); test_cast_impl(test_value, "batch cast uint16 -> int8"); test_cast_impl(test_value, "batch cast uint16 -> uint8"); test_cast_impl(test_value, "batch cast uint16 -> int32"); test_cast_impl(test_value, "batch cast uint16 -> uint32"); test_cast_impl(test_value, "batch cast uint16 -> float"); test_cast_impl(test_value, "batch cast int32 -> int16"); test_cast_impl(test_value, "batch cast int32 -> uint16"); test_cast_impl(test_value, "batch cast int32 -> int64"); test_cast_impl(test_value, "batch cast int32 -> uint64"); test_cast_impl(test_value, "batch cast int32 -> double"); test_cast_impl(test_value, "batch cast uint32 -> int16"); test_cast_impl(test_value, "batch cast uint32 -> uint16"); test_cast_impl(test_value, "batch cast uint32 -> int64"); test_cast_impl(test_value, "batch cast uint32 -> uint64"); test_cast_impl(test_value, "batch cast uint32 -> double"); test_cast_impl(test_value, "batch cast int64 -> int32"); test_cast_impl(test_value, "batch cast int64 -> uint32"); test_cast_impl(test_value, "batch cast int64 -> float"); test_cast_impl(test_value, "batch cast uint64 -> int32"); test_cast_impl(test_value, "batch cast uint64 -> uint32"); test_cast_impl(test_value, "batch cast uint64 -> float"); } for (const auto& test_value : float_test_values) { test_cast_impl(test_value, "batch cast float -> int16"); test_cast_impl(test_value, "batch cast float -> uint16"); test_cast_impl(test_value, "batch cast float -> int64"); test_cast_impl(test_value, "batch cast float -> uint64"); test_cast_impl(test_value, "batch cast float -> double"); } for (const auto& test_value : double_test_values) { test_cast_impl(test_value, "batch cast double -> int32"); test_cast_impl(test_value, "batch cast double -> uint32"); test_cast_impl(test_value, "batch cast double -> float"); } } template typename std::enable_if::type test_cast_sizeshift1() const { } #endif #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX512_VERSION template typename std::enable_if= 64, void>::type test_cast_sizeshift2() const { for (const auto& test_value : int_test_values) { test_cast_impl(test_value, "batch cast int8 -> int32"); test_cast_impl(test_value, "batch cast int8 -> uint32"); test_cast_impl(test_value, "batch cast int8 -> float"); test_cast_impl(test_value, "batch cast uint8 -> int32"); test_cast_impl(test_value, "batch cast uint8 -> uint32"); test_cast_impl(test_value, "batch cast uint8 -> float"); test_cast_impl(test_value, "batch cast int16 -> int64"); test_cast_impl(test_value, "batch cast int16 -> uint64"); test_cast_impl(test_value, "batch cast int16 -> double"); test_cast_impl(test_value, "batch cast uint16 -> int64"); test_cast_impl(test_value, "batch cast uint16 -> uint64"); test_cast_impl(test_value, "batch cast uint16 -> double"); test_cast_impl(test_value, "batch cast int32 -> int8"); test_cast_impl(test_value, "batch cast int32 -> uint8"); test_cast_impl(test_value, "batch cast uint32 -> int8"); test_cast_impl(test_value, "batch cast uint32 -> uint8"); test_cast_impl(test_value, "batch cast int64 -> int16"); test_cast_impl(test_value, "batch cast int64 -> uint16"); test_cast_impl(test_value, "batch cast uint64 -> int16"); test_cast_impl(test_value, "batch cast uint64 -> uint16"); } for (const auto& test_value : float_test_values) { test_cast_impl(test_value, "batch cast float -> int8"); test_cast_impl(test_value, "batch cast float -> uint8"); } for (const auto& test_value : double_test_values) { test_cast_impl(test_value, "batch cast double -> int16"); test_cast_impl(test_value, "batch cast double -> uint16"); } } template typename std::enable_if::type test_cast_sizeshift2() const { } #endif private: template void test_cast_impl(T test_value, const std::string& name) const { using T_in = typename B_in::value_type; using T_out = typename B_out::value_type; static constexpr std::size_t N_common = B_in::size < B_out::size ? B_in::size : B_out::size; using B_common_in = xsimd::batch; using B_common_out = xsimd::batch; T_in in_test_value = static_cast(test_value); if (detail::is_convertible(in_test_value)) { B_common_out res = xsimd::batch_cast(B_common_in(in_test_value)); EXPECT_SCALAR_EQ(res[0], static_cast(in_test_value)) << print_function_name(name); } } }; TYPED_TEST_SUITE(batch_cast_test, conversion_types, conversion_test_names); TYPED_TEST(batch_cast_test, cast) { this->test_cast(); } #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION TYPED_TEST(batch_cast_test, cast_sizeshift1) { this->test_cast_sizeshift1(); } #endif #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX512_VERSION TYPED_TEST(batch_cast_test, cast_sizeshift2) { this->test_cast_sizeshift2(); } #endif xsimd-7.6.0/test/test_batch_complex.cpp000066400000000000000000000607071410101234500202030ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include #include #include #include "test_utils.hpp" using namespace std::placeholders; template class batch_complex_test : public testing::Test { protected: using batch_type = B; using real_batch_type = typename B::real_batch; using value_type = typename B::value_type; using real_value_type = typename value_type::value_type; static constexpr size_t size = B::size; using array_type = std::array; using bool_array_type = std::array; using real_array_type = std::array; array_type lhs; array_type rhs; value_type scalar; real_value_type real_scalar; batch_complex_test() { scalar = value_type(real_value_type(1.4), real_value_type(2.3)); real_scalar = scalar.real(); for (size_t i = 0; i < size; ++i) { lhs[i] = value_type(real_value_type(i) / real_value_type(4) + real_value_type(1.2) * std::sqrt(real_value_type(i + 0.25)), real_value_type(i) / real_value_type(5)); rhs[i] = value_type(real_value_type(10.2) / real_value_type(i + 2) + real_value_type(0.25), real_value_type(i) / real_value_type(3.2)); } } void test_load_store() const { { array_type res; batch_type b; b.load_unaligned(lhs.data()); b.store_unaligned(res.data()); EXPECT_EQ(res, lhs) << print_function_name("load_unaligned / store_unaligned complex*"); alignas(XSIMD_DEFAULT_ALIGNMENT) array_type arhs(this->rhs); alignas(XSIMD_DEFAULT_ALIGNMENT) array_type ares; b.load_aligned(arhs.data()); b.store_aligned(ares.data()); EXPECT_EQ(ares, rhs) << print_function_name("load_aligned / store_aligned complex*"); } { real_array_type real, imag, res_real, res_imag; for (size_t i = 0; i < size; ++i) { real[i] = lhs[i].real(); imag[i] = lhs[i].imag(); } batch_type b; b.load_unaligned(real.data(), imag.data()); b.store_unaligned(res_real.data(), res_imag.data()); EXPECT_EQ(res_real, real) << print_function_name("load_unaligned / store_unaligned (real*, real*)"); alignas(XSIMD_DEFAULT_ALIGNMENT) real_array_type areal, aimag, ares_real, ares_imag; for (size_t i = 0; i < size; ++i) { areal[i] = lhs[i].real(); aimag[i] = lhs[i].imag(); } b.load_aligned(areal.data(), aimag.data()); b.store_aligned(ares_real.data(), ares_imag.data()); EXPECT_EQ(ares_real, areal) << print_function_name("load_aligned / store_aligned (real*, real*)"); } { real_array_type real, res_real; for (size_t i = 0; i < size; ++i) { real[i] = lhs[i].real(); } batch_type b; b.load_unaligned(real.data()); b.store_unaligned(res_real.data()); EXPECT_EQ(res_real, real) << print_function_name("load_unaligned / store_unaligned (real*)"); alignas(XSIMD_DEFAULT_ALIGNMENT) real_array_type areal, ares_real; for (size_t i = 0; i < size; ++i) { areal[i] = lhs[i].real(); } b.load_aligned(areal.data()); b.store_aligned(ares_real.data()); EXPECT_EQ(ares_real, areal) << print_function_name("load_aligned / store_aligned (real*)"); } } void test_constructors() const { array_type tmp; std::fill(tmp.begin(), tmp.end(), value_type(2, 3)); batch_type b0(value_type(2, 3)); EXPECT_EQ(b0, tmp) << print_function_name("batch(value_type)"); std::fill(tmp.begin(), tmp.end(), value_type(real_scalar)); batch_type b1(real_scalar); EXPECT_EQ(b1, tmp) << print_function_name("batch(real_value_type)"); real_array_type real, imag; for (size_t i = 0; i < size; ++i) { real[i] = lhs[i].real(); imag[i] = lhs[i].imag(); tmp[i] = value_type(real[i]); } batch_type b2(real.data()); EXPECT_EQ(b2, tmp) << print_function_name("batch(real_batch)"); batch_type b3(real.data(), imag.data()); EXPECT_EQ(b3, lhs) << print_function_name("batch(real_batch, real_batch)"); batch_type b4(real_batch_type(real.data())); EXPECT_EQ(b4, tmp) << print_function_name("batch(real_ptr)"); batch_type b5(real_batch_type(real.data()), real_batch_type(imag.data())); EXPECT_EQ(b5, lhs) << print_function_name("batch(real_ptr, real_ptr)"); } void test_access_operator() const { batch_type res = batch_lhs(); for (size_t i = 0; i < size; ++i) { EXPECT_EQ(res[i], lhs[i]) << print_function_name("operator[](") << i << ")"; } } void test_arithmetic() const { // +batch { array_type expected = lhs; batch_type res = +batch_lhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("+batch"); } // -batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::negate()); batch_type res = -batch_lhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("-batch"); } // batch + batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::plus()); batch_type res = batch_lhs() + batch_rhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch + batch"); } // batch + scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::plus(), _1, scalar)); batch_type lres = batch_lhs() + scalar; EXPECT_BATCH_EQ(lres, expected) << print_function_name("batch + scalar"); batch_type rres = scalar + batch_lhs(); EXPECT_BATCH_EQ(rres, expected) << print_function_name("scalar + batch"); } // batch + real_batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l + r.real(); }); batch_type lres = batch_lhs() + batch_rhs().real(); EXPECT_BATCH_EQ(lres, expected) << print_function_name("batch + real_batch"); batch_type rres = batch_rhs().real() + batch_lhs(); EXPECT_BATCH_EQ(rres, expected) << print_function_name("real_batch + batch"); } // batch + real_scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::plus(), _1, real_scalar)); batch_type lres = batch_lhs() + real_scalar; EXPECT_BATCH_EQ(lres, expected) << print_function_name("batch + real_scalar"); batch_type rres = real_scalar + batch_lhs(); EXPECT_BATCH_EQ(rres, expected) << print_function_name("real_scalar + batch"); } // batch - batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::minus()); batch_type res = batch_lhs() - batch_rhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch - batch"); } // batch - scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::minus(), _1, scalar)); batch_type lres = batch_lhs() - scalar; EXPECT_BATCH_EQ(lres, expected) << print_function_name("batch - scalar"); std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::minus(), scalar, _1)); batch_type rres = scalar - batch_lhs(); EXPECT_BATCH_EQ(rres, expected) << print_function_name("scalar - batch"); } // batch - real_batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l - r.real(); }); batch_type lres = batch_lhs() - batch_rhs().real(); EXPECT_BATCH_EQ(lres, expected) << print_function_name("batch - real_batch"); std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return r.real() - l; }); batch_type rres = batch_rhs().real() - batch_lhs(); EXPECT_BATCH_EQ(rres, expected) << print_function_name("real_batch - batch"); } // batch - real_scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::minus(), _1, real_scalar)); batch_type lres = batch_lhs() - real_scalar; EXPECT_BATCH_EQ(lres, expected) << print_function_name("batch - real_scalar"); std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::minus(), real_scalar, _1)); batch_type rres = real_scalar - batch_lhs(); EXPECT_BATCH_EQ(rres, expected) << print_function_name("real_scalar - batch"); } // batch * batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::multiplies()); batch_type res = batch_lhs() * batch_rhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch * batch"); } // batch * scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::multiplies(), _1, scalar)); batch_type lres = batch_lhs() * scalar; EXPECT_BATCH_EQ(lres, expected) << print_function_name("batch * scalar"); batch_type rres = scalar * batch_lhs(); EXPECT_BATCH_EQ(rres, expected) << print_function_name("scalar * batch"); } // batch * real_batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l * r.real(); }); batch_type lres = batch_lhs() * batch_rhs().real(); EXPECT_BATCH_EQ(lres, expected) << print_function_name("batch * real_batch"); batch_type rres = batch_rhs().real() * batch_lhs(); EXPECT_BATCH_EQ(rres, expected) << print_function_name("real_batch * batch"); } // batch * real_scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::multiplies(), _1, real_scalar)); batch_type lres = batch_lhs() * real_scalar; EXPECT_BATCH_EQ(lres, expected) << print_function_name("batch * real_scalar"); batch_type rres = real_scalar * batch_lhs(); EXPECT_BATCH_EQ(rres, expected) << print_function_name("real_scalar * batch"); } // batch / batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::divides()); batch_type res = batch_lhs() / batch_rhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch / batch"); } // batch / scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::divides(), _1, scalar)); batch_type lres = batch_lhs() / scalar; EXPECT_BATCH_EQ(lres, expected) << print_function_name("batch / scalar"); std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::divides(), scalar, _1)); batch_type rres = scalar / batch_lhs(); EXPECT_BATCH_EQ(rres, expected) << print_function_name("scalar / batch"); } // batch / real_batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l / r.real(); }); batch_type lres = batch_lhs() / batch_rhs().real(); EXPECT_BATCH_EQ(lres, expected) << print_function_name("batch / real_batch"); std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return r.real() / l; }); batch_type rres = batch_rhs().real() / batch_lhs(); EXPECT_BATCH_EQ(rres, expected) << print_function_name("real_batch / batch"); } // batch - real_scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::divides(), _1, real_scalar)); batch_type lres = batch_lhs() / real_scalar; EXPECT_BATCH_EQ(lres, expected) << print_function_name("batch / real_scalar"); std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::divides(), real_scalar, _1)); batch_type rres = real_scalar / batch_lhs(); EXPECT_BATCH_EQ(rres, expected) << print_function_name("real_scalar / batch"); } } void test_computed_assignment() const { // batch += batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::plus()); batch_type res = batch_lhs(); res += batch_rhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch += batch"); } // batch += scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::plus(), _1, scalar)); batch_type res = batch_lhs(); res += scalar; EXPECT_BATCH_EQ(res, expected) << print_function_name("batch += scalar"); } // batch += real_batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l + r.real(); }); batch_type res = batch_lhs(); res += batch_rhs().real(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch += real_batch"); } // batch += real_scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::plus(), _1, real_scalar)); batch_type res = batch_lhs(); res += real_scalar; EXPECT_BATCH_EQ(res, expected) << print_function_name("batch += real_scalar"); } // batch -= batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::minus()); batch_type res = batch_lhs(); res -= batch_rhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch -= batch"); } // batch -= scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::minus(), _1, scalar)); batch_type res = batch_lhs(); res -= scalar; EXPECT_BATCH_EQ(res, expected) << print_function_name("batch -= scalar"); } // batch -= real_batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l - r.real(); }); batch_type res = batch_lhs(); res -= batch_rhs().real(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch -= real_batch"); } // batch -= real_scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::minus(), _1, real_scalar)); batch_type res = batch_lhs(); res -= real_scalar; EXPECT_BATCH_EQ(res, expected) << print_function_name("batch -= real_scalar"); } // batch *= batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::multiplies()); batch_type res = batch_lhs(); res *= batch_rhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch *= batch"); } // batch *= scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::multiplies(), _1, scalar)); batch_type res = batch_lhs(); res *= scalar; EXPECT_BATCH_EQ(res, expected) << print_function_name("batch *= scalar"); } // batch *= real_batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l * r.real(); }); batch_type res = batch_lhs(); res *= batch_rhs().real(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch *= real_batch"); } // batch *= real_scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::multiplies(), _1, real_scalar)); batch_type res = batch_lhs(); res *= real_scalar; EXPECT_BATCH_EQ(res, expected) << print_function_name("batch *= real_scalar"); } // batch /= batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::divides()); batch_type res = batch_lhs(); res /= batch_rhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch /= batch"); } // batch /= scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::divides(), _1, scalar)); batch_type res = batch_lhs(); res /= scalar; EXPECT_BATCH_EQ(res, expected) << print_function_name("batch /= scalar"); } // batch /= real_batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l / r.real(); }); batch_type res = batch_lhs(); res /= batch_rhs().real(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch /= real_batch"); } // batch /= real_scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::divides(), _1, real_scalar)); batch_type res = batch_lhs(); res /= real_scalar; EXPECT_BATCH_EQ(res, expected) << print_function_name("batch /= real_scalar"); } } void test_conj_norm_proj() const { // conj { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [](const value_type& v) { using std::conj; return conj(v); }); batch_type res = conj(batch_lhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("conj"); } // norm { real_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [](const value_type& v) { using std::norm; return norm(v); }); real_batch_type res = norm(batch_lhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("norm"); } // proj { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [](const value_type& v) { using std::proj; return proj(v); }); batch_type res = proj(batch_lhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("proj"); } } void test_horizontal_operations() const { // hadd { value_type expected = std::accumulate(lhs.cbegin(), lhs.cend(), value_type(0)); value_type res = hadd(batch_lhs()); EXPECT_SCALAR_EQ(res, expected) << print_function_name("hadd"); } } void test_fused_operations() const { // fma { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.begin(), expected.begin(), [](const value_type& l, const value_type& r) { return l * r + r; }); batch_type res = xsimd::fma(batch_lhs(), batch_rhs(), batch_rhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("fma"); } // fms { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.begin(), expected.begin(), [](const value_type& l, const value_type& r) { return l * r - r; }); batch_type res = fms(batch_lhs(), batch_rhs(), batch_rhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("fms"); } // fnma { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.begin(), expected.begin(), [](const value_type& l, const value_type& r) { return -l * r + r; }); batch_type res = fnma(batch_lhs(), batch_rhs(), batch_rhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("fnma"); } // fnms { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.begin(), expected.begin(), [](const value_type& l, const value_type& r) { return -l * r - r; }); batch_type res = fnms(batch_lhs(), batch_rhs(), batch_rhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("fnms"); } } private: batch_type batch_lhs() const { batch_type res; res.load_unaligned(lhs.data()); return res; } batch_type batch_rhs() const { batch_type res; res.load_unaligned(rhs.data()); return res; } }; TYPED_TEST_SUITE(batch_complex_test, batch_complex_types, simd_test_names); TYPED_TEST(batch_complex_test, load_store) { this->test_load_store(); } TYPED_TEST(batch_complex_test, constructors) { this->test_constructors(); } TYPED_TEST(batch_complex_test, access_operator) { this->test_access_operator(); } TYPED_TEST(batch_complex_test, arithmetic) { this->test_arithmetic(); } TYPED_TEST(batch_complex_test, computed_assignment) { this->test_computed_assignment(); } TYPED_TEST(batch_complex_test, conj_norm_proj) { this->test_conj_norm_proj(); } TYPED_TEST(batch_complex_test, horizontal_operations) { this->test_horizontal_operations(); } TYPED_TEST(batch_complex_test, fused_operations) { this->test_fused_operations(); } xsimd-7.6.0/test/test_batch_float.cpp000066400000000000000000000056601410101234500176360ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "test_utils.hpp" template class batch_float_test : public testing::Test { protected: using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; using array_type = std::array; using bool_array_type = std::array; array_type lhs; array_type rhs; batch_float_test() { for (size_t i = 0; i < size; ++i) { lhs[i] = value_type(i) / 4 + value_type(1.2) * std::sqrt(value_type(i + 0.25)); if (lhs[i] == value_type(0)) { lhs[i] += value_type(0.1); } rhs[i] = value_type(10.2) / (i + 2) + value_type(0.25); } } void test_sqrt() const { // sqrt { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [](const value_type& l) { return std::sqrt(l); }); batch_type res = sqrt(batch_lhs()); EXPECT_BATCH_EQ(res, expected) << print_function_name("sqrt"); } } void test_haddp() const { batch_type haddp_input[size]; for(size_t i = 0; i < size; i += 2) { haddp_input[i] = batch_lhs(); if(i + 1 < size) { haddp_input[i+1] = batch_rhs(); } } array_type expected; std::fill(expected.begin(), expected.end(), value_type(0)); for(size_t i = 0; i < size; ++i) { for(size_t j = 0; j < size; j += 2) { expected[j] += lhs[i]; if(j + 1 < size) { expected[j + 1] += rhs[i]; } } } auto res = haddp(haddp_input); EXPECT_BATCH_EQ(res, expected) << print_function_name("haddp"); } private: batch_type batch_lhs() const { return batch_type(lhs.data()); } batch_type batch_rhs() const { return batch_type(rhs.data()); } }; TYPED_TEST_SUITE(batch_float_test, batch_float_types, simd_test_names); TYPED_TEST(batch_float_test, sqrt) { this->test_sqrt(); } TYPED_TEST(batch_float_test, haddp) { this->test_haddp(); } xsimd-7.6.0/test/test_batch_int.cpp000066400000000000000000000253261410101234500173240ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "test_utils.hpp" namespace xsimd { template struct test_int_min_max { bool run() { return true; } }; template struct test_int_min_max> { void run() { using B = batch; using BB = batch_bool; using A = std::array; T max = std::numeric_limits::max(); T min = std::numeric_limits::min(); std::array maxmin_cmp{{max, min}}; B maxmin(max, min); EXPECT_BATCH_EQ(maxmin, maxmin_cmp) << print_function_name("numeric max and min"); B a(1, 3); B b(2); B c(2, 3); auto r1 = xsimd::max(a, c); auto r3 = xsimd::min(a, c); EXPECT_BATCH_EQ(r1, (A{{2, 3}})) << print_function_name("max"); EXPECT_BATCH_EQ(r3, (A{{1, 3}})) << print_function_name("min"); auto r4 = a < b; // test lt BB e4(1, 0); EXPECT_TRUE(xsimd::all(r4 == e4)); } }; template struct test_int_min_max> { void run() { using B = batch; using BB = batch_bool; using A = std::array; B a(1,3,1,1); B b(2); B c(2,3,2,3); auto r1 = xsimd::max(a, c); auto r3 = xsimd::min(a, c); EXPECT_BATCH_EQ(r1, (A{{2, 3, 2, 3}})) << print_function_name("max"); EXPECT_BATCH_EQ(r3, (A{{1, 3, 1, 1}})) << print_function_name("min"); auto r4 = a < b; // test lt BB e4(1,0,1,1); EXPECT_TRUE(xsimd::all(r4 == e4)); } }; template struct test_int_min_max> { void run() { using B = batch; using BB = batch_bool; using A = std::array; T max = std::numeric_limits::max(); T min = std::numeric_limits::min(); std::array maxmin_cmp{{0, 0, max, 0, min, 0, 0, 0}}; B maxmin(0, 0, max, 0, min, 0, 0, 0); EXPECT_BATCH_EQ(maxmin, maxmin_cmp) << print_function_name("numeric max and min"); B a(1,3,1,3, 1,1,3,3); B b(2); B c(2,3,2,3, 2,3,2,3); auto r1 = xsimd::max(a, c); auto r3 = xsimd::min(a, c); auto r4 = a < b; // test lt EXPECT_BATCH_EQ(r1, (A{{2, 3, 2, 3, 2, 3, 3, 3}})) << print_function_name("max"); EXPECT_BATCH_EQ(r3, (A{{1, 3, 1, 3, 1, 1, 2, 3}})) << print_function_name("min"); BB e4(1,0,1,0, 1,1,0,0); EXPECT_TRUE(xsimd::all(r4 == e4)); } }; template struct test_int_min_max> { void run() { using B = batch; using BB = batch_bool; using A = std::array; T max = std::numeric_limits::max(); T min = std::numeric_limits::min(); std::array maxmin_cmp{{0, 0, max, 0, min, 0, 0, 0, 0, 0, max, 0, min, 0, 0, 0}}; B maxmin(0, 0, max, 0, min, 0, 0, 0, 0, 0, max, 0, min, 0, 0, 0); EXPECT_BATCH_EQ(maxmin, maxmin_cmp) << print_function_name("numeric max and min"); B a(1,3,1,3, 1,3,1,3, 3,3,3,3, min,max,max,min); B b(2); B c(2,3,2,3, 2,3,2,3, 2,3,2,3, 2,3,2,3); auto r1 = xsimd::max(a, b); auto r3 = xsimd::min(a, b); auto r4 = a < b; // test lt auto r5 = a == c; auto r6 = a != c; EXPECT_BATCH_EQ(r1, (A{{2,3,2,3, 2,3,2,3, 3,3,3,3, 2,max,max,2}})) << print_function_name("max"); EXPECT_BATCH_EQ(r3, (A{{1,2,1,2, 1,2,1,2, 2,2,2,2, min,2,2,min}})) << print_function_name("min"); BB e4(1,0,1,0, 1,0,1,0, 0,0,0,0, 1,0,0,1); EXPECT_TRUE(xsimd::all(r4 == e4)); BB e5(0,1,0,1, 0,1,0,1, 0,1,0,1, 0,0,0,0); EXPECT_TRUE(xsimd::all(r5 == e5)); EXPECT_TRUE(xsimd::all(r6 == !e5)); } }; template struct test_int_min_max> { void run() { using B = batch; using BB = batch_bool; using A = std::array; T max = std::numeric_limits::max(); T min = std::numeric_limits::min(); B a(1,3,1,3, 1,3,1,3, 1,3,1,3, 1,3,1,3, 1,3,1,3, 1,3,1,3, 3,3,3,3, min,max,max,min); B b(2); B c(2,3,2,3, 2,3,2,3, 2,3,2,3, 2,3,2,3, 2,3,2,3, 2,3,2,3, 2,3,2,3, 2,3,2,3); auto r1 = xsimd::max(a, b); auto r3 = xsimd::min(a, b); auto r4 = a < b; // test lt EXPECT_BATCH_EQ(r1, (A{{2,3,2,3, 2,3,2,3, 2,3,2,3, 2,3,2,3, 2,3,2,3, 2,3,2,3, 3,3,3,3, 2,max,max,2}})) << print_function_name("max"); EXPECT_BATCH_EQ(r3, (A{{1,2,1,2, 1,2,1,2, 1,2,1,2, 1,2,1,2, 1,2,1,2, 1,2,1,2, 2,2,2,2, min,2,2,min}})) << print_function_name("min"); BB e4(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0, 0,0,0,0, 1,0,0,1); EXPECT_TRUE(xsimd::all(r4 == e4)); } }; } template class batch_int_test : public testing::Test { protected: using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; using array_type = std::array; using bool_array_type = std::array; array_type lhs; array_type rhs; array_type shift; batch_int_test() { using signed_value_type = typename std::make_signed::type; for (size_t i = 0; i < size; ++i) { bool negative_lhs = std::is_signed::value && (i % 2 == 1); lhs[i] = value_type(i) * (negative_lhs ? -10 : 10); if (lhs[i] == value_type(0)) { lhs[i] += value_type(1); } rhs[i] = value_type(i) + value_type(4); shift[i] = signed_value_type(i) % (CHAR_BIT * sizeof(value_type)); } } void test_modulo() const { // batch % batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l % r; }); batch_type res = batch_lhs() % batch_rhs(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch % batch"); } } void test_shift() const { int32_t nb_sh = 3; // batch << scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [nb_sh](const value_type& v) { return v << nb_sh; }); batch_type res = batch_lhs() << nb_sh; EXPECT_BATCH_EQ(res, expected) << print_function_name("batch << scalar"); } // batch << batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), shift.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l << r; }); batch_type res = batch_lhs() << batch_shift(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch << batch"); } // batch >> scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [nb_sh](const value_type& v) { return v >> nb_sh; }); batch_type res = batch_lhs() >> nb_sh; EXPECT_BATCH_EQ(res, expected) << print_function_name("batch >> scalar"); } // batch >> batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), shift.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l >> r; }); batch_type res = batch_lhs() >> batch_shift(); EXPECT_BATCH_EQ(res, expected) << print_function_name("batch >> batch"); } } void test_more_shift() const { int32_t s = static_cast(sizeof(value_type) * 8); batch_type lhs = batch_type(value_type(1)); batch_type res; for (int32_t i = 0; i < s; ++i) { res = lhs << i; value_type expected = value_type(1) << i; for (std::size_t j = 0; j < size; ++j) { EXPECT_EQ(res[j], expected); } } lhs = batch_type(std::numeric_limits::max()); for (int32_t i = 0; i < s; ++i) { res = lhs >> value_type(i); value_type expected = std::numeric_limits::max() >> i; for (std::size_t j = 0; j < size; ++j) { EXPECT_EQ(res[j], expected); } } } void test_min_max() const { xsimd::test_int_min_max t; t.run(); } void test_less_than_underflow() const { batch_type test_negative_compare = batch_type(5) - 6; if (std::is_unsigned::value) { EXPECT_FALSE(xsimd::any(test_negative_compare < 1)); } else { EXPECT_TRUE(xsimd::all(test_negative_compare < 1)); } } private: batch_type batch_lhs() const { return batch_type(lhs.data()); } batch_type batch_rhs() const { return batch_type(rhs.data()); } batch_type batch_shift() const { return batch_type(shift.data()); } }; TYPED_TEST_SUITE(batch_int_test, batch_int_types, simd_test_names); TYPED_TEST(batch_int_test, modulo) { this->test_modulo(); } TYPED_TEST(batch_int_test, shift) { this->test_shift(); } TYPED_TEST(batch_int_test, more_shift) { this->test_more_shift(); } TYPED_TEST(batch_int_test, min_max) { this->test_min_max(); } TYPED_TEST(batch_int_test, less_than_underflow) { this->test_less_than_underflow(); } xsimd-7.6.0/test/test_bitwise_cast.cpp000066400000000000000000000155531410101234500200520ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "test_utils.hpp" template class bitwise_cast_test : public testing::Test { protected: static constexpr size_t N = CP::size; static constexpr size_t A = CP::alignment; using int32_batch = xsimd::batch; using int64_batch = xsimd::batch; using float_batch = xsimd::batch; using double_batch = xsimd::batch; using int32_vector = std::vector>; using int64_vector = std::vector>; using float_vector = std::vector>; using double_vector = std::vector>; int32_vector ftoi32_res; int32_vector dtoi32_res; int64_vector ftoi64_res; int64_vector dtoi64_res; float_vector i32tof_res; float_vector i64tof_res; float_vector dtof_res; double_vector i32tod_res; double_vector i64tod_res; double_vector ftod_res; bitwise_cast_test() : ftoi32_res(2 * N), dtoi32_res(2 * N), ftoi64_res(N), dtoi64_res(N), i32tof_res(2 * N), i64tof_res(2 * N), dtof_res(2 * N), i32tod_res(N), i64tod_res(N), ftod_res(N) { { int32_batch input = i32_input(); bitcast b; b.i32[0] = input[0]; b.i32[1] = input[1]; std::fill(i32tof_res.begin(), i32tof_res.end(), b.f[0]); std::fill(i32tod_res.begin(), i32tod_res.end(), b.d); } { int64_batch input = i64_input(); bitcast b; b.i64 = input[0]; std::fill(i64tod_res.begin(), i64tod_res.end(), b.d); for (size_t i = 0; i < N; ++i) { i64tof_res[2 * i] = b.f[0]; i64tof_res[2 * i + 1] = b.f[1]; } } { float_batch input = f_input(); bitcast b; b.f[0] = input[0]; b.f[1] = input[1]; std::fill(ftoi32_res.begin(), ftoi32_res.end(), b.i32[0]); std::fill(ftoi64_res.begin(), ftoi64_res.end(), b.i64); std::fill(ftod_res.begin(), ftod_res.end(), b.d); } { double_batch input = d_input(); bitcast b; b.d = input[0]; //std::fill(dtoi32_res.begin(), dtoi32_res.end(), b.i32[0]); std::fill(dtoi64_res.begin(), dtoi64_res.end(), b.i64); for (size_t i = 0; i < N; ++i) { dtoi32_res[2 * i] = b.i32[0]; dtoi32_res[2 * i + 1] = b.i32[1]; dtof_res[2 * i] = b.f[0]; dtof_res[2 * i + 1] = b.f[1]; } } } void test_to_int32() { int32_vector i32vres(int32_batch::size); { int32_batch i32bres = xsimd::bitwise_cast(f_input()); i32bres.store_aligned(i32vres.data()); EXPECT_VECTOR_EQ(i32vres, ftoi32_res) << print_function_name("to_int32(float)"); } { int32_batch i32bres = xsimd::bitwise_cast(d_input()); i32bres.store_aligned(i32vres.data()); EXPECT_VECTOR_EQ(i32vres, dtoi32_res) << print_function_name("to_int32(double)"); } } void test_to_int64() { int64_vector i64vres(int64_batch::size); { int64_batch i64bres = xsimd::bitwise_cast(f_input()); i64bres.store_aligned(i64vres.data()); EXPECT_VECTOR_EQ(i64vres, ftoi64_res) << print_function_name("to_int64(float)"); } { int64_batch i64bres = xsimd::bitwise_cast(d_input()); i64bres.store_aligned(i64vres.data()); EXPECT_VECTOR_EQ(i64vres, dtoi64_res) << print_function_name("to_int64(double)"); } } void test_to_float() { float_vector fvres(float_batch::size); { float_batch fbres = xsimd::bitwise_cast(i32_input()); fbres.store_aligned(fvres.data()); EXPECT_VECTOR_EQ(fvres, i32tof_res) << print_function_name("to_float(int32_t)"); } { float_batch fbres = xsimd::bitwise_cast(i64_input()); fbres.store_aligned(fvres.data()); EXPECT_VECTOR_EQ(fvres, i64tof_res) << print_function_name("to_float(int64_t)"); } { float_batch fbres = xsimd::bitwise_cast(d_input()); fbres.store_aligned(fvres.data()); EXPECT_VECTOR_EQ(fvres, dtof_res) << print_function_name("to_float(double)"); } } void test_to_double() { double_vector dvres(double_batch::size); { double_batch dbres = xsimd::bitwise_cast(i32_input()); dbres.store_aligned(dvres.data()); EXPECT_VECTOR_EQ(dvres, i32tod_res) << print_function_name("to_double(int32_t)"); } { double_batch dbres = xsimd::bitwise_cast(i64_input()); dbres.store_aligned(dvres.data()); EXPECT_VECTOR_EQ(dvres, i64tod_res) << print_function_name("to_double(int64_t)"); } { double_batch dbres = xsimd::bitwise_cast(f_input()); dbres.store_aligned(dvres.data()); EXPECT_VECTOR_EQ(dvres, ftod_res) << print_function_name("to_double(float)"); } } private: int32_batch i32_input() const { return int32_batch(2); } int64_batch i64_input() const { return int64_batch(2); } float_batch f_input() const { return float_batch(3.); } double_batch d_input() const { return double_batch(2.5e17); } union bitcast { float f[2]; int32_t i32[2]; int64_t i64; double d; }; }; TYPED_TEST_SUITE(bitwise_cast_test, conversion_types, conversion_test_names); TYPED_TEST(bitwise_cast_test, to_int32) { this->test_to_int32(); } TYPED_TEST(bitwise_cast_test, to_int64) { this->test_to_int64(); } TYPED_TEST(bitwise_cast_test, to_float) { this->test_to_float(); } TYPED_TEST(bitwise_cast_test, to_double) { this->test_to_double(); } xsimd-7.6.0/test/test_complex_exponential.cpp000066400000000000000000000156661410101234500214540ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "xsimd/math/xsimd_math_complex.hpp" #include "test_utils.hpp" template class complex_exponential_test : public testing::Test { protected: using batch_type = B; using real_batch_type = typename B::real_batch; using value_type = typename B::value_type; using real_value_type = typename value_type::value_type; static constexpr size_t size = B::size; using vector_type = std::vector; size_t nb_input; vector_type exp_input; vector_type huge_exp_input; vector_type log_input; vector_type expected; vector_type res; complex_exponential_test() { nb_input = 10000 * size; exp_input.resize(nb_input); huge_exp_input.resize(nb_input); log_input.resize(nb_input); for (size_t i = 0; i < nb_input; ++i) { exp_input[i] = value_type(real_value_type(-1.5) + i * real_value_type(3) / nb_input, real_value_type(-1.3) + i * real_value_type(2) / nb_input); huge_exp_input[i] = value_type(real_value_type(0), real_value_type(102.12) + i * real_value_type(100.) / nb_input); log_input[i] = value_type(real_value_type(0.001 + i * 100 / nb_input), real_value_type(0.002 + i * 110 / nb_input)); } expected.resize(nb_input); res.resize(nb_input); } void test_exp() { std::transform(exp_input.cbegin(), exp_input.cend(), expected.begin(), [](const value_type& v) { using std::exp; return exp(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, exp_input, i); out = exp(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("exp"); } void test_expm1() { std::transform(exp_input.cbegin(), exp_input.cend(), expected.begin(), [](const value_type& v) { using xsimd::expm1; return expm1(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, exp_input, i); out = expm1(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("expm1"); } void test_huge_exp() { std::transform(huge_exp_input.cbegin(), huge_exp_input.cend(), expected.begin(), [](const value_type& v) { using std::exp; return exp(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, huge_exp_input, i); out = exp(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("huge exp"); } void test_log() { std::transform(log_input.cbegin(), log_input.cend(), expected.begin(), [](const value_type& v) { using std::log; return log(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, log_input, i); out = log(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("log"); } void test_log2() { std::transform(log_input.cbegin(), log_input.cend(), expected.begin(), [](const value_type& v) { using xsimd::log2; return log2(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, log_input, i); out = log2(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("log2"); } void test_log10() { std::transform(log_input.cbegin(), log_input.cend(), expected.begin(), [](const value_type& v) { using std::log10; return log10(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, log_input, i); out = log10(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("log10"); } void test_log1p() { std::transform(log_input.cbegin(), log_input.cend(), expected.begin(), [](const value_type& v) { using xsimd::log1p; return log1p(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, log_input, i); out = log1p(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("log1p"); } void test_sign() { std::transform(log_input.cbegin(), log_input.cend(), expected.begin(), [](const value_type& v) { using xsimd::sign; return sign(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, log_input, i); out = sign(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("sign"); } }; TYPED_TEST_SUITE(complex_exponential_test, batch_complex_types, simd_test_names); TYPED_TEST(complex_exponential_test, exp) { this->test_exp(); } TYPED_TEST(complex_exponential_test, expm1) { this->test_expm1(); } TYPED_TEST(complex_exponential_test, huge_exp) { this->test_huge_exp(); } TYPED_TEST(complex_exponential_test, log) { this->test_log(); } TYPED_TEST(complex_exponential_test, log2) { this->test_log2(); } TYPED_TEST(complex_exponential_test, log10) { this->test_log10(); } TYPED_TEST(complex_exponential_test, log1p) { this->test_log1p(); } TYPED_TEST(complex_exponential_test, sign) { this->test_sign(); } xsimd-7.6.0/test/test_complex_hyperbolic.cpp000066400000000000000000000133411410101234500212520ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "test_utils.hpp" template class complex_hyperbolic_test : public testing::Test { protected: using batch_type = B; using real_batch_type = typename B::real_batch; using value_type = typename B::value_type; using real_value_type = typename value_type::value_type; static constexpr size_t size = B::size; using vector_type = std::vector; size_t nb_input; vector_type input; vector_type acosh_input; vector_type atanh_input; vector_type expected; vector_type res; complex_hyperbolic_test() { nb_input = 10000 * size; input.resize(nb_input); acosh_input.resize(nb_input); atanh_input.resize(nb_input); for (size_t i = 0; i < nb_input; ++i) { input[i] = value_type(real_value_type(-1.5) + i * real_value_type(3) / nb_input, real_value_type(-1.3) + i * real_value_type(2.5) / nb_input); acosh_input[i] = value_type(real_value_type(1.) + i * real_value_type(3) / nb_input, real_value_type(1.2) + i * real_value_type(2.7) / nb_input); atanh_input[i] = value_type(real_value_type(-0.95) + i * real_value_type(1.9) / nb_input, real_value_type(-0.94) + i * real_value_type(1.8) / nb_input); } expected.resize(nb_input); res.resize(nb_input); } void test_sinh() { std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { using std::sinh; return sinh(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, input, i); out = sinh(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("sinh"); } void test_cosh() { std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { using std::cosh; return cosh(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, input, i); out = cosh(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("cosh"); } void test_tanh() { std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { using std::tanh; return tanh(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, input, i); out = tanh(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("tanh"); } void test_asinh() { std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { using std::asinh; return asinh(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, input, i); out = asinh(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("asinh"); } void test_acosh() { std::transform(acosh_input.cbegin(), acosh_input.cend(), expected.begin(), [](const value_type& v) { using std::acosh; return acosh(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, acosh_input, i); out = acosh(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("acosh"); } void test_atanh() { std::transform(atanh_input.cbegin(), atanh_input.cend(), expected.begin(), [](const value_type& v) { using std::atanh; return atanh(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, atanh_input, i); out = atanh(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("atanh"); } }; TYPED_TEST_SUITE(complex_hyperbolic_test, batch_complex_types, simd_test_names); TYPED_TEST(complex_hyperbolic_test, sinh) { this->test_sinh(); } TYPED_TEST(complex_hyperbolic_test, cosh) { this->test_cosh(); } TYPED_TEST(complex_hyperbolic_test, tanh) { this->test_tanh(); } TYPED_TEST(complex_hyperbolic_test, asinh) { this->test_asinh(); } TYPED_TEST(complex_hyperbolic_test, acosh) { this->test_acosh(); } TYPED_TEST(complex_hyperbolic_test, atanh) { this->test_atanh(); } xsimd-7.6.0/test/test_complex_power.cpp000066400000000000000000000165451410101234500202570ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "test_utils.hpp" template class complex_power_test : public testing::Test { protected: using batch_type = B; using real_batch_type = typename B::real_batch; using value_type = typename B::value_type; using real_value_type = typename value_type::value_type; static constexpr size_t size = B::size; using vector_type = std::vector; using real_vector_type = std::vector; size_t nb_input; vector_type lhs_nn; vector_type lhs_pn; vector_type lhs_np; vector_type lhs_pp; vector_type rhs; vector_type expected; vector_type res; complex_power_test() { nb_input = 10000 * size; lhs_nn.resize(nb_input); lhs_pn.resize(nb_input); lhs_np.resize(nb_input); lhs_pp.resize(nb_input); rhs.resize(nb_input); for (size_t i = 0; i < nb_input; ++i) { real_value_type real = (real_value_type(i) / 4 + real_value_type(1.2) * std::sqrt(real_value_type(i + 0.25)))/ 100; real_value_type imag = (real_value_type(i) / 7 + real_value_type(1.7) * std::sqrt(real_value_type(i + 0.37))) / 100; lhs_nn[i] = value_type(-real, -imag); lhs_pn[i] = value_type(real, -imag); lhs_np[i] = value_type(-real, imag); lhs_pp[i] = value_type(real, imag); rhs[i] = value_type(real_value_type(10.2) / (i + 2) + real_value_type(0.25), real_value_type(9.1) / (i + 3) + real_value_type(0.45)); } expected.resize(nb_input); res.resize(nb_input); } void test_abs() { real_vector_type real_expected(nb_input), real_res(nb_input); std::transform(lhs_np.cbegin(), lhs_np.cend(), real_expected.begin(), [](const value_type& v) { using std::abs; return abs(v); }); batch_type in; real_batch_type out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, lhs_np, i); out = abs(in); detail::store_batch(out, real_res, i); } size_t diff = detail::get_nb_diff(real_res, real_expected); EXPECT_EQ(diff, 0) << print_function_name("abs"); } void test_arg() { real_vector_type real_expected(nb_input), real_res(nb_input); std::transform(lhs_np.cbegin(), lhs_np.cend(), real_expected.begin(), [](const value_type& v) { using std::arg; return arg(v); }); batch_type in; real_batch_type out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, lhs_np, i); out = arg(in); detail::store_batch(out, real_res, i); } size_t diff = detail::get_nb_diff(real_res, real_expected); EXPECT_EQ(diff, 0) << print_function_name("arg"); } void test_pow() { test_conditional_pow(); } void test_sqrt_nn() { std::transform(lhs_nn.cbegin(), lhs_nn.cend(), expected.begin(), [](const value_type& v) { using std::sqrt; return sqrt(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, lhs_nn, i); out = sqrt(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("sqrt_nn"); } void test_sqrt_pn() { std::transform(lhs_pn.cbegin(), lhs_pn.cend(), expected.begin(), [](const value_type& v) { using std::sqrt; return sqrt(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, lhs_pn, i); out = sqrt(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("sqrt_pn"); } void test_sqrt_np() { std::transform(lhs_np.cbegin(), lhs_np.cend(), expected.begin(), [](const value_type& v) { using std::sqrt; return sqrt(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, lhs_np, i); out = sqrt(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("sqrt_nn"); } void test_sqrt_pp() { std::transform(lhs_pp.cbegin(), lhs_pp.cend(), expected.begin(), [](const value_type& v) { using std::sqrt; return sqrt(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, lhs_pp, i); out = sqrt(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("sqrt_pp"); } private: void test_pow_impl() { std::transform(lhs_np.cbegin(), lhs_np.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { using std::pow; return pow(l, r); }); batch_type lhs_in, rhs_in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(lhs_in, lhs_np, i); detail::load_batch(rhs_in, rhs, i); out = pow(lhs_in, rhs_in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("pow"); } template ::value, int>::type = 0> void test_conditional_pow() { test_pow_impl(); } template ::value, int>::type = 0> void test_conditional_pow() { #if (XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX512_VERSION) || (XSIMD_ARM_INSTR_SET >= XSIMD_ARM7_NEON_VERSION) #if DEBUG_ACCURACY test_pow_impl(); #endif #else test_pow_impl(); #endif } }; TYPED_TEST_SUITE(complex_power_test, batch_complex_types, simd_test_names); TYPED_TEST(complex_power_test, abs) { this->test_abs(); } TYPED_TEST(complex_power_test, arg) { this->test_arg(); } TYPED_TEST(complex_power_test, pow) { this->test_pow(); } TYPED_TEST(complex_power_test, sqrt_nn) { this->test_sqrt_nn(); } TYPED_TEST(complex_power_test, sqrt_pn) { this->test_sqrt_pn(); } TYPED_TEST(complex_power_test, sqrt_np) { this->test_sqrt_np(); } TYPED_TEST(complex_power_test, sqrt_pp) { this->test_sqrt_pp(); } xsimd-7.6.0/test/test_complex_trigonometric.cpp000066400000000000000000000163361410101234500220060ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "xsimd/math/xsimd_math_complex.hpp" #include "test_utils.hpp" template class complex_trigonometric_test : public testing::Test { protected: using batch_type = B; using real_batch_type = typename B::real_batch; using value_type = typename B::value_type; using real_value_type = typename value_type::value_type; static constexpr size_t size = B::size; using vector_type = std::vector; size_t nb_input; vector_type input; vector_type ainput; vector_type atan_input; vector_type expected; vector_type res; complex_trigonometric_test() { nb_input = size * 10000; input.resize(nb_input); ainput.resize(nb_input); atan_input.resize(nb_input); for (size_t i = 0; i < nb_input; ++i) { input[i] = value_type(real_value_type(0.) + i * real_value_type(80.) / nb_input, real_value_type(0.1) + i * real_value_type(56.) / nb_input); ainput[i] = value_type(real_value_type(-1.) + real_value_type(2.) * i / nb_input, real_value_type(-1.1) + real_value_type(2.1) * i / nb_input); atan_input[i] = value_type(real_value_type(-10.) + i * real_value_type(20.) / nb_input, real_value_type(-9.) + i * real_value_type(21.) / nb_input); } expected.resize(nb_input); res.resize(nb_input); } void test_sin() { std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { using std::sin; return sin(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, input, i); out = sin(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("sin"); } void test_cos() { std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { using std::cos; return cos(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, input, i); out = cos(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("cos"); } void test_sincos() { vector_type expected2(nb_input), res2(nb_input); std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { using std::sin; return sin(v); }); std::transform(input.cbegin(), input.cend(), expected2.begin(), [](const value_type& v) { using std::cos; return cos(v); }); batch_type in, out1, out2; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, input, i); sincos(in, out1, out2); detail::store_batch(out1, res, i); detail::store_batch(out2, res2, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("sincos(sin)"); diff = detail::get_nb_diff(res2, expected2); EXPECT_EQ(diff, 0) << print_function_name("sincos(cos)"); } void test_tan() { test_conditional_tan(); } void test_asin() { std::transform(ainput.cbegin(), ainput.cend(), expected.begin(), [](const value_type& v) { using std::asin; return asin(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, ainput, i); out = asin(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("asin"); } void test_acos() { std::transform(ainput.cbegin(), ainput.cend(), expected.begin(), [](const value_type& v) { using std::acos; return acos(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, ainput, i); out = acos(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("acos"); } void test_atan() { std::transform(atan_input.cbegin(), atan_input.cend(), expected.begin(), [](const value_type& v) { using std::atan; return atan(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, atan_input, i); out = atan(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("atan"); } private: void test_tan_impl() { std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { using std::tan; return tan(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, input, i); out = tan(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("tan"); } template ::value, int>::type = 0> void test_conditional_tan() { test_tan_impl(); } template ::value, int>::type = 0> void test_conditional_tan() { #if (XSIMD_ARM_INSTR_SET >= XSIMD_ARM7_NEON_VERSION) #if DEBUG_ACCURACY test_tan_impl(); #endif #else test_tan_impl(); #endif } }; TYPED_TEST_SUITE(complex_trigonometric_test, batch_complex_types, simd_test_names); TYPED_TEST(complex_trigonometric_test, sin) { this->test_sin(); } TYPED_TEST(complex_trigonometric_test, cos) { this->test_cos(); } TYPED_TEST(complex_trigonometric_test, sincos) { this->test_sincos(); } TYPED_TEST(complex_trigonometric_test, tan) { this->test_tan(); } TYPED_TEST(complex_trigonometric_test, asin) { this->test_asin(); } TYPED_TEST(complex_trigonometric_test, acos) { this->test_acos(); } TYPED_TEST(complex_trigonometric_test, atan) { this->test_atan(); } xsimd-7.6.0/test/test_constant_batch.cpp000066400000000000000000000110751410101234500203570ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Serge Guelton * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "test_utils.hpp" using namespace std::placeholders; template class constant_batch_test : public testing::Test { protected: using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; using array_type = std::array; using bool_array_type = std::array; struct generator { static constexpr value_type get(size_t index, size_t /*size*/) { return index % 2 ? 0 : 1; } }; void test_init_from_generator() const { array_type expected; size_t i = 0; std::generate(expected.begin(), expected.end(), [&i]() { return generator::get(i++, size); }); constexpr auto b = xsimd::make_batch_constant(); EXPECT_BATCH_EQ(b(), expected) << print_function_name("batch(value_type)"); } struct arange { static constexpr value_type get(size_t index, size_t /*size*/) { return index; } }; void test_init_from_generator_arange() const { array_type expected; size_t i = 0; std::generate(expected.begin(), expected.end(), [&i]() { return arange::get(i++, size); }); constexpr auto b = xsimd::make_batch_constant(); EXPECT_BATCH_EQ(b(), expected) << print_function_name("batch(value_type)"); } struct constant { static constexpr value_type get(size_t /*index*/, size_t /*size*/) { return 3; } }; void test_init_from_constant() const { array_type expected; std::fill(expected.begin(), expected.end(), constant::get(0, 0)); constexpr auto b = xsimd::make_batch_constant(); EXPECT_BATCH_EQ(b(), expected) << print_function_name("batch(value_type)"); } }; TYPED_TEST_SUITE(constant_batch_test, batch_int_types, simd_test_names); TYPED_TEST(constant_batch_test, init_from_generator) { this->test_init_from_generator(); } TYPED_TEST(constant_batch_test, init_from_generator_arange) { this->test_init_from_generator_arange(); } TYPED_TEST(constant_batch_test, init_from_constant) { this->test_init_from_constant(); } template class constant_bool_batch_test : public testing::Test { protected: using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; using array_type = std::array; using bool_array_type = std::array; struct generator { static constexpr bool get(size_t index, size_t /*size*/) { return index % 2; } }; void test_init_from_generator() const { bool_array_type expected; size_t i = 0; std::generate(expected.begin(), expected.end(), [&i]() { return generator::get(i++, size); }); constexpr auto b = xsimd::make_batch_bool_constant(); EXPECT_BATCH_EQ(b(), expected) << print_function_name("batch_bool_constant(value_type)"); } struct split { static constexpr bool get(size_t index, size_t size) { return index < size / 2; } }; void test_init_from_generator_split() const { bool_array_type expected; size_t i = 0; std::generate(expected.begin(), expected.end(), [&i]() { return split::get(i++, size); }); constexpr auto b = xsimd::make_batch_bool_constant(); EXPECT_BATCH_EQ(b(), expected) << print_function_name("batch_bool_constant(value_type)"); } }; TYPED_TEST_SUITE(constant_bool_batch_test, batch_int_types, simd_test_names); TYPED_TEST(constant_bool_batch_test, init_from_generator) { this->test_init_from_generator(); } TYPED_TEST(constant_bool_batch_test, init_from_generator_split) { this->test_init_from_generator_split(); } xsimd-7.6.0/test/test_conversion.cpp000066400000000000000000000137331410101234500175550ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "test_utils.hpp" template class conversion_test : public testing::Test { protected: static constexpr size_t N = CP::size; static constexpr size_t A = CP::alignment; using int32_batch = xsimd::batch; using int64_batch = xsimd::batch; using float_batch = xsimd::batch; using double_batch = xsimd::batch; using uint8_batch = xsimd::batch; using uint16_batch = xsimd::batch; using uint32_batch = xsimd::batch; using uint64_batch = xsimd::batch; using int32_vector = std::vector>; using int64_vector = std::vector>; using float_vector = std::vector>; using double_vector = std::vector>; using uint8_vector = std::vector>; /*int32_batch i32pos; int32_batch i32neg; int64_batch i64pos; int64_batch i64neg; float_batch fpos; float_batch fneg; double_batch dpos; double_batch dneg;*/ int32_vector fposres; int32_vector fnegres; int64_vector dposres; int64_vector dnegres; float_vector i32posres; float_vector i32negres; double_vector i64posres; double_vector i64negres; uint8_vector ui8res; conversion_test() : fposres(2 * N, 7), fnegres(2 * N, -6), dposres(N, 5), dnegres(N, -1), i32posres(2 * N, float(2)), i32negres(2 * N, float(-3)), i64posres(N, double(2)), i64negres(N, double(-3)), ui8res(8 * N, 4) { } void test_to_int32() { float_batch fpos(float(7.4)), fneg(float(-6.2)); int32_vector fvres(int32_batch::size); { int32_batch fbres = to_int(fpos); fbres.store_aligned(fvres.data()); EXPECT_VECTOR_EQ(fvres, fposres) << print_function_name("to_int(positive float)"); } { int32_batch fbres = to_int(fneg); fbres.store_aligned(fvres.data()); EXPECT_VECTOR_EQ(fvres, fnegres) << print_function_name("to_int(negative float)"); } } void test_to_int64() { double_batch dpos(double(5.4)), dneg(double(-1.2)); int64_vector dvres(int64_batch::size); { int64_batch dbres = to_int(dpos); dbres.store_aligned(dvres.data()); EXPECT_VECTOR_EQ(dvres, dposres) << print_function_name("to_int(positive double)"); } { int64_batch dbres = to_int(dneg); dbres.store_aligned(dvres.data()); EXPECT_VECTOR_EQ(dvres, dnegres) << print_function_name("to_int(negative double)"); } } void test_to_float() { int32_batch i32pos(2), i32neg(-3); float_vector i32vres(float_batch::size); { float_batch i32bres = to_float(i32pos); i32bres.store_aligned(i32vres.data()); EXPECT_VECTOR_EQ(i32vres, i32posres) << print_function_name("to_float(positive int32)"); } { float_batch i32bres = to_float(i32neg); i32bres.store_aligned(i32vres.data()); EXPECT_VECTOR_EQ(i32vres, i32negres) << print_function_name("to_float(negative int32)"); } } void test_to_double() { int64_batch i64pos(2), i64neg(-3); double_vector i64vres(double_batch::size); { double_batch i64bres = to_float(i64pos); i64bres.store_aligned(i64vres.data()); EXPECT_VECTOR_EQ(i64vres, i64posres) << print_function_name("to_float(positive int64)"); } { double_batch i64bres = to_float(i64neg); i64bres.store_aligned(i64vres.data()); EXPECT_VECTOR_EQ(i64vres, i64negres) << print_function_name("to_float(negative int64)"); } } void test_u8_casting() { uint8_batch ui8tmp(4); uint8_vector ui8vres(uint8_batch::size); { uint16_batch ui16casting = u8_to_u16(ui8tmp); uint8_batch ui8casting = u16_to_u8(ui16casting); ui8casting.store_aligned(ui8vres.data()); EXPECT_VECTOR_EQ(ui8vres, ui8res) << print_function_name("u8_to_16"); } { uint32_batch ui32casting = u8_to_u32(ui8tmp); uint8_batch ui8casting = u32_to_u8(ui32casting); ui8casting.store_aligned(ui8vres.data()); EXPECT_VECTOR_EQ(ui8vres, ui8res) << print_function_name("u8_to_32"); } { uint64_batch ui64casting = u8_to_u64(ui8tmp); uint8_batch ui8casting = u64_to_u8(ui64casting); ui8casting.store_aligned(ui8vres.data()); EXPECT_VECTOR_EQ(ui8vres, ui8res) << print_function_name("u8_to_64"); } } }; TYPED_TEST_SUITE(conversion_test, conversion_types, conversion_test_names); TYPED_TEST(conversion_test, to_int32) { this->test_to_int32(); } TYPED_TEST(conversion_test, to_int64) { this->test_to_int64(); } TYPED_TEST(conversion_test, to_float) { this->test_to_float(); } TYPED_TEST(conversion_test, to_double) { this->test_to_double(); } TYPED_TEST(conversion_test, u8_casting) { this->test_u8_casting(); } xsimd-7.6.0/test/test_error_gamma.cpp000066400000000000000000000126321410101234500176600ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "test_utils.hpp" template class error_gamma_test : public testing::Test { protected: using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; using vector_type = std::vector; size_t nb_input; vector_type input; vector_type gamma_input; vector_type gamma_neg_input; vector_type expected; vector_type res; error_gamma_test() { nb_input = size * 10000; input.resize(nb_input); gamma_input.resize(nb_input); gamma_neg_input.resize(nb_input); for (size_t i = 0; i < nb_input; ++i) { input[i] = value_type(-1.5) + i * value_type(3) / nb_input; gamma_input[i] = value_type(0.5) + i * value_type(3) / nb_input; gamma_neg_input[i] = value_type(-3.99) + i * value_type(0.9) / nb_input; } expected.resize(nb_input); res.resize(nb_input); } void test_error_functions() { // erf { std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { return std::erf(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, input, i); out = erf(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("erf"); } // erfc { std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { return std::erfc(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, input, i); out = erfc(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("erfc"); } } void test_gamma_functions() { // tgamma { std::transform(gamma_input.cbegin(), gamma_input.cend(), expected.begin(), [](const value_type& v) { return std::tgamma(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, gamma_input, i); out = tgamma(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("tgamma"); } // tgamma (negative input) { std::transform(gamma_neg_input.cbegin(), gamma_neg_input.cend(), expected.begin(), [](const value_type& v) { return std::tgamma(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, gamma_neg_input, i); out = tgamma(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("tgamma (negative input)"); } // lgamma { std::transform(gamma_input.cbegin(), gamma_input.cend(), expected.begin(), [](const value_type& v) { return std::lgamma(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, gamma_input, i); out = lgamma(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("lgamma"); } // tgamma (negative input) { std::transform(gamma_neg_input.cbegin(), gamma_neg_input.cend(), expected.begin(), [](const value_type& v) { return std::lgamma(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, gamma_neg_input, i); out = lgamma(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("lgamma (negative input)"); } } }; TYPED_TEST_SUITE(error_gamma_test, batch_float_types, simd_test_names); TYPED_TEST(error_gamma_test, error) { this->test_error_functions(); } TYPED_TEST(error_gamma_test, gamma) { this->test_gamma_functions(); } xsimd-7.6.0/test/test_exponential.cpp000066400000000000000000000133021410101234500177060ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "test_utils.hpp" template class exponential_test : public testing::Test { protected: using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; using vector_type = std::vector; size_t nb_input; vector_type exp_input; vector_type log_input; vector_type expected; vector_type res; exponential_test() { nb_input = size * 10000; exp_input.resize(nb_input); log_input.resize(nb_input); for (size_t i = 0; i < nb_input; ++i) { exp_input[i] = value_type(-1.5) + i * value_type(3) / nb_input; log_input[i] = value_type(0.001 + i * 100 / nb_input); } expected.resize(nb_input); res.resize(nb_input); } void test_exponential_functions() { // exp { std::transform(exp_input.cbegin(), exp_input.cend(), expected.begin(), [](const value_type& v) { return std::exp(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, exp_input, i); out = exp(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("exp"); } // exp2 { std::transform(exp_input.cbegin(), exp_input.cend(), expected.begin(), [](const value_type& v) { return std::exp2(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, exp_input, i); out = exp2(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("exp2"); } // expm1 { std::transform(exp_input.cbegin(), exp_input.cend(), expected.begin(), [](const value_type& v) { return std::expm1(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, exp_input, i); out = expm1(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("expm1"); } } void test_log_functions() { // log { std::transform(log_input.cbegin(), log_input.cend(), expected.begin(), [](const value_type& v) { return std::log(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, log_input, i); out = log(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("log"); } // log2 { std::transform(log_input.cbegin(), log_input.cend(), expected.begin(), [](const value_type& v) { return std::log2(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, log_input, i); out = log2(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("log2"); } // log10 { std::transform(log_input.cbegin(), log_input.cend(), expected.begin(), [](const value_type& v) { return std::log10(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, log_input, i); out = log10(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("log10"); } // log1p { std::transform(log_input.cbegin(), log_input.cend(), expected.begin(), [](const value_type& v) { return std::log1p(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, log_input, i); out = log1p(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("log1p"); } } }; TYPED_TEST_SUITE(exponential_test, batch_float_types, simd_test_names); TYPED_TEST(exponential_test, exp) { this->test_exponential_functions(); } TYPED_TEST(exponential_test, log) { this->test_log_functions(); } xsimd-7.6.0/test/test_extract_pair.cpp000066400000000000000000000056151410101234500200550ustar00rootroot00000000000000/*************************************************************************** * * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "test_utils.hpp" namespace xsimd { template struct init_extract_pair_base { using extract_vector_type = std::array; extract_vector_type lhs_in, rhs_in, exped; std::vector create_extract_vectors(const int index) { std::vector vects; vects.reserve(3); int num = static_cast(N); /* Generate input data: lhs, rhs */ for (int i = 0; i < num; ++i) { lhs_in[i] = 2*i + 1; rhs_in[i] = 2*i + 2; } vects.push_back(std::move(lhs_in)); vects.push_back(std::move(rhs_in)); /* Expected shuffle data */ for (int i = 0 ; i < (num - index); ++i) { exped[i] = lhs_in[i + index]; if(i < index) { exped[num - 1 - i] = rhs_in[index - 1 - i]; } } vects.push_back(std::move(exped)); return vects; } }; } template class extract_pair_test : public testing::Test { protected: using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; extract_pair_test() { std::cout << "shffle_extract_pair tests" << std::endl; } void extract_pair_128() { xsimd::init_extract_pair_base extract_pair_base; auto extract_pair_vecs = extract_pair_base.create_extract_vectors(1); auto v_lhs = extract_pair_vecs[0]; auto v_rhs = extract_pair_vecs[1]; auto v_exped = extract_pair_vecs[2]; B b_lhs, b_rhs, b_exped, b_res; b_lhs.load_unaligned(v_lhs.data()); b_rhs.load_unaligned(v_rhs.data()); b_exped.load_unaligned(v_exped.data()); /* Only Test 128bit */ if ((sizeof(value_type) * size) == 16) { b_res = xsimd::extract_pair(b_lhs, b_rhs, 1); EXPECT_BATCH_EQ(b_res, b_exped) << print_function_name("extract_pair 128 test"); } } }; TYPED_TEST_SUITE(extract_pair_test, batch_types, simd_test_names); TYPED_TEST(extract_pair_test, extract_pair_128) { this->extract_pair_128(); } xsimd-7.6.0/test/test_fp_manipulation.cpp000066400000000000000000000046721410101234500205570ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "test_utils.hpp" template class fp_manipulation_test : public testing::Test { protected: using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; using array_type = std::array; using int_value_type = xsimd::as_integer_t; using int_batch_type = xsimd::batch; array_type input; int_value_type exponent; fp_manipulation_test() { exponent = 5; for (size_t i = 0; i < size; ++i) { input[i] = value_type(i) / 4 + value_type(1.2) * std::sqrt(value_type(i + 0.25)); } } void test_fp_manipulations() const { int_batch_type bexp(exponent); // ldexp { array_type expected; std::transform(input.cbegin(), input.cend(), expected.begin(), [this](const value_type& v) { return std::ldexp(v, exponent); }); batch_type res = xsimd::ldexp(batch_input(), bexp); EXPECT_BATCH_EQ(res, expected) << print_function_name("ldexp"); } // frexp { array_type expected; std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { int tmp; return std::frexp(v, &tmp); }); batch_type res = xsimd::frexp(batch_input(), bexp); EXPECT_BATCH_EQ(res, expected) << print_function_name("frexp"); } } private: batch_type batch_input() const { return batch_type(input.data()); } }; TYPED_TEST_SUITE(fp_manipulation_test, batch_float_types, simd_test_names); TYPED_TEST(fp_manipulation_test, fp_manipulations) { this->test_fp_manipulations(); } xsimd-7.6.0/test/test_hyperbolic.cpp000066400000000000000000000124251410101234500175250ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "test_utils.hpp" template class hyperbolic_test : public testing::Test { protected: using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; using vector_type = std::vector; size_t nb_input; vector_type input; vector_type acosh_input; vector_type atanh_input; vector_type expected; vector_type res; hyperbolic_test() { nb_input = size * 10000; input.resize(nb_input); acosh_input.resize(nb_input); atanh_input.resize(nb_input); for (size_t i = 0; i < nb_input; ++i) { input[i] = value_type(-1.5) + i * value_type(3) / nb_input; acosh_input[i] = value_type(1.) + i * value_type(3) / nb_input; atanh_input[i] = value_type(-0.95) + i * value_type(1.9) / nb_input; } expected.resize(nb_input); res.resize(nb_input); } void test_hyperbolic_functions() { // sinh { std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { return std::sinh(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, input, i); out = sinh(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("sinh"); } // cosh { std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { return std::cosh(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, input, i); out = cosh(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("cosh"); } // tanh { std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { return std::tanh(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, input, i); out = tanh(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("tanh"); } } void test_reciprocal_functions() { // asinh { std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { return std::asinh(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, input, i); out = asinh(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("asinh"); } // acosh { std::transform(acosh_input.cbegin(), acosh_input.cend(), expected.begin(), [](const value_type& v) { return std::acosh(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, acosh_input, i); out = acosh(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("acosh"); } // atanh { std::transform(atanh_input.cbegin(), atanh_input.cend(), expected.begin(), [](const value_type& v) { return std::atanh(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, atanh_input, i); out = atanh(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("atanh"); } } }; TYPED_TEST_SUITE(hyperbolic_test, batch_float_types, simd_test_names); TYPED_TEST(hyperbolic_test, hyperbolic) { this->test_hyperbolic_functions(); } TYPED_TEST(hyperbolic_test, reciprocal) { this->test_reciprocal_functions(); } xsimd-7.6.0/test/test_interface.cpp000066400000000000000000000077721410101234500173360ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include #include #include #include "gtest/gtest.h" #include "xsimd/config/xsimd_instruction_set.hpp" #ifdef XSIMD_INSTR_SET_AVAILABLE #include "xsimd/xsimd.hpp" struct interface_tester { std::vector> fvec; std::vector> ivec; std::vector> fres; std::vector> ires; interface_tester(); static const std::size_t SIZE = xsimd::simd_traits::size; }; interface_tester::interface_tester() : fvec(SIZE), ivec(SIZE), fres(SIZE), ires(SIZE) { std::iota(fvec.begin(), fvec.end(), 1.f); std::iota(ivec.begin(), ivec.end(), 1); } TEST(xsimd, set_simd) { interface_tester t; xsimd::simd_type r1 = xsimd::set_simd(t.fvec[0]); EXPECT_EQ(r1[0], t.fvec[0]); xsimd::simd_type r2 = xsimd::set_simd(t.ivec[0]); EXPECT_EQ(r2[0], t.fvec[0]); } TEST(xsimd, load_store_aligned) { interface_tester t; xsimd::simd_type r1 = xsimd::load_aligned(&t.fvec[0]); xsimd::store_aligned(&t.fres[0], r1); EXPECT_EQ(t.fvec, t.fres); xsimd::simd_type r2 = xsimd::load_aligned(&t.ivec[0]); xsimd::store_aligned(&t.fres[0], r2); EXPECT_EQ(t.fvec, t.fres); xsimd::simd_type r3 = xsimd::load_aligned(&t.fvec[0]); xsimd::store_aligned(&t.ires[0], r3); EXPECT_EQ(t.ivec, t.ires); } TEST(xsimd, load_store_unaligned) { interface_tester t; xsimd::simd_type r1 = xsimd::load_unaligned(&t.fvec[0]); xsimd::store_unaligned(&t.fres[0], r1); EXPECT_EQ(t.fvec, t.fres); xsimd::simd_type r2 = xsimd::load_unaligned(&t.ivec[0]); xsimd::store_unaligned(&t.fres[0], r2); EXPECT_EQ(t.fvec, t.fres); xsimd::simd_type r3 = xsimd::load_unaligned(&t.fvec[0]); xsimd::store_unaligned(&t.ires[0], r3); EXPECT_EQ(t.ivec, t.ires); } TEST(xsimd, load_store_simd_aligned) { interface_tester t; xsimd::simd_type r1 = xsimd::load_simd(&t.fvec[0], xsimd::aligned_mode()); xsimd::store_simd(&t.fres[0], r1, xsimd::aligned_mode()); EXPECT_EQ(t.fvec, t.fres); xsimd::simd_type r2 = xsimd::load_simd(&t.ivec[0], xsimd::aligned_mode()); xsimd::store_simd(&t.fres[0], r2, xsimd::aligned_mode()); EXPECT_EQ(t.fvec, t.fres); xsimd::simd_type r3 = xsimd::load_simd(&t.fvec[0], xsimd::aligned_mode()); xsimd::store_simd(&t.ires[0], r3, xsimd::aligned_mode()); EXPECT_EQ(t.ivec, t.ires); } TEST(xsimd, load_store_simd_unaligned) { interface_tester t; xsimd::simd_type r1 = xsimd::load_simd(&t.fvec[0], xsimd::unaligned_mode()); xsimd::store_simd(&t.fres[0], r1, xsimd::unaligned_mode()); EXPECT_EQ(t.fvec, t.fres); xsimd::simd_type r2 = xsimd::load_simd(&t.ivec[0], xsimd::unaligned_mode()); xsimd::store_simd(&t.fres[0], r2, xsimd::unaligned_mode()); EXPECT_EQ(t.fvec, t.fres); xsimd::simd_type r3 = xsimd::load_simd(&t.fvec[0], xsimd::unaligned_mode()); xsimd::store_simd(&t.ires[0], r3, xsimd::unaligned_mode()); EXPECT_EQ(t.ivec, t.ires); } #endif // XSIMD_INSTR_SET_AVAILABLE xsimd-7.6.0/test/test_load_store.cpp000066400000000000000000000132531410101234500175200ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include #include "test_utils.hpp" template class load_store_test : public testing::Test { protected: using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; using array_type = std::array; using int8_vector_type = std::vector; using uint8_vector_type = std::vector; using int16_vector_type = std::vector; using uint16_vector_type = std::vector; using int32_vector_type = std::vector; using uint32_vector_type = std::vector; using int64_vector_type = std::vector; using uint64_vector_type = std::vector; #ifdef XSIMD_32_BIT_ABI using long_vector_type = std::vector; using ulong_vector_type = std::vector; #endif using float_vector_type = std::vector; using double_vector_type = std::vector; int8_vector_type i8_vec; uint8_vector_type ui8_vec; int16_vector_type i16_vec; uint16_vector_type ui16_vec; int32_vector_type i32_vec; uint32_vector_type ui32_vec; int64_vector_type i64_vec; uint64_vector_type ui64_vec; #ifdef XSIMD_32_BIT_ABI long_vector_type l_vec; ulong_vector_type ul_vec; #endif float_vector_type f_vec; double_vector_type d_vec; array_type expected; load_store_test() { init_test_vector(i8_vec); init_test_vector(ui8_vec); init_test_vector(i16_vec); init_test_vector(ui16_vec); init_test_vector(i32_vec); init_test_vector(ui32_vec); init_test_vector(i64_vec); init_test_vector(ui64_vec); #ifdef XSIMD_32_BIT_ABI init_test_vector(l_vec); init_test_vector(ul_vec); #endif init_test_vector(f_vec); init_test_vector(d_vec); } void test_load() { test_load_impl(i8_vec, "load int8_t"); test_load_impl(ui8_vec, "load uint8_t"); test_load_impl(i16_vec, "load int16_t"); test_load_impl(ui16_vec, "load uint16_t"); test_load_impl(i32_vec, "load int32_t"); test_load_impl(ui32_vec, "load uint32_t"); test_load_impl(i64_vec, "load int64_t"); test_load_impl(ui64_vec, "load uint64_t"); #ifdef XSIMD_32_BIT_ABI test_load_impl(l_vec, "load long"); test_load_impl(ul_vec, "load unsigned long"); #endif test_load_impl(f_vec, "load float"); test_load_impl(d_vec, "load double"); } void test_store() { test_store_impl(i8_vec, "load int8_t"); test_store_impl(ui8_vec, "load uint8_t"); test_store_impl(i16_vec, "load int16_t"); test_store_impl(ui16_vec, "load uint16_t"); test_store_impl(i32_vec, "load int32_t"); test_store_impl(ui32_vec, "load uint32_t"); test_store_impl(i64_vec, "load int64_t"); test_store_impl(ui64_vec, "load uint64_t"); #ifdef XSIMD_32_BIT_ABI test_store_impl(l_vec, "load long"); test_store_impl(ul_vec, "load unsigned long"); #endif test_store_impl(f_vec, "load float"); test_store_impl(d_vec, "load double"); } private: template void test_load_impl(const V& v, const std::string& name) { batch_type b; std::copy(v.cbegin(), v.cend(), expected.begin()); b.load_unaligned(v.data()); EXPECT_BATCH_EQ(b, expected) << print_function_name(name + " unaligned"); b.load_aligned(v.data()); EXPECT_BATCH_EQ(b, expected) << print_function_name(name + " aligned"); } template void test_store_impl(const V& v, const std::string& name) { batch_type b; b.load_aligned(v.data()); V res(size); b.store_unaligned(res.data()); EXPECT_VECTOR_EQ(res, v) << print_function_name(name + " unaligned"); b.store_aligned(res.data()); EXPECT_VECTOR_EQ(res, v) << print_function_name(name + " aligned"); } template void init_test_vector(V& vec) { vec.resize(size); value_type min = value_type(0); value_type max = value_type(100); std::default_random_engine generator; std::uniform_int_distribution distribution(min, max); auto gen = [&distribution, &generator](){ return static_cast(distribution(generator)); }; std::generate(vec.begin(), vec.end(), gen); } }; TYPED_TEST_SUITE(load_store_test, batch_types, simd_test_names); TYPED_TEST(load_store_test, load) { this->test_load(); } TYPED_TEST(load_store_test, store) { this->test_store(); } xsimd-7.6.0/test/test_memory.cpp000066400000000000000000000030561410101234500166750ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include #include #include "gtest/gtest.h" #include "xsimd/config/xsimd_instruction_set.hpp" #ifdef XSIMD_INSTR_SET_AVAILABLE #include "xsimd/memory/xsimd_alignment.hpp" struct mock_container {}; TEST(xsimd, alignment) { using u_vector_type = std::vector; using a_vector_type = std::vector>; using u_vector_align = xsimd::container_alignment_t; using a_vector_align = xsimd::container_alignment_t; using mock_align = xsimd::container_alignment_t; EXPECT_TRUE((std::is_same::value)); EXPECT_TRUE((std::is_same::value)); EXPECT_TRUE((std::is_same::value)); } #endif // XSIMD_INSTR_SET_AVAILABLE xsimd-7.6.0/test/test_poly_evaluation.cpp000066400000000000000000000042451410101234500206000ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "test_utils.hpp" template class poly_evaluation_test : public testing::Test { protected: using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; using vector_type = std::vector; size_t nb_input; vector_type input; vector_type horner_res; vector_type estrin_res; poly_evaluation_test() { nb_input = size * 10000; input.resize(nb_input); for (size_t i = 0; i < nb_input; ++i) { input[i] = value_type(i) / 4 + value_type(1.2) * std::sqrt(value_type(i + 0.25)); } horner_res.resize(nb_input); estrin_res.resize(nb_input); } void test_poly_evaluation() { batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, input, i); out = xsimd::horner(in); detail::store_batch(out, horner_res, i); out = xsimd::estrin(in); detail::store_batch(out, estrin_res, i); } size_t diff = detail::get_nb_diff(horner_res, estrin_res); EXPECT_EQ(diff, 0) << print_function_name("estrin"); } }; TYPED_TEST_SUITE(poly_evaluation_test, batch_float_types, simd_test_names); TYPED_TEST(poly_evaluation_test, poly_evaluation) { this->test_poly_evaluation(); } xsimd-7.6.0/test/test_power.cpp000066400000000000000000000104641410101234500165220ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "test_utils.hpp" template class power_test : public testing::Test { protected: using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; using vector_type = std::vector; size_t nb_input; vector_type lhs_input; vector_type rhs_input; vector_type expected; vector_type res; power_test() { nb_input = size * 10000; lhs_input.resize(nb_input); rhs_input.resize(nb_input); for (size_t i = 0; i < nb_input; ++i) { lhs_input[i] = value_type(i) / 4 + value_type(1.2) * std::sqrt(value_type(i + 0.25)); rhs_input[i] = value_type(10.2) / (i + 2) + value_type(0.25); } expected.resize(nb_input); res.resize(nb_input); } void test_power_functions() { // pow { std::transform(lhs_input.cbegin(), lhs_input.cend(), rhs_input.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return std::pow(l, r); }); batch_type lhs_in, rhs_in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(lhs_in, lhs_input, i); detail::load_batch(rhs_in, rhs_input, i); out = pow(lhs_in, rhs_in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("pow"); } // ipow { long k = 0; std::transform(lhs_input.cbegin(), lhs_input.cend(), expected.begin(), [&k, this](const value_type& l) { auto arg = k / size - nb_input / size / 2; ++k; return std::pow(l, arg); }); batch_type lhs_in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(lhs_in, lhs_input, i); out = pow(lhs_in, i/size - nb_input / size / 2); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("ipow"); } // hypot { std::transform(lhs_input.cbegin(), lhs_input.cend(), rhs_input.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return std::hypot(l, r); }); batch_type lhs_in, rhs_in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(lhs_in, lhs_input, i); detail::load_batch(rhs_in, rhs_input, i); out = hypot(lhs_in, rhs_in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("hypot"); } // cbrt { std::transform(lhs_input.cbegin(), lhs_input.cend(), expected.begin(), [](const value_type& l) { return std::cbrt(l); }); batch_type lhs_in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(lhs_in, lhs_input, i); out = cbrt(lhs_in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("cbrt"); } // hypot } }; TYPED_TEST_SUITE(power_test, batch_float_types, simd_test_names); TYPED_TEST(power_test, power) { this->test_power_functions(); } xsimd-7.6.0/test/test_rounding.cpp000066400000000000000000000133661410101234500172170ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "test_utils.hpp" template class rounding_test : public testing::Test { protected: using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; static constexpr size_t nb_input = 8; static constexpr size_t nb_batches = nb_input / size; std::array input; std::array expected; std::array res; rounding_test() { input[0] = value_type(-3.5); input[1] = value_type(-2.7); input[2] = value_type(-2.5); input[3] = value_type(-2.3); input[4] = value_type(2.3); input[5] = value_type(2.5); input[6] = value_type(2.7); input[7] = value_type(3.5); } void test_rounding_functions() { // ceil { std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { return std::ceil(v); }); batch_type in, out; for (size_t i = 0; i < nb_batches; i += size) { detail::load_batch(in, input, i); out = ceil(in); detail::store_batch(out, res, i); } for (size_t i = nb_batches; i < nb_input; ++i) { res[i] = std::ceil(input[i]); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("ceil"); } // floor { std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { return std::floor(v); }); batch_type in, out; for (size_t i = 0; i < nb_batches; i += size) { detail::load_batch(in, input, i); out = floor(in); detail::store_batch(out, res, i); } for (size_t i = nb_batches; i < nb_input; ++i) { res[i] = std::floor(input[i]); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("floor"); } // trunc { std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { return std::trunc(v); }); batch_type in, out; for (size_t i = 0; i < nb_batches; i += size) { detail::load_batch(in, input, i); out = trunc(in); detail::store_batch(out, res, i); } for (size_t i = nb_batches; i < nb_input; ++i) { res[i] = std::trunc(input[i]); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("trunc"); } // round { std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { return std::round(v); }); batch_type in, out; for (size_t i = 0; i < nb_batches; i += size) { detail::load_batch(in, input, i); out = round(in); detail::store_batch(out, res, i); } for (size_t i = nb_batches; i < nb_input; ++i) { res[i] = std::round(input[i]); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("round"); } // nearbyint { std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { return std::nearbyint(v); }); batch_type in, out; for (size_t i = 0; i < nb_batches; i += size) { detail::load_batch(in, input, i); out = nearbyint(in); detail::store_batch(out, res, i); } for (size_t i = nb_batches; i < nb_input; ++i) { res[i] = std::nearbyint(input[i]); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("nearbyint"); } // rint { std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { return std::rint(v); }); batch_type in, out; for (size_t i = 0; i < nb_batches; i += size) { detail::load_batch(in, input, i); out = rint(in); detail::store_batch(out, res, i); } for (size_t i = nb_batches; i < nb_input; ++i) { res[i] = std::rint(input[i]); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("rint"); } } }; TYPED_TEST_SUITE(rounding_test, batch_float_types, simd_test_names); TYPED_TEST(rounding_test, rounding) { this->test_rounding_functions(); } xsimd-7.6.0/test/test_select.cpp000066400000000000000000000063121410101234500166420ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "test_utils.hpp" template class select_test : public testing::Test { protected: using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; using vector_type = std::vector; size_t nb_input; vector_type lhs_input; vector_type rhs_input; vector_type expected; vector_type res; select_test() { nb_input = size * 10000; lhs_input.resize(nb_input); rhs_input.resize(nb_input); for (size_t i = 0; i < nb_input; ++i) { lhs_input[i] = value_type(i) / 4 + value_type(1.2) * std::sqrt(value_type(i + 0.25)); rhs_input[i] = value_type(10.2) / (i + 2) + value_type(0.25); } expected.resize(nb_input); res.resize(nb_input); } void test_select_dynamic() { for (size_t i = 0; i < nb_input; ++i) { expected[i] = lhs_input[i] > value_type(3) ? lhs_input[i] : rhs_input[i]; } batch_type lhs_in, rhs_in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(lhs_in, lhs_input, i); detail::load_batch(rhs_in, rhs_input, i); out = xsimd::select(lhs_in > value_type(3), lhs_in, rhs_in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("pow"); } struct pattern { static constexpr bool get(std::size_t i, std::size_t) { return i % 2; } }; void test_select_static() { constexpr auto mask = xsimd::make_batch_bool_constant(); for (size_t i = 0; i < nb_input; ++i) { expected[i] = mask[i % size] ? lhs_input[i] : rhs_input[i]; } batch_type lhs_in, rhs_in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(lhs_in, lhs_input, i); detail::load_batch(rhs_in, rhs_input, i); out = xsimd::select(mask, lhs_in, rhs_in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("pow"); } }; TYPED_TEST_SUITE(select_test, batch_types, simd_test_names); TYPED_TEST(select_test, select_dynamic) { this->test_select_dynamic(); } TYPED_TEST(select_test, select_static) { this->test_select_static(); } xsimd-7.6.0/test/test_shuffle_128.cpp000066400000000000000000000062041410101234500174110ustar00rootroot00000000000000/*************************************************************************** * * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "test_utils.hpp" namespace xsimd { template struct init_shuffle_128_base { using shuffle_vector_type = std::array; shuffle_vector_type lhs_in, rhs_in, exp_lo, exp_hi; std::vector create_vectors() { std::vector vects; vects.reserve(4); /* Generate input data: lhs, rhs */ for (size_t i = 0; i < N; ++i) { lhs_in[i] = 2*i + 1; rhs_in[i] = 2*i + 2; } vects.push_back(std::move(lhs_in)); vects.push_back(std::move(rhs_in)); /* Expected shuffle data */ for (size_t i = 0, j= 0; i < N/2; ++i, j=j+2) { exp_lo[j] = lhs_in[i]; exp_hi[j] = lhs_in[i + N/2]; exp_lo[j + 1] = rhs_in[i]; exp_hi[j + 1] = rhs_in[i + N/2]; } vects.push_back(std::move(exp_lo)); vects.push_back(std::move(exp_hi)); return vects; } }; } template class shuffle_128_test : public testing::Test { protected: using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; shuffle_128_test() { std::cout << "shuffle-128 test" << std::endl; } void shuffle_128_low_high() { xsimd::init_shuffle_128_base shuffle_base; auto shuffle_base_vecs = shuffle_base.create_vectors(); auto v_lhs = shuffle_base_vecs[0]; auto v_rhs = shuffle_base_vecs[1]; auto v_exp_lo = shuffle_base_vecs[2]; auto v_exp_hi = shuffle_base_vecs[3]; B b_lhs, b_rhs, b_exp_lo, b_exp_hi, b_res_lo, b_res_hi; b_lhs.load_unaligned(v_lhs.data()); b_rhs.load_unaligned(v_rhs.data()); b_exp_lo.load_unaligned(v_exp_lo.data()); b_exp_hi.load_unaligned(v_exp_hi.data()); /* Only Test 128bit */ if ((sizeof(value_type) * size) == 16) { b_res_lo = xsimd::zip_lo(b_lhs, b_rhs); EXPECT_BATCH_EQ(b_res_lo, b_exp_lo) << print_function_name("shuffle-128 low test"); b_res_hi = xsimd::zip_hi(b_lhs, b_rhs); EXPECT_BATCH_EQ(b_res_hi, b_exp_hi) << print_function_name("shuffle-128 high test"); } } }; TYPED_TEST_SUITE(shuffle_128_test, batch_types, simd_test_names); TYPED_TEST(shuffle_128_test, shuffle_128_low_high) { this->shuffle_128_low_high(); } xsimd-7.6.0/test/test_trigonometric.cpp000066400000000000000000000155521410101234500202560ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "test_utils.hpp" template class trigonometric_test : public testing::Test { protected: using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; using vector_type = std::vector; size_t nb_input; vector_type input; vector_type ainput; vector_type atan_input; vector_type expected; vector_type res; trigonometric_test() { nb_input = size * 10000; input.resize(nb_input); ainput.resize(nb_input); atan_input.resize(nb_input); for (size_t i = 0; i < nb_input; ++i) { input[i] = value_type(0.) + i * value_type(80.) / nb_input; ainput[i] = value_type(-1.) + value_type(2.) * i / nb_input; atan_input[i] = value_type(-10.) + i * value_type(20.) / nb_input; } expected.resize(nb_input); res.resize(nb_input); } void test_trigonometric_functions() { // sin { std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { return std::sin(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, input, i); out = sin(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("sin"); } // cos { std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { return std::cos(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, input, i); out = cos(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("cos"); } // sincos { vector_type expected2(nb_input), res2(nb_input); std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { return std::sin(v); }); std::transform(input.cbegin(), input.cend(), expected2.begin(), [](const value_type& v) { return std::cos(v); }); batch_type in, out1, out2; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, input, i); sincos(in, out1, out2); detail::store_batch(out1, res, i); detail::store_batch(out2, res2, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("sincos(sin)"); diff = detail::get_nb_diff(res2, expected2); EXPECT_EQ(diff, 0) << print_function_name("sincos(cos)"); } // tan { std::transform(input.cbegin(), input.cend(), expected.begin(), [](const value_type& v) { return std::tan(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, input, i); out = tan(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("tan"); } } void test_reciprocal_functions() { // asin { std::transform(ainput.cbegin(), ainput.cend(), expected.begin(), [](const value_type& v) { return std::asin(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, ainput, i); out = asin(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("asin"); } // acos { std::transform(ainput.cbegin(), ainput.cend(), expected.begin(), [](const value_type& v) { return std::acos(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, ainput, i); out = acos(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("acos"); } // atan { std::transform(atan_input.cbegin(), atan_input.cend(), expected.begin(), [](const value_type& v) { return std::atan(v); }); batch_type in, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, atan_input, i); out = atan(in); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("atan"); } // atan2 { std::transform(atan_input.cbegin(), atan_input.cend(), input.cbegin(), expected.begin(), [](const value_type& v, const value_type& r) { return std::atan2(v, r); }); batch_type in, rhs, out; for (size_t i = 0; i < nb_input; i += size) { detail::load_batch(in, atan_input, i); detail::load_batch(rhs, input, i); out = atan2(in, rhs); detail::store_batch(out, res, i); } size_t diff = detail::get_nb_diff(res, expected); EXPECT_EQ(diff, 0) << print_function_name("atan2"); } } }; TYPED_TEST_SUITE(trigonometric_test, batch_float_types, simd_test_names); TYPED_TEST(trigonometric_test, trigonometric) { this->test_trigonometric_functions(); } TYPED_TEST(trigonometric_test, reciprocal) { this->test_reciprocal_functions(); } xsimd-7.6.0/test/test_utils.hpp000066400000000000000000000633701410101234500165370ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include #include #include #include #include #include #include "gtest/gtest.h" #include "xsimd/xsimd.hpp" #ifndef XSIMD_TEST_UTILS_HPP #define XSIMD_TEST_UTILS_HPP /******************* * Pretty printers * *******************/ class simd_test_names { public: template static std::string GetName(int) { using value_type = typename T::value_type; std::string prefix = "fallback_"; #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION size_t register_size = T::size * sizeof(value_type) * CHAR_BIT; if (register_size == size_t(128)) { prefix = "sse_"; } else if (register_size == size_t(256)) { prefix = "avx_"; } else if (register_size == size_t(512)) { prefix = "avx512_"; } #elif XSIMD_ARM_INSTR_SET >= XSIMD_ARM7_NEON_VERSION size_t register_size = T::size * sizeof(value_type) * CHAR_BIT; if (register_size == size_t(128)) { prefix = "arm_"; } #endif if (std::is_same::value) { return prefix + "uint8_t"; } if (std::is_same::value) { return prefix + "int8_t"; } if (std::is_same::value) { return prefix + "uint16_t"; } if (std::is_same::value) { return prefix + "int16_t"; } if (std::is_same::value) { return prefix + "uint32_t"; } if (std::is_same::value) { return prefix + "int32_t"; } if (std::is_same::value) { return prefix + "uint64_t"; } if (std::is_same::value) { return prefix + "int64_t"; } if (std::is_same::value) { return prefix + "float"; } if (std::is_same::value) { return prefix + "double"; } if (std::is_same>::value) { return prefix + "complex"; } if (std::is_same>::value) { return prefix + "complex"; } #ifdef XSIMD_ENABLE_XTL_COMPLEX if (std::is_same>::value) { return prefix + "xcomplex"; } if (std::is_same>::value) { return prefix + "xcomplex"; } #endif return prefix + "unknow_type"; } }; inline std::string print_function_name(const std::string& func) { return std::string(" while testing ") + func; } /************************ * Comparison functions * ************************/ namespace xsimd { template inline bool operator==(const batch& lhs, const std::array& rhs) { std::array tmp; lhs.store_unaligned(tmp.data()); return tmp == rhs; } template inline bool operator==(const std::array& lhs, const batch& rhs) { return rhs == lhs; } } namespace detail { namespace utils { // define some overloads here as integer versions do not exist for msvc template inline typename std::enable_if::value, bool>::type isinf(const T& c) { return std::isinf(c); } template inline typename std::enable_if::value, bool>::type isinf(const T&) { return false; } template inline typename std::enable_if::value, bool>::type isnan(const T& c) { return std::isnan(c); } template inline typename std::enable_if::value, bool>::type isnan(const T&) { return false; } } inline unsigned char uabs(unsigned char val) { return val; } inline unsigned short uabs(unsigned short val) { return val; } inline unsigned int uabs(unsigned int val) { return val; } inline unsigned long uabs(unsigned long val) { return val; } inline unsigned long long uabs(unsigned long long val) { return val; } template inline T uabs(T val) { return std::abs(val); } template bool check_is_small(const T& value, const T& tolerance) { using std::abs; return uabs(value) < uabs(tolerance); } template T safe_division(const T& lhs, const T& rhs) { if (rhs < static_cast(1) && lhs > rhs * (std::numeric_limits::max)()) { return (std::numeric_limits::max)(); } if ((lhs == static_cast(0)) || (rhs > static_cast(1) && lhs < rhs * (std::numeric_limits::min)())) { return static_cast(0); } return lhs / rhs; } template bool check_is_close(const T& lhs, const T& rhs, const T& relative_precision) { using std::abs; T diff = uabs(lhs - rhs); T d1 = safe_division(diff, T(uabs(rhs))); T d2 = safe_division(diff, T(uabs(lhs))); return d1 <= relative_precision && d2 <= relative_precision; } template struct scalar_comparison_near { static bool run(const T& lhs, const T& rhs) { using std::max; using std::abs; // direct compare integers -- but need tolerance for inexact double conversion if (std::is_integral::value && lhs < 10e6 && rhs < 10e6) { return lhs == rhs; } if (utils::isnan(lhs)) { return utils::isnan(rhs); } if (utils::isinf(lhs)) { return utils::isinf(rhs) && (lhs * rhs > 0) /* same sign */; } T relative_precision = 2048 * std::numeric_limits::epsilon(); T absolute_zero_prox = 2048 * std::numeric_limits::epsilon(); if (max(uabs(lhs), uabs(rhs)) < T(1e-3)) { using res_type = decltype(lhs - rhs); return detail::check_is_small(lhs - rhs, res_type(absolute_zero_prox)); } else { return detail::check_is_close(lhs, rhs, relative_precision); } } }; template struct scalar_comparison { static bool run(const T& lhs, const T& rhs) { return lhs == rhs; } }; template <> struct scalar_comparison : scalar_comparison_near { }; template <> struct scalar_comparison : scalar_comparison_near { }; template struct scalar_comparison> { static bool run(const std::complex& lhs, const std::complex& rhs) { using real_comparison = scalar_comparison; return real_comparison::run(lhs.real(), rhs.real()) && real_comparison::run(lhs.imag(), rhs.imag()); } }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct scalar_comparison> { static bool run(const xtl::xcomplex& lhs, const xtl::xcomplex& rhs) { using real_comparison = scalar_comparison; return real_comparison::run(lhs.real(), rhs.real()) && real_comparison::run(lhs.imag(), rhs.imag()); } }; #endif template struct vector_comparison { static bool run(const V& lhs, const V& rhs) { using value_type = typename V::value_type; for (size_t i = 0; i < lhs.size(); ++i) { if (!scalar_comparison::run(lhs[i], rhs[i])) return false; } return true; } }; template testing::AssertionResult expect_scalar_near(const char* lhs_expression, const char* rhs_expression, const T& lhs, const T& rhs) { if (scalar_comparison::run(lhs, rhs)) { return testing::AssertionSuccess(); } std::stringstream lhs_ss; lhs_ss << std::setprecision(std::numeric_limits::digits10 + 2) << lhs; std::stringstream rhs_ss; rhs_ss << std::setprecision(std::numeric_limits::digits10 + 2) << rhs; return testing::internal::EqFailure(lhs_expression, rhs_expression, lhs_ss.str(), rhs_ss.str(), false); } template testing::AssertionResult expect_container_near(const char* lhs_expression, const char* rhs_expression, const V& lhs, const V& rhs) { if (vector_comparison::run(lhs, rhs)) { return testing::AssertionSuccess(); } using value_type = typename V::value_type; std::stringstream lhs_ss; lhs_ss << std::setprecision(std::numeric_limits::digits10 + 2); testing::internal::PrintTo(lhs, &lhs_ss); std::stringstream rhs_ss; rhs_ss << std::setprecision(std::numeric_limits::digits10 + 2); testing::internal::PrintTo(rhs, &rhs_ss); return testing::internal::EqFailure(lhs_expression, rhs_expression, lhs_ss.str(), rhs_ss.str(), false); } template testing::AssertionResult expect_array_near(const char* lhs_expression, const char* rhs_expression, const std::array& lhs, const std::array& rhs) { return expect_container_near(lhs_expression, rhs_expression, lhs, rhs); } template testing::AssertionResult expect_vector_near(const char* lhs_expression, const char* rhs_expression, const std::vector& lhs, const std::vector& rhs) { return expect_container_near(lhs_expression, rhs_expression, lhs, rhs); } template testing::AssertionResult expect_batch_near(const char* lhs_expression, const char* rhs_expression, const ::xsimd::batch& lhs, const std::array& rhs) { std::array tmp; lhs.store_unaligned(tmp.data()); return expect_array_near(lhs_expression, rhs_expression, tmp, rhs); } template testing::AssertionResult expect_batch_near(const char* lhs_expression, const char* rhs_expression, const std::array& lhs, const ::xsimd::batch& rhs) { std::array tmp; rhs.store_unaligned(tmp.data()); return expect_array_near(lhs_expression, rhs_expression, lhs, tmp); } template testing::AssertionResult expect_batch_near(const char* lhs_expression, const char* rhs_expression, const ::xsimd::batch& lhs, const ::xsimd::batch& rhs) { std::array tmp; lhs.store_unaligned(tmp.data()); return expect_batch_near(lhs_expression, rhs_expression, tmp, rhs); } template testing::AssertionResult expect_batch_near(const char* lhs_expression, const char* rhs_expression, const ::xsimd::batch_bool& lhs, const std::array& rhs) { std::array tmp; lhs.store_unaligned(tmp.data()); return expect_array_near(lhs_expression, rhs_expression, tmp, rhs); } template testing::AssertionResult expect_batch_near(const char* lhs_expression, const char* rhs_expression, const std::array& lhs, const ::xsimd::batch_bool& rhs) { std::array tmp; rhs.store_unaligned(tmp.data()); return expect_array_near(lhs_expression, rhs_expression, lhs, tmp); } template testing::AssertionResult expect_batch_near(const char* lhs_expression, const char* rhs_expression, const ::xsimd::batch_bool& lhs, const ::xsimd::batch_bool& rhs) { std::array tmp; lhs.store_unaligned(tmp.data()); return expect_batch_near(lhs_expression, rhs_expression, tmp, rhs); } template size_t get_nb_diff(It lhs_begin, It lhs_end, It rhs_begin) { size_t res = 0; using value_type = typename std::iterator_traits::value_type; while (lhs_begin != lhs_end) { if (!scalar_comparison::run(*lhs_begin++, *rhs_begin++)) { ++res; } } return res; } template size_t get_nb_diff(const std::vector& lhs, const std::vector& rhs) { return get_nb_diff(lhs.begin(), lhs.end(), rhs.begin()); } template size_t get_nb_diff(const std::array& lhs, const std::array& rhs) { return get_nb_diff(lhs.begin(), lhs.end(), rhs.begin()); } template void load_batch(B& b, const S& src, size_t i = 0) { b.load_unaligned(src.data() + i); } template void store_batch(const B& b, D& dst, size_t i = 0) { b.store_unaligned(dst.data() + i); } } #define EXPECT_BATCH_EQ(b1, b2) EXPECT_PRED_FORMAT2(::detail::expect_batch_near, b1, b2) #define EXPECT_SCALAR_EQ(s1, s2) EXPECT_PRED_FORMAT2(::detail::expect_scalar_near, s1, s2) #define EXPECT_VECTOR_EQ(v1, v2) EXPECT_PRED_FORMAT2(::detail::expect_vector_near, v1, v2) namespace xsimd { /************************ * Enable metafunctions * ************************/ // Backport of C++14 std::enable_if template using enable_if_t = typename std::enable_if::type; template using enable_integral_t = enable_if_t::value, R>; template using enable_floating_point_t = enable_if_t::value, R>; namespace mpl { /************** * types_list * **************/ template struct type_list {}; /*************** * concatenate * ***************/ template struct concatenate; template