pax_global_header00006660000000000000000000000064132070311120014500gustar00rootroot0000000000000052 comment=9423e30fe203d94c20cc52554b1c59c2ededad47 Vc-1.3.3/000077500000000000000000000000001320703111200120545ustar00rootroot00000000000000Vc-1.3.3/.appveyor.yml000066400000000000000000000005151320703111200145230ustar00rootroot00000000000000version: 1.0.{build} os: Visual Studio 2015 configuration: Release platform: - x64 #- x86 clone_depth: 50 environment: matrix: - subset: sse - subset: avx build_script: - cmd: >- CALL "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" %PLATFORM% ctest -VV -S C:\projects\vc\test.cmake Vc-1.3.3/.clang-format000066400000000000000000000143241320703111200144330ustar00rootroot00000000000000BasedOnStyle: Google # The extra indent or outdent of access modifiers, e.g. public:. AccessModifierOffset: -4 # If true, aligns escaped newlines as far left as possible. Otherwise puts them into the right-most column. AlignEscapedNewlinesLeft: false # If true, aligns trailing comments. AlignTrailingComments: true # Allow putting all parameters of a function declaration onto the next line even if BinPackParameters is false. AllowAllParametersOfDeclarationOnNextLine: false # If true, if (a) return; can be put on a single line. AllowShortIfStatementsOnASingleLine: false # If true, while (true) continue; can be put on a single line. AllowShortLoopsOnASingleLine: false AllowShortFunctionsOnASingleLine: true # If true, always break before multiline string literals. AlwaysBreakBeforeMultilineStrings: false # If true, always break after the template<...> of a template declaration. AlwaysBreakTemplateDeclarations: false # If false, a function call’s or function definition’s parameters will either all be on the same line or will have one line each. BinPackParameters: true # If true, binary operators will be placed after line breaks. BreakBeforeBinaryOperators: false # The brace breaking style to use. # Possible values: # BS_Attach (in configuration: Attach) Always attach braces to surrounding context. # BS_Linux (in configuration: Linux) Like Attach, but break before braces on function, namespace and class definitions. # BS_Stroustrup (in configuration: Stroustrup) Like Attach, but break before function definitions. # BS_Allman (in configuration: Allman) Always break before braces. BreakBeforeBraces: Linux # Always break constructor initializers before commas and align the commas with the colon. BreakConstructorInitializersBeforeComma: true # The column limit. # A column limit of 0 means that there is no column limit. In this case, clang-format will respect the input’s line breaking decisions within statements. ColumnLimit: 90 # If the constructor initializers don’t fit on a line, put each initializer on its own line. #ConstructorInitializerAllOnOneLineOrOnePerLine (bool) # The number of characters to use for indentation of constructor initializer lists. #ConstructorInitializerIndentWidth (unsigned) # If true, format braced lists as best suited for C++11 braced lists. # Important differences: - No spaces inside the braced list. - No line break before the closing brace. - Indentation with the continuation indent, not with the block indent. # Fundamentally, C++11 braced lists are formatted exactly like function calls would be formatted in their place. If the braced list follows a name (e.g. a type or variable name), clang-format formats as if the {} were the parentheses of a function call with that name. If there is no name, a zero-length name is assumed. Cpp11BracedListStyle: true # If true, analyze the formatted file for the most common binding. #DerivePointerBinding (bool) # If true, clang-format detects whether function calls and definitions are formatted with one parameter per line. # Each call can be bin-packed, one-per-line or inconclusive. If it is inconclusive, e.g. completely on one line, but a decision needs to be made, clang-format analyzes whether there are other bin-packed cases in the input file and act accordingly. # NOTE: This is an experimental flag, that might go away or be renamed. Do not use this in config files, etc. Use at your own risk. #ExperimentalAutoDetectBinPacking (bool) # Indent case labels one level from the switch statement. # When false, use the same indentation level as for the switch statement. Switch statement body is always indented one level more than case labels. IndentCaseLabels: false # If true, indent when breaking function declarations which are not also definitions after the type. #IndentFunctionDeclarationAfterType (bool) # The number of characters to use for indentation. IndentWidth: 4 # The maximum number of consecutive empty lines to keep. MaxEmptyLinesToKeep: 1 # The indentation used for namespaces. # Possible values: # NI_None (in configuration: None) Don’t indent in namespaces. # NI_Inner (in configuration: Inner) Indent only in inner namespaces (nested in other namespaces). # NI_All (in configuration: All) Indent in all namespaces. NamespaceIndentation: None # Add a space in front of an Objective-C protocol list, i.e. use Foo instead of Foo. #ObjCSpaceBeforeProtocolList (bool) # The penalty for each line break introduced inside a comment. #PenaltyBreakComment (unsigned) # The penalty for breaking before the first <<. #PenaltyBreakFirstLessLess (unsigned) # The penalty for each line break introduced inside a string literal. #PenaltyBreakString (unsigned) # The penalty for each character outside of the column limit. #PenaltyExcessCharacter (unsigned) # Penalty for putting the return type of a function onto its own line. #PenaltyReturnTypeOnItsOwnLine (unsigned) # Set whether & and * bind to the type as opposed to the variable. #PointerBindsToType: false # If true, spaces will be inserted between ‘for’/’if’/’while’/... and ‘(‘. #SpaceAfterControlStatementKeyword: true # If false, spaces will be removed before ‘=’, ‘+=’, etc. #SpaceBeforeAssignmentOperators: true # If false, spaces may be inserted into ‘()’. #SpaceInEmptyParentheses: false # The number of spaces to before trailing line comments. #SpacesBeforeTrailingComments (unsigned) # If false, spaces may be inserted into C style casts. #SpacesInCStyleCastParentheses (bool) # If true, spaces will be inserted after every ‘(‘ and before every ‘)’. SpacesInParentheses: false # Format compatible with this standard, e.g. use A > instead of A> for LS_Cpp03. # Possible values: # LS_Cpp03 (in configuration: Cpp03) Use C++03-compatible syntax. # LS_Cpp11 (in configuration: Cpp11) Use features of C++11 (e.g. A> instead of A >). # LS_Auto (in configuration: Auto) Automatic detection based on the input. Standard: Cpp11 # If true, IndentWidth consecutive spaces will be replaced with tab characters. UseTab: false # vim: ft=yaml Vc-1.3.3/.github/000077500000000000000000000000001320703111200134145ustar00rootroot00000000000000Vc-1.3.3/.github/CONTRIBUTING.md000066400000000000000000000065431320703111200156550ustar00rootroot00000000000000## Copyright and License Vc is licensed with the [3-clause BSD license](http://opensource.org/licenses/BSD-3-Clause). Your contributions to Vc must be released under the same license. You must add your copyright information to the files you modified/added. ## Code Formatting & Style The recommended way is to format the code according to `clang-format` using the `.clang-format` file in the repository. In addition to the `clang-format` style, `if`, `else`, `for`, `while`, and `do` *must* use braces. If, for some reason, you cannot use `clang-format`, here's a quick overview of the style rules: * Constrain the code to no more than 90 characters per line. * Use four spaces for indent. No tabs. * Opening braces attach to the preceding expression, except for functions, namespaces, and classes/structs/unions/enums. * Namespaces introduce no additional indent * `case` labels are aligned with the `switch` statement * No more than one empty line. * No spaces in parentheses, but spaces between keywords and opening paren, i.e. `if (foo) { bar(); }` ### Naming Rules * Naming is very important. Take time to choose a name that clearly explains the intended functionality & usage of the entity. * Type names typically use `CamelCase`. No underscores. * Function and variable names use `camelCase`. No underscores. * Acronyms that appear in camel case names must use lowercase letters for all characters after the first characters. (e.g. `SimdArray`, `simdFunction`) * Traits use `lower_case_with_underscores`. * Macros are prefixed with `Vc_` and use `Vc_ALL_CAPITALS_WITH_UNDERSCORES`. Macro arguments use a single underscore suffix. Include guards are prefixed with `VC_` instead. * File names use `alllowercasewithoutunderscores`. Basically, it is the type name declared/defined in the file with all letters in lower case. * There are exceptions and inconsistencies in the code. Don't bother. ### Design Guidelines * *Avoid out parameters.* Use the return value insted. Use `std::tuple` if you need to return multiple values. * *Look for alternatives to in-out parameters.* An obvious exception (and thus design alternative) is the implicit `this` parameter to non-static member functions. * Consequently, *pass function parameters by const-ref or by value.* Use const-ref for types that (potentially) require more than two CPU registers. (Consider fundamental types and the fundamental `Vector` types to require one register, each.) By value otherwise. * *Ensure const-correctness.* Member functions use the `const` qualifier if they do not modify observable state. Use `mutable` members for unobservable state. * *Avoid macros.* Possible alternatives are constexpr variables and template code. ## Git History Git history should be flat, if feasible. Feel free to use merges on your private branch. However, once you submit a pull request, the history should apply cleanly on top of master. Use `git rebase [-i]` to straighten the history. Use different branches for different issues. ## Git Commit Logs 1. Write meaningful summaries and strive to use no more than 50 characters 1. Use imperative mood in the subject line (and possibly in bullet points in the summary) 1. Wrap the body at 72 characters 1. Use the body to explain *what* and *why* (normally it is irrelevant *how* you did it) See also [Chris Beams article](http://chris.beams.io/posts/git-commit/). Vc-1.3.3/.github/ISSUE_TEMPLATE.md000066400000000000000000000005751320703111200161300ustar00rootroot00000000000000Vc version / revision | Operating System | Compiler & Version | Compiler Flags | Assembler & Version | CPU ----------------------|------------------|--------------------|----------------|---------------------|---- | | | | | ## Testcase ```cpp ``` ## Actual Results ## Expected Results Vc-1.3.3/.gitignore000066400000000000000000000001321320703111200140400ustar00rootroot00000000000000doc/html doc/latex doc/man vc-benchmarks *.swp *~ .makeApidox.stamp .makeApidox.stamp.new Vc-1.3.3/.travis.yml000066400000000000000000000116401320703111200141670ustar00rootroot00000000000000language: cpp cache: ccache env: - arch=x86_64 CXX_VERSION=current - arch=x86_64 CXX_VERSION=previous - arch=x86_64 CXX_VERSION=default - arch=aarch64 CXX_VERSION=default os: - linux - osx compiler: - gcc - clang matrix: exclude: - env: arch=x86_64 CXX_VERSION=previous os: osx compiler: clang - env: arch=aarch64 CXX_VERSION=default os: osx # The following gives us Ubuntu 14.04 LTS instead of 12.04 LTS sudo: required dist: trusty install: - case "$TRAVIS_OS_NAME-$CXX-$CXX_VERSION-$arch" in linux-g++-current-x86_64) sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y && sudo apt-get update -q && sudo apt-get install g++-6 -y && export CXX=g++-6 CC=gcc-6;; linux-g++-previous-x86_64) sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y && sudo apt-get update -q && sudo apt-get install g++-5 -y && export CXX=g++-5 CC=gcc-5;; linux-g++-default-x86_64) ;; linux-clang++-current-x86_64) sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y && wget -O - http://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add - && sudo tee /etc/apt/sources.list.d/llvm.list <<< "deb http://apt.llvm.org/trusty/ llvm-toolchain-trusty-5.0 main" && sudo apt-get update -q && sudo apt-get install clang-5.0 -y && export CXX=clang++-5.0 CC=clang-5.0;; linux-clang++-previous-x86_64) sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y && wget -O - http://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add - && sudo tee /etc/apt/sources.list.d/llvm.list <<< "deb http://apt.llvm.org/trusty/ llvm-toolchain-trusty-4.0 main" && sudo apt-get update -q && sudo apt-get install clang-4.0 -y && export CXX=clang++-4.0 CC=clang-4.0;; linux-clang++-default-x86_64) sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y && sudo apt-get update -q && sudo apt-get install clang-3.8 libstdc++-6-dev libc++-dev libc++abi-dev -y && export CXX=clang++-3.8 CC=clang-3.8;; osx-g++-current-x86_64) brew update && brew install gcc6 && export CXX=g++-6 CC=gcc-6;; osx-g++-previous-x86_64) brew update && brew install gcc5 && export CXX=g++-5 CC=gcc-5;; osx-g++-default-x86_64) export CXX=g++-4.9 CC=gcc-4.9;; osx-clang++-current-x86_64) wget http://releases.llvm.org/4.0.0/clang+llvm-4.0.0-x86_64-apple-darwin.tar.xz && tar xf clang+llvm-4.0.0-x86_64-apple-darwin.tar.xz && cd clang+llvm-4.0.0-x86_64-apple-darwin && export CXX=$PWD/bin/clang++ CC=$PWD/bin/clang && export LDFLAGS="-L$PWD/lib -Wl,-rpath,$PWD/lib" && export CPPFLAGS=-I$PWD/include && cd ..;; osx-clang++-previous-x86_64) exit 1;; osx-clang++-default-x86_64) ;; linux-g++-default-aarch64) sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y && sudo add-apt-repository -y 'deb http://de.archive.ubuntu.com/ubuntu/ xenial main' && sudo add-apt-repository -y 'deb http://de.archive.ubuntu.com/ubuntu/ xenial-updates main' && sudo apt-get update -q && sudo apt-get install -y g++-5-aarch64-linux-gnu && export CXX=aarch64-linux-gnu-g++-5 CC=aarch64-linux-gnu-gcc-5 CMAKE_TOOLCHAIN_FILE=cmake/toolchain-arm-linux.cmake;; linux-clang++-default-aarch64) wget -O - http://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add - && sudo tee /etc/apt/sources.list.d/llvm.list <<< "deb http://apt.llvm.org/trusty/ llvm-toolchain-trusty-5.0 main" && sudo apt-get update -q && sudo apt-get install -y clang-5.0 && sudo add-apt-repository -y 'deb http://de.archive.ubuntu.com/ubuntu/ xenial main' && sudo add-apt-repository -y 'deb http://de.archive.ubuntu.com/ubuntu/ xenial-updates main' && sudo apt-get update -q && sudo apt-get install -y libstdc++-5-dev-arm64-cross gcc-5-aarch64-linux-gnu && export CPATH=/usr/aarch64-linux-gnu/include && export CXX="clang++-5.0 -target aarch64-linux-gnu" CC="clang-5.0 -target aarch64-linux-gnu" && export CMAKE_TOOLCHAIN_FILE=cmake/toolchain-arm-linux.cmake;; esac before_script: - $CXX --version - cmake --version - export dashboard_model=Experimental - export build_type=Release - export NUMBER_OF_PROCESSORS=2 script: - ctest -V -S test.cmake notifications: email: false irc: channels: - "chat.freenode.net##Vc" on_success: change # [always|never|change] # default: always on_failure: always # [always|never|change] # default: always use_notice: true skip_join: true Vc-1.3.3/CMakeLists.txt000066400000000000000000000245771320703111200146330ustar00rootroot00000000000000cmake_minimum_required(VERSION 3.0) if(CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR) message(FATAL_ERROR "You don't want to configure in the source directory!") endif() if(NOT DEFINED CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebug RelWithDebInfo MinSizeRel." FORCE) endif() project(Vc) set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") set(disabled_targets) include (VcMacros) include (AddTargetProperty) include (OptimizeForArchitecture) vc_determine_compiler() if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(x86|AMD64|amd64)") set(Vc_X86 TRUE) find_package(MIC) elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(arm|aarch32|aarch64)") message(WARNING "No optimized implementation of the Vc types available for ${CMAKE_SYSTEM_PROCESSOR}") set(Vc_ARM TRUE) else() message(WARNING "No optimized implementation of the Vc types available for ${CMAKE_SYSTEM_PROCESSOR}") endif() option(USE_CCACHE "If enabled, ccache will be used (if it exists on the system) to speed up recompiles." OFF) if(USE_CCACHE) find_program(CCACHE_COMMAND ccache) if(CCACHE_COMMAND) mark_as_advanced(CCACHE_COMMAND) set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE_COMMAND}") endif() endif() # TODO: check that 'decltype' compiles # TODO: check that 'constexpr' compiles if(NOT Vc_COMPILER_IS_MSVC) # MSVC doesn't provide a switch to turn C++11 on/off AFAIK AddCompilerFlag("-std=c++14" CXX_RESULT _ok MIC_CXX_RESULT _mic_ok CXX_FLAGS CMAKE_CXX_FLAGS MIC_CXX_FLAGS Vc_MIC_CXX_FLAGS) if(MIC_NATIVE_FOUND AND NOT _mic_ok) AddCompilerFlag("-std=c++1y" MIC_CXX_RESULT _mic_ok MIC_CXX_FLAGS Vc_MIC_CXX_FLAGS) if(NOT _mic_ok) AddCompilerFlag("-std=c++11" MIC_CXX_RESULT _mic_ok MIC_CXX_FLAGS Vc_MIC_CXX_FLAGS) if(NOT _mic_ok) AddCompilerFlag("-std=c++0x" MIC_CXX_RESULT _mic_ok MIC_CXX_FLAGS Vc_MIC_CXX_FLAGS) if(NOT _mic_ok) message(FATAL_ERROR "Vc 1.x requires C++11, better even C++14. The MIC native compiler does not support any of the C++11 language flags.") endif() endif() endif() endif() if(NOT _ok) AddCompilerFlag("-std=c++1y" CXX_RESULT _ok CXX_FLAGS CMAKE_CXX_FLAGS) if(NOT _ok) AddCompilerFlag("-std=c++11" CXX_RESULT _ok CXX_FLAGS CMAKE_CXX_FLAGS) if(NOT _ok) AddCompilerFlag("-std=c++0x" CXX_RESULT _ok CXX_FLAGS CMAKE_CXX_FLAGS) if(NOT _ok) message(FATAL_ERROR "Vc 1.x requires C++11, better even C++14. It seems this is not available. If this was incorrectly determined please notify vc-devel@compeng.uni-frankfurt.de") endif() endif() endif() endif() elseif(Vc_MSVC_VERSION LESS 180021114) message(FATAL_ERROR "Vc 1.x requires C++11 support. This requires at least Visual Studio 2013 with the Nov 2013 CTP.") endif() if(Vc_COMPILER_IS_GCC) if(Vc_GCC_VERSION VERSION_GREATER "5.0.0" AND Vc_GCC_VERSION VERSION_LESS "6.0.0") UserWarning("GCC 5 goes into an endless loop comiling example_scaling_scalar. Therefore, this target is disabled.") list(APPEND disabled_targets example_scaling_scalar ) endif() elseif(Vc_COMPILER_IS_MSVC) if(MSVC_VERSION LESS 1700) # MSVC before 2012 has a broken std::vector::resize implementation. STL + Vc code will probably not compile. # UserWarning in VcMacros.cmake list(APPEND disabled_targets stlcontainer_sse stlcontainer_avx ) endif() # Disable warning "C++ exception specification ignored except to indicate a function is not __declspec(nothrow)" # MSVC emits the warning for the _UnitTest_Compare desctructor which needs the throw declaration so that it doesn't std::terminate AddCompilerFlag("/wd4290") endif() if(MIC_NATIVE_FOUND) if("${Vc_MIC_ICC_VERSION}" VERSION_LESS "16.1.0") UserWarning("ICC for MIC uses an incompatible STL. Disabling simdize_mic.") list(APPEND disabled_targets simdize_mic example_simdize_mic ) endif() endif() vc_set_preferred_compiler_flags(WARNING_FLAGS BUILDTYPE_FLAGS) add_definitions(${Vc_DEFINITIONS}) add_compile_options(${Vc_COMPILE_FLAGS}) if(Vc_COMPILER_IS_INTEL) # per default icc is not IEEE compliant, but we need that for verification AddCompilerFlag("-fp-model source") endif() if(CMAKE_BUILD_TYPE STREQUAL "" AND NOT CMAKE_CXX_FLAGS MATCHES "-O[123]") message(STATUS "WARNING! It seems you are compiling without optimization. Please set CMAKE_BUILD_TYPE.") endif(CMAKE_BUILD_TYPE STREQUAL "" AND NOT CMAKE_CXX_FLAGS MATCHES "-O[123]") include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/include) add_custom_target(other VERBATIM) add_custom_target(Scalar COMMENT "build Scalar code" VERBATIM) add_custom_target(SSE COMMENT "build SSE code" VERBATIM) add_custom_target(AVX COMMENT "build AVX code" VERBATIM) add_custom_target(AVX2 COMMENT "build AVX2 code" VERBATIM) add_custom_target(MIC COMMENT "build MIC code" VERBATIM) AddCompilerFlag(-ftemplate-depth=128 CXX_FLAGS CMAKE_CXX_FLAGS MIC_CXX_FLAGS CMAKE_MIC_CXX_FLAGS) set(libvc_compile_flags "-DVc_COMPILE_LIB") set(libvc_mic_compile_flags "-DVc_COMPILE_LIB") AddCompilerFlag("-fPIC" CXX_FLAGS libvc_compile_flags MIC_CXX_FLAGS libvc_mic_compile_flags) if(MIC_FOUND) mic_add_library(Vc_MIC STATIC src/mic_const.cpp src/cpuid.cpp src/support_x86.cpp src/mic_sorthelper.cpp COMPILE_FLAGS ${libvc_mic_compile_flags}) add_target_property(Vc_MIC LABELS "MIC") add_dependencies(MIC Vc_MIC) get_target_property(outputName Vc_MIC OUTPUT_NAME) install(FILES ${outputName} DESTINATION lib${LIB_SUFFIX}) endif() set(_srcs src/const.cpp) if(Vc_X86) list(APPEND _srcs src/cpuid.cpp src/support_x86.cpp) vc_compile_for_all_implementations(_srcs src/trigonometric.cpp ONLY SSE2 SSE3 SSSE3 SSE4_1 AVX SSE+XOP+FMA4 AVX+XOP+FMA4 AVX+XOP+FMA AVX+FMA AVX2+FMA+BMI2) vc_compile_for_all_implementations(_srcs src/sse_sorthelper.cpp ONLY SSE2 SSE4_1 AVX AVX2+FMA+BMI2) vc_compile_for_all_implementations(_srcs src/avx_sorthelper.cpp ONLY AVX AVX2+FMA+BMI2) elseif(Vc_ARM) list(APPEND _srcs src/support_dummy.cpp) else() message(FATAL_ERROR "Unsupported target architecture '${CMAKE_SYSTEM_PROCESSOR}'. No support_???.cpp file exists for this architecture.") endif() add_library(Vc STATIC ${_srcs}) set_property(TARGET Vc APPEND PROPERTY COMPILE_OPTIONS ${libvc_compile_flags}) add_target_property(Vc LABELS "other") if(XCODE) # TODO: document what this does and why it has no counterpart in the non-XCODE logic set_target_properties(Vc PROPERTIES XCODE_ATTRIBUTE_GCC_INLINES_ARE_PRIVATE_EXTERN "NO") set_target_properties(Vc PROPERTIES XCODE_ATTRIBUTE_GCC_SYMBOLS_PRIVATE_EXTERN "YES") set_target_properties(Vc PROPERTIES XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++0x") set_target_properties(Vc PROPERTIES XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++") elseif(UNIX AND Vc_COMPILER_IS_CLANG) # On UNIX (Linux) the standard library used by default typically is libstdc++ (GCC). # To get the full clang deal we rather want to build against libc++. This requires # additionally the libc++abi and libsupc++ libraries in all linker invokations. option(USE_LIBC++ "Use libc++ instead of the system default C++ standard library." ON) if(USE_LIBC++) AddCompilerFlag(-stdlib=libc++ CXX_FLAGS CMAKE_CXX_FLAGS CXX_RESULT _use_libcxx) if(_use_libcxx) find_library(LIBC++ABI c++abi) mark_as_advanced(LIBC++ABI) if(LIBC++ABI) set(CMAKE_REQUIRED_LIBRARIES "${LIBC++ABI};supc++") CHECK_CXX_SOURCE_COMPILES("#include #include void foo() { std::cout << 'h' << std::flush << std::endl; throw std::exception(); } int main() { try { foo(); } catch (int) { return 0; } return 1; }" libcxx_compiles) unset(CMAKE_REQUIRED_LIBRARIES) if(libcxx_compiles) link_libraries(${LIBC++ABI} supc++) endif() endif() endif() endif() endif() add_dependencies(other Vc) install(TARGETS Vc DESTINATION lib${LIB_SUFFIX}) install(DIRECTORY include/Vc/ DESTINATION include/Vc) # Install all implementation headers install(DIRECTORY scalar sse avx mic common traits DESTINATION include/Vc FILES_MATCHING REGEX "/*.(h|tcc|def)$") # read version parts from version.h to be put into VcConfig.cmake file(STRINGS ${CMAKE_CURRENT_SOURCE_DIR}/include/Vc/version.h _version_lines REGEX "^#define Vc_VERSION_STRING ") string(REGEX MATCH "([0-9]+)\\.([0-9]+)\\.([0-9]+)" _version_matches "${_version_lines}") set(Vc_VERSION_MAJOR ${CMAKE_MATCH_1}) set(Vc_VERSION_MINOR ${CMAKE_MATCH_2}) set(Vc_VERSION_PATCH ${CMAKE_MATCH_3}) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/VcConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/cmake/VcConfig.cmake @ONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/VcConfigVersion.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/cmake/VcConfigVersion.cmake @ONLY) install(FILES cmake/UserWarning.cmake cmake/VcMacros.cmake cmake/AddCompilerFlag.cmake cmake/CheckCCompilerFlag.cmake cmake/CheckCXXCompilerFlag.cmake cmake/CheckMicCCompilerFlag.cmake cmake/CheckMicCXXCompilerFlag.cmake cmake/FindMIC.cmake cmake/OptimizeForArchitecture.cmake cmake/FindVc.cmake ${CMAKE_CURRENT_BINARY_DIR}/cmake/VcConfig.cmake ${CMAKE_CURRENT_BINARY_DIR}/cmake/VcConfigVersion.cmake DESTINATION lib${LIB_SUFFIX}/cmake/Vc ) option(BUILD_TESTING "Build the testing tree." OFF) include (CTest) configure_file(${CMAKE_SOURCE_DIR}/CTestCustom.cmake ${CMAKE_BINARY_DIR}/CTestCustom.cmake COPYONLY) if(BUILD_TESTING) add_custom_target(build_tests ALL VERBATIM) add_subdirectory(tests) endif(BUILD_TESTING) set(BUILD_EXAMPLES FALSE CACHE BOOL "Build examples.") if(BUILD_EXAMPLES) add_subdirectory(examples) endif(BUILD_EXAMPLES) # Hide Vc_IMPL as it is only meant for users of Vc mark_as_advanced(Vc_IMPL) find_program(BIN_CAT cat) mark_as_advanced(BIN_CAT) if(BIN_CAT) file(REMOVE ${CMAKE_BINARY_DIR}/help.txt) add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/help.txt COMMAND ${CMAKE_MAKE_PROGRAM} help > ${CMAKE_BINARY_DIR}/help.txt VERBATIM ) add_custom_target(cached_help ${BIN_CAT} ${CMAKE_BINARY_DIR}/help.txt DEPENDS ${CMAKE_BINARY_DIR}/help.txt VERBATIM ) endif() Vc-1.3.3/CTestConfig.cmake000066400000000000000000000005471320703111200152340ustar00rootroot00000000000000set(CTEST_PROJECT_NAME "Vc") set(CTEST_NIGHTLY_START_TIME "00:00:00 CEST") set(CTEST_DROP_METHOD "http") set(CTEST_DROP_SITE "lxwww53.gsi.de") set(CTEST_DROP_LOCATION "/submit.php?project=Vc") set(CTEST_DROP_SITE_CDASH TRUE) set(CTEST_UPDATE_TYPE "git") find_program(GITCOMMAND git) set(CTEST_UPDATE_COMMAND "${GITCOMMAND}") mark_as_advanced(GITCOMMAND) Vc-1.3.3/CTestCustom.cmake000066400000000000000000000025321320703111200152750ustar00rootroot00000000000000set(CTEST_CUSTOM_WARNING_EXCEPTION ${CTEST_CUSTOM_WARNING_EXCEPTION} " C4723: " # MSVC 2012 can't suppress this warning " C4756: " # MSVC 2012 can't suppress this warning "used uninitialized in this function" "Skipping compilation of tests gatherStruct and gather2dim because of clang bug" # Not a helpful warning for the dashboard "warning is a GCC extension" "^-- " # Ignore output from cmake "AVX disabled per default because of old/broken compiler" # This warning is meant for users not the dashboard "warning: the mangled name of .*typename Vc::{anonymous}::Decltype.* will change in a future version of GCC" "WARNING non-zero return value in ctest from: make" # Ignore output from ctest "ipo: warning #11010:" # Ignore warning about incompatible libraries with ICC -m32 on 64-bit system "include/qt4" # -Wuninitialized in QWeakPointer(X *ptr) " note: " # Notes are additional lines from errors (or warnings) that we don't want to count as additional warnings "clang: warning: argument unused during compilation: '-stdlib=libc" "clang 3.6.x miscompiles AVX code" # a preprocessor warning for users of Vc, irrelevant for the dashboard ) set(CTEST_CUSTOM_ERROR_EXCEPTION ${CTEST_CUSTOM_ERROR_EXCEPTION} "^ICECC" "^make\\[[1-9]\\]: " "^collect2: ld returned . exit status" "^make: \\*\\*\\* \\[.*\\] Error ") Vc-1.3.3/INSTALL000066400000000000000000000000171320703111200131030ustar00rootroot00000000000000See README.md. Vc-1.3.3/LICENSE000066400000000000000000000027461320703111200130720ustar00rootroot00000000000000Copyright © 2009-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Vc-1.3.3/README.md000066400000000000000000000124511320703111200133360ustar00rootroot00000000000000# Vc: portable, zero-overhead C++ types for explicitly data-parallel programming Recent generations of CPUs, and GPUs in particular, require data-parallel codes for full efficiency. Data parallelism requires that the same sequence of operations is applied to different input data. CPUs and GPUs can thus reduce the necessary hardware for instruction decoding and scheduling in favor of more arithmetic and logic units, which execute the same instructions synchronously. On CPU architectures this is implemented via SIMD registers and instructions. A single SIMD register can store N values and a single SIMD instruction can execute N operations on those values. On GPU architectures N threads run in perfect sync, fed by a single instruction decoder/scheduler. Each thread has local memory and a given index to calculate the offsets in memory for loads and stores. Current C++ compilers can do automatic transformation of scalar codes to SIMD instructions (auto-vectorization). However, the compiler must reconstruct an intrinsic property of the algorithm that was lost when the developer wrote a purely scalar implementation in C++. Consequently, C++ compilers cannot vectorize any given code to its most efficient data-parallel variant. Especially larger data-parallel loops, spanning over multiple functions or even translation units, will often not be transformed into efficient SIMD code. The Vc library provides the missing link. Its types enable explicitly stating data-parallel operations on multiple values. The parallelism is therefore added via the type system. Competing approaches state the parallelism via new control structures and consequently new semantics inside the body of these control structures. Vc is a free software library to ease explicit vectorization of C++ code. It has an intuitive API and provides portability between different compilers and compiler versions as well as portability between different vector instruction sets. Thus an application written with Vc can be compiled for: * AVX and AVX2 * SSE2 up to SSE4.2 or SSE4a * Scalar * MIC * AVX-512 (in development) * NEON (in development) * NVIDIA GPUs / CUDA (in development) ## Examples ### Scalar Product Let's start from the code for calculating a 3D scalar product using builtin floats: ```cpp using Vec3D = std::array; float scalar_product(Vec3D a, Vec3D b) { return a[0] * b[0] + a[1] * b[1] + a[2] * b[2]; } ``` Using Vc, we can easily vectorize the code using the `float_v` type: ```cpp using Vc::float_v using Vec3D = std::array; float_v scalar_product(Vec3D a, Vec3D b) { return a[0] * b[0] + a[1] * b[1] + a[2] * b[2]; } ``` The above will scale to 1, 4, 8, 16, etc. scalar products calculated in parallel, depending on the target hardware's capabilities. For comparison, the same vectorization using Intel SSE intrinsics is more verbose and uses prefix notation (i.e. function calls): ```cpp using Vec3D = std::array<__m128, 3>; __m128 scalar_product(Vec3D a, Vec3D b) { return _mm_add_ps(_mm_add_ps(_mm_mul_ps(a[0], b[0]), _mm_mul_ps(a[1], b[1])), _mm_mul_ps(a[2], b[2])); } ``` The above will neither scale to AVX, MIC, etc. nor is it portable to other SIMD ISAs. ## Build Requirements cmake >= 3.0 C++11 Compiler: * GCC >= 4.8.1 * clang >= 3.4 * ICC >= 15.0.3 * Visual Studio 2015 (64-bit target) ## Building and Installing Vc * Create a build directory: ```sh $ mkdir build $ cd build ``` * Call cmake with the relevant options: ```sh $ cmake -DCMAKE_INSTALL_PREFIX=/opt/Vc -DBUILD_TESTING=OFF ``` * Build and install: ```sh $ make -j16 $ make install ``` ## Documentation The documentation is generated via [doxygen](http://doxygen.org). You can build the documentation by running `doxygen` in the `doc` subdirectory. Alternatively, you can find nightly builds of the documentation at: * [master branch](https://web-docs.gsi.de/~mkretz/Vc-master/) * [1.3.0 release](https://web-docs.gsi.de/~mkretz/Vc-1.3.0/) * [1.2.0 release](https://web-docs.gsi.de/~mkretz/Vc-1.2.0/) * [1.1.0 release](https://web-docs.gsi.de/~mkretz/Vc-1.1.0/) * [0.7 branch](https://web-docs.gsi.de/~mkretz/Vc-0.7/) ## Publications * [M. Kretz, "Extending C++ for Explicit Data-Parallel Programming via SIMD Vector Types", Goethe University Frankfurt, Dissertation, 2015.](http://publikationen.ub.uni-frankfurt.de/frontdoor/index/index/docId/38415) * [M. Kretz and V. Lindenstruth, "Vc: A C++ library for explicit vectorization", Software: Practice and Experience, 2011.](http://dx.doi.org/10.1002/spe.1149) * [M. Kretz, "Efficient Use of Multi- and Many-Core Systems with Vectorization and Multithreading", University of Heidelberg, 2009.](http://code.compeng.uni-frankfurt.de/attachments/13/Diplomarbeit.pdf) [Work on integrating the functionality of Vc in the C++ standard library.]( https://github.com/VcDevel/Vc/wiki/ISO-Standardization-of-the-Vector-classes) ## Communication A channel on the freenode IRC network is reserved for discussions on Vc: [##vc on freenode](irc://chat.freenode.net:6667/##vc) ([via SSL](ircs://chat.freenode.net:6697/##vc)) Feel free to use the GitHub issue tracker for questions. Alternatively, there's a [mailinglist for users of Vc](https://compeng.uni-frankfurt.de/mailman/listinfo/vc) ## License Vc is released under the terms of the [3-clause BSD license](http://opensource.org/licenses/BSD-3-Clause). Vc-1.3.3/Test_all_compilers.sh000077500000000000000000000067221320703111200162460ustar00rootroot00000000000000#!/bin/sh -e export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games" export LANG="en_US.UTF-8" export LANGUAGE="en_US.UTF-8" export LC_CTYPE="en_US.UTF-8" export LC_NUMERIC="en_US.UTF-8" export LC_TIME="en_US.UTF-8" export LC_MESSAGES="en_US.UTF-8" unset CFLAGS CXXFLAGS cd "`dirname "$0"`" test -z "dashboard_model" && export dashboard_model=Experimental runTest() { CFLAGS="$1" CXXFLAGS="$1" ctest -S test.cmake } tested_compilers="lsakdfjwowleqirjodfisj" runAllTests() { # first make sure we don't test a compiler a second time id="`which $CXX`" id="`readlink -f $id`" echo "$id"|grep -qF "$tested_compilers" && return tested_compilers="$tested_compilers $id" # alright run the ctest script runTest supports32Bit && runTest -m32 supportsx32 && runTest -mx32 } supports32Bit() { test `uname -m` = "x86_64" || return 1 CXX=${CXX:-c++} cat > /tmp/m32test.cpp < #include #include #include void foo(int x) { switch (x) { case 0x0A: break; case 0x0B: break; case 0x0C: break; case 0x0D: break; case 0x0E: break; } } int main() { std::cout << "Hello World!\n"; return 0; } END $CXX -m32 -o /tmp/m32test /tmp/m32test.cpp >/dev/null 2>&1 || return 1 rm /tmp/m32test* return 0 } supportsx32() { test `uname -m` = "x86_64" || return 1 CXX=${CXX:-c++} cat > /tmp/mx32test.cpp < #include #include #include void foo(int x) { switch (x) { case 0x0A: break; case 0x0B: break; case 0x0C: break; case 0x0D: break; case 0x0E: break; } } int main() { std::cout << "Hello World!\n"; return 0; } END $CXX -mx32 -o /tmp/mx32test /tmp/mx32test.cpp >/dev/null 2>&1 || return 1 rm /tmp/mx32test* return 0 } system_compilers() { cxxlist="`find /usr/bin/ /usr/local/bin/ -name '*++-[0-9]*'|grep -v -- -linux-gnu`" if test -z "$cxxlist"; then cxxlist="`find /usr/bin/ /usr/local/bin/ -name '*++'|grep -v -- -linux-gnu`" fi if test -z "$cxxlist"; then # default compiler runAllTests else for CXX in $cxxlist; do CC=`echo "$CXX"|sed 's/clang++/clang/;s/g++/gcc/'` if test -x "$CC" -a -x "$CXX"; then export CC export CXX runAllTests fi done fi } modules_compilers() { if test -r /etc/profile.d/modules.sh; then source /etc/profile.d/modules.sh for mod in `module avail -t 2>&1`; do case `echo $mod|tr '[:upper:]' '[:lower:]'` in *intel*|*icc*) export CC=icc CXX=icpc;; *gnu*|*gcc*) export CC=gcc CXX=g++;; *llvm*|*clang*) export CC=clang CXX=clang++;; *) continue;; esac module load $mod runAllTests module unload $mod done fi } gccbuild_compilers() { for VcEnv in `find /opt/ -mindepth 2 -maxdepth 2 -name Vc.env`; do ( . "$VcEnv" case "$VcEnv" in *-snapshot/Vc.env) ( cd $HOME/src/gcc-build && ./update.sh "`dirname "$VcEnv"`" ) ;; esac runAllTests ) done } icc_compilers() { export CC=icc export CXX=icpc icclist="`find /opt/intel/compiler* -name 'iccvars.sh' | xargs readlink -e | sort -ur`" case `uname -m` in x86_64) COMPILERVARS_ARCHITECTURE=intel64 ;; i[345678]86) COMPILERVARS_ARCHITECTURE=ia32 ;; esac export COMPILERVARS_ARCHITECTURE test -n "$icclist" && for IccEnv in $icclist; do ( . $IccEnv $COMPILERVARS_ARCHITECTURE runAllTests ) done } system_compilers modules_compilers gccbuild_compilers icc_compilers Vc-1.3.3/Test_vc.sh000077500000000000000000000010571320703111200140250ustar00rootroot00000000000000#!/bin/bash case "$1" in Experimental|Nightly|Continuous) export dashboard_model=$1 case "$2" in None|Debug|Release|RelWithDebug|RelWithDebInfo|MinSizeRel) export build_type=$2 ;; esac ;; *) echo "Usage: $0 []" echo echo "Possible arguments for model are Nightly, Continuous, or Experimental." echo "Build type may be one of: None Debug Release RelWithDebug RelWithDebInfo MinSizeRel." echo exit 1 ;; esac ctest -S "`dirname $0`/test.cmake" Vc-1.3.3/avx/000077500000000000000000000000001320703111200126525ustar00rootroot00000000000000Vc-1.3.3/avx/README000066400000000000000000000035551320703111200135420ustar00rootroot00000000000000########################################### ################# AVX ################# ########################################### 1. Floating Point =========================================== Uses full 256bit vectors for all operations. 128bit vectors are never used. 2. Integer =========================================== Integer support in AVX is minimal. The 256bit integer vectors are just intended as a supporting type of float operations. Any arithmetic, logical, or comparison operations must be implemented using 128bit operations. int_v/uint_v could be implemented either as 128 or 256 types. I.e. either int_v::Size == 4 or 8. 2.1. 256bit int vectors =========================================== 2.1.1. Implementation Details: This requires the SSE operations to not zero the high bits of the registers. Since the YMM registers are aliased on the XMM registers you need to use SSE ops that are not using the VEX prefix (IIUC). Or you have to use two XMM registers most of the time. Perfect would be the use of union M256I { __m256i ymm; __m128i xmm[2]; }; But as far as I know GCC, this will result in lots of unnecessary loads and stores. (It seems this is due to GCC expecting aliasing, thus making sure the modified values are always up-to-date in memory - like if it were declared volatile.) 2.1.2. Upsides: int_v::Size == float_v::Size 2.1.3. Downsides: Register pressure is increased. 2.2. 128bit int vectors =========================================== 2.2.1. Implementation Details: 2.2.2. Upsides: 2.2.3. Downsides: - Use of int_v for float_v operations involving __m256i arguments require an extra type. This will be hard to generalize 2.3. Mixed approach =========================================== int_v/uint_v are implemented as 256bit while short_v/ushort_v are implemented as 128bit. Thus int_v::Size == short_v::Size (which is the case on LRBni, too). Vc-1.3.3/avx/casts.h000066400000000000000000000414251320703111200141460ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2009-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_AVX_CASTS_H_ #define VC_AVX_CASTS_H_ #include "intrinsics.h" #include "types.h" #include "../sse/casts.h" #include "shuffle.h" #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace AVX { namespace Casts { template Vc_INTRINSIC_L T avx_cast(__m128 v) Vc_INTRINSIC_R; template Vc_INTRINSIC_L T avx_cast(__m128i v) Vc_INTRINSIC_R; template Vc_INTRINSIC_L T avx_cast(__m128d v) Vc_INTRINSIC_R; template Vc_INTRINSIC_L T avx_cast(__m256 v) Vc_INTRINSIC_R; template Vc_INTRINSIC_L T avx_cast(__m256i v) Vc_INTRINSIC_R; template Vc_INTRINSIC_L T avx_cast(__m256d v) Vc_INTRINSIC_R; // 128 -> 128 template<> Vc_INTRINSIC __m128 avx_cast(__m128 v) { return v; } template<> Vc_INTRINSIC __m128 avx_cast(__m128i v) { return _mm_castsi128_ps(v); } template<> Vc_INTRINSIC __m128 avx_cast(__m128d v) { return _mm_castpd_ps(v); } template<> Vc_INTRINSIC __m128i avx_cast(__m128 v) { return _mm_castps_si128(v); } template<> Vc_INTRINSIC __m128i avx_cast(__m128i v) { return v; } template<> Vc_INTRINSIC __m128i avx_cast(__m128d v) { return _mm_castpd_si128(v); } template<> Vc_INTRINSIC __m128d avx_cast(__m128 v) { return _mm_castps_pd(v); } template<> Vc_INTRINSIC __m128d avx_cast(__m128i v) { return _mm_castsi128_pd(v); } template<> Vc_INTRINSIC __m128d avx_cast(__m128d v) { return v; } // 128 -> 256 // FIXME: the following casts leave the upper 128bits undefined. With GCC and ICC I've never // seen the cast not do what I want though: after a VEX-coded SSE instruction the register's // upper 128bits are zero. Thus using the same register as AVX register will have the upper // 128bits zeroed. MSVC, though, implements _mm256_castxx128_xx256 with a 128bit move to memory // + 256bit load. Thus the upper 128bits are really undefined. But there is no intrinsic to do // what I want (i.e. alias the register, disallowing the move to memory in-between). I'm stuck, // do we really want to rely on specific compiler behavior here? template<> Vc_INTRINSIC __m256 avx_cast(__m128 v) { return _mm256_castps128_ps256(v); } template<> Vc_INTRINSIC __m256 avx_cast(__m128i v) { return _mm256_castps128_ps256(_mm_castsi128_ps(v)); } template<> Vc_INTRINSIC __m256 avx_cast(__m128d v) { return _mm256_castps128_ps256(_mm_castpd_ps(v)); } template<> Vc_INTRINSIC __m256i avx_cast(__m128 v) { return _mm256_castsi128_si256(_mm_castps_si128(v)); } template<> Vc_INTRINSIC __m256i avx_cast(__m128i v) { return _mm256_castsi128_si256(v); } template<> Vc_INTRINSIC __m256i avx_cast(__m128d v) { return _mm256_castsi128_si256(_mm_castpd_si128(v)); } template<> Vc_INTRINSIC __m256d avx_cast(__m128 v) { return _mm256_castpd128_pd256(_mm_castps_pd(v)); } template<> Vc_INTRINSIC __m256d avx_cast(__m128i v) { return _mm256_castpd128_pd256(_mm_castsi128_pd(v)); } template<> Vc_INTRINSIC __m256d avx_cast(__m128d v) { return _mm256_castpd128_pd256(v); } #if defined Vc_MSVC || defined Vc_CLANG || defined Vc_APPLECLANG static Vc_INTRINSIC Vc_CONST __m256 zeroExtend(__m128 v) { return _mm256_permute2f128_ps (_mm256_castps128_ps256(v), _mm256_castps128_ps256(v), 0x80); } static Vc_INTRINSIC Vc_CONST __m256i zeroExtend(__m128i v) { return _mm256_permute2f128_si256(_mm256_castsi128_si256(v), _mm256_castsi128_si256(v), 0x80); } static Vc_INTRINSIC Vc_CONST __m256d zeroExtend(__m128d v) { return _mm256_permute2f128_pd (_mm256_castpd128_pd256(v), _mm256_castpd128_pd256(v), 0x80); } #else static Vc_INTRINSIC Vc_CONST __m256 zeroExtend(__m128 v) { return _mm256_castps128_ps256(v); } static Vc_INTRINSIC Vc_CONST __m256i zeroExtend(__m128i v) { return _mm256_castsi128_si256(v); } static Vc_INTRINSIC Vc_CONST __m256d zeroExtend(__m128d v) { return _mm256_castpd128_pd256(v); } #endif // 256 -> 128 template<> Vc_INTRINSIC __m128 avx_cast(__m256 v) { return _mm256_castps256_ps128(v); } template<> Vc_INTRINSIC __m128 avx_cast(__m256i v) { return _mm256_castps256_ps128(_mm256_castsi256_ps(v)); } template<> Vc_INTRINSIC __m128 avx_cast(__m256d v) { return _mm256_castps256_ps128(_mm256_castpd_ps(v)); } template<> Vc_INTRINSIC __m128i avx_cast(__m256 v) { return _mm256_castsi256_si128(_mm256_castps_si256(v)); } template<> Vc_INTRINSIC __m128i avx_cast(__m256i v) { return _mm256_castsi256_si128(v); } template<> Vc_INTRINSIC __m128i avx_cast(__m256d v) { return _mm256_castsi256_si128(_mm256_castpd_si256(v)); } template<> Vc_INTRINSIC __m128d avx_cast(__m256 v) { return _mm256_castpd256_pd128(_mm256_castps_pd(v)); } template<> Vc_INTRINSIC __m128d avx_cast(__m256i v) { return _mm256_castpd256_pd128(_mm256_castsi256_pd(v)); } template<> Vc_INTRINSIC __m128d avx_cast(__m256d v) { return _mm256_castpd256_pd128(v); } // 256 -> 256 template<> Vc_INTRINSIC __m256 avx_cast(__m256 v) { return v; } template<> Vc_INTRINSIC __m256 avx_cast(__m256i v) { return _mm256_castsi256_ps(v); } template<> Vc_INTRINSIC __m256 avx_cast(__m256d v) { return _mm256_castpd_ps(v); } template<> Vc_INTRINSIC __m256i avx_cast(__m256 v) { return _mm256_castps_si256(v); } template<> Vc_INTRINSIC __m256i avx_cast(__m256i v) { return v; } template<> Vc_INTRINSIC __m256i avx_cast(__m256d v) { return _mm256_castpd_si256(v); } template<> Vc_INTRINSIC __m256d avx_cast(__m256 v) { return _mm256_castps_pd(v); } template<> Vc_INTRINSIC __m256d avx_cast(__m256i v) { return _mm256_castsi256_pd(v); } template<> Vc_INTRINSIC __m256d avx_cast(__m256d v) { return v; } // simplify splitting 256-bit registers in 128-bit registers Vc_INTRINSIC Vc_CONST __m128 lo128(__m256 v) { return avx_cast<__m128>(v); } Vc_INTRINSIC Vc_CONST __m128d lo128(__m256d v) { return avx_cast<__m128d>(v); } Vc_INTRINSIC Vc_CONST __m128i lo128(__m256i v) { return avx_cast<__m128i>(v); } Vc_INTRINSIC Vc_CONST __m128 hi128(__m256 v) { return extract128<1>(v); } Vc_INTRINSIC Vc_CONST __m128d hi128(__m256d v) { return extract128<1>(v); } Vc_INTRINSIC Vc_CONST __m128i hi128(__m256i v) { return extract128<1>(v); } // simplify combining 128-bit registers in 256-bit registers Vc_INTRINSIC Vc_CONST __m256 concat(__m128 a, __m128 b) { return insert128<1>(avx_cast<__m256 >(a), b); } Vc_INTRINSIC Vc_CONST __m256d concat(__m128d a, __m128d b) { return insert128<1>(avx_cast<__m256d>(a), b); } Vc_INTRINSIC Vc_CONST __m256i concat(__m128i a, __m128i b) { return insert128<1>(avx_cast<__m256i>(a), b); } } // namespace Casts using namespace Casts; } // namespace AVX namespace AVX2 { using namespace AVX::Casts; } // namespace AVX2 namespace AVX { template struct ConvertTag {}; Vc_INTRINSIC __m256i convert(__m256 v, ConvertTag) { return _mm256_cvttps_epi32(v); } Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag) { return _mm256_cvttpd_epi32(v); } Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag) { #ifdef Vc_IMPL_AVX2 return _mm256_cvtepi16_epi32(v); #else return AVX::srai_epi32<16>( concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v))); #endif } Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag) { #ifdef Vc_IMPL_AVX2 return _mm256_cvtepu16_epi32(v); #else return AVX::srli_epi32<16>( concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v))); #endif } Vc_INTRINSIC __m256i convert(__m256 v, ConvertTag) { using namespace AVX; return _mm256_castps_si256(_mm256_blendv_ps( _mm256_castsi256_ps(_mm256_cvttps_epi32(v)), _mm256_castsi256_ps(add_epi32(_mm256_cvttps_epi32(_mm256_sub_ps(v, set2power31_ps())), set2power31_epu32())), cmpge_ps(v, set2power31_ps()))); } Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag) { using namespace AVX; return _mm_xor_si128( _mm256_cvttpd_epi32(_mm256_sub_pd(_mm256_floor_pd(v), set1_pd(0x80000000u))), _mm_set2power31_epu32()); } Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag) { #ifdef Vc_IMPL_AVX2 return _mm256_cvtepi16_epi32(v); #else return AVX::srai_epi32<16>( concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v))); #endif } Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag) { #ifdef Vc_IMPL_AVX2 return _mm256_cvtepu16_epi32(v); #else return AVX::srli_epi32<16>( concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v))); #endif } Vc_INTRINSIC __m256 convert(__m256 v, ConvertTag) { return v; } Vc_INTRINSIC __m128 convert(__m256d v, ConvertTag) { return _mm256_cvtpd_ps(v); } Vc_INTRINSIC __m256 convert(__m256i v, ConvertTag) { return _mm256_cvtepi32_ps(v); } Vc_INTRINSIC __m256 convert(__m256i v, ConvertTag) { // this is complicated because cvtepi32_ps only supports signed input. Thus, all // input values with the MSB set would produce a negative result. We can reuse the // cvtepi32_ps instruction if we unset the MSB. But then the rounding results can be // different. Since float uses 24 bits for the mantissa (effectively), the 9-bit LSB // determines the rounding direction. (Consider the bits ...8'7654'3210. The bits [0:7] // need to be dropped and if > 0x80 round up, if < 0x80 round down. If [0:7] == 0x80 // then the rounding direction is determined by bit [8] for round to even. That's why // the 9th bit is relevant for the rounding decision.) // If the MSB of the input is set to 0, the cvtepi32_ps instruction makes its rounding // decision on the lowest 8 bits instead. A second rounding decision is made when // float(0x8000'0000) is added. This will rarely fix the rounding issue. // // Here's what the standard rounding mode expects: // 0xc0000080 should cvt to 0xc0000000 // 0xc0000081 should cvt to 0xc0000100 // -- should cvt to 0xc0000100 // 0xc000017f should cvt to 0xc0000100 // 0xc0000180 should cvt to 0xc0000200 // // However: using float(input ^ 0x8000'0000) + float(0x8000'0000) we get: // 0xc0000081 would cvt to 0xc0000000 // 0xc00000c0 would cvt to 0xc0000000 // 0xc00000c1 would cvt to 0xc0000100 // 0xc000013f would cvt to 0xc0000100 // 0xc0000140 would cvt to 0xc0000200 // // Solution: float(input & 0x7fff'fe00) + (float(0x8000'0000) + float(input & 0x1ff)) // This ensures the rounding decision is made on the 9-bit LSB when 0x8000'0000 is // added to the float value of the low 8 bits of the input. using namespace AVX; return _mm256_blendv_ps( _mm256_cvtepi32_ps(v), _mm256_add_ps(_mm256_cvtepi32_ps(and_si256(v, set1_epi32(0x7ffffe00))), _mm256_add_ps(set2power31_ps(), _mm256_cvtepi32_ps(and_si256( v, set1_epi32(0x000001ff))))), _mm256_castsi256_ps(cmplt_epi32(v, _mm256_setzero_si256()))); } Vc_INTRINSIC __m256 convert(__m128i v, ConvertTag) { return _mm256_cvtepi32_ps(convert(v, ConvertTag< short, int>())); } Vc_INTRINSIC __m256 convert(__m128i v, ConvertTag) { return _mm256_cvtepi32_ps(convert(v, ConvertTag())); } Vc_INTRINSIC __m256d convert(__m128 v, ConvertTag) { return _mm256_cvtps_pd(v); } Vc_INTRINSIC __m256d convert(__m256d v, ConvertTag) { return v; } Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag) { return _mm256_cvtepi32_pd(v); } Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag) { using namespace AVX; return _mm256_add_pd( _mm256_cvtepi32_pd(_mm_xor_si128(v, _mm_setmin_epi32())), set1_pd(1u << 31)); } Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag) { return convert(convert(v, SSE::ConvertTag< short, int>()), ConvertTag()); } Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag) { return convert(convert(v, SSE::ConvertTag()), ConvertTag()); } Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag) { const auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v)); const auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v)); const auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); const auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); return _mm_unpacklo_epi16(tmp2, tmp3); } Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag) { const auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v)); const auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v)); const auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); const auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); return _mm_unpacklo_epi16(tmp2, tmp3); } Vc_INTRINSIC __m128i convert(__m256 v, ConvertTag) { return convert(convert(v, ConvertTag()), ConvertTag()); } Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag) { return convert(convert(v, ConvertTag()), SSE::ConvertTag()); } Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag) { auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v)); auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v)); auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); return _mm_unpacklo_epi16(tmp2, tmp3); } Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag) { auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v)); auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v)); auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); return _mm_unpacklo_epi16(tmp2, tmp3); } Vc_INTRINSIC __m128i convert(__m256 v, ConvertTag) { return convert(convert(v, ConvertTag()), ConvertTag()); } Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag) { return convert(convert(v, ConvertTag()), SSE::ConvertTag()); } Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } template Vc_INTRINSIC auto convert( typename std::conditional<(sizeof(From) < sizeof(To)), typename SSE::VectorTraits::VectorType, typename AVX::VectorTypeHelper::Type>::type v) -> decltype(convert(v, ConvertTag())) { return convert(v, ConvertTag()); } template > Vc_INTRINSIC auto convert(typename AVX::VectorTypeHelper::Type v) -> decltype(convert(lo128(v), ConvertTag())) { return convert(lo128(v), ConvertTag()); } } // namespace AVX } // namespace Vc #endif // VC_AVX_CASTS_H_ Vc-1.3.3/avx/const.h000066400000000000000000000156341320703111200141620ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2009-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_AVX_CONST_H_ #define VC_AVX_CONST_H_ #include #include "types.h" #include "const_data.h" #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace AVX { template struct IndexesFromZeroData; template<> struct IndexesFromZeroData { static Vc_ALWAYS_INLINE Vc_CONST const int *address() { return reinterpret_cast(&_IndexesFromZero32[0]); } }; template<> struct IndexesFromZeroData { static Vc_ALWAYS_INLINE Vc_CONST const unsigned int *address() { return &_IndexesFromZero32[0]; } }; template<> struct IndexesFromZeroData { static Vc_ALWAYS_INLINE Vc_CONST const short *address() { return reinterpret_cast(&_IndexesFromZero16[0]); } }; template<> struct IndexesFromZeroData { static Vc_ALWAYS_INLINE Vc_CONST const unsigned short *address() { return &_IndexesFromZero16[0]; } }; template<> struct IndexesFromZeroData { static Vc_ALWAYS_INLINE Vc_CONST const signed char *address() { return reinterpret_cast(&_IndexesFromZero8[0]); } }; template<> struct IndexesFromZeroData { static Vc_ALWAYS_INLINE Vc_CONST const char *address() { return reinterpret_cast(&_IndexesFromZero8[0]); } }; template<> struct IndexesFromZeroData { static Vc_ALWAYS_INLINE Vc_CONST const unsigned char *address() { return &_IndexesFromZero8[0]; } }; template struct Const { typedef Vector<_T> V; typedef typename V::EntryType T; typedef typename V::Mask M; static Vc_ALWAYS_INLINE Vc_CONST V _pi_4() { return V(c_trig::data[0]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_hi() { return V(c_trig::data[1]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem1() { return V(c_trig::data[2]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem2() { return V(c_trig::data[3]); } static Vc_ALWAYS_INLINE Vc_CONST V _1_16() { return V(c_trig::data[4]); } static Vc_ALWAYS_INLINE Vc_CONST V _16() { return V(c_trig::data[5]); } static Vc_ALWAYS_INLINE Vc_CONST V cosCoeff(int i) { return V(c_trig::data[( 8 + i)]); } static Vc_ALWAYS_INLINE Vc_CONST V sinCoeff(int i) { return V(c_trig::data[(14 + i)]); } static Vc_ALWAYS_INLINE Vc_CONST V atanP(int i) { return V(c_trig::data[(24 + i)]); } static Vc_ALWAYS_INLINE Vc_CONST V atanQ(int i) { return V(c_trig::data[(29 + i)]); } static Vc_ALWAYS_INLINE Vc_CONST V atanThrsHi() { return V(c_trig::data[34]); } static Vc_ALWAYS_INLINE Vc_CONST V atanThrsLo() { return V(c_trig::data[35]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_2_rem() { return V(c_trig::data[36]); } static Vc_ALWAYS_INLINE Vc_CONST V lossThreshold() { return V(c_trig::data[20]); } static Vc_ALWAYS_INLINE Vc_CONST V _4_pi() { return V(c_trig::data[21]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_2() { return V(c_trig::data[22]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi() { return V(c_trig::data[23]); } static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff0(int i) { return V(c_trig::data[(40 + i)]); } static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff1(int i) { return V(c_trig::data[(45 + i)]); } static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff2(int i) { return V(c_trig::data[(49 + i)]); } static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff3(int i) { return V(c_trig::data[(55 + i)]); } static Vc_ALWAYS_INLINE Vc_CONST V smallAsinInput() { return V(c_trig::data[37]); } static Vc_ALWAYS_INLINE Vc_CONST V largeAsinInput() { return V(c_trig::data[38]); } static Vc_ALWAYS_INLINE Vc_CONST M exponentMask() { return M(V(c_log::d(1)).data()); } static Vc_ALWAYS_INLINE Vc_CONST V _1_2() { return V(c_log::d(18)); } static Vc_ALWAYS_INLINE Vc_CONST V _1_sqrt2() { return V(c_log::d(15)); } static Vc_ALWAYS_INLINE Vc_CONST V P(int i) { return V(c_log::d(2 + i)); } static Vc_ALWAYS_INLINE Vc_CONST V Q(int i) { return V(c_log::d(8 + i)); } static Vc_ALWAYS_INLINE Vc_CONST V min() { return V(c_log::d(14)); } static Vc_ALWAYS_INLINE Vc_CONST V ln2_small() { return V(c_log::d(17)); } static Vc_ALWAYS_INLINE Vc_CONST V ln2_large() { return V(c_log::d(16)); } static Vc_ALWAYS_INLINE Vc_CONST V neginf() { return V(c_log::d(13)); } static Vc_ALWAYS_INLINE Vc_CONST V log10_e() { return V(c_log::d(19)); } static Vc_ALWAYS_INLINE Vc_CONST V log2_e() { return V(c_log::d(20)); } static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask() Vc_ALWAYS_INLINE_R Vc_CONST_R; }; template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask() { return _mm256_broadcast_ss(reinterpret_cast(&c_general::highMaskFloat)); } template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask() { return _mm256_broadcast_sd(reinterpret_cast(&c_general::highMaskDouble)); } } // namespace AVX namespace AVX2 { using AVX::IndexesFromZeroData; using AVX::Const; } // namespace AVX2 } // namespace Vc #endif // VC_AVX_CONST_H_ Vc-1.3.3/avx/const_data.h000066400000000000000000000067561320703111200151600ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2012-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_AVX_CONST_DATA_H_ #define VC_AVX_CONST_DATA_H_ #include "../common/data.h" #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace AVX { alignas(64) extern const unsigned int _IndexesFromZero32[ 8]; alignas(16) extern const unsigned short _IndexesFromZero16[16]; alignas(16) extern const unsigned char _IndexesFromZero8 [32]; struct alignas(64) c_general { static const float oneFloat; static const unsigned int absMaskFloat[2]; static const unsigned int signMaskFloat[2]; static const unsigned int highMaskFloat; static const unsigned short minShort[2]; static const unsigned short one16[2]; static const float _2power31; static const double oneDouble; static const unsigned long long frexpMask; static const unsigned long long highMaskDouble; }; template struct c_trig { alignas(64) static const T data[]; }; #ifndef Vc_MSVC template <> alignas(64) const float c_trig::data[]; template <> alignas(64) const double c_trig::data[]; #endif template struct c_log { typedef float floatAlias Vc_MAY_ALIAS; static Vc_ALWAYS_INLINE float d(int i) { return *reinterpret_cast(&data[i]); } alignas(64) static const unsigned int data[21]; }; #ifndef Vc_MSVC template<> alignas(64) const unsigned int c_log::data[21]; #endif template<> struct c_log { enum VectorSize { Size = 16 / sizeof(double) }; typedef double doubleAlias Vc_MAY_ALIAS; static Vc_ALWAYS_INLINE double d(int i) { return *reinterpret_cast(&data[i]); } alignas(64) static const unsigned long long data[21]; }; } // namespace AVX } // namespace Vc namespace Vc_VERSIONED_NAMESPACE { namespace AVX2 { using AVX::_IndexesFromZero8; using AVX::_IndexesFromZero16; using AVX::_IndexesFromZero32; using AVX::c_general; using AVX::c_trig; using AVX::c_log; } // namespace AVX2 } // namespace Vc #endif // VC_AVX_CONST_DATA_H_ Vc-1.3.3/avx/debug.h000066400000000000000000000100771320703111200141160ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2011-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_AVX_DEBUG_H_ #define VC_AVX_DEBUG_H_ #ifndef NDEBUG #include "vector.h" #include #include #endif namespace Vc_VERSIONED_NAMESPACE { namespace AVX { template struct AddType { const U &d; }; template AddType addType(const U &x) { return {x}; } #ifdef NDEBUG class DebugStream { public: DebugStream(const char *, const char *, int) {} template inline DebugStream &operator<<(const T &) { return *this; } }; #else class DebugStream { private: template static void printVector(V _x) { enum { Size = sizeof(V) / sizeof(T) }; union { V v; T m[Size]; } x = { _x }; std::cerr << '[' << std::setprecision(24) << x.m[0]; for (int i = 1; i < Size; ++i) { std::cerr << ", " << std::setprecision(24) << x.m[i]; } std::cerr << ']'; } public: DebugStream(const char *func, const char *file, int line) { std::cerr << "\033[1;40;33mDEBUG: " << file << ':' << line << ' ' << func << ' '; } template DebugStream &operator<<(const T &x) { std::cerr << x; return *this; } template DebugStream &operator<<(AddType &&x) { printVector(x.d); return *this; } DebugStream &operator<<(__m128 x) { printVector(x); return *this; } DebugStream &operator<<(__m256 x) { printVector(x); return *this; } DebugStream &operator<<(__m128d x) { printVector(x); return *this; } DebugStream &operator<<(__m256d x) { printVector(x); return *this; } DebugStream &operator<<(__m128i x) { printVector(x); return *this; } DebugStream &operator<<(__m256i x) { printVector(x); return *this; } ~DebugStream() { std::cerr << "\033[0m" << std::endl; } }; #endif #ifdef Vc_DEBUG #undef Vc_DEBUG #endif #ifdef Vc_MSVC #define Vc_DEBUG Vc::AVX::DebugStream(__FUNCSIG__, __FILE__, __LINE__) #else #define Vc_DEBUG Vc::AVX::DebugStream(__PRETTY_FUNCTION__, __FILE__, __LINE__) #endif } // namespace AVX } // namespace Vc #endif // VC_AVX_DEBUG_H_ Vc-1.3.3/avx/deinterleave.tcc000066400000000000000000000302621320703111200160170ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2010-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ namespace Vc_VERSIONED_NAMESPACE { namespace AVX2 { inline void deinterleave(double_v &Vc_RESTRICT a, double_v &Vc_RESTRICT b, double_v &Vc_RESTRICT c) { // estimated latency (AVX): 4.5 cycles const m256d tmp0 = Mem::shuffle128(a.data(), b.data()); const m256d tmp1 = Mem::shuffle128(a.data(), c.data()); const m256d tmp2 = Mem::shuffle128(b.data(), c.data()); a.data() = Mem::shuffle(tmp0, tmp1); b.data() = Mem::shuffle(tmp0, tmp2); c.data() = Mem::shuffle(tmp1, tmp2); } inline void deinterleave(float_v &Vc_RESTRICT a, float_v &Vc_RESTRICT b, float_v &Vc_RESTRICT c) { // abc abc abc // a = [a0 b0 c0 a1 b1 c1 a2 b2] 332 = 211+121 // b = [c2 a3 b3 c3 a4 b4 c4 a5] 323 = 112+211 // c = [b5 c5 a6 b6 c6 a7 b7 c7] 233 = 121+112 const m256 ac0 = Mem::shuffle128(a.data(), c.data()); // a0 b0 c0 a1 b5 c5 a6 b6 const m256 ac1 = Mem::shuffle128(a.data(), c.data()); // b1 c1 a2 b2 c6 a7 b7 c7 m256 tmp0 = Mem::blend( ac0, b.data()); tmp0 = Mem::blend(tmp0, ac1); // a0 a3 a2 a1 a4 a7 a6 a5 m256 tmp1 = Mem::blend( ac0, b.data()); tmp1 = Mem::blend(tmp1, ac1); // b1 b0 b3 b2 b5 b4 b7 b6 m256 tmp2 = Mem::blend( ac0, b.data()); tmp2 = Mem::blend(tmp2, ac1); // c2 c1 c0 c3 c6 c5 c4 c7 a.data() = Mem::permute(tmp0); b.data() = Mem::permute(tmp1); c.data() = Mem::permute(tmp2); } inline void deinterleave(int_v &Vc_RESTRICT a, int_v &Vc_RESTRICT b, int_v &Vc_RESTRICT c) { deinterleave(reinterpret_cast(a), reinterpret_cast(b), reinterpret_cast(c)); } inline void deinterleave(uint_v &Vc_RESTRICT a, uint_v &Vc_RESTRICT b, uint_v &Vc_RESTRICT c) { deinterleave(reinterpret_cast(a), reinterpret_cast(b), reinterpret_cast(c)); } inline void deinterleave(Vector &Vc_RESTRICT , Vector &Vc_RESTRICT , Vector &Vc_RESTRICT ) { return; /* TODO: // abc abc abc // a = [a0 b0 c0 a1 b1 c1 a2 b2] 332 = 211+121 // b = [c2 a3 b3 c3 a4 b4 c4 a5] 323 = 112+211 // c = [b5 c5 a6 b6 c6 a7 b7 c7] 233 = 121+112 m128i ac0 = _mm_unpacklo_epi64(a.data(), c.data()); // a0 b0 c0 a1 b5 c5 a6 b6 m128i ac1 = _mm_unpackhi_epi64(a.data(), c.data()); // b1 c1 a2 b2 c6 a7 b7 c7 m128i tmp0 = Mem::blend( ac0, b.data()); tmp0 = Mem::blend(tmp0, ac1); // a0 a3 a2 a1 a4 a7 a6 a5 m128i tmp1 = Mem::blend( ac0, b.data()); tmp1 = Mem::blend(tmp1, ac1); // b1 b0 b3 b2 b5 b4 b7 b6 m128i tmp2 = Mem::blend( ac0, b.data()); tmp2 = Mem::blend(tmp2, ac1); // c2 c1 c0 c3 c6 c5 c4 c7 a.data() = Mem::permuteHi(Mem::permuteLo(tmp0)); b.data() = Mem::permuteHi(Mem::permuteLo(tmp1)); c.data() = Mem::permuteHi(Mem::permuteLo(tmp2)); */ } inline void deinterleave(Vector &Vc_RESTRICT a, Vector &Vc_RESTRICT b, Vector &Vc_RESTRICT c) { deinterleave(reinterpret_cast &>(a), reinterpret_cast &>(b), reinterpret_cast &>(c)); } inline void deinterleave(Vector &a, Vector &b) { // a7 a6 a5 a4 a3 a2 a1 a0 // b7 b6 b5 b4 b3 b2 b1 b0 const m256 tmp0 = Reg::permute128(a.data(), b.data()); // b3 b2 b1 b0 a3 a2 a1 a0 const m256 tmp1 = Reg::permute128(a.data(), b.data()); // b7 b6 b5 b4 a7 a6 a5 a4 const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); // b5 b1 b4 b0 a5 a1 a4 a0 const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); // b7 b3 b6 b2 a7 a3 a6 a2 a.data() = _mm256_unpacklo_ps(tmp2, tmp3); // b6 b4 b2 b0 a6 a4 a2 a0 b.data() = _mm256_unpackhi_ps(tmp2, tmp3); // b7 b5 b3 b1 a7 a5 a3 a1 } inline void deinterleave(Vector &a, // a0 b0 a1 b1 a2 b2 a3 b3 | a4 b4 a5 ... Vector &b) // a8 b8 a9 ... { auto v0 = Mem::shuffle128(a.data(), b.data()); auto v1 = Mem::shuffle128(a.data(), b.data()); auto v2 = AVX::unpacklo_epi16(v0, v1); // a0 a4 ... auto v3 = AVX::unpackhi_epi16(v0, v1); // a2 a6 ... v0 = AVX::unpacklo_epi16(v2, v3); // a0 a2 ... v1 = AVX::unpackhi_epi16(v2, v3); // a1 a3 ... a.data() = AVX::unpacklo_epi16(v0, v1); // a0 a1 ... b.data() = AVX::unpackhi_epi16(v0, v1); // b0 b1 ... } inline void deinterleave(Vector &a, Vector &b) { auto v0 = Mem::shuffle128(a.data(), b.data()); auto v1 = Mem::shuffle128(a.data(), b.data()); auto v2 = AVX::unpacklo_epi16(v0, v1); // a0 a4 ... auto v3 = AVX::unpackhi_epi16(v0, v1); // a2 a6 ... v0 = AVX::unpacklo_epi16(v2, v3); // a0 a2 ... v1 = AVX::unpackhi_epi16(v2, v3); // a1 a3 ... a.data() = AVX::unpacklo_epi16(v0, v1); // a0 a1 ... b.data() = AVX::unpackhi_epi16(v0, v1); // b0 b1 ... } } // namespace AVX2 namespace Detail { template inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const float *m, Flags align) { a.load(m, align); b.load(m + AVX2::float_v::Size, align); Vc::AVX2::deinterleave(a, b); } template inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const short *m, Flags f) { using namespace Vc::AVX2; const auto tmp = Detail::load32(m, f); a.data() = _mm256_cvtepi32_ps(concat(_mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16), _mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16))); b.data() = _mm256_cvtepi32_ps( concat(_mm_srai_epi32(lo128(tmp), 16), _mm_srai_epi32(hi128(tmp), 16))); } template inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const unsigned short *m, Flags f) { using namespace Vc::AVX2; const auto tmp = Detail::load32(m, f); a.data() = _mm256_cvtepi32_ps( concat(_mm_blend_epi16(lo128(tmp), _mm_setzero_si128(), 0xaa), _mm_blend_epi16(hi128(tmp), _mm_setzero_si128(), 0xaa))); b.data() = _mm256_cvtepi32_ps( concat(_mm_srli_epi32(lo128(tmp), 16), _mm_srli_epi32(hi128(tmp), 16))); } template inline void deinterleave(AVX2::double_v &a, AVX2::double_v &b, const double *m, Flags align) { using namespace Vc::AVX2; a.load(m, align); b.load(m + AVX2::double_v::Size, align); m256d tmp0 = Mem::shuffle128(a.data(), b.data()); // b1 b0 a1 a0 m256d tmp1 = Mem::shuffle128(a.data(), b.data()); // b3 b2 a3 a2 a.data() = _mm256_unpacklo_pd(tmp0, tmp1); // b2 b0 a2 a0 b.data() = _mm256_unpackhi_pd(tmp0, tmp1); // b3 b1 a3 a1 } template inline void deinterleave(AVX2::int_v &a, AVX2::int_v &b, const int *m, Flags align) { using namespace AVX; a.load(m, align); b.load(m + AVX2::int_v::Size, align); const m256 tmp0 = avx_cast(Mem::shuffle128(a.data(), b.data())); const m256 tmp1 = avx_cast(Mem::shuffle128(a.data(), b.data())); const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); // b5 b1 b4 b0 a5 a1 a4 a0 const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); // b7 b3 b6 b2 a7 a3 a6 a2 a.data() = avx_cast(_mm256_unpacklo_ps(tmp2, tmp3)); // b6 b4 b2 b0 a6 a4 a2 a0 b.data() = avx_cast(_mm256_unpackhi_ps(tmp2, tmp3)); // b7 b5 b3 b1 a7 a5 a3 a1 } template inline void deinterleave(AVX2::int_v &a, AVX2::int_v &b, const short *m, Flags f) { using namespace Vc::AVX; const AVX2::short_v tmp0(m, f); const m256i tmp = tmp0.data(); a.data() = concat( _mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16), _mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16)); b.data() = concat( _mm_srai_epi32(lo128(tmp), 16), _mm_srai_epi32(hi128(tmp), 16)); } template inline void deinterleave(AVX2::uint_v &a, AVX2::uint_v &b, const unsigned int *m, Flags align) { using namespace AVX; a.load(m, align); b.load(m + AVX2::uint_v::Size, align); const m256 tmp0 = avx_cast(Mem::shuffle128(a.data(), b.data())); const m256 tmp1 = avx_cast(Mem::shuffle128(a.data(), b.data())); const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); // b5 b1 b4 b0 a5 a1 a4 a0 const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); // b7 b3 b6 b2 a7 a3 a6 a2 a.data() = avx_cast(_mm256_unpacklo_ps(tmp2, tmp3)); // b6 b4 b2 b0 a6 a4 a2 a0 b.data() = avx_cast(_mm256_unpackhi_ps(tmp2, tmp3)); // b7 b5 b3 b1 a7 a5 a3 a1 } template inline void deinterleave(AVX2::uint_v &a, AVX2::uint_v &b, const unsigned short *m, Flags f) { using namespace Vc::AVX; const AVX2::ushort_v tmp0(m, f); const m256i tmp = tmp0.data(); a.data() = concat( _mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16), _mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16)); b.data() = concat( _mm_srai_epi32(lo128(tmp), 16), _mm_srai_epi32(hi128(tmp), 16)); } template inline void deinterleave(AVX2::short_v &a, AVX2::short_v &b, const short *m, Flags align) { a.load(m, align); b.load(m + AVX2::short_v::Size, align); Vc::AVX2::deinterleave(a, b); } template inline void deinterleave(AVX2::ushort_v &a, AVX2::ushort_v &b, const unsigned short *m, Flags align) { a.load(m, align); b.load(m + AVX2::ushort_v::Size, align); Vc::AVX2::deinterleave(a, b); } // only support M == V::EntryType -> no specialization template Vc_ALWAYS_INLINE void deinterleave(AVX2::Vector &Vc_RESTRICT a, AVX2::Vector &Vc_RESTRICT b, AVX2::Vector &Vc_RESTRICT c, const M *Vc_RESTRICT memory, Flags align) { using V = AVX2::Vector; a.load(&memory[0 * V::Size], align); b.load(&memory[1 * V::Size], align); c.load(&memory[2 * V::Size], align); Vc::AVX2::deinterleave(a, b, c); } } // namespace Detail } // namespace Vc Vc-1.3.3/avx/detail.h000066400000000000000000003367741320703111200143110ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_AVX_DETAIL_H_ #define VC_AVX_DETAIL_H_ #include "../sse/detail.h" #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace Detail { // (converting) load functions {{{1 template Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>, typename Flags::EnableIfAligned = nullptr) { return _mm256_load_ps(x); } template Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>, typename Flags::EnableIfUnaligned = nullptr) { return _mm256_loadu_ps(x); } template Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>, typename Flags::EnableIfStreaming = nullptr) { return AvxIntrinsics::stream_load<__m256>(x); } template Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>, typename Flags::EnableIfAligned = nullptr) { return _mm256_load_pd(x); } template Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>, typename Flags::EnableIfUnaligned = nullptr) { return _mm256_loadu_pd(x); } template Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>, typename Flags::EnableIfStreaming = nullptr) { return AvxIntrinsics::stream_load<__m256d>(x); } template ::value>> Vc_INTRINSIC Vc_PURE __m256i load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfAligned = nullptr) { return _mm256_load_si256(reinterpret_cast(x)); } template ::value>> Vc_INTRINSIC Vc_PURE __m256i load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfUnaligned = nullptr) { return _mm256_loadu_si256(reinterpret_cast(x)); } template ::value>> Vc_INTRINSIC Vc_PURE __m256i load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfStreaming = nullptr) { return AvxIntrinsics::stream_load<__m256i>(x); } // load32{{{2 Vc_INTRINSIC __m256 load32(const float *mem, when_aligned) { return _mm256_load_ps(mem); } Vc_INTRINSIC __m256 load32(const float *mem, when_unaligned) { return _mm256_loadu_ps(mem); } Vc_INTRINSIC __m256 load32(const float *mem, when_streaming) { return AvxIntrinsics::stream_load<__m256>(mem); } Vc_INTRINSIC __m256d load32(const double *mem, when_aligned) { return _mm256_load_pd(mem); } Vc_INTRINSIC __m256d load32(const double *mem, when_unaligned) { return _mm256_loadu_pd(mem); } Vc_INTRINSIC __m256d load32(const double *mem, when_streaming) { return AvxIntrinsics::stream_load<__m256d>(mem); } template Vc_INTRINSIC __m256i load32(const T *mem, when_aligned) { static_assert(std::is_integral::value, "load32 is only intended for integral T"); return _mm256_load_si256(reinterpret_cast(mem)); } template Vc_INTRINSIC __m256i load32(const T *mem, when_unaligned) { static_assert(std::is_integral::value, "load32 is only intended for integral T"); return _mm256_loadu_si256(reinterpret_cast(mem)); } template Vc_INTRINSIC __m256i load32(const T *mem, when_streaming) { static_assert(std::is_integral::value, "load32 is only intended for integral T"); return AvxIntrinsics::stream_load<__m256i>(mem); } // MSVC workarounds{{{2 #ifdef Vc_MSVC // work around: "fatal error C1001: An internal error has occurred in the compiler." Vc_INTRINSIC __m256i load(const uint *mem, when_aligned, LoadTag<__m256i, int>) { return _mm256_load_si256(reinterpret_cast(mem)); } Vc_INTRINSIC __m256d load(const double *mem, when_unaligned, LoadTag<__m256d, double>) { return _mm256_loadu_pd(mem); } template Vc_INTRINSIC __m256 load(const float *mem, when_aligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_load_ps(mem); } template Vc_INTRINSIC __m256 load(const float *mem, when_unaligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_loadu_ps(mem); } template Vc_INTRINSIC __m256 load(const float *mem, when_streaming, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return AvxIntrinsics::stream_load<__m256>(mem); } template Vc_INTRINSIC __m256d load(const double *mem, when_aligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_load_pd(mem); } template Vc_INTRINSIC __m256d load(const double *mem, when_unaligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_loadu_pd(mem); } template Vc_INTRINSIC __m256d load(const double *mem, when_streaming, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return AvxIntrinsics::stream_load<__m256d>(mem); } template Vc_INTRINSIC __m256i load(const uint *mem, when_aligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_load_si256(reinterpret_cast(mem)); } template Vc_INTRINSIC __m256i load(const uint *mem, when_unaligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_loadu_si256(reinterpret_cast(mem)); } template Vc_INTRINSIC __m256i load(const uint *mem, when_streaming, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return AvxIntrinsics::stream_load<__m256i>(mem); } template Vc_INTRINSIC __m256i load(const int *mem, when_unaligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_loadu_si256(reinterpret_cast(mem)); } template Vc_INTRINSIC __m256i load(const int *mem, when_aligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_load_si256(reinterpret_cast(mem)); } template Vc_INTRINSIC __m256i load(const int *mem, when_streaming, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return AvxIntrinsics::stream_load<__m256i>(mem); } template Vc_INTRINSIC __m256i load(const short *mem, when_unaligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_loadu_si256(reinterpret_cast(mem)); } template Vc_INTRINSIC __m256i load(const short *mem, when_aligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_load_si256(reinterpret_cast(mem)); } template Vc_INTRINSIC __m256i load(const short *mem, when_streaming, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return AvxIntrinsics::stream_load<__m256i>(mem); } template Vc_INTRINSIC __m256i load(const ushort *mem, when_unaligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_loadu_si256(reinterpret_cast(mem)); } template Vc_INTRINSIC __m256i load(const ushort *mem, when_aligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_load_si256(reinterpret_cast(mem)); } template Vc_INTRINSIC __m256i load(const ushort *mem, when_streaming, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return AvxIntrinsics::stream_load<__m256i>(mem); } #endif // Vc_MSVC // short {{{2 template Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, short>) { return load32(mem, f); } template Vc_INTRINSIC __m256i load(const uchar *mem, Flags f, LoadTag<__m256i, short>) { return AVX::cvtepu8_epi16(load16(mem, f)); } template Vc_INTRINSIC __m256i load(const schar *mem, Flags f, LoadTag<__m256i, short>) { return AVX::cvtepi8_epi16(load16(mem, f)); } // ushort {{{2 template Vc_INTRINSIC __m256i load(const uchar *mem, Flags f, LoadTag<__m256i, ushort>) { return AVX::cvtepu8_epi16(load16(mem, f)); } // int {{{2 template Vc_INTRINSIC __m256i load(const uint *mem, Flags f, LoadTag<__m256i, int>) { return load32(mem, f); } template Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, int>) { return AVX::cvtepu16_epi32(load16(mem, f)); } template Vc_INTRINSIC __m256i load(const short *mem, Flags f, LoadTag<__m256i, int>) { return AVX::cvtepi16_epi32(load16(mem, f)); } template Vc_INTRINSIC __m256i load(const uchar *mem, Flags, LoadTag<__m256i, int>) { return AVX::cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast(mem))); } template Vc_INTRINSIC __m256i load(const schar *mem, Flags, LoadTag<__m256i, int>) { return AVX::cvtepi8_epi32(_mm_loadl_epi64(reinterpret_cast(mem))); } // uint {{{2 template Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, uint>) { return AVX::cvtepu16_epi32(load16(mem, f)); } template Vc_INTRINSIC __m256i load(const uchar *mem, Flags, LoadTag<__m256i, uint>) { return AVX::cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast(mem))); } // double {{{2 template Vc_INTRINSIC __m256d load(const float *mem, Flags f, LoadTag<__m256d, double>) { return AVX::convert(load16(mem, f)); } template Vc_INTRINSIC __m256d load(const uint *mem, Flags f, LoadTag<__m256d, double>) { return AVX::convert(load16(mem, f)); } template Vc_INTRINSIC __m256d load(const int *mem, Flags f, LoadTag<__m256d, double>) { return AVX::convert(load16(mem, f)); } template Vc_INTRINSIC __m256d load(const ushort *mem, Flags f, LoadTag<__m256d, double>) { return AVX::convert(load16(mem, f)); } template Vc_INTRINSIC __m256d load(const short *mem, Flags f, LoadTag<__m256d, double>) { return AVX::convert(load16(mem, f)); } template Vc_INTRINSIC __m256d load(const uchar *mem, Flags f, LoadTag<__m256d, double>) { return AVX::convert(load16(mem, f)); } template Vc_INTRINSIC __m256d load(const schar *mem, Flags f, LoadTag<__m256d, double>) { return AVX::convert(load16(mem, f)); } // float {{{2 template Vc_INTRINSIC __m256 load(const double *mem, Flags f, LoadTag<__m256, float>) { return AVX::concat(_mm256_cvtpd_ps(load32(&mem[0], f)), _mm256_cvtpd_ps(load32(&mem[4], f))); } template Vc_INTRINSIC __m256 load(const uint *mem, Flags f, LoadTag<__m256, float>) { const auto v = load32(mem, f); return _mm256_blendv_ps( _mm256_cvtepi32_ps(v), _mm256_add_ps(_mm256_cvtepi32_ps(AVX::sub_epi32(v, AVX::set2power31_epu32())), AVX::set2power31_ps()), _mm256_castsi256_ps(AVX::cmplt_epi32(v, _mm256_setzero_si256()))); } template Vc_INTRINSIC __m256 load(const int *mem, Flags f, LoadTag<__m256, float>) { return AVX::convert(load32(mem, f)); } template ::value>> Vc_INTRINSIC __m256 load(const T *mem, Flags f, LoadTag<__m256, float>) { return _mm256_cvtepi32_ps(load<__m256i, int>(mem, f)); } template Vc_INTRINSIC __m256 load(const ushort *mem, Flags f, LoadTag<__m256, float>) { return AVX::convert(load16(mem, f)); } template Vc_INTRINSIC __m256 load(const short *mem, Flags f, LoadTag<__m256, float>) { return AVX::convert(load16(mem, f)); } /* template struct LoadHelper { static __m256 load(const unsigned char *mem, Flags) { return _mm256_cvtepi32_ps( cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast(mem)))); } }; template struct LoadHelper { static __m256 load(const signed char *mem, Flags) { return _mm256_cvtepi32_ps( cvtepi8_epi32(_mm_loadl_epi64(reinterpret_cast(mem)))); } }; */ // shifted{{{1 template Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount >= 16), T> shifted(T k) { return AVX::avx_cast(AVX::zeroExtend( _mm_srli_si128(AVX::hi128(AVX::avx_cast<__m256i>(k)), amount - 16))); } template Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount > 0 && amount < 16), T> shifted(T k) { return AVX::avx_cast( AVX::alignr(Mem::permute128(AVX::avx_cast<__m256i>(k)), AVX::avx_cast<__m256i>(k))); } template Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount <= -16), T> shifted(T k) { return AVX::avx_cast(Mem::permute128(AVX::avx_cast<__m256i>( _mm_slli_si128(AVX::lo128(AVX::avx_cast<__m256i>(k)), -16 - amount)))); } template Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount > -16 && amount < 0), T> shifted(T k) { return AVX::avx_cast( AVX::alignr<16 + amount>(AVX::avx_cast<__m256i>(k), Mem::permute128(AVX::avx_cast<__m256i>(k)))); } // mask_cast{{{1 template Vc_INTRINSIC Vc_CONST R mask_cast(__m256i k) { static_assert(From == To, "Incorrect mask cast."); static_assert(std::is_same::value, "Incorrect mask cast."); return AVX::avx_cast<__m256>(k); } // 4 -> 4 template <> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 4, __m128>(__m256i k) { return AVX::avx_cast<__m128>(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k))); } template <> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 4, __m256>(__m128i k) { const auto kk = _mm_castsi128_ps(k); return AVX::concat(_mm_unpacklo_ps(kk, kk), _mm_unpackhi_ps(kk, kk)); } // 4 -> 8 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 8, __m256>(__m256i k) { // aabb ccdd -> abcd 0000 return AVX::avx_cast<__m256>(AVX::concat(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)), _mm_setzero_si128())); } template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 8, __m128>(__m256i k) { // aaaa bbbb cccc dddd -> abcd 0000 return AVX::avx_cast<__m128>(_mm_packs_epi16(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)), _mm_setzero_si128())); } template <> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 8, __m256>(__m128i k) { return AVX::zeroExtend(AVX::avx_cast<__m128>(k)); } // 4 -> 16 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 16, __m256>(__m256i k) { // aaaa bbbb cccc dddd -> abcd 0000 0000 0000 return AVX::zeroExtend(mask_cast<4, 8, __m128>(k)); } // 8 -> 4 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 4, __m256>(__m256i k) { // aabb ccdd eeff gghh -> aaaa bbbb cccc dddd const auto lo = AVX::lo128(AVX::avx_cast<__m256>(k)); return AVX::concat(_mm_unpacklo_ps(lo, lo), _mm_unpackhi_ps(lo, lo)); } template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 4, __m128>(__m256i k) { return AVX::avx_cast<__m128>(AVX::lo128(k)); } template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 4, __m256>(__m128i k) { // abcd efgh -> aaaa bbbb cccc dddd const auto tmp = _mm_unpacklo_epi16(k, k); // aa bb cc dd return AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi32(tmp, tmp), // aaaa bbbb _mm_unpackhi_epi32(tmp, tmp))); // cccc dddd } // 8 -> 8 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 8, __m128>(__m256i k) { // aabb ccdd eeff gghh -> abcd efgh return AVX::avx_cast<__m128>(_mm_packs_epi16(AVX::lo128(k), AVX::hi128(k))); } template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 8, __m256>(__m128i k) { return AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi16(k, k), _mm_unpackhi_epi16(k, k))); } // 8 -> 16 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 16, __m256>(__m256i k) { // aabb ccdd eeff gghh -> abcd efgh 0000 0000 return AVX::zeroExtend(mask_cast<8, 8, __m128>(k)); } // 16 -> 8 #ifdef Vc_IMPL_AVX2 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<16, 8, __m256>(__m256i k) { // abcd efgh ijkl mnop -> aabb ccdd eeff gghh const auto flipped = Mem::permute4x64(k); return _mm256_castsi256_ps(AVX::unpacklo_epi16(flipped, flipped)); } #endif // 16 -> 4 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<16, 4, __m256>(__m256i k) { // abcd efgh ijkl mnop -> aaaa bbbb cccc dddd const auto tmp = _mm_unpacklo_epi16(AVX::lo128(k), AVX::lo128(k)); // aabb ccdd return _mm256_castsi256_ps(AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp))); } // allone{{{1 template<> Vc_INTRINSIC Vc_CONST __m256 allone<__m256 >() { return AVX::setallone_ps(); } template<> Vc_INTRINSIC Vc_CONST __m256i allone<__m256i>() { return AVX::setallone_si256(); } template<> Vc_INTRINSIC Vc_CONST __m256d allone<__m256d>() { return AVX::setallone_pd(); } // zero{{{1 template<> Vc_INTRINSIC Vc_CONST __m256 zero<__m256 >() { return _mm256_setzero_ps(); } template<> Vc_INTRINSIC Vc_CONST __m256i zero<__m256i>() { return _mm256_setzero_si256(); } template<> Vc_INTRINSIC Vc_CONST __m256d zero<__m256d>() { return _mm256_setzero_pd(); } // one{{{1 Vc_INTRINSIC Vc_CONST __m256 one( float) { return AVX::setone_ps (); } Vc_INTRINSIC Vc_CONST __m256d one(double) { return AVX::setone_pd (); } Vc_INTRINSIC Vc_CONST __m256i one( int) { return AVX::setone_epi32(); } Vc_INTRINSIC Vc_CONST __m256i one( uint) { return AVX::setone_epu32(); } Vc_INTRINSIC Vc_CONST __m256i one( short) { return AVX::setone_epi16(); } Vc_INTRINSIC Vc_CONST __m256i one(ushort) { return AVX::setone_epu16(); } Vc_INTRINSIC Vc_CONST __m256i one( schar) { return AVX::setone_epi8 (); } Vc_INTRINSIC Vc_CONST __m256i one( uchar) { return AVX::setone_epu8 (); } // negate{{{1 Vc_ALWAYS_INLINE Vc_CONST __m256 negate(__m256 v, std::integral_constant) { return _mm256_xor_ps(v, AVX::setsignmask_ps()); } Vc_ALWAYS_INLINE Vc_CONST __m256d negate(__m256d v, std::integral_constant) { return _mm256_xor_pd(v, AVX::setsignmask_pd()); } Vc_ALWAYS_INLINE Vc_CONST __m256i negate(__m256i v, std::integral_constant) { return AVX::sign_epi32(v, Detail::allone<__m256i>()); } Vc_ALWAYS_INLINE Vc_CONST __m256i negate(__m256i v, std::integral_constant) { return AVX::sign_epi16(v, Detail::allone<__m256i>()); } // xor_{{{1 Vc_INTRINSIC __m256 xor_(__m256 a, __m256 b) { return _mm256_xor_ps(a, b); } Vc_INTRINSIC __m256d xor_(__m256d a, __m256d b) { return _mm256_xor_pd(a, b); } Vc_INTRINSIC __m256i xor_(__m256i a, __m256i b) { #ifdef Vc_IMPL_AVX2 return _mm256_xor_si256(a, b); #else return _mm256_castps_si256( _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); #endif } // or_{{{1 Vc_INTRINSIC __m256 or_(__m256 a, __m256 b) { return _mm256_or_ps(a, b); } Vc_INTRINSIC __m256d or_(__m256d a, __m256d b) { return _mm256_or_pd(a, b); } Vc_INTRINSIC __m256i or_(__m256i a, __m256i b) { #ifdef Vc_IMPL_AVX2 return _mm256_or_si256(a, b); #else return _mm256_castps_si256( _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); #endif } // and_{{{1 Vc_INTRINSIC __m256 and_(__m256 a, __m256 b) { return _mm256_and_ps(a, b); } Vc_INTRINSIC __m256d and_(__m256d a, __m256d b) { return _mm256_and_pd(a, b); } Vc_INTRINSIC __m256i and_(__m256i a, __m256i b) { #ifdef Vc_IMPL_AVX2 return _mm256_and_si256(a, b); #else return _mm256_castps_si256( _mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); #endif } // andnot_{{{1 Vc_INTRINSIC __m256 andnot_(__m256 a, __m256 b) { return _mm256_andnot_ps(a, b); } Vc_INTRINSIC __m256d andnot_(__m256d a, __m256d b) { return _mm256_andnot_pd(a, b); } Vc_INTRINSIC __m256i andnot_(__m256i a, __m256i b) { #ifdef Vc_IMPL_AVX2 return _mm256_andnot_si256(a, b); #else return _mm256_castps_si256( _mm256_andnot_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); #endif } // not_{{{1 Vc_INTRINSIC __m256 not_(__m256 a) { return andnot_(a, allone<__m256 >()); } Vc_INTRINSIC __m256d not_(__m256d a) { return andnot_(a, allone<__m256d>()); } Vc_INTRINSIC __m256i not_(__m256i a) { return andnot_(a, allone<__m256i>()); } // blend{{{1 Vc_INTRINSIC __m256 blend(__m256 a, __m256 b, __m256 c) { return _mm256_blendv_ps(a, b, c); } Vc_INTRINSIC __m256d blend(__m256d a, __m256d b, __m256d c) { return _mm256_blendv_pd(a, b, c); } Vc_INTRINSIC __m256i blend(__m256i a, __m256i b, __m256i c) { return AVX::blendv_epi8(a, b, c); } // abs{{{1 Vc_INTRINSIC __m256 abs(__m256 a, float) { return and_(a, AVX::setabsmask_ps()); } Vc_INTRINSIC __m256d abs(__m256d a, double) { return and_(a, AVX::setabsmask_pd()); } Vc_INTRINSIC __m256i abs(__m256i a, int) { return AVX::abs_epi32(a); } Vc_INTRINSIC __m256i abs(__m256i a, uint) { return a; } Vc_INTRINSIC __m256i abs(__m256i a, short) { return AVX::abs_epi16(a); } Vc_INTRINSIC __m256i abs(__m256i a, ushort) { return a; } Vc_INTRINSIC __m256i abs(__m256i a, schar) { return AVX::abs_epi8 (a); } Vc_INTRINSIC __m256i abs(__m256i a, uchar) { return a; } // add{{{1 Vc_INTRINSIC __m256 add(__m256 a, __m256 b, float) { return _mm256_add_ps(a, b); } Vc_INTRINSIC __m256d add(__m256d a, __m256d b, double) { return _mm256_add_pd(a, b); } Vc_INTRINSIC __m256i add(__m256i a, __m256i b, int) { return AVX::add_epi32(a, b); } Vc_INTRINSIC __m256i add(__m256i a, __m256i b, uint) { return AVX::add_epi32(a, b); } Vc_INTRINSIC __m256i add(__m256i a, __m256i b, short) { return AVX::add_epi16(a, b); } Vc_INTRINSIC __m256i add(__m256i a, __m256i b, ushort) { return AVX::add_epi16(a, b); } // sub{{{1 Vc_INTRINSIC __m256 sub(__m256 a, __m256 b, float) { return _mm256_sub_ps(a, b); } Vc_INTRINSIC __m256d sub(__m256d a, __m256d b, double) { return _mm256_sub_pd(a, b); } Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, int) { return AVX::sub_epi32(a, b); } Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, uint) { return AVX::sub_epi32(a, b); } Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, short) { return AVX::sub_epi16(a, b); } Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, ushort) { return AVX::sub_epi16(a, b); } // mul{{{1 Vc_INTRINSIC __m256 mul(__m256 a, __m256 b, float) { return _mm256_mul_ps(a, b); } Vc_INTRINSIC __m256d mul(__m256d a, __m256d b, double) { return _mm256_mul_pd(a, b); } Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, int) { return AVX::mullo_epi32(a, b); } Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, uint) { return AVX::mullo_epi32(a, b); } Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, short) { return AVX::mullo_epi16(a, b); } Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, ushort) { return AVX::mullo_epi16(a, b); } // mul{{{1 Vc_INTRINSIC __m256 div(__m256 a, __m256 b, float) { return _mm256_div_ps(a, b); } Vc_INTRINSIC __m256d div(__m256d a, __m256d b, double) { return _mm256_div_pd(a, b); } Vc_INTRINSIC __m256i div(__m256i a, __m256i b, int) { using namespace AVX; const __m256d lo1 = _mm256_cvtepi32_pd(lo128(a)); const __m256d lo2 = _mm256_cvtepi32_pd(lo128(b)); const __m256d hi1 = _mm256_cvtepi32_pd(hi128(a)); const __m256d hi2 = _mm256_cvtepi32_pd(hi128(b)); return concat(_mm256_cvttpd_epi32(_mm256_div_pd(lo1, lo2)), _mm256_cvttpd_epi32(_mm256_div_pd(hi1, hi2))); } Vc_INTRINSIC __m256i div(__m256i a, __m256i b, uint) { // SSE/AVX only has signed int conversion to doubles. Therefore we first adjust the input before // conversion and take the adjustment back after the conversion. // It could be argued that for b this is not really important because division by a b >= 2^31 is // useless. But for full correctness it cannot be ignored. using namespace AVX; const __m256i aa = add_epi32(a, set1_epi32(-2147483648)); const __m256i bb = add_epi32(b, set1_epi32(-2147483648)); const __m256d loa = _mm256_add_pd(_mm256_cvtepi32_pd(lo128(aa)), set1_pd(2147483648.)); const __m256d hia = _mm256_add_pd(_mm256_cvtepi32_pd(hi128(aa)), set1_pd(2147483648.)); const __m256d lob = _mm256_add_pd(_mm256_cvtepi32_pd(lo128(bb)), set1_pd(2147483648.)); const __m256d hib = _mm256_add_pd(_mm256_cvtepi32_pd(hi128(bb)), set1_pd(2147483648.)); // there is one remaining problem: a >= 2^31 and b == 1 // in that case the return value would be 2^31 return avx_cast<__m256i>(_mm256_blendv_ps( avx_cast<__m256>(concat(_mm256_cvttpd_epi32(_mm256_div_pd(loa, lob)), _mm256_cvttpd_epi32(_mm256_div_pd(hia, hib)))), avx_cast<__m256>(a), avx_cast<__m256>(cmpeq_epi32(b, setone_epi32())))); } Vc_INTRINSIC __m256i div(__m256i a, __m256i b, short) { using namespace AVX; const __m256 lo = _mm256_div_ps(convert(lo128(a)), convert(lo128(b))); const __m256 hi = _mm256_div_ps(convert(hi128(a)), convert(hi128(b))); return concat(convert(lo), convert(hi)); } // horizontal add{{{1 template Vc_INTRINSIC T add(Common::IntrinsicType a, T) { return {add(add(AVX::lo128(a), AVX::hi128(a), T()), T())}; } // horizontal mul{{{1 template Vc_INTRINSIC T mul(Common::IntrinsicType a, T) { return {mul(mul(AVX::lo128(a), AVX::hi128(a), T()), T())}; } // horizontal min{{{1 template Vc_INTRINSIC T min(Common::IntrinsicType a, T) { return {min(min(AVX::lo128(a), AVX::hi128(a), T()), T())}; } // horizontal max{{{1 template Vc_INTRINSIC T max(Common::IntrinsicType a, T) { return {max(max(AVX::lo128(a), AVX::hi128(a), T()), T())}; } // cmpeq{{{1 Vc_INTRINSIC __m256 cmpeq(__m256 a, __m256 b, float) { return AvxIntrinsics::cmpeq_ps(a, b); } Vc_INTRINSIC __m256d cmpeq(__m256d a, __m256d b, double) { return AvxIntrinsics::cmpeq_pd(a, b); } Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, int) { return AvxIntrinsics::cmpeq_epi32(a, b); } Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, uint) { return AvxIntrinsics::cmpeq_epi32(a, b); } Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, short) { return AvxIntrinsics::cmpeq_epi16(a, b); } Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, ushort) { return AvxIntrinsics::cmpeq_epi16(a, b); } // cmpneq{{{1 Vc_INTRINSIC __m256 cmpneq(__m256 a, __m256 b, float) { return AvxIntrinsics::cmpneq_ps(a, b); } Vc_INTRINSIC __m256d cmpneq(__m256d a, __m256d b, double) { return AvxIntrinsics::cmpneq_pd(a, b); } Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, int) { return not_(AvxIntrinsics::cmpeq_epi32(a, b)); } Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, uint) { return not_(AvxIntrinsics::cmpeq_epi32(a, b)); } Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, short) { return not_(AvxIntrinsics::cmpeq_epi16(a, b)); } Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, ushort) { return not_(AvxIntrinsics::cmpeq_epi16(a, b)); } Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, schar) { return not_(AvxIntrinsics::cmpeq_epi8 (a, b)); } Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, uchar) { return not_(AvxIntrinsics::cmpeq_epi8 (a, b)); } // cmpgt{{{1 Vc_INTRINSIC __m256 cmpgt(__m256 a, __m256 b, float) { return AVX::cmpgt_ps(a, b); } Vc_INTRINSIC __m256d cmpgt(__m256d a, __m256d b, double) { return AVX::cmpgt_pd(a, b); } Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, int) { return AVX::cmpgt_epi32(a, b); } Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, uint) { return AVX::cmpgt_epu32(a, b); } Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, short) { return AVX::cmpgt_epi16(a, b); } Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, ushort) { return AVX::cmpgt_epu16(a, b); } Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, schar) { return AVX::cmpgt_epi8 (a, b); } Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, uchar) { return AVX::cmpgt_epu8 (a, b); } // cmpge{{{1 Vc_INTRINSIC __m256 cmpge(__m256 a, __m256 b, float) { return AVX::cmpge_ps(a, b); } Vc_INTRINSIC __m256d cmpge(__m256d a, __m256d b, double) { return AVX::cmpge_pd(a, b); } Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, int) { return not_(AVX::cmpgt_epi32(b, a)); } Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, uint) { return not_(AVX::cmpgt_epu32(b, a)); } Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, short) { return not_(AVX::cmpgt_epi16(b, a)); } Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, ushort) { return not_(AVX::cmpgt_epu16(b, a)); } Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, schar) { return not_(AVX::cmpgt_epi8 (b, a)); } Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, uchar) { return not_(AVX::cmpgt_epu8 (b, a)); } // cmple{{{1 Vc_INTRINSIC __m256 cmple(__m256 a, __m256 b, float) { return AVX::cmple_ps(a, b); } Vc_INTRINSIC __m256d cmple(__m256d a, __m256d b, double) { return AVX::cmple_pd(a, b); } Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, int) { return not_(AVX::cmpgt_epi32(a, b)); } Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, uint) { return not_(AVX::cmpgt_epu32(a, b)); } Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, short) { return not_(AVX::cmpgt_epi16(a, b)); } Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, ushort) { return not_(AVX::cmpgt_epu16(a, b)); } Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, schar) { return not_(AVX::cmpgt_epi8 (a, b)); } Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, uchar) { return not_(AVX::cmpgt_epu8 (a, b)); } // cmplt{{{1 Vc_INTRINSIC __m256 cmplt(__m256 a, __m256 b, float) { return AVX::cmplt_ps(a, b); } Vc_INTRINSIC __m256d cmplt(__m256d a, __m256d b, double) { return AVX::cmplt_pd(a, b); } Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, int) { return AVX::cmpgt_epi32(b, a); } Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, uint) { return AVX::cmpgt_epu32(b, a); } Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, short) { return AVX::cmpgt_epi16(b, a); } Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, ushort) { return AVX::cmpgt_epu16(b, a); } Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, schar) { return AVX::cmpgt_epi8 (b, a); } Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, uchar) { return AVX::cmpgt_epu8 (b, a); } // fma{{{1 Vc_INTRINSIC __m256 fma(__m256 a, __m256 b, __m256 c, float) { #ifdef Vc_IMPL_FMA4 return _mm256_macc_ps(a, b, c); #elif defined Vc_IMPL_FMA return _mm256_fmadd_ps(a, b, c); #else using namespace AVX; __m256d v1_0 = _mm256_cvtps_pd(lo128(a)); __m256d v1_1 = _mm256_cvtps_pd(hi128(a)); __m256d v2_0 = _mm256_cvtps_pd(lo128(b)); __m256d v2_1 = _mm256_cvtps_pd(hi128(b)); __m256d v3_0 = _mm256_cvtps_pd(lo128(c)); __m256d v3_1 = _mm256_cvtps_pd(hi128(c)); return concat(_mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)), _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1))); #endif } Vc_INTRINSIC __m256d fma(__m256d a, __m256d b, __m256d c, double) { #ifdef Vc_IMPL_FMA4 return _mm256_macc_pd(a, b, c); #elif defined Vc_IMPL_FMA return _mm256_fmadd_pd(a, b, c); #else using namespace AVX; __m256d h1 = and_(a, _mm256_broadcast_sd(reinterpret_cast( &c_general::highMaskDouble))); __m256d h2 = and_(b, _mm256_broadcast_sd(reinterpret_cast( &c_general::highMaskDouble))); const __m256d l1 = _mm256_sub_pd(a, h1); const __m256d l2 = _mm256_sub_pd(b, h2); const __m256d ll = mul(l1, l2, double()); const __m256d lh = add(mul(l1, h2, double()), mul(h1, l2, double()), double()); const __m256d hh = mul(h1, h2, double()); // ll < lh < hh for all entries is certain const __m256d lh_lt_v3 = cmplt(abs(lh, double()), abs(c, double()), double()); // |lh| < |c| const __m256d x = _mm256_blendv_pd(c, lh, lh_lt_v3); const __m256d y = _mm256_blendv_pd(lh, c, lh_lt_v3); return add(add(ll, x, double()), add(y, hh, double()), double()); #endif } template Vc_INTRINSIC __m256i fma(__m256i a, __m256i b, __m256i c, T) { return add(mul(a, b, T()), c, T()); } // shiftRight{{{1 template Vc_INTRINSIC __m256i shiftRight(__m256i a, int) { return AVX::srai_epi32(a); } template Vc_INTRINSIC __m256i shiftRight(__m256i a, uint) { return AVX::srli_epi32(a); } template Vc_INTRINSIC __m256i shiftRight(__m256i a, short) { return AVX::srai_epi16(a); } template Vc_INTRINSIC __m256i shiftRight(__m256i a, ushort) { return AVX::srli_epi16(a); } //template Vc_INTRINSIC __m256i shiftRight(__m256i a, schar) { return AVX::srai_epi8 (a); } //template Vc_INTRINSIC __m256i shiftRight(__m256i a, uchar) { return AVX::srli_epi8 (a); } Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, int) { return AVX::sra_epi32(a, _mm_cvtsi32_si128(shift)); } Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, uint) { return AVX::srl_epi32(a, _mm_cvtsi32_si128(shift)); } Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, short) { return AVX::sra_epi16(a, _mm_cvtsi32_si128(shift)); } Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, ushort) { return AVX::srl_epi16(a, _mm_cvtsi32_si128(shift)); } //Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, schar) { return AVX::sra_epi8 (a, _mm_cvtsi32_si128(shift)); } //Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, uchar) { return AVX::srl_epi8 (a, _mm_cvtsi32_si128(shift)); } // shiftLeft{{{1 template Vc_INTRINSIC __m256i shiftLeft(__m256i a, int) { return AVX::slli_epi32(a); } template Vc_INTRINSIC __m256i shiftLeft(__m256i a, uint) { return AVX::slli_epi32(a); } template Vc_INTRINSIC __m256i shiftLeft(__m256i a, short) { return AVX::slli_epi16(a); } template Vc_INTRINSIC __m256i shiftLeft(__m256i a, ushort) { return AVX::slli_epi16(a); } //template Vc_INTRINSIC __m256i shiftLeft(__m256i a, schar) { return AVX::slli_epi8 (a); } //template Vc_INTRINSIC __m256i shiftLeft(__m256i a, uchar) { return AVX::slli_epi8 (a); } Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, int) { return AVX::sll_epi32(a, _mm_cvtsi32_si128(shift)); } Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, uint) { return AVX::sll_epi32(a, _mm_cvtsi32_si128(shift)); } Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, short) { return AVX::sll_epi16(a, _mm_cvtsi32_si128(shift)); } Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, ushort) { return AVX::sll_epi16(a, _mm_cvtsi32_si128(shift)); } //Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, schar) { return AVX::sll_epi8 (a, _mm_cvtsi32_si128(shift)); } //Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, uchar) { return AVX::sll_epi8 (a, _mm_cvtsi32_si128(shift)); } // zeroExtendIfNeeded{{{1 Vc_INTRINSIC __m256 zeroExtendIfNeeded(__m256 x) { return x; } Vc_INTRINSIC __m256d zeroExtendIfNeeded(__m256d x) { return x; } Vc_INTRINSIC __m256i zeroExtendIfNeeded(__m256i x) { return x; } Vc_INTRINSIC __m256 zeroExtendIfNeeded(__m128 x) { return AVX::zeroExtend(x); } Vc_INTRINSIC __m256d zeroExtendIfNeeded(__m128d x) { return AVX::zeroExtend(x); } Vc_INTRINSIC __m256i zeroExtendIfNeeded(__m128i x) { return AVX::zeroExtend(x); } // broadcast{{{1 Vc_INTRINSIC __m256 avx_broadcast( float x) { return _mm256_set1_ps(x); } Vc_INTRINSIC __m256d avx_broadcast(double x) { return _mm256_set1_pd(x); } Vc_INTRINSIC __m256i avx_broadcast( int x) { return _mm256_set1_epi32(x); } Vc_INTRINSIC __m256i avx_broadcast( uint x) { return _mm256_set1_epi32(x); } Vc_INTRINSIC __m256i avx_broadcast( short x) { return _mm256_set1_epi16(x); } Vc_INTRINSIC __m256i avx_broadcast(ushort x) { return _mm256_set1_epi16(x); } Vc_INTRINSIC __m256i avx_broadcast( char x) { return _mm256_set1_epi8(x); } Vc_INTRINSIC __m256i avx_broadcast( schar x) { return _mm256_set1_epi8(x); } Vc_INTRINSIC __m256i avx_broadcast( uchar x) { return _mm256_set1_epi8(x); } // sorted{{{1 template = AVXImpl && Impl <= AVX2Impl)>> Vc_CONST_L AVX2::Vector sorted(AVX2::Vector x) Vc_CONST_R; template Vc_INTRINSIC Vc_CONST AVX2::Vector sorted(AVX2::Vector x) { return sorted(x); } // shifted{{{1 template static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32), V> shifted(V v, int amount) { using namespace AVX; constexpr int S = sizeof(T); switch (amount) { case 0: return v; case 1: return shifted( 1 * S)>(v); case 2: return shifted( 2 * S)>(v); case 3: return shifted( 3 * S)>(v); case -1: return shifted(-1 * S)>(v); case -2: return shifted(-2 * S)>(v); case -3: return shifted(-3 * S)>(v); } if (sizeof(T) <= 4) { switch (amount) { case 4: return shifted( 4 * S)>(v); case 5: return shifted( 5 * S)>(v); case 6: return shifted( 6 * S)>(v); case 7: return shifted( 7 * S)>(v); case -4: return shifted(-4 * S)>(v); case -5: return shifted(-5 * S)>(v); case -6: return shifted(-6 * S)>(v); case -7: return shifted(-7 * S)>(v); } if (sizeof(T) <= 2) { switch (amount) { case 8: return shifted( 8 * S)>(v); case 9: return shifted( 9 * S)>(v); case 10: return shifted( 10 * S)>(v); case 11: return shifted( 11 * S)>(v); case 12: return shifted( 12 * S)>(v); case 13: return shifted( 13 * S)>(v); case 14: return shifted( 14 * S)>(v); case 15: return shifted( 15 * S)>(v); case -8: return shifted(- 8 * S)>(v); case -9: return shifted(- 9 * S)>(v); case -10: return shifted(-10 * S)>(v); case -11: return shifted(-11 * S)>(v); case -12: return shifted(-12 * S)>(v); case -13: return shifted(-13 * S)>(v); case -14: return shifted(-14 * S)>(v); case -15: return shifted(-15 * S)>(v); } if (sizeof(T) == 1) { switch (amount) { case 16: return shifted( 16)>(v); case 17: return shifted( 17)>(v); case 18: return shifted( 18)>(v); case 19: return shifted( 19)>(v); case 20: return shifted( 20)>(v); case 21: return shifted( 21)>(v); case 22: return shifted( 22)>(v); case 23: return shifted( 23)>(v); case 24: return shifted( 24)>(v); case 25: return shifted( 25)>(v); case 26: return shifted( 26)>(v); case 27: return shifted( 27)>(v); case 28: return shifted( 28)>(v); case 29: return shifted( 29)>(v); case 30: return shifted( 30)>(v); case 31: return shifted( 31)>(v); case -16: return shifted(-16)>(v); case -17: return shifted(-17)>(v); case -18: return shifted(-18)>(v); case -19: return shifted(-19)>(v); case -20: return shifted(-20)>(v); case -21: return shifted(-21)>(v); case -22: return shifted(-22)>(v); case -23: return shifted(-23)>(v); case -24: return shifted(-24)>(v); case -25: return shifted(-25)>(v); case -26: return shifted(-26)>(v); case -27: return shifted(-27)>(v); case -28: return shifted(-28)>(v); case -29: return shifted(-29)>(v); case -30: return shifted(-30)>(v); case -31: return shifted(-31)>(v); } } } } return avx_cast(_mm256_setzero_ps()); } template static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 16), V> shifted(V v, int amount) { using namespace AVX; switch (amount) { case 0: return v; case 1: return avx_cast(_mm_srli_si128(avx_cast<__m128i>(v), sanitize(1 * sizeof(T)))); case 2: return avx_cast(_mm_srli_si128(avx_cast<__m128i>(v), sanitize(2 * sizeof(T)))); case 3: return avx_cast(_mm_srli_si128(avx_cast<__m128i>(v), sanitize(3 * sizeof(T)))); case -1: return avx_cast(_mm_slli_si128(avx_cast<__m128i>(v), sanitize(1 * sizeof(T)))); case -2: return avx_cast(_mm_slli_si128(avx_cast<__m128i>(v), sanitize(2 * sizeof(T)))); case -3: return avx_cast(_mm_slli_si128(avx_cast<__m128i>(v), sanitize(3 * sizeof(T)))); } if (sizeof(T) <= 2) { switch (amount) { case 4: return avx_cast(_mm_srli_si128(avx_cast<__m128i>(v), sanitize(4 * sizeof(T)))); case 5: return avx_cast(_mm_srli_si128(avx_cast<__m128i>(v), sanitize(5 * sizeof(T)))); case 6: return avx_cast(_mm_srli_si128(avx_cast<__m128i>(v), sanitize(6 * sizeof(T)))); case 7: return avx_cast(_mm_srli_si128(avx_cast<__m128i>(v), sanitize(7 * sizeof(T)))); case -4: return avx_cast(_mm_slli_si128(avx_cast<__m128i>(v), sanitize(4 * sizeof(T)))); case -5: return avx_cast(_mm_slli_si128(avx_cast<__m128i>(v), sanitize(5 * sizeof(T)))); case -6: return avx_cast(_mm_slli_si128(avx_cast<__m128i>(v), sanitize(6 * sizeof(T)))); case -7: return avx_cast(_mm_slli_si128(avx_cast<__m128i>(v), sanitize(7 * sizeof(T)))); } } return avx_cast(_mm_setzero_ps()); } // rotated{{{1 template static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 4), V> rotated(V v, int amount) { using namespace AVX; const __m128i vLo = avx_cast<__m128i>(lo128(v)); const __m128i vHi = avx_cast<__m128i>(hi128(v)); switch (static_cast(amount) % N) { case 0: return v; case 1: return avx_cast(concat(SSE::alignr_epi8(vHi, vLo), SSE::alignr_epi8(vLo, vHi))); case 2: return Mem::permute128(v); case 3: return avx_cast(concat(SSE::alignr_epi8(vLo, vHi), SSE::alignr_epi8(vHi, vLo))); } return avx_cast(_mm256_setzero_ps()); } template static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 8), V> rotated(V v, int amount) { using namespace AVX; const __m128i vLo = avx_cast<__m128i>(lo128(v)); const __m128i vHi = avx_cast<__m128i>(hi128(v)); switch (static_cast(amount) % N) { case 0: return v; case 1: return avx_cast(concat(SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo), SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi))); case 2: return avx_cast(concat(SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo), SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi))); case 3: return avx_cast(concat(SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo), SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi))); case 4: return Mem::permute128(v); case 5: return avx_cast(concat(SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi), SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo))); case 6: return avx_cast(concat(SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi), SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo))); case 7: return avx_cast(concat(SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi), SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo))); } return avx_cast(_mm256_setzero_ps()); } #ifdef Vc_IMPL_AVX2 template static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 16), V> rotated( V v, int amount) { using namespace AVX; const __m128i vLo = avx_cast<__m128i>(lo128(v)); const __m128i vHi = avx_cast<__m128i>(hi128(v)); switch (static_cast(amount) % N) { case 0: return v; case 1: return avx_cast(concat(SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo), SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi))); case 2: return avx_cast(concat(SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo), SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi))); case 3: return avx_cast(concat(SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo), SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi))); case 4: return Mem::permute4x64(v); case 5: return avx_cast(concat(SSE::alignr_epi8<5 * sizeof(T)>(vHi, vLo), SSE::alignr_epi8<5 * sizeof(T)>(vLo, vHi))); case 6: return avx_cast(concat(SSE::alignr_epi8<6 * sizeof(T)>(vHi, vLo), SSE::alignr_epi8<6 * sizeof(T)>(vLo, vHi))); case 7: return avx_cast(concat(SSE::alignr_epi8<7 * sizeof(T)>(vHi, vLo), SSE::alignr_epi8<7 * sizeof(T)>(vLo, vHi))); case 8: return Mem::permute128(v); case 9: return avx_cast(concat(SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi), SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo))); case 10: return avx_cast(concat(SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi), SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo))); case 11: return avx_cast(concat(SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi), SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo))); case 12: return Mem::permute4x64(v); case 13: return avx_cast(concat(SSE::alignr_epi8<5 * sizeof(T)>(vLo, vHi), SSE::alignr_epi8<5 * sizeof(T)>(vHi, vLo))); case 14: return avx_cast(concat(SSE::alignr_epi8<6 * sizeof(T)>(vLo, vHi), SSE::alignr_epi8<6 * sizeof(T)>(vHi, vLo))); case 15: return avx_cast(concat(SSE::alignr_epi8<7 * sizeof(T)>(vLo, vHi), SSE::alignr_epi8<7 * sizeof(T)>(vHi, vLo))); } return avx_cast(_mm256_setzero_ps()); } #endif // Vc_IMPL_AVX2 // testc{{{1 Vc_INTRINSIC Vc_CONST int testc(__m128 a, __m128 b) { return _mm_testc_si128(_mm_castps_si128(a), _mm_castps_si128(b)); } Vc_INTRINSIC Vc_CONST int testc(__m256 a, __m256 b) { return _mm256_testc_ps(a, b); } Vc_INTRINSIC Vc_CONST int testc(__m256d a, __m256d b) { return _mm256_testc_pd(a, b); } Vc_INTRINSIC Vc_CONST int testc(__m256i a, __m256i b) { return _mm256_testc_si256(a, b); } // testz{{{1 Vc_INTRINSIC Vc_CONST int testz(__m128 a, __m128 b) { return _mm_testz_si128(_mm_castps_si128(a), _mm_castps_si128(b)); } Vc_INTRINSIC Vc_CONST int testz(__m256 a, __m256 b) { return _mm256_testz_ps(a, b); } Vc_INTRINSIC Vc_CONST int testz(__m256d a, __m256d b) { return _mm256_testz_pd(a, b); } Vc_INTRINSIC Vc_CONST int testz(__m256i a, __m256i b) { return _mm256_testz_si256(a, b); } // testnzc{{{1 Vc_INTRINSIC Vc_CONST int testnzc(__m128 a, __m128 b) { return _mm_testnzc_si128(_mm_castps_si128(a), _mm_castps_si128(b)); } Vc_INTRINSIC Vc_CONST int testnzc(__m256 a, __m256 b) { return _mm256_testnzc_ps(a, b); } Vc_INTRINSIC Vc_CONST int testnzc(__m256d a, __m256d b) { return _mm256_testnzc_pd(a, b); } Vc_INTRINSIC Vc_CONST int testnzc(__m256i a, __m256i b) { return _mm256_testnzc_si256(a, b); } // movemask{{{1 Vc_INTRINSIC Vc_CONST int movemask(__m256i a) { return AVX::movemask_epi8(a); } Vc_INTRINSIC Vc_CONST int movemask(__m128i a) { return _mm_movemask_epi8(a); } Vc_INTRINSIC Vc_CONST int movemask(__m256d a) { return _mm256_movemask_pd(a); } Vc_INTRINSIC Vc_CONST int movemask(__m128d a) { return _mm_movemask_pd(a); } Vc_INTRINSIC Vc_CONST int movemask(__m256 a) { return _mm256_movemask_ps(a); } Vc_INTRINSIC Vc_CONST int movemask(__m128 a) { return _mm_movemask_ps(a); } // mask_store{{{1 template Vc_INTRINSIC void mask_store(__m256i k, bool *mem, Flags) { static_assert( N == 4 || N == 8 || N == 16, "mask_store(__m256i, bool *) is only implemented for 4, 8, and 16 entries"); switch (N) { case 4: *reinterpret_cast *>(mem) = (_mm_movemask_epi8(AVX::lo128(k)) | (_mm_movemask_epi8(AVX::hi128(k)) << 16)) & 0x01010101; break; case 8: { const auto k2 = _mm_srli_epi16(_mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)), 15); const auto k3 = _mm_packs_epi16(k2, _mm_setzero_si128()); #ifdef __x86_64__ *reinterpret_cast *>(mem) = _mm_cvtsi128_si64(k3); #else *reinterpret_cast *>(mem) = _mm_cvtsi128_si32(k3); *reinterpret_cast *>(mem + 4) = _mm_extract_epi32(k3, 1); #endif } break; case 16: { const auto bools = Detail::and_(AVX::_mm_setone_epu8(), _mm_packs_epi16(AVX::lo128(k), AVX::hi128(k))); if (Flags::IsAligned) { _mm_store_si128(reinterpret_cast<__m128i *>(mem), bools); } else { _mm_storeu_si128(reinterpret_cast<__m128i *>(mem), bools); } } break; default: Vc_UNREACHABLE(); } } // mask_load{{{1 template Vc_INTRINSIC R mask_load(const bool *mem, Flags, enable_if::value> = nullarg) { static_assert(N == 4 || N == 8, "mask_load<__m128>(const bool *) is only implemented for 4, 8 entries"); switch (N) { case 4: { __m128i k = _mm_cvtsi32_si128(*reinterpret_cast *>(mem)); k = _mm_unpacklo_epi8(k, k); k = _mm_unpacklo_epi16(k, k); k = _mm_cmpgt_epi32(k, _mm_setzero_si128()); return AVX::avx_cast<__m128>(k); } case 8: { #ifdef __x86_64__ __m128i k = _mm_cvtsi64_si128(*reinterpret_cast *>(mem)); #else __m128i k = _mm_castpd_si128( _mm_load_sd(reinterpret_cast *>(mem))); #endif return AVX::avx_cast<__m128>( _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128())); } default: Vc_UNREACHABLE(); } } template Vc_INTRINSIC R mask_load(const bool *mem, Flags, enable_if::value> = nullarg) { static_assert( N == 4 || N == 8 || N == 16, "mask_load<__m256>(const bool *) is only implemented for 4, 8, and 16 entries"); switch (N) { case 4: { __m128i k = AVX::avx_cast<__m128i>(_mm_and_ps( _mm_set1_ps(*reinterpret_cast *>(mem)), AVX::avx_cast<__m128>(_mm_setr_epi32(0x1, 0x100, 0x10000, 0x1000000)))); k = _mm_cmpgt_epi32(k, _mm_setzero_si128()); return AVX::avx_cast<__m256>( AVX::concat(_mm_unpacklo_epi32(k, k), _mm_unpackhi_epi32(k, k))); } case 8: { #ifdef __x86_64__ __m128i k = _mm_cvtsi64_si128(*reinterpret_cast *>(mem)); #else __m128i k = _mm_castpd_si128( _mm_load_sd(reinterpret_cast *>(mem))); #endif k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128()); return AVX::avx_cast<__m256>( AVX::concat(_mm_unpacklo_epi16(k, k), _mm_unpackhi_epi16(k, k))); } case 16: { const auto k128 = _mm_cmpgt_epi8( Flags::IsAligned ? _mm_load_si128(reinterpret_cast(mem)) : _mm_loadu_si128(reinterpret_cast(mem)), _mm_setzero_si128()); return AVX::avx_cast<__m256>( AVX::concat(_mm_unpacklo_epi8(k128, k128), _mm_unpackhi_epi8(k128, k128))); } default: Vc_UNREACHABLE(); return R(); } } // mask_to_int{{{1 template Vc_INTRINSIC_L Vc_CONST_L int mask_to_int(__m256i x) Vc_INTRINSIC_R Vc_CONST_R; template <> Vc_INTRINSIC Vc_CONST int mask_to_int<4>(__m256i k) { return movemask(AVX::avx_cast<__m256d>(k)); } template <> Vc_INTRINSIC Vc_CONST int mask_to_int<8>(__m256i k) { return movemask(AVX::avx_cast<__m256>(k)); } #ifdef Vc_IMPL_BMI2 template <> Vc_INTRINSIC Vc_CONST int mask_to_int<16>(__m256i k) { return _pext_u32(movemask(k), 0x55555555u); } #endif template <> Vc_INTRINSIC Vc_CONST int mask_to_int<32>(__m256i k) { return movemask(k); } //InterleaveImpl{{{1 template struct InterleaveImpl { template static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/ const typename V::AsArg v0, // a0 a1 a2 a3 a4 a5 a6 a7 | a8 a9 ... const typename V::AsArg v1) // b0 b1 b2 b3 b4 b5 b6 b7 | b8 b9 ... { const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v1.data()); // a0 b0 a1 b1 a2 b2 a3 b3 | a8 b8 a9 ... const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v1.data()); // a4 b4 a5 ... using namespace AVX; *reinterpret_cast *>(&data[i[ 0]]) = _mm_cvtsi128_si32(lo128(tmp0)); *reinterpret_cast *>(&data[i[ 1]]) = _mm_extract_epi32(lo128(tmp0), 1); *reinterpret_cast *>(&data[i[ 2]]) = _mm_extract_epi32(lo128(tmp0), 2); *reinterpret_cast *>(&data[i[ 3]]) = _mm_extract_epi32(lo128(tmp0), 3); *reinterpret_cast *>(&data[i[ 4]]) = _mm_cvtsi128_si32(lo128(tmp1)); *reinterpret_cast *>(&data[i[ 5]]) = _mm_extract_epi32(lo128(tmp1), 1); *reinterpret_cast *>(&data[i[ 6]]) = _mm_extract_epi32(lo128(tmp1), 2); *reinterpret_cast *>(&data[i[ 7]]) = _mm_extract_epi32(lo128(tmp1), 3); *reinterpret_cast *>(&data[i[ 8]]) = _mm_cvtsi128_si32(hi128(tmp0)); *reinterpret_cast *>(&data[i[ 9]]) = _mm_extract_epi32(hi128(tmp0), 1); *reinterpret_cast *>(&data[i[10]]) = _mm_extract_epi32(hi128(tmp0), 2); *reinterpret_cast *>(&data[i[11]]) = _mm_extract_epi32(hi128(tmp0), 3); *reinterpret_cast *>(&data[i[12]]) = _mm_cvtsi128_si32(hi128(tmp1)); *reinterpret_cast *>(&data[i[13]]) = _mm_extract_epi32(hi128(tmp1), 1); *reinterpret_cast *>(&data[i[14]]) = _mm_extract_epi32(hi128(tmp1), 2); *reinterpret_cast *>(&data[i[15]]) = _mm_extract_epi32(hi128(tmp1), 3); }/*}}}*/ static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1) { const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v1.data()); // a0 b0 a1 b1 a2 b2 a3 b3 | a8 b8 a9 ... const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v1.data()); // a4 b4 a5 ... V(Mem::shuffle128(tmp0, tmp1)).store(&data[i[0]], Vc::Unaligned); V(Mem::shuffle128(tmp0, tmp1)).store(&data[i[8]], Vc::Unaligned); }/*}}}*/ template static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) { interleave(data, i, v0, v1); v2.scatter(data + 2, i); }/*}}}*/ template static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) { const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v2.data()); // a0 c0 a1 c1 a2 c2 a3 c3 | a8 c8 a9 c9 ... const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v2.data()); // a4 c4 a5 c5 a6 c6 a7 c7 | a12 c12 ... const __m256i tmp2 = AVX::unpacklo_epi16(v1.data(), v3.data()); // b0 d0 b1 d1 b2 d2 b3 d3 | b8 d8 b9 d9 ... const __m256i tmp3 = AVX::unpackhi_epi16(v1.data(), v3.data()); // b4 d4 b5 ... const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2); // a0 b0 c0 d0 a1 b1 c1 d1 | a8 b8 c8 d8 a9 b9 ... const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2); // [abcd]2 [abcd]3 | [abcd]10 [abcd]11 const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3); // [abcd]4 [abcd]5 | [abcd]12 [abcd]13 const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3); // [abcd]6 [abcd]7 | [abcd]14 [abcd]15 using namespace AVX; auto &&store = [&](__m256i x, int offset) { _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[offset + 0]]), lo128(x)); _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[offset + 8]]), hi128(x)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[offset + 1]]), avx_cast<__m128>(x)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[offset + 9]]), avx_cast<__m128>(hi128(x))); }; store(tmp4, 0); store(tmp5, 2); store(tmp6, 4); store(tmp7, 6); }/*}}}*/ static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<4> &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) { const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v2.data()); // a0 c0 a1 c1 a2 c2 a3 c3 | a8 c8 a9 c9 ... const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v2.data()); // a4 c4 a5 c5 a6 c6 a7 c7 | a12 c12 ... const __m256i tmp2 = AVX::unpacklo_epi16(v1.data(), v3.data()); // b0 d0 b1 d1 b2 d2 b3 d3 | b8 d8 b9 d9 ... const __m256i tmp3 = AVX::unpackhi_epi16(v1.data(), v3.data()); // b4 d4 b5 ... const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2); // a0 b0 c0 d0 a1 b1 c1 d1 | a8 b8 c8 d8 a9 b9 ... const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2); // [abcd]2 [abcd]3 | [abcd]10 [abcd]11 const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3); // [abcd]4 [abcd]5 | [abcd]12 [abcd]13 const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3); // [abcd]6 [abcd]7 | [abcd]14 [abcd]15 V(Mem::shuffle128(tmp4, tmp5)).store(&data[i[0]], ::Vc::Unaligned); V(Mem::shuffle128(tmp6, tmp7)).store(&data[i[4]], ::Vc::Unaligned); V(Mem::shuffle128(tmp4, tmp5)).store(&data[i[8]], ::Vc::Unaligned); V(Mem::shuffle128(tmp6, tmp7)).store(&data[i[12]], ::Vc::Unaligned); }/*}}}*/ template // interleave 5 args {{{2 static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4) { interleave(data, i, v0, v1, v2, v3); v4.scatter(data + 4, i); } template // interleave 6 args {{{2 static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5); } template // interleave 7 args {{{2 static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5, v6); } template // interleave 8 args {{{2 static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6, const typename V::AsArg v7) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5, v6, v7); } //}}}2 template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ const I &i, V &v0, V &v1) { const __m256i tmp4 = // a0 b0 a1 b1 a2 b2 a3 b3 | a8 b8 a9 b9 a10 b10 a11 b11 _mm256_setr_epi32(*reinterpret_cast *>(&data[i[0]]), *reinterpret_cast *>(&data[i[1]]), *reinterpret_cast *>(&data[i[2]]), *reinterpret_cast *>(&data[i[3]]), *reinterpret_cast *>(&data[i[8]]), *reinterpret_cast *>(&data[i[9]]), *reinterpret_cast *>(&data[i[10]]), *reinterpret_cast *>(&data[i[11]])); const __m256i tmp5 = // a4 b4 a5 b5 a6 b6 a7 b7 | a12 b12 a13 b13 a14 b14 a15 b15 _mm256_setr_epi32(*reinterpret_cast *>(&data[i[4]]), *reinterpret_cast *>(&data[i[5]]), *reinterpret_cast *>(&data[i[6]]), *reinterpret_cast *>(&data[i[7]]), *reinterpret_cast *>(&data[i[12]]), *reinterpret_cast *>(&data[i[13]]), *reinterpret_cast *>(&data[i[14]]), *reinterpret_cast *>(&data[i[15]])); const __m256i tmp2 = AVX::unpacklo_epi16(tmp4, tmp5); // a0 a4 b0 b4 a1 a5 b1 b5 | a8 a12 b8 b12 a9 a13 b9 b13 const __m256i tmp3 = AVX::unpackhi_epi16(tmp4, tmp5); // a2 a6 b2 b6 a3 a7 b3 b7 | a10 a14 b10 b14 a11 a15 b11 b15 const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 | a8 a10 a12 a14 b8 ... const __m256i tmp1 = AVX::unpackhi_epi16(tmp2, tmp3); // a1 a3 a5 a7 b1 b3 b5 b7 | a9 a11 a13 a15 b9 ... v0.data() = AVX::unpacklo_epi16(tmp0, tmp1); // a0 a1 a2 a3 a4 a5 a6 a7 | a8 a9 ... v1.data() = AVX::unpackhi_epi16(tmp0, tmp1); // b0 b1 b2 b3 b4 b5 b6 b7 | b8 b9 ... }/*}}}*/ template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ const I &i, V &v0, V &v1, V &v2) { using namespace AVX; const __m256i tmp0 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast *>(&data[i[0]]), *reinterpret_cast *>(&data[i[1]]), *reinterpret_cast *>(&data[i[8]]), *reinterpret_cast *>(&data[i[9]]))); const __m256i tmp1 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast *>(&data[i[2]]), *reinterpret_cast *>(&data[i[3]]), *reinterpret_cast *>(&data[i[10]]), *reinterpret_cast *>(&data[i[11]]))); const __m256i tmp2 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast *>(&data[i[4]]), *reinterpret_cast *>(&data[i[5]]), *reinterpret_cast *>(&data[i[12]]), *reinterpret_cast *>(&data[i[13]]))); const __m256i tmp3 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast *>(&data[i[6]]), *reinterpret_cast *>(&data[i[7]]), *reinterpret_cast *>(&data[i[14]]), *reinterpret_cast *>(&data[i[15]]))); const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2); // a0 a4 b0 b4 c0 c4 XX XX | a8 a12 b8 ... const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2); // a1 a5 ... const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3); // a2 a6 ... const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3); // a3 a7 ... const __m256i tmp8 = AVX::unpacklo_epi16(tmp4, tmp6); // a0 a2 a4 a6 b0 ... const __m256i tmp9 = AVX::unpackhi_epi16(tmp4, tmp6); // c0 c2 c4 c6 XX ... const __m256i tmp10 = AVX::unpacklo_epi16(tmp5, tmp7); // a1 a3 a5 a7 b1 ... const __m256i tmp11 = AVX::unpackhi_epi16(tmp5, tmp7); // c1 c3 c5 c7 XX ... v0.data() = AVX::unpacklo_epi16(tmp8, tmp10); // a0 a1 a2 a3 a4 a5 a6 a7 | a8 ... v1.data() = AVX::unpackhi_epi16(tmp8, tmp10); v2.data() = AVX::unpacklo_epi16(tmp9, tmp11); }/*}}}*/ template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ const I &i, V &v0, V &v1, V &v2, V &v3) { using namespace AVX; const __m256i tmp0 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast *>(&data[i[0]]), *reinterpret_cast *>(&data[i[1]]), *reinterpret_cast *>(&data[i[8]]), *reinterpret_cast *>(&data[i[9]]))); const __m256i tmp1 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast *>(&data[i[2]]), *reinterpret_cast *>(&data[i[3]]), *reinterpret_cast *>(&data[i[10]]), *reinterpret_cast *>(&data[i[11]]))); const __m256i tmp2 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast *>(&data[i[4]]), *reinterpret_cast *>(&data[i[5]]), *reinterpret_cast *>(&data[i[12]]), *reinterpret_cast *>(&data[i[13]]))); const __m256i tmp3 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast *>(&data[i[6]]), *reinterpret_cast *>(&data[i[7]]), *reinterpret_cast *>(&data[i[14]]), *reinterpret_cast *>(&data[i[15]]))); const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2); // a0 a4 b0 b4 c0 c4 d0 d4 | a8 a12 b8 ... const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2); // a1 a5 ... const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3); // a2 a6 ... const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3); // a3 a7 ... const __m256i tmp8 = AVX::unpacklo_epi16(tmp4, tmp6); // a0 a2 a4 a6 b0 ... const __m256i tmp9 = AVX::unpackhi_epi16(tmp4, tmp6); // c0 c2 c4 c6 d0 ... const __m256i tmp10 = AVX::unpacklo_epi16(tmp5, tmp7); // a1 a3 a5 a7 b1 ... const __m256i tmp11 = AVX::unpackhi_epi16(tmp5, tmp7); // c1 c3 c5 c7 d1 ... v0.data() = AVX::unpacklo_epi16(tmp8, tmp10); // a0 a1 a2 a3 a4 a5 a6 a7 | a8 ... v1.data() = AVX::unpackhi_epi16(tmp8, tmp10); v2.data() = AVX::unpacklo_epi16(tmp9, tmp11); v3.data() = AVX::unpackhi_epi16(tmp9, tmp11); }/*}}}*/ template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ const I &i, V &v0, V &v1, V &v2, V &v3, V &v4) { using namespace AVX; const __m256i a = concat(_mm_loadu_si128(reinterpret_cast(&data[i[0]])), _mm_loadu_si128(reinterpret_cast(&data[i[8]]))); const __m256i b = concat(_mm_loadu_si128(reinterpret_cast(&data[i[1]])), _mm_loadu_si128(reinterpret_cast(&data[i[9]]))); const __m256i c = concat(_mm_loadu_si128(reinterpret_cast(&data[i[2]])), _mm_loadu_si128(reinterpret_cast(&data[i[10]]))); const __m256i d = concat(_mm_loadu_si128(reinterpret_cast(&data[i[3]])), _mm_loadu_si128(reinterpret_cast(&data[i[11]]))); const __m256i e = concat(_mm_loadu_si128(reinterpret_cast(&data[i[4]])), _mm_loadu_si128(reinterpret_cast(&data[i[12]]))); const __m256i f = concat(_mm_loadu_si128(reinterpret_cast(&data[i[5]])), _mm_loadu_si128(reinterpret_cast(&data[i[13]]))); const __m256i g = concat(_mm_loadu_si128(reinterpret_cast(&data[i[6]])), _mm_loadu_si128(reinterpret_cast(&data[i[14]]))); const __m256i h = concat(_mm_loadu_si128(reinterpret_cast(&data[i[7]])), _mm_loadu_si128(reinterpret_cast(&data[i[15]]))); const __m256i tmp2 = AVX::unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 | a8 ... const __m256i tmp4 = AVX::unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 const __m256i tmp3 = AVX::unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 const __m256i tmp5 = AVX::unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 const __m256i tmp10 = AVX::unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 const __m256i tmp11 = AVX::unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 const __m256i tmp12 = AVX::unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 const __m256i tmp13 = AVX::unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 | a8 ... const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 v0.data() = AVX::unpacklo_epi16(tmp0, tmp1); v1.data() = AVX::unpackhi_epi16(tmp0, tmp1); v2.data() = AVX::unpacklo_epi16(tmp6, tmp7); v3.data() = AVX::unpackhi_epi16(tmp6, tmp7); v4.data() = AVX::unpacklo_epi16(tmp8, tmp9); }/*}}}*/ template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5) { using namespace AVX; const __m256i a = concat(_mm_loadu_si128(reinterpret_cast(&data[i[0]])), _mm_loadu_si128(reinterpret_cast(&data[i[8]]))); const __m256i b = concat(_mm_loadu_si128(reinterpret_cast(&data[i[1]])), _mm_loadu_si128(reinterpret_cast(&data[i[9]]))); const __m256i c = concat(_mm_loadu_si128(reinterpret_cast(&data[i[2]])), _mm_loadu_si128(reinterpret_cast(&data[i[10]]))); const __m256i d = concat(_mm_loadu_si128(reinterpret_cast(&data[i[3]])), _mm_loadu_si128(reinterpret_cast(&data[i[11]]))); const __m256i e = concat(_mm_loadu_si128(reinterpret_cast(&data[i[4]])), _mm_loadu_si128(reinterpret_cast(&data[i[12]]))); const __m256i f = concat(_mm_loadu_si128(reinterpret_cast(&data[i[5]])), _mm_loadu_si128(reinterpret_cast(&data[i[13]]))); const __m256i g = concat(_mm_loadu_si128(reinterpret_cast(&data[i[6]])), _mm_loadu_si128(reinterpret_cast(&data[i[14]]))); const __m256i h = concat(_mm_loadu_si128(reinterpret_cast(&data[i[7]])), _mm_loadu_si128(reinterpret_cast(&data[i[15]]))); const __m256i tmp2 = AVX::unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 | a8 ... const __m256i tmp4 = AVX::unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 const __m256i tmp3 = AVX::unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 const __m256i tmp5 = AVX::unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 const __m256i tmp10 = AVX::unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 const __m256i tmp11 = AVX::unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 const __m256i tmp12 = AVX::unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 const __m256i tmp13 = AVX::unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 | a8 ... const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 v0.data() = AVX::unpacklo_epi16(tmp0, tmp1); v1.data() = AVX::unpackhi_epi16(tmp0, tmp1); v2.data() = AVX::unpacklo_epi16(tmp6, tmp7); v3.data() = AVX::unpackhi_epi16(tmp6, tmp7); v4.data() = AVX::unpacklo_epi16(tmp8, tmp9); v5.data() = AVX::unpackhi_epi16(tmp8, tmp9); }/*}}}*/ template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6) { using namespace AVX; const __m256i a = concat(_mm_loadu_si128(reinterpret_cast(&data[i[0]])), _mm_loadu_si128(reinterpret_cast(&data[i[8]]))); const __m256i b = concat(_mm_loadu_si128(reinterpret_cast(&data[i[1]])), _mm_loadu_si128(reinterpret_cast(&data[i[9]]))); const __m256i c = concat(_mm_loadu_si128(reinterpret_cast(&data[i[2]])), _mm_loadu_si128(reinterpret_cast(&data[i[10]]))); const __m256i d = concat(_mm_loadu_si128(reinterpret_cast(&data[i[3]])), _mm_loadu_si128(reinterpret_cast(&data[i[11]]))); const __m256i e = concat(_mm_loadu_si128(reinterpret_cast(&data[i[4]])), _mm_loadu_si128(reinterpret_cast(&data[i[12]]))); const __m256i f = concat(_mm_loadu_si128(reinterpret_cast(&data[i[5]])), _mm_loadu_si128(reinterpret_cast(&data[i[13]]))); const __m256i g = concat(_mm_loadu_si128(reinterpret_cast(&data[i[6]])), _mm_loadu_si128(reinterpret_cast(&data[i[14]]))); const __m256i h = concat(_mm_loadu_si128(reinterpret_cast(&data[i[7]])), _mm_loadu_si128(reinterpret_cast(&data[i[15]]))); const __m256i tmp2 = AVX::unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 | a8 ... const __m256i tmp4 = AVX::unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 const __m256i tmp3 = AVX::unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 const __m256i tmp5 = AVX::unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 const __m256i tmp10 = AVX::unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 const __m256i tmp11 = AVX::unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 const __m256i tmp12 = AVX::unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 const __m256i tmp13 = AVX::unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 | a8 ... const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 const __m256i tmp14 = AVX::unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6 const __m256i tmp15 = AVX::unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7 v0.data() = AVX::unpacklo_epi16(tmp0, tmp1); v1.data() = AVX::unpackhi_epi16(tmp0, tmp1); v2.data() = AVX::unpacklo_epi16(tmp6, tmp7); v3.data() = AVX::unpackhi_epi16(tmp6, tmp7); v4.data() = AVX::unpacklo_epi16(tmp8, tmp9); v5.data() = AVX::unpackhi_epi16(tmp8, tmp9); v6.data() = AVX::unpacklo_epi16(tmp14, tmp15); }/*}}}*/ template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7) { using namespace AVX; const __m256i a = concat(_mm_loadu_si128(reinterpret_cast(&data[i[0]])), _mm_loadu_si128(reinterpret_cast(&data[i[8]]))); const __m256i b = concat(_mm_loadu_si128(reinterpret_cast(&data[i[1]])), _mm_loadu_si128(reinterpret_cast(&data[i[9]]))); const __m256i c = concat(_mm_loadu_si128(reinterpret_cast(&data[i[2]])), _mm_loadu_si128(reinterpret_cast(&data[i[10]]))); const __m256i d = concat(_mm_loadu_si128(reinterpret_cast(&data[i[3]])), _mm_loadu_si128(reinterpret_cast(&data[i[11]]))); const __m256i e = concat(_mm_loadu_si128(reinterpret_cast(&data[i[4]])), _mm_loadu_si128(reinterpret_cast(&data[i[12]]))); const __m256i f = concat(_mm_loadu_si128(reinterpret_cast(&data[i[5]])), _mm_loadu_si128(reinterpret_cast(&data[i[13]]))); const __m256i g = concat(_mm_loadu_si128(reinterpret_cast(&data[i[6]])), _mm_loadu_si128(reinterpret_cast(&data[i[14]]))); const __m256i h = concat(_mm_loadu_si128(reinterpret_cast(&data[i[7]])), _mm_loadu_si128(reinterpret_cast(&data[i[15]]))); const __m256i tmp2 = AVX::unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 | a8 ... const __m256i tmp4 = AVX::unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 const __m256i tmp3 = AVX::unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 const __m256i tmp5 = AVX::unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 const __m256i tmp10 = AVX::unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 const __m256i tmp11 = AVX::unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 const __m256i tmp12 = AVX::unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 const __m256i tmp13 = AVX::unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 | a8 ... const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 const __m256i tmp14 = AVX::unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6 const __m256i tmp15 = AVX::unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7 v0.data() = AVX::unpacklo_epi16(tmp0, tmp1); v1.data() = AVX::unpackhi_epi16(tmp0, tmp1); v2.data() = AVX::unpacklo_epi16(tmp6, tmp7); v3.data() = AVX::unpackhi_epi16(tmp6, tmp7); v4.data() = AVX::unpacklo_epi16(tmp8, tmp9); v5.data() = AVX::unpackhi_epi16(tmp8, tmp9); v6.data() = AVX::unpacklo_epi16(tmp14, tmp15); v7.data() = AVX::unpackhi_epi16(tmp14, tmp15); }/*}}}*/ }; template struct InterleaveImpl { template static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1) { using namespace AVX; // [0a 1a 0b 1b 0e 1e 0f 1f]: const m256 tmp0 = _mm256_unpacklo_ps(avx_cast(v0.data()), avx_cast(v1.data())); // [0c 1c 0d 1d 0g 1g 0h 1h]: const m256 tmp1 = _mm256_unpackhi_ps(avx_cast(v0.data()), avx_cast(v1.data())); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), lo128(tmp0)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), lo128(tmp0)); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), lo128(tmp1)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), lo128(tmp1)); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[4]]), hi128(tmp0)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), hi128(tmp0)); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[6]]), hi128(tmp1)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), hi128(tmp1)); }/*}}}*/ static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1) { using namespace AVX; // [0a 1a 0b 1b 0e 1e 0f 1f]: const m256 tmp0 = _mm256_unpacklo_ps(avx_cast(v0.data()), avx_cast(v1.data())); // [0c 1c 0d 1d 0g 1g 0h 1h]: const m256 tmp1 = _mm256_unpackhi_ps(avx_cast(v0.data()), avx_cast(v1.data())); _mm_storeu_ps(reinterpret_cast *>(&data[i[0]]), lo128(tmp0)); _mm_storeu_ps(reinterpret_cast *>(&data[i[2]]), lo128(tmp1)); _mm_storeu_ps(reinterpret_cast *>(&data[i[4]]), hi128(tmp0)); _mm_storeu_ps(reinterpret_cast *>(&data[i[6]]), hi128(tmp1)); }/*}}}*/ template static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) { using namespace AVX; #ifdef Vc_USE_MASKMOV_SCATTER // [0a 2a 0b 2b 0e 2e 0f 2f]: const m256 tmp0 = _mm256_unpacklo_ps(avx_cast(v0.data()), avx_cast(v2.data())); // [0c 2c 0d 2d 0g 2g 0h 2h]: const m256 tmp1 = _mm256_unpackhi_ps(avx_cast(v0.data()), avx_cast(v2.data())); // [1a __ 1b __ 1e __ 1f __]: const m256 tmp2 = _mm256_unpacklo_ps(avx_cast(v1.data()), avx_cast(v1.data())); // [1c __ 1d __ 1g __ 1h __]: const m256 tmp3 = _mm256_unpackhi_ps(avx_cast(v1.data()), avx_cast(v1.data())); const m256 tmp4 = _mm256_unpacklo_ps(tmp0, tmp2); const m256 tmp5 = _mm256_unpackhi_ps(tmp0, tmp2); const m256 tmp6 = _mm256_unpacklo_ps(tmp1, tmp3); const m256 tmp7 = _mm256_unpackhi_ps(tmp1, tmp3); const m128i mask = _mm_set_epi32(0, -1, -1, -1); _mm_maskstore_ps(reinterpret_cast *>(&data[i[0]]), mask, lo128(tmp4)); _mm_maskstore_ps(reinterpret_cast *>(&data[i[1]]), mask, lo128(tmp5)); _mm_maskstore_ps(reinterpret_cast *>(&data[i[2]]), mask, lo128(tmp6)); _mm_maskstore_ps(reinterpret_cast *>(&data[i[3]]), mask, lo128(tmp7)); _mm_maskstore_ps(reinterpret_cast *>(&data[i[4]]), mask, hi128(tmp4)); _mm_maskstore_ps(reinterpret_cast *>(&data[i[5]]), mask, hi128(tmp5)); _mm_maskstore_ps(reinterpret_cast *>(&data[i[6]]), mask, hi128(tmp6)); _mm_maskstore_ps(reinterpret_cast *>(&data[i[7]]), mask, hi128(tmp7)); #else interleave(data, i, v0, v1); v2.scatter(data + 2, i); #endif }/*}}}*/ template static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) { using namespace AVX; const m256 tmp0 = _mm256_unpacklo_ps(avx_cast(v0.data()), avx_cast(v2.data())); const m256 tmp1 = _mm256_unpackhi_ps(avx_cast(v0.data()), avx_cast(v2.data())); const m256 tmp2 = _mm256_unpacklo_ps(avx_cast(v1.data()), avx_cast(v3.data())); const m256 tmp3 = _mm256_unpackhi_ps(avx_cast(v1.data()), avx_cast(v3.data())); const m256 tmp4 = _mm256_unpacklo_ps(tmp0, tmp2); const m256 tmp5 = _mm256_unpackhi_ps(tmp0, tmp2); const m256 tmp6 = _mm256_unpacklo_ps(tmp1, tmp3); const m256 tmp7 = _mm256_unpackhi_ps(tmp1, tmp3); _mm_storeu_ps(reinterpret_cast *>(&data[i[0]]), lo128(tmp4)); _mm_storeu_ps(reinterpret_cast *>(&data[i[1]]), lo128(tmp5)); _mm_storeu_ps(reinterpret_cast *>(&data[i[2]]), lo128(tmp6)); _mm_storeu_ps(reinterpret_cast *>(&data[i[3]]), lo128(tmp7)); _mm_storeu_ps(reinterpret_cast *>(&data[i[4]]), hi128(tmp4)); _mm_storeu_ps(reinterpret_cast *>(&data[i[5]]), hi128(tmp5)); _mm_storeu_ps(reinterpret_cast *>(&data[i[6]]), hi128(tmp6)); _mm_storeu_ps(reinterpret_cast *>(&data[i[7]]), hi128(tmp7)); }/*}}}*/ template // interleave 5 args {{{2 static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4) { interleave(data, i, v0, v1, v2, v3); v4.scatter(data + 4, i); } template // interleave 6 args {{{2 static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5); } template // interleave 7 args {{{2 static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5, v6); } template // interleave 8 args {{{2 static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6, const typename V::AsArg v7) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5, v6, v7); } //}}}2 template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ const I &i, V &v0, V &v1) { using namespace AVX; const m128 il0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[0]])); // a0 b0 const m128 il2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[2]])); // a2 b2 const m128 il4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[4]])); // a4 b4 const m128 il6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[6]])); // a6 b6 const m128 il01 = _mm_loadh_pi( il0, reinterpret_cast<__m64 const *>(&data[i[1]])); // a0 b0 a1 b1 const m128 il23 = _mm_loadh_pi( il2, reinterpret_cast<__m64 const *>(&data[i[3]])); // a2 b2 a3 b3 const m128 il45 = _mm_loadh_pi( il4, reinterpret_cast<__m64 const *>(&data[i[5]])); // a4 b4 a5 b5 const m128 il67 = _mm_loadh_pi( il6, reinterpret_cast<__m64 const *>(&data[i[7]])); // a6 b6 a7 b7 const m256 tmp2 = concat(il01, il45); const m256 tmp3 = concat(il23, il67); const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3); const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3); v0.data() = avx_cast(_mm256_unpacklo_ps(tmp0, tmp1)); v1.data() = avx_cast(_mm256_unpackhi_ps(tmp0, tmp1)); }/*}}}*/ static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ const Common::SuccessiveEntries<2> &i, V &v0, V &v1) { using namespace AVX; const m256 il0123 = _mm256_loadu_ps(reinterpret_cast *>(&data[i[0]])); // a0 b0 a1 b1 a2 b2 a3 b3 const m256 il4567 = _mm256_loadu_ps(reinterpret_cast *>(&data[i[4]])); // a4 b4 a5 b5 a6 b6 a7 b7 const m256 tmp2 = Mem::shuffle128(il0123, il4567); const m256 tmp3 = Mem::shuffle128(il0123, il4567); const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3); const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3); v0.data() = avx_cast(_mm256_unpacklo_ps(tmp0, tmp1)); v1.data() = avx_cast(_mm256_unpackhi_ps(tmp0, tmp1)); }/*}}}*/ template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ const I &i, V &v0, V &v1, V &v2) { using namespace AVX; const m128 il0 = _mm_loadu_ps(reinterpret_cast *>(&data[i[0]])); // a0 b0 c0 d0 const m128 il1 = _mm_loadu_ps(reinterpret_cast *>(&data[i[1]])); // a1 b1 c1 d1 const m128 il2 = _mm_loadu_ps(reinterpret_cast *>(&data[i[2]])); // a2 b2 c2 d2 const m128 il3 = _mm_loadu_ps(reinterpret_cast *>(&data[i[3]])); // a3 b3 c3 d3 const m128 il4 = _mm_loadu_ps(reinterpret_cast *>(&data[i[4]])); // a4 b4 c4 d4 const m128 il5 = _mm_loadu_ps(reinterpret_cast *>(&data[i[5]])); // a5 b5 c5 d5 const m128 il6 = _mm_loadu_ps(reinterpret_cast *>(&data[i[6]])); // a6 b6 c6 d6 const m128 il7 = _mm_loadu_ps(reinterpret_cast *>(&data[i[7]])); // a7 b7 c7 d7 const m256 il04 = concat(il0, il4); const m256 il15 = concat(il1, il5); const m256 il26 = concat(il2, il6); const m256 il37 = concat(il3, il7); const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); v0.data() = avx_cast(_mm256_unpacklo_ps(ab0246, ab1357)); v1.data() = avx_cast(_mm256_unpackhi_ps(ab0246, ab1357)); v2.data() = avx_cast(_mm256_unpacklo_ps(cd0246, cd1357)); }/*}}}*/ template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ const I &i, V &v0, V &v1, V &v2, V &v3) { using namespace AVX; const m128 il0 = _mm_loadu_ps(reinterpret_cast *>(&data[i[0]])); // a0 b0 c0 d0 const m128 il1 = _mm_loadu_ps(reinterpret_cast *>(&data[i[1]])); // a1 b1 c1 d1 const m128 il2 = _mm_loadu_ps(reinterpret_cast *>(&data[i[2]])); // a2 b2 c2 d2 const m128 il3 = _mm_loadu_ps(reinterpret_cast *>(&data[i[3]])); // a3 b3 c3 d3 const m128 il4 = _mm_loadu_ps(reinterpret_cast *>(&data[i[4]])); // a4 b4 c4 d4 const m128 il5 = _mm_loadu_ps(reinterpret_cast *>(&data[i[5]])); // a5 b5 c5 d5 const m128 il6 = _mm_loadu_ps(reinterpret_cast *>(&data[i[6]])); // a6 b6 c6 d6 const m128 il7 = _mm_loadu_ps(reinterpret_cast *>(&data[i[7]])); // a7 b7 c7 d7 const m256 il04 = concat(il0, il4); const m256 il15 = concat(il1, il5); const m256 il26 = concat(il2, il6); const m256 il37 = concat(il3, il7); const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); v0.data() = avx_cast(_mm256_unpacklo_ps(ab0246, ab1357)); v1.data() = avx_cast(_mm256_unpackhi_ps(ab0246, ab1357)); v2.data() = avx_cast(_mm256_unpacklo_ps(cd0246, cd1357)); v3.data() = avx_cast(_mm256_unpackhi_ps(cd0246, cd1357)); }/*}}}*/ template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ const I &i, V &v0, V &v1, V &v2, V &v3, V &v4) { v4.gather(data + 4, i); deinterleave(data, i, v0, v1, v2, v3); }/*}}}*/ template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5) { deinterleave(data, i, v0, v1, v2, v3); deinterleave(data + 4, i, v4, v5); }/*}}}*/ static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ const Common::SuccessiveEntries<6> &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5) { using namespace AVX; const m256 a = _mm256_loadu_ps(reinterpret_cast *>(&data[i[0]])); const m256 b = _mm256_loadu_ps(reinterpret_cast *>(&data[i[0] + 1 * V::Size])); const m256 c = _mm256_loadu_ps(reinterpret_cast *>(&data[i[0] + 2 * V::Size])); const m256 d = _mm256_loadu_ps(reinterpret_cast *>(&data[i[0] + 3 * V::Size])); const m256 e = _mm256_loadu_ps(reinterpret_cast *>(&data[i[0] + 4 * V::Size])); const m256 f = _mm256_loadu_ps(reinterpret_cast *>(&data[i[0] + 5 * V::Size])); const __m256 tmp2 = Mem::shuffle128(a, d); const __m256 tmp3 = Mem::shuffle128(b, e); const __m256 tmp4 = Mem::shuffle128(a, d); const __m256 tmp5 = Mem::shuffle128(c, f); const __m256 tmp8 = Mem::shuffle128(b, e); const __m256 tmp9 = Mem::shuffle128(c, f); const __m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3); const __m256 tmp1 = _mm256_unpackhi_ps(tmp4, tmp5); const __m256 tmp6 = _mm256_unpackhi_ps(tmp2, tmp3); const __m256 tmp7 = _mm256_unpacklo_ps(tmp8, tmp9); const __m256 tmp10 = _mm256_unpacklo_ps(tmp4, tmp5); const __m256 tmp11 = _mm256_unpackhi_ps(tmp8, tmp9); v0.data() = avx_cast(_mm256_unpacklo_ps(tmp0, tmp1)); v1.data() = avx_cast(_mm256_unpackhi_ps(tmp0, tmp1)); v2.data() = avx_cast(_mm256_unpacklo_ps(tmp6, tmp7)); v3.data() = avx_cast(_mm256_unpackhi_ps(tmp6, tmp7)); v4.data() = avx_cast(_mm256_unpacklo_ps(tmp10, tmp11)); v5.data() = avx_cast(_mm256_unpackhi_ps(tmp10, tmp11)); }/*}}}*/ template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6) { deinterleave(data, i, v0, v1, v2, v3); deinterleave(data + 4, i, v4, v5, v6); }/*}}}*/ template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7) { deinterleave(data, i, v0, v1, v2, v3); deinterleave(data + 4, i, v4, v5, v6, v7); }/*}}}*/ }; template struct InterleaveImpl { template // interleave 2 args{{{2 static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1) { using namespace AVX; const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data()); const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data()); _mm_storeu_pd(&data[i[0]], lo128(tmp0)); _mm_storeu_pd(&data[i[1]], lo128(tmp1)); _mm_storeu_pd(&data[i[2]], hi128(tmp0)); _mm_storeu_pd(&data[i[3]], hi128(tmp1)); } template // interleave 3 args{{{2 static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) { using namespace AVX; #ifdef Vc_USE_MASKMOV_SCATTER const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data()); const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data()); const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v2.data()); const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v2.data()); #if defined(Vc_MSVC) && (Vc_MSVC < 170000000 || !defined(_WIN64)) // MSVC needs to be at Version 2012 before _mm256_set_epi64x works const m256i mask = concat(_mm_setallone_si128(), _mm_set_epi32(0, 0, -1, -1)); #else const m256i mask = _mm256_set_epi64x(0, -1, -1, -1); #endif _mm256_maskstore_pd(&data[i[0]], mask, Mem::shuffle128(tmp0, tmp2)); _mm256_maskstore_pd(&data[i[1]], mask, Mem::shuffle128(tmp1, tmp3)); _mm256_maskstore_pd(&data[i[2]], mask, Mem::shuffle128(tmp0, tmp2)); _mm256_maskstore_pd(&data[i[3]], mask, Mem::shuffle128(tmp1, tmp3)); #else interleave(data, i, v0, v1); v2.scatter(data + 2, i); #endif } template // interleave 4 args{{{2 static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) { using namespace AVX; // 0a 1a 0c 1c: const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data()); // 0b 1b 0b 1b: const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data()); // 2a 3a 2c 3c: const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v3.data()); // 2b 3b 2b 3b: const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v3.data()); /* The following might be more efficient once 256-bit stores are not split internally into 2 * 128-bit stores. _mm256_storeu_pd(&data[i[0]], Mem::shuffle128(tmp0, tmp2)); _mm256_storeu_pd(&data[i[1]], Mem::shuffle128(tmp1, tmp3)); _mm256_storeu_pd(&data[i[2]], Mem::shuffle128(tmp0, tmp2)); _mm256_storeu_pd(&data[i[3]], Mem::shuffle128(tmp1, tmp3)); */ _mm_storeu_pd(&data[i[0] ], lo128(tmp0)); _mm_storeu_pd(&data[i[0]+2], lo128(tmp2)); _mm_storeu_pd(&data[i[1] ], lo128(tmp1)); _mm_storeu_pd(&data[i[1]+2], lo128(tmp3)); _mm_storeu_pd(&data[i[2] ], hi128(tmp0)); _mm_storeu_pd(&data[i[2]+2], hi128(tmp2)); _mm_storeu_pd(&data[i[3] ], hi128(tmp1)); _mm_storeu_pd(&data[i[3]+2], hi128(tmp3)); } template // interleave 5 args {{{2 static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4) { interleave(data, i, v0, v1, v2, v3); v4.scatter(data + 4, i); } template // interleave 6 args {{{2 static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5); } template // interleave 7 args {{{2 static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5, v6); } template // interleave 8 args {{{2 static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6, const typename V::AsArg v7) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5, v6, v7); } //}}}2 template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ const I &i, V &v0, V &v1) { using namespace Vc::AVX; const m256d ab02 = concat(_mm_loadu_pd(&data[i[0]]), _mm_loadu_pd(&data[i[2]])); const m256d ab13 = concat(_mm_loadu_pd(&data[i[1]]), _mm_loadu_pd(&data[i[3]])); v0.data() = _mm256_unpacklo_pd(ab02, ab13); v1.data() = _mm256_unpackhi_pd(ab02, ab13); }/*}}}*/ template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ const I &i, V &v0, V &v1, V &v2) { v2.gather(data + 2, i); deinterleave(data, i, v0, v1); }/*}}}*/ template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ const I &i, V &v0, V &v1, V &v2, V &v3) { deinterleave(data, i, v0, v1); deinterleave(data + 2, i, v2, v3); }/*}}}*/ template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ const I &i, V &v0, V &v1, V &v2, V &v3, V &v4) { v4.gather(data + 4, i); deinterleave(data, i, v0, v1); deinterleave(data + 2, i, v2, v3); }/*}}}*/ template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5) { deinterleave(data, i, v0, v1); deinterleave(data + 2, i, v2, v3); deinterleave(data + 4, i, v4, v5); }/*}}}*/ template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6) { v6.gather(data + 6, i); deinterleave(data, i, v0, v1); deinterleave(data + 2, i, v2, v3); deinterleave(data + 4, i, v4, v5); }/*}}}*/ template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7) { deinterleave(data, i, v0, v1); deinterleave(data + 2, i, v2, v3); deinterleave(data + 4, i, v4, v5); deinterleave(data + 6, i, v6, v7); }/*}}}*/ }; //}}}1 } // namespace Detail } // namespace Vc #endif // VC_AVX_DETAIL_H_ // vim: foldmethod=marker Vc-1.3.3/avx/helperimpl.h000066400000000000000000000126571320703111200151770ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2011-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_AVX_HELPERIMPL_H_ #define VC_AVX_HELPERIMPL_H_ #include "../sse/helperimpl.h" #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace Detail { template inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const float *, A); template inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const short *, A); template inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const ushort *, A); template inline void deinterleave(AVX2::double_v &, AVX2::double_v &, const double *, A); template inline void deinterleave(AVX2::int_v &, AVX2::int_v &, const int *, A); template inline void deinterleave(AVX2::int_v &, AVX2::int_v &, const short *, A); template inline void deinterleave(AVX2::uint_v &, AVX2::uint_v &, const uint *, A); template inline void deinterleave(AVX2::uint_v &, AVX2::uint_v &, const ushort *, A); template inline void deinterleave(AVX2::short_v &, AVX2::short_v &, const short *, A); template inline void deinterleave(AVX2::ushort_v &, AVX2::ushort_v &, const ushort *, A); template Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector &Vc_RESTRICT a, AVX2::Vector &Vc_RESTRICT b, AVX2::Vector &Vc_RESTRICT c, const M *Vc_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; template Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector &Vc_RESTRICT a, AVX2::Vector &Vc_RESTRICT b, AVX2::Vector &Vc_RESTRICT c, AVX2::Vector &Vc_RESTRICT d, const M *Vc_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; template Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector &Vc_RESTRICT a, AVX2::Vector &Vc_RESTRICT b, AVX2::Vector &Vc_RESTRICT c, AVX2::Vector &Vc_RESTRICT d, AVX2::Vector &Vc_RESTRICT e, const M *Vc_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; template Vc_ALWAYS_INLINE_L void deinterleave( AVX2::Vector &Vc_RESTRICT a, AVX2::Vector &Vc_RESTRICT b, AVX2::Vector &Vc_RESTRICT c, AVX2::Vector &Vc_RESTRICT d, AVX2::Vector &Vc_RESTRICT e, AVX2::Vector &Vc_RESTRICT f, const M *Vc_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; template Vc_ALWAYS_INLINE_L void deinterleave( AVX2::Vector &Vc_RESTRICT a, AVX2::Vector &Vc_RESTRICT b, AVX2::Vector &Vc_RESTRICT c, AVX2::Vector &Vc_RESTRICT d, AVX2::Vector &Vc_RESTRICT e, AVX2::Vector &Vc_RESTRICT f, AVX2::Vector &Vc_RESTRICT g, AVX2::Vector &Vc_RESTRICT h, const M *Vc_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE void prefetchForOneRead(const void *addr, VectorAbi::Avx) { prefetchForOneRead(addr, VectorAbi::Sse()); } Vc_ALWAYS_INLINE void prefetchForModify(const void *addr, VectorAbi::Avx) { prefetchForModify(addr, VectorAbi::Sse()); } Vc_ALWAYS_INLINE void prefetchClose(const void *addr, VectorAbi::Avx) { prefetchClose(addr, VectorAbi::Sse()); } Vc_ALWAYS_INLINE void prefetchMid(const void *addr, VectorAbi::Avx) { prefetchMid(addr, VectorAbi::Sse()); } Vc_ALWAYS_INLINE void prefetchFar(const void *addr, VectorAbi::Avx) { prefetchFar(addr, VectorAbi::Sse()); } } // namespace Detail } // namespace Vc #include "deinterleave.tcc" #endif // VC_AVX_HELPERIMPL_H_ Vc-1.3.3/avx/intrinsics.h000066400000000000000000001110011320703111200152020ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2009-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_AVX_INTRINSICS_H_ #define VC_AVX_INTRINSICS_H_ #include #include "../traits/type_traits.h" // see comment in sse/intrinsics.h extern "C" { // AVX #include #if (defined(Vc_IMPL_XOP) || defined(Vc_IMPL_FMA4)) && !defined(Vc_MSVC) #include #endif } #include "../common/fix_clang_emmintrin.h" #include "const_data.h" #include "../common/types.h" #include "macros.h" #include #if (defined Vc_CLANG && Vc_CLANG >= 0x30900) #ifdef _mm256_permute2f128_si256 #undef _mm256_permute2f128_si256 #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \ (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \ (__v8si)(__m256i)(V2), (char)(M)); }) #endif #ifdef _mm256_permute2f128_ps #undef _mm256_permute2f128_ps #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \ (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \ (__v8sf)(__m256)(V2), (char)(M)); }) #endif #ifdef _mm256_permute2x128_si256 #undef _mm256_permute2x128_si256 #define _mm256_permute2x128_si256(V1, V2, M) __extension__ ({ \ (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (char)(M)); }) #endif #endif namespace Vc_VERSIONED_NAMESPACE { namespace AvxIntrinsics { using AVX::c_general; using AVX::_IndexesFromZero32; using AVX::_IndexesFromZero16; using AVX::_IndexesFromZero8; typedef __m128 m128 ; typedef __m128d m128d; typedef __m128i m128i; typedef __m256 m256 ; typedef __m256d m256d; typedef __m256i m256i; typedef const m128 param128 ; typedef const m128d param128d; typedef const m128i param128i; typedef const m256 param256 ; typedef const m256d param256d; typedef const m256i param256i; #ifdef Vc_GCC // Redefine the mul/add/sub intrinsics to use GCC-specific operators instead of builtin // functions. This way the fp-contraction optimization step kicks in and creates FMAs! :) static Vc_INTRINSIC Vc_CONST m256d _mm256_mul_pd(m256d a, m256d b) { return static_cast(static_cast<__v4df>(a) * static_cast<__v4df>(b)); } static Vc_INTRINSIC Vc_CONST m256d _mm256_add_pd(m256d a, m256d b) { return static_cast(static_cast<__v4df>(a) + static_cast<__v4df>(b)); } static Vc_INTRINSIC Vc_CONST m256d _mm256_sub_pd(m256d a, m256d b) { return static_cast(static_cast<__v4df>(a) - static_cast<__v4df>(b)); } static Vc_INTRINSIC Vc_CONST m256 _mm256_mul_ps(m256 a, m256 b) { return static_cast(static_cast<__v8sf>(a) * static_cast<__v8sf>(b)); } static Vc_INTRINSIC Vc_CONST m256 _mm256_add_ps(m256 a, m256 b) { return static_cast(static_cast<__v8sf>(a) + static_cast<__v8sf>(b)); } static Vc_INTRINSIC Vc_CONST m256 _mm256_sub_ps(m256 a, m256 b) { return static_cast(static_cast<__v8sf>(a) - static_cast<__v8sf>(b)); } #endif static Vc_INTRINSIC m256 Vc_CONST set1_ps (float a) { return _mm256_set1_ps (a); } static Vc_INTRINSIC m256d Vc_CONST set1_pd (double a) { return _mm256_set1_pd (a); } static Vc_INTRINSIC m256i Vc_CONST set1_epi32(int a) { return _mm256_set1_epi32(a); } //static Vc_INTRINSIC m256i Vc_CONST _mm256_set1_epu32(unsigned int a) { return ::_mm256_set1_epu32(a); } static Vc_INTRINSIC Vc_CONST m128i _mm_setallone_si128() { return _mm_load_si128(reinterpret_cast(Common::AllBitsSet)); } static Vc_INTRINSIC Vc_CONST m128 _mm_setallone_ps() { return _mm_load_ps(reinterpret_cast(Common::AllBitsSet)); } static Vc_INTRINSIC Vc_CONST m128d _mm_setallone_pd() { return _mm_load_pd(reinterpret_cast(Common::AllBitsSet)); } static Vc_INTRINSIC Vc_CONST m256i setallone_si256() { return _mm256_castps_si256(_mm256_load_ps(reinterpret_cast(Common::AllBitsSet))); } static Vc_INTRINSIC Vc_CONST m256d setallone_pd() { return _mm256_load_pd(reinterpret_cast(Common::AllBitsSet)); } static Vc_INTRINSIC Vc_CONST m256 setallone_ps() { return _mm256_load_ps(reinterpret_cast(Common::AllBitsSet)); } static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi8 () { return _mm_set1_epi8(1); } static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epu8 () { return _mm_setone_epi8(); } static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi16() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast(c_general::one16))); } static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epu16() { return _mm_setone_epi16(); } static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast(&_IndexesFromZero32[1]))); } static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epu32() { return _mm_setone_epi32(); } static Vc_INTRINSIC m256i Vc_CONST setone_epi8 () { return _mm256_set1_epi8(1); } static Vc_INTRINSIC m256i Vc_CONST setone_epu8 () { return setone_epi8(); } static Vc_INTRINSIC m256i Vc_CONST setone_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(c_general::one16))); } static Vc_INTRINSIC m256i Vc_CONST setone_epu16() { return setone_epi16(); } static Vc_INTRINSIC m256i Vc_CONST setone_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(&_IndexesFromZero32[1]))); } static Vc_INTRINSIC m256i Vc_CONST setone_epu32() { return setone_epi32(); } static Vc_INTRINSIC m256 Vc_CONST setone_ps() { return _mm256_broadcast_ss(&c_general::oneFloat); } static Vc_INTRINSIC m256d Vc_CONST setone_pd() { return _mm256_broadcast_sd(&c_general::oneDouble); } static Vc_INTRINSIC m256d Vc_CONST setabsmask_pd() { return _mm256_broadcast_sd(reinterpret_cast(&c_general::absMaskFloat[0])); } static Vc_INTRINSIC m256 Vc_CONST setabsmask_ps() { return _mm256_broadcast_ss(reinterpret_cast(&c_general::absMaskFloat[1])); } static Vc_INTRINSIC m256d Vc_CONST setsignmask_pd(){ return _mm256_broadcast_sd(reinterpret_cast(&c_general::signMaskFloat[0])); } static Vc_INTRINSIC m256 Vc_CONST setsignmask_ps(){ return _mm256_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1])); } static Vc_INTRINSIC m256 Vc_CONST set2power31_ps() { return _mm256_broadcast_ss(&c_general::_2power31); } static Vc_INTRINSIC m128 Vc_CONST _mm_set2power31_ps() { return _mm_broadcast_ss(&c_general::_2power31); } static Vc_INTRINSIC m256i Vc_CONST set2power31_epu32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1]))); } static Vc_INTRINSIC m128i Vc_CONST _mm_set2power31_epu32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1]))); } static Vc_INTRINSIC m256i Vc_CONST setmin_epi8 () { return _mm256_set1_epi8(-0x80); } static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi16() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast(c_general::minShort))); } static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1]))); } static Vc_INTRINSIC m256i Vc_CONST setmin_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(c_general::minShort))); } static Vc_INTRINSIC m256i Vc_CONST setmin_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1]))); } template static Vc_INTRINSIC Vc_CONST unsigned char extract_epu8(__m128i x) { return _mm_extract_epi8(x, i); } template static Vc_INTRINSIC Vc_CONST unsigned short extract_epu16(__m128i x) { return _mm_extract_epi16(x, i); } template static Vc_INTRINSIC Vc_CONST unsigned int extract_epu32(__m128i x) { return _mm_extract_epi32(x, i); } template Vc_INTRINSIC __m256 insert128(__m256 a, __m128 b) { return _mm256_insertf128_ps(a, b, offset); } template Vc_INTRINSIC __m256d insert128(__m256d a, __m128d b) { return _mm256_insertf128_pd(a, b, offset); } template Vc_INTRINSIC __m256i insert128(__m256i a, __m128i b) { #ifdef Vc_IMPL_AVX2 return _mm256_inserti128_si256(a, b, offset); #else return _mm256_insertf128_si256(a, b, offset); #endif } template Vc_INTRINSIC __m128 extract128(__m256 a) { return _mm256_extractf128_ps(a, offset); } template Vc_INTRINSIC __m128d extract128(__m256d a) { return _mm256_extractf128_pd(a, offset); } template Vc_INTRINSIC __m128i extract128(__m256i a) { #ifdef Vc_IMPL_AVX2 return _mm256_extracti128_si256(a, offset); #else return _mm256_extractf128_si256(a, offset); #endif } /////////////////////// COMPARE OPS /////////////////////// static Vc_INTRINSIC m256d Vc_CONST cmpeq_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); } static Vc_INTRINSIC m256d Vc_CONST cmpneq_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); } static Vc_INTRINSIC m256d Vc_CONST cmplt_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); } static Vc_INTRINSIC m256d Vc_CONST cmpnlt_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); } static Vc_INTRINSIC m256d Vc_CONST cmpge_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); } static Vc_INTRINSIC m256d Vc_CONST cmple_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); } static Vc_INTRINSIC m256d Vc_CONST cmpnle_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); } static Vc_INTRINSIC m256d Vc_CONST cmpgt_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); } static Vc_INTRINSIC m256d Vc_CONST cmpord_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_ORD_Q); } static Vc_INTRINSIC m256d Vc_CONST cmpunord_pd(__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_UNORD_Q); } static Vc_INTRINSIC m256 Vc_CONST cmpeq_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); } static Vc_INTRINSIC m256 Vc_CONST cmpneq_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); } static Vc_INTRINSIC m256 Vc_CONST cmplt_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); } static Vc_INTRINSIC m256 Vc_CONST cmpnlt_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); } static Vc_INTRINSIC m256 Vc_CONST cmpge_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); } static Vc_INTRINSIC m256 Vc_CONST cmple_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); } static Vc_INTRINSIC m256 Vc_CONST cmpnle_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); } static Vc_INTRINSIC m256 Vc_CONST cmpgt_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); } static Vc_INTRINSIC m256 Vc_CONST cmpord_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_ORD_Q); } static Vc_INTRINSIC m256 Vc_CONST cmpunord_ps(__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_UNORD_Q); } #if defined(Vc_IMPL_XOP) static Vc_INTRINSIC m128i cmplt_epu16(__m128i a, __m128i b) { return _mm_comlt_epu16(a, b); } static Vc_INTRINSIC m128i cmpgt_epu16(__m128i a, __m128i b) { return _mm_comgt_epu16(a, b); } #else static Vc_INTRINSIC m128i cmplt_epu16(__m128i a, __m128i b) { return _mm_cmplt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); } static Vc_INTRINSIC m128i cmpgt_epu16(__m128i a, __m128i b) { return _mm_cmpgt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); } #endif #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2) { return _mm256_alignr_epi8(s1, s2, shift); } #else template Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2) { return insert128<1>( _mm256_castsi128_si256(_mm_alignr_epi8(_mm256_castsi256_si128(s1), _mm256_castsi256_si128(s2), shift)), _mm_alignr_epi8(extract128<1>(s1), extract128<1>(s2), shift)); } #endif #ifdef Vc_IMPL_AVX2 #define Vc_AVX_TO_SSE_2_NEW(name) \ Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m256i b0) \ { \ return _mm256_##name(a0, b0); \ } #define Vc_AVX_TO_SSE_256_128(name) \ Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0) \ { \ return _mm256_##name(a0, b0); \ } #define Vc_AVX_TO_SSE_1i(name) \ template Vc_INTRINSIC Vc_CONST m256i name(__m256i a0) \ { \ return _mm256_##name(a0, i); \ } #define Vc_AVX_TO_SSE_1(name) \ Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0) { return _mm256_##name(a0); } #define Vc_AVX_TO_SSE_1_128(name, shift__) \ Vc_INTRINSIC Vc_CONST __m256i name(__m128i a0) { return _mm256_##name(a0); } #else /**\internal * Defines the function \p name, which takes to __m256i arguments and calls `_mm_##name` on the low * and high 128 bit halfs of the arguments. * * In case the AVX2 intrinsics are enabled, the arguments are directly passed to a single * `_mm256_##name` call. */ #define Vc_AVX_TO_SSE_1(name) \ Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0) \ { \ __m128i a1 = extract128<1>(a0); \ __m128i r0 = _mm_##name(_mm256_castsi256_si128(a0)); \ __m128i r1 = _mm_##name(a1); \ return insert128<1>(_mm256_castsi128_si256(r0), r1); \ } #define Vc_AVX_TO_SSE_1_128(name, shift__) \ Vc_INTRINSIC Vc_CONST __m256i name(__m128i a0) \ { \ __m128i r0 = _mm_##name(a0); \ __m128i r1 = _mm_##name(_mm_srli_si128(a0, shift__)); \ return insert128<1>(_mm256_castsi128_si256(r0), r1); \ } #define Vc_AVX_TO_SSE_2_NEW(name) \ Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m256i b0) \ { \ m128i a1 = extract128<1>(a0); \ m128i b1 = extract128<1>(b0); \ m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0)); \ m128i r1 = _mm_##name(a1, b1); \ return insert128<1>(_mm256_castsi128_si256(r0), r1); \ } #define Vc_AVX_TO_SSE_256_128(name) \ Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0) \ { \ m128i a1 = extract128<1>(a0); \ m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), b0); \ m128i r1 = _mm_##name(a1, b0); \ return insert128<1>(_mm256_castsi128_si256(r0), r1); \ } #define Vc_AVX_TO_SSE_1i(name) \ template Vc_INTRINSIC Vc_CONST m256i name(__m256i a0) \ { \ m128i a1 = extract128<1>(a0); \ m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), i); \ m128i r1 = _mm_##name(a1, i); \ return insert128<1>(_mm256_castsi128_si256(r0), r1); \ } #endif Vc_INTRINSIC Vc_CONST __m128i sll_epi16(__m128i a, __m128i b) { return _mm_sll_epi16(a, b); } Vc_INTRINSIC Vc_CONST __m128i sll_epi32(__m128i a, __m128i b) { return _mm_sll_epi32(a, b); } Vc_INTRINSIC Vc_CONST __m128i sll_epi64(__m128i a, __m128i b) { return _mm_sll_epi64(a, b); } Vc_INTRINSIC Vc_CONST __m128i srl_epi16(__m128i a, __m128i b) { return _mm_srl_epi16(a, b); } Vc_INTRINSIC Vc_CONST __m128i srl_epi32(__m128i a, __m128i b) { return _mm_srl_epi32(a, b); } Vc_INTRINSIC Vc_CONST __m128i srl_epi64(__m128i a, __m128i b) { return _mm_srl_epi64(a, b); } Vc_INTRINSIC Vc_CONST __m128i sra_epi16(__m128i a, __m128i b) { return _mm_sra_epi16(a, b); } Vc_INTRINSIC Vc_CONST __m128i sra_epi32(__m128i a, __m128i b) { return _mm_sra_epi32(a, b); } Vc_AVX_TO_SSE_1i(slli_epi16) Vc_AVX_TO_SSE_1i(slli_epi32) Vc_AVX_TO_SSE_1i(slli_epi64) Vc_AVX_TO_SSE_1i(srai_epi16) Vc_AVX_TO_SSE_1i(srai_epi32) Vc_AVX_TO_SSE_1i(srli_epi16) Vc_AVX_TO_SSE_1i(srli_epi32) Vc_AVX_TO_SSE_1i(srli_epi64) Vc_AVX_TO_SSE_256_128(sll_epi16) Vc_AVX_TO_SSE_256_128(sll_epi32) Vc_AVX_TO_SSE_256_128(sll_epi64) Vc_AVX_TO_SSE_256_128(srl_epi16) Vc_AVX_TO_SSE_256_128(srl_epi32) Vc_AVX_TO_SSE_256_128(srl_epi64) Vc_AVX_TO_SSE_256_128(sra_epi16) Vc_AVX_TO_SSE_256_128(sra_epi32) Vc_AVX_TO_SSE_2_NEW(cmpeq_epi8) Vc_AVX_TO_SSE_2_NEW(cmpeq_epi16) Vc_AVX_TO_SSE_2_NEW(cmpeq_epi32) Vc_AVX_TO_SSE_2_NEW(cmpeq_epi64) Vc_AVX_TO_SSE_2_NEW(cmpgt_epi8) Vc_AVX_TO_SSE_2_NEW(cmpgt_epi16) Vc_AVX_TO_SSE_2_NEW(cmpgt_epi32) Vc_AVX_TO_SSE_2_NEW(cmpgt_epi64) Vc_AVX_TO_SSE_2_NEW(packs_epi16) Vc_AVX_TO_SSE_2_NEW(packs_epi32) Vc_AVX_TO_SSE_2_NEW(packus_epi16) Vc_AVX_TO_SSE_2_NEW(unpackhi_epi8) Vc_AVX_TO_SSE_2_NEW(unpackhi_epi16) Vc_AVX_TO_SSE_2_NEW(unpackhi_epi32) Vc_AVX_TO_SSE_2_NEW(unpackhi_epi64) Vc_AVX_TO_SSE_2_NEW(unpacklo_epi8) Vc_AVX_TO_SSE_2_NEW(unpacklo_epi16) Vc_AVX_TO_SSE_2_NEW(unpacklo_epi32) Vc_AVX_TO_SSE_2_NEW(unpacklo_epi64) Vc_AVX_TO_SSE_2_NEW(add_epi8) Vc_AVX_TO_SSE_2_NEW(add_epi16) Vc_AVX_TO_SSE_2_NEW(add_epi32) Vc_AVX_TO_SSE_2_NEW(add_epi64) Vc_AVX_TO_SSE_2_NEW(adds_epi8) Vc_AVX_TO_SSE_2_NEW(adds_epi16) Vc_AVX_TO_SSE_2_NEW(adds_epu8) Vc_AVX_TO_SSE_2_NEW(adds_epu16) Vc_AVX_TO_SSE_2_NEW(sub_epi8) Vc_AVX_TO_SSE_2_NEW(sub_epi16) Vc_AVX_TO_SSE_2_NEW(sub_epi32) Vc_AVX_TO_SSE_2_NEW(sub_epi64) Vc_AVX_TO_SSE_2_NEW(subs_epi8) Vc_AVX_TO_SSE_2_NEW(subs_epi16) Vc_AVX_TO_SSE_2_NEW(subs_epu8) Vc_AVX_TO_SSE_2_NEW(subs_epu16) Vc_AVX_TO_SSE_2_NEW(madd_epi16) Vc_AVX_TO_SSE_2_NEW(mulhi_epi16) Vc_AVX_TO_SSE_2_NEW(mullo_epi16) Vc_AVX_TO_SSE_2_NEW(mul_epu32) Vc_AVX_TO_SSE_2_NEW(max_epi16) Vc_AVX_TO_SSE_2_NEW(max_epu8) Vc_AVX_TO_SSE_2_NEW(min_epi16) Vc_AVX_TO_SSE_2_NEW(min_epu8) Vc_AVX_TO_SSE_2_NEW(mulhi_epu16) // shufflehi_epi16 // shufflelo_epi16 (__m128i __A, const int __mask) // shuffle_epi32 (__m128i __A, const int __mask) // maskmoveu_si128 (__m128i __A, __m128i __B, char *__C) Vc_AVX_TO_SSE_2_NEW(avg_epu8) Vc_AVX_TO_SSE_2_NEW(avg_epu16) Vc_AVX_TO_SSE_2_NEW(sad_epu8) // stream_si32 (int *__A, int __B) // stream_si128 (__m128i *__A, __m128i __B) // cvtsi32_si128 (int __A) // cvtsi64_si128 (long long __A) // cvtsi64x_si128 (long long __A) Vc_AVX_TO_SSE_2_NEW(hadd_epi16) Vc_AVX_TO_SSE_2_NEW(hadd_epi32) Vc_AVX_TO_SSE_2_NEW(hadds_epi16) Vc_AVX_TO_SSE_2_NEW(hsub_epi16) Vc_AVX_TO_SSE_2_NEW(hsub_epi32) Vc_AVX_TO_SSE_2_NEW(hsubs_epi16) Vc_AVX_TO_SSE_2_NEW(maddubs_epi16) Vc_AVX_TO_SSE_2_NEW(mulhrs_epi16) Vc_AVX_TO_SSE_2_NEW(shuffle_epi8) Vc_AVX_TO_SSE_2_NEW(sign_epi8) Vc_AVX_TO_SSE_2_NEW(sign_epi16) Vc_AVX_TO_SSE_2_NEW(sign_epi32) Vc_AVX_TO_SSE_2_NEW(min_epi8) Vc_AVX_TO_SSE_2_NEW(max_epi8) Vc_AVX_TO_SSE_2_NEW(min_epu16) Vc_AVX_TO_SSE_2_NEW(max_epu16) Vc_AVX_TO_SSE_2_NEW(min_epi32) Vc_AVX_TO_SSE_2_NEW(max_epi32) Vc_AVX_TO_SSE_2_NEW(min_epu32) Vc_AVX_TO_SSE_2_NEW(max_epu32) Vc_AVX_TO_SSE_2_NEW(mullo_epi32) Vc_AVX_TO_SSE_2_NEW(mul_epi32) Vc_AVX_TO_SSE_1(abs_epi8) Vc_AVX_TO_SSE_1(abs_epi16) Vc_AVX_TO_SSE_1(abs_epi32) Vc_AVX_TO_SSE_1_128(cvtepi8_epi16, 8) Vc_AVX_TO_SSE_1_128(cvtepi8_epi32, 4) Vc_AVX_TO_SSE_1_128(cvtepi8_epi64, 2) Vc_AVX_TO_SSE_1_128(cvtepi16_epi32, 8) Vc_AVX_TO_SSE_1_128(cvtepi16_epi64, 4) Vc_AVX_TO_SSE_1_128(cvtepi32_epi64, 8) Vc_AVX_TO_SSE_1_128(cvtepu8_epi16, 8) Vc_AVX_TO_SSE_1_128(cvtepu8_epi32, 4) Vc_AVX_TO_SSE_1_128(cvtepu8_epi64, 2) Vc_AVX_TO_SSE_1_128(cvtepu16_epi32, 8) Vc_AVX_TO_SSE_1_128(cvtepu16_epi64, 4) Vc_AVX_TO_SSE_1_128(cvtepu32_epi64, 8) Vc_AVX_TO_SSE_2_NEW(packus_epi32) #ifndef Vc_IMPL_AVX2 ///////////////////////////////////////////////////////////////////////// // implementation of the intrinsics missing in AVX ///////////////////////////////////////////////////////////////////////// template Vc_INTRINSIC Vc_CONST __m256i srli_si256(__m256i a0) { const __m128i vLo = _mm256_castsi256_si128(a0); const __m128i vHi = extract128<1>(a0); return insert128<1>(_mm256_castsi128_si256(_mm_srli_si128(vLo, i)), _mm_srli_si128(vHi, i)); } template Vc_INTRINSIC Vc_CONST __m256i slli_si256(__m256i a0) { const __m128i vLo = _mm256_castsi256_si128(a0); const __m128i vHi = extract128<1>(a0); return insert128<1>(_mm256_castsi128_si256(_mm_slli_si128(vLo, i)), _mm_slli_si128(vHi, i)); } static Vc_INTRINSIC m256i Vc_CONST and_si256(__m256i x, __m256i y) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); } static Vc_INTRINSIC m256i Vc_CONST andnot_si256(__m256i x, __m256i y) { return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); } static Vc_INTRINSIC m256i Vc_CONST or_si256(__m256i x, __m256i y) { return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); } static Vc_INTRINSIC m256i Vc_CONST xor_si256(__m256i x, __m256i y) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); } Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0) { m128i a1 = extract128<1>(a0); return (_mm_movemask_epi8(a1) << 16) | _mm_movemask_epi8(_mm256_castsi256_si128(a0)); } template Vc_INTRINSIC Vc_CONST m256i blend_epi16(param256i a0, param256i b0) { m128i a1 = extract128<1>(a0); m128i b1 = extract128<1>(b0); m128i r0 = _mm_blend_epi16(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), m & 0xff); m128i r1 = _mm_blend_epi16(a1, b1, m >> 8); return insert128<1>(_mm256_castsi128_si256(r0), r1); } Vc_INTRINSIC Vc_CONST m256i blendv_epi8(param256i a0, param256i b0, param256i m0) { m128i a1 = extract128<1>(a0); m128i b1 = extract128<1>(b0); m128i m1 = extract128<1>(m0); m128i r0 = _mm_blendv_epi8(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), _mm256_castsi256_si128(m0)); m128i r1 = _mm_blendv_epi8(a1, b1, m1); return insert128<1>(_mm256_castsi128_si256(r0), r1); } // mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M) // stream_load_si128 (__m128i *__X) #else // Vc_IMPL_AVX2 static Vc_INTRINSIC Vc_CONST m256i xor_si256(__m256i x, __m256i y) { return _mm256_xor_si256(x, y); } static Vc_INTRINSIC Vc_CONST m256i or_si256(__m256i x, __m256i y) { return _mm256_or_si256(x, y); } static Vc_INTRINSIC Vc_CONST m256i and_si256(__m256i x, __m256i y) { return _mm256_and_si256(x, y); } static Vc_INTRINSIC Vc_CONST m256i andnot_si256(__m256i x, __m256i y) { return _mm256_andnot_si256(x, y); } template Vc_INTRINSIC Vc_CONST __m256i srli_si256(__m256i a0) { return _mm256_srli_si256(a0, i); } template Vc_INTRINSIC Vc_CONST __m256i slli_si256(__m256i a0) { return _mm256_slli_si256(a0, i); } ///////////////////////////////////////////////////////////////////////// // implementation of the intrinsics missing in AVX2 ///////////////////////////////////////////////////////////////////////// Vc_INTRINSIC Vc_CONST m256i blendv_epi8(__m256i a0, __m256i b0, __m256i m0) { return _mm256_blendv_epi8(a0, b0, m0); } Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0) { return _mm256_movemask_epi8(a0); } #endif // Vc_IMPL_AVX2 ///////////////////////////////////////////////////////////////////////// // implementation of intrinsics missing in AVX and AVX2 ///////////////////////////////////////////////////////////////////////// static Vc_INTRINSIC m256i cmplt_epi64(__m256i a, __m256i b) { return cmpgt_epi64(b, a); } static Vc_INTRINSIC m256i cmplt_epi32(__m256i a, __m256i b) { return cmpgt_epi32(b, a); } static Vc_INTRINSIC m256i cmplt_epi16(__m256i a, __m256i b) { return cmpgt_epi16(b, a); } static Vc_INTRINSIC m256i cmplt_epi8(__m256i a, __m256i b) { return cmpgt_epi8(b, a); } static Vc_INTRINSIC m256i cmplt_epu8(__m256i a, __m256i b) { return cmplt_epi8(xor_si256(a, setmin_epi8()), xor_si256(b, setmin_epi8())); } static Vc_INTRINSIC m256i cmpgt_epu8(__m256i a, __m256i b) { return cmpgt_epi8(xor_si256(a, setmin_epi8()), xor_si256(b, setmin_epi8())); } #if defined(Vc_IMPL_XOP) Vc_AVX_TO_SSE_2_NEW(comlt_epu32) Vc_AVX_TO_SSE_2_NEW(comgt_epu32) Vc_AVX_TO_SSE_2_NEW(comlt_epu16) Vc_AVX_TO_SSE_2_NEW(comgt_epu16) static Vc_INTRINSIC m256i Vc_CONST cmplt_epu32(__m256i a, __m256i b) { return comlt_epu32(a, b); } static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu32(__m256i a, __m256i b) { return comgt_epu32(a, b); } static Vc_INTRINSIC m256i Vc_CONST cmplt_epu16(__m256i a, __m256i b) { return comlt_epu16(a, b); } static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu16(__m256i a, __m256i b) { return comgt_epu16(a, b); } #else static Vc_INTRINSIC m256i Vc_CONST cmplt_epu32(__m256i _a, __m256i _b) { m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi32()))); m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi32()))); return cmplt_epi32(a, b); } static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu32(__m256i _a, __m256i _b) { m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi32()))); m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi32()))); return cmpgt_epi32(a, b); } static Vc_INTRINSIC m256i Vc_CONST cmplt_epu16(__m256i _a, __m256i _b) { m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi16()))); m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi16()))); return cmplt_epi16(a, b); } static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu16(__m256i _a, __m256i _b) { m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi16()))); m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi16()))); return cmpgt_epi16(a, b); } #endif static Vc_INTRINSIC void _mm256_maskstore(float *mem, const __m256 mask, const __m256 v) { _mm256_maskstore_ps(mem, _mm256_castps_si256(mask), v); } static Vc_INTRINSIC void _mm256_maskstore(double *mem, const __m256d mask, const __m256d v) { _mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), v); } static Vc_INTRINSIC void _mm256_maskstore(int *mem, const __m256i mask, const __m256i v) { #ifdef Vc_IMPL_AVX2 _mm256_maskstore_epi32(mem, mask, v); #else _mm256_maskstore_ps(reinterpret_cast(mem), mask, _mm256_castsi256_ps(v)); #endif } static Vc_INTRINSIC void _mm256_maskstore(unsigned int *mem, const __m256i mask, const __m256i v) { _mm256_maskstore(reinterpret_cast(mem), mask, v); } static Vc_INTRINSIC void _mm256_maskstore(short *mem, const __m256i mask, const __m256i v) { using namespace AVX; _mm_maskmoveu_si128(_mm256_castsi256_si128(v), _mm256_castsi256_si128(mask), reinterpret_cast(&mem[0])); _mm_maskmoveu_si128(extract128<1>(v), extract128<1>(mask), reinterpret_cast(&mem[8])); } static Vc_INTRINSIC void _mm256_maskstore(unsigned short *mem, const __m256i mask, const __m256i v) { _mm256_maskstore(reinterpret_cast(mem), mask, v); } #undef Vc_AVX_TO_SSE_1 #undef Vc_AVX_TO_SSE_1_128 #undef Vc_AVX_TO_SSE_2_NEW #undef Vc_AVX_TO_SSE_256_128 #undef Vc_AVX_TO_SSE_1i template Vc_INTRINSIC_L R stream_load(const float *mem) Vc_INTRINSIC_R; template<> Vc_INTRINSIC m128 stream_load(const float *mem) { return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(mem)))); } template<> Vc_INTRINSIC m256 stream_load(const float *mem) { return insert128<1>(_mm256_castps128_ps256(stream_load(mem)), stream_load(mem + 4)); } template Vc_INTRINSIC_L R stream_load(const double *mem) Vc_INTRINSIC_R; template<> Vc_INTRINSIC m128d stream_load(const double *mem) { return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(mem)))); } template<> Vc_INTRINSIC m256d stream_load(const double *mem) { return insert128<1>(_mm256_castpd128_pd256(stream_load(mem)), stream_load(mem + 2)); } template Vc_INTRINSIC_L R stream_load(const void *mem) Vc_INTRINSIC_R; template<> Vc_INTRINSIC m128i stream_load(const void *mem) { return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(mem))); } template<> Vc_INTRINSIC m256i stream_load(const void *mem) { return insert128<1>(_mm256_castsi128_si256(stream_load(mem)), stream_load(static_cast(mem) + 1)); } Vc_INTRINSIC void stream_store(float *mem, __m128 value, __m128 mask) { _mm_maskmoveu_si128(_mm_castps_si128(value), _mm_castps_si128(mask), reinterpret_cast(mem)); } Vc_INTRINSIC void stream_store(float *mem, __m256 value, __m256 mask) { stream_store(mem, _mm256_castps256_ps128(value), _mm256_castps256_ps128(mask)); stream_store(mem + 4, extract128<1>(value), extract128<1>(mask)); } Vc_INTRINSIC void stream_store(double *mem, __m128d value, __m128d mask) { _mm_maskmoveu_si128(_mm_castpd_si128(value), _mm_castpd_si128(mask), reinterpret_cast(mem)); } Vc_INTRINSIC void stream_store(double *mem, __m256d value, __m256d mask) { stream_store(mem, _mm256_castpd256_pd128(value), _mm256_castpd256_pd128(mask)); stream_store(mem + 2, extract128<1>(value), extract128<1>(mask)); } Vc_INTRINSIC void stream_store(void *mem, __m128i value, __m128i mask) { _mm_maskmoveu_si128(value, mask, reinterpret_cast(mem)); } Vc_INTRINSIC void stream_store(void *mem, __m256i value, __m256i mask) { stream_store(mem, _mm256_castsi256_si128(value), _mm256_castsi256_si128(mask)); stream_store(static_cast<__m128i *>(mem) + 1, extract128<1>(value), extract128<1>(mask)); } #ifndef __x86_64__ Vc_INTRINSIC Vc_PURE __m128i _mm_cvtsi64_si128(int64_t x) { return _mm_castpd_si128(_mm_load_sd(reinterpret_cast(&x))); } #endif } // namespace AvxIntrinsics } // namespace Vc namespace Vc_VERSIONED_NAMESPACE { namespace AVX { using namespace AvxIntrinsics; } // namespace AVX namespace AVX2 { using namespace AvxIntrinsics; } // namespace AVX2 namespace AVX { template struct VectorTypeHelper; template<> struct VectorTypeHelper< char > { typedef __m256i Type; }; template<> struct VectorTypeHelper< signed char > { typedef __m256i Type; }; template<> struct VectorTypeHelper { typedef __m256i Type; }; template<> struct VectorTypeHelper< short> { typedef __m256i Type; }; template<> struct VectorTypeHelper { typedef __m256i Type; }; template<> struct VectorTypeHelper< int > { typedef __m256i Type; }; template<> struct VectorTypeHelper { typedef __m256i Type; }; template<> struct VectorTypeHelper< long > { typedef __m256i Type; }; template<> struct VectorTypeHelper { typedef __m256i Type; }; template<> struct VectorTypeHelper< long long> { typedef __m256i Type; }; template<> struct VectorTypeHelper { typedef __m256i Type; }; template<> struct VectorTypeHelper< float> { typedef __m256 Type; }; template<> struct VectorTypeHelper< double> { typedef __m256d Type; }; template struct SseVectorType; template<> struct SseVectorType<__m256 > { typedef __m128 Type; }; template<> struct SseVectorType<__m256i> { typedef __m128i Type; }; template<> struct SseVectorType<__m256d> { typedef __m128d Type; }; template<> struct SseVectorType<__m128 > { typedef __m128 Type; }; template<> struct SseVectorType<__m128i> { typedef __m128i Type; }; template<> struct SseVectorType<__m128d> { typedef __m128d Type; }; template using IntegerVectorType = typename std::conditional::type; template using DoubleVectorType = typename std::conditional::type; template using FloatVectorType = typename std::conditional::type; template struct VectorHelper {}; template struct GatherHelper; template struct ScatterHelper; template struct HasVectorDivisionHelper { enum { Value = 1 }; }; template struct VectorHelperSize; } // namespace AVX } // namespace Vc #endif // VC_AVX_INTRINSICS_H_ Vc-1.3.3/avx/limits.h000066400000000000000000000130101320703111200143170ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2009-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_AVX_LIMITS_H_ #define VC_AVX_LIMITS_H_ #include "intrinsics.h" #include "types.h" #include "macros.h" namespace std { #define Vc_NUM_LIM(T, _max, _min) \ template <> struct numeric_limits> : public numeric_limits { \ static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector max() Vc_NOEXCEPT \ { \ return _max; \ } \ static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector min() Vc_NOEXCEPT \ { \ return _min; \ } \ static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector lowest() Vc_NOEXCEPT \ { \ return min(); \ } \ static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector epsilon() Vc_NOEXCEPT \ { \ return Vc::AVX2::Vector::Zero(); \ } \ static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector round_error() Vc_NOEXCEPT \ { \ return Vc::AVX2::Vector::Zero(); \ } \ static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector infinity() Vc_NOEXCEPT \ { \ return Vc::AVX2::Vector::Zero(); \ } \ static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector quiet_NaN() Vc_NOEXCEPT \ { \ return Vc::AVX2::Vector::Zero(); \ } \ static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector signaling_NaN() Vc_NOEXCEPT \ { \ return Vc::AVX2::Vector::Zero(); \ } \ static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector denorm_min() Vc_NOEXCEPT \ { \ return Vc::AVX2::Vector::Zero(); \ } \ } #ifdef Vc_IMPL_AVX2 Vc_NUM_LIM(unsigned short, Vc::Detail::allone<__m256i>(), Vc::Detail::zero<__m256i>()); Vc_NUM_LIM( short, _mm256_srli_epi16(Vc::Detail::allone<__m256i>(), 1), Vc::AVX::setmin_epi16()); Vc_NUM_LIM( unsigned int, Vc::Detail::allone<__m256i>(), Vc::Detail::zero<__m256i>()); Vc_NUM_LIM( int, _mm256_srli_epi32(Vc::Detail::allone<__m256i>(), 1), Vc::AVX::setmin_epi32()); #endif #undef Vc_NUM_LIM } // namespace std #endif // VC_AVX_LIMITS_H_ Vc-1.3.3/avx/macros.h000066400000000000000000000032101320703111200143030ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2009-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #include "../common/macros.h" #ifndef VC_AVX_MACROS_H_ #define VC_AVX_MACROS_H_ #endif // VC_AVX_MACROS_H_ Vc-1.3.3/avx/mask.h000066400000000000000000000224051320703111200137610ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2009-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_AVX_MASK_H_ #define VC_AVX_MASK_H_ #include #include "intrinsics.h" #include "../common/storage.h" #include "../common/bitscanintrinsics.h" #include "../common/maskbool.h" #include "detail.h" #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { template class Mask { public: using abi = VectorAbi::Avx; /** * The \c EntryType of masks is always bool, independent of \c T. */ typedef bool EntryType; using value_type = EntryType; using MaskBool = Common::MaskBool; /** * The \c VectorEntryType, in contrast to \c EntryType, reveals information about the SIMD * implementation. This type is useful for the \c sizeof operator in generic functions. */ using VectorEntryType = MaskBool; /** * The associated Vector type. */ using Vector = AVX2::Vector; ///\internal using VectorTypeF = AVX::FloatVectorType::Type>; ///\internal using VectorTypeD = AVX::DoubleVectorType; ///\internal using VectorTypeI = AVX::IntegerVectorType; private: typedef const VectorTypeF VArg; typedef const VectorTypeD VdArg; typedef const VectorTypeI ViArg; public: static constexpr size_t Size = sizeof(VectorTypeF) / sizeof(T); static constexpr size_t MemoryAlignment = Size; static constexpr std::size_t size() { return Size; } Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(VectorType)); private: typedef Common::Storage Storage; public: /** * The \c VectorType reveals the implementation-specific internal type used for the * SIMD type. */ using VectorType = typename Storage::VectorType; using EntryReference = Vc::Detail::ElementReference; using reference = EntryReference; // abstracts the way Masks are passed to functions, it can easily be changed to const ref here #if defined Vc_MSVC && defined _WIN32 typedef const Mask &AsArg; #else typedef const Mask AsArg; #endif Vc_INTRINSIC Mask() {} Vc_INTRINSIC Mask(VArg x) : d(AVX::avx_cast(x)) {} Vc_INTRINSIC Mask(VdArg x) : d(AVX::avx_cast(x)) {} Vc_INTRINSIC Mask(ViArg x) : d(AVX::avx_cast(x)) {} Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero) : d(Detail::zero()) {} Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne) : d(Detail::allone()) {} Vc_INTRINSIC explicit Mask(bool b) : d(b ? Detail::allone() : Detail::zero()) { } Vc_INTRINSIC static Mask Zero() { return Mask{Vc::Zero}; } Vc_INTRINSIC static Mask One() { return Mask{Vc::One}; } // implicit cast template Vc_INTRINSIC Mask(U &&rhs, Common::enable_if_mask_converts_implicitly = nullarg) : d(AVX::avx_cast( Detail::mask_cast::Size, Size, VectorTypeF>( rhs.dataI()))) { } #if Vc_IS_VERSION_1 // explicit cast, implemented via simd_cast (in avx/simd_cast_caller.h) template Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between " "mask types") Vc_INTRINSIC explicit Mask(U &&rhs, Common::enable_if_mask_converts_explicitly = nullarg); #endif template Vc_INTRINSIC explicit Mask(const bool *mem, Flags f = Flags()) { load(mem, f); } template Vc_INTRINSIC void load(const bool *mem, Flags = Flags()); template Vc_INTRINSIC void store(bool *mem, Flags = Flags()) const; Vc_INTRINSIC Mask &operator=(const Mask &) = default; Vc_INTRINSIC_L Mask &operator=(const std::array &values) Vc_INTRINSIC_R; Vc_INTRINSIC_L operator std::array() const Vc_INTRINSIC_R; // specializations in mask.tcc Vc_INTRINSIC Vc_PURE bool operator==(const Mask &rhs) const { return Detail::movemask(d.v()) == Detail::movemask(rhs.d.v()); } Vc_INTRINSIC Vc_PURE bool operator!=(const Mask &rhs) const { return !operator==(rhs); } Vc_INTRINSIC Mask operator!() const { return Detail::andnot_(data(), Detail::allone()); } Vc_INTRINSIC Mask &operator&=(const Mask &rhs) { d.v() = AVX::avx_cast(Detail::and_(data(), rhs.data())); return *this; } Vc_INTRINSIC Mask &operator|=(const Mask &rhs) { d.v() = AVX::avx_cast(Detail::or_ (data(), rhs.data())); return *this; } Vc_INTRINSIC Mask &operator^=(const Mask &rhs) { d.v() = AVX::avx_cast(Detail::xor_(data(), rhs.data())); return *this; } Vc_INTRINSIC Vc_PURE Mask operator&(const Mask &rhs) const { return Detail::and_(data(), rhs.data()); } Vc_INTRINSIC Vc_PURE Mask operator|(const Mask &rhs) const { return Detail::or_(data(), rhs.data()); } Vc_INTRINSIC Vc_PURE Mask operator^(const Mask &rhs) const { return Detail::xor_(data(), rhs.data()); } Vc_INTRINSIC Vc_PURE Mask operator&&(const Mask &rhs) const { return Detail::and_(data(), rhs.data()); } Vc_INTRINSIC Vc_PURE Mask operator||(const Mask &rhs) const { return Detail::or_(data(), rhs.data()); } // no need for expression template optimizations because cmp(n)eq for floats are not bitwise // compares Vc_INTRINSIC_L bool isNotEmpty() const Vc_INTRINSIC_R; Vc_INTRINSIC_L bool isEmpty() const Vc_INTRINSIC_R; Vc_INTRINSIC_L bool isFull() const Vc_INTRINSIC_R; Vc_INTRINSIC_L bool isMix() const Vc_INTRINSIC_R; Vc_INTRINSIC Vc_PURE int shiftMask() const { return Detail::movemask(dataI()); } Vc_INTRINSIC Vc_PURE int toInt() const { return Detail::mask_to_int(dataI()); } Vc_INTRINSIC VectorTypeF data () const { return AVX::avx_cast(d.v()); } Vc_INTRINSIC VectorTypeI dataI() const { return AVX::avx_cast(d.v()); } Vc_INTRINSIC VectorTypeD dataD() const { return AVX::avx_cast(d.v()); } private: friend reference; static Vc_INTRINSIC Vc_PURE value_type get(const Mask &m, int i) noexcept { return m.toInt() & (1 << i); } template static Vc_INTRINSIC void set(Mask &m, int i, U &&v) noexcept(noexcept(MaskBool(std::declval()))) { m.d.set(i, MaskBool(std::forward(v))); } public: /** * \note the returned object models the concept of a reference and * as such it can exist longer than the data it is referencing. * \note to avoid lifetime issues, we strongly advice not to store * any reference objects. */ Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept { return {*this, int(index)}; } Vc_ALWAYS_INLINE Vc_PURE value_type operator[](size_t index) const noexcept { return get(*this, index); } Vc_INTRINSIC Vc_PURE int count() const { return Detail::popcnt16(toInt()); } Vc_INTRINSIC Vc_PURE int firstOne() const { return _bit_scan_forward(toInt()); } template static Vc_INTRINSIC_L Mask generate(G &&gen) Vc_INTRINSIC_R; Vc_INTRINSIC_L Vc_PURE_L Mask shifted(int amount) const Vc_INTRINSIC_R Vc_PURE_R; private: #ifdef Vc_COMPILE_BENCHMARKS public: #endif Storage d; }; template constexpr size_t Mask::Size; template constexpr size_t Mask::MemoryAlignment; } // namespace Vc #include "mask.tcc" #endif // VC_AVX_MASK_H_ Vc-1.3.3/avx/mask.tcc000066400000000000000000000320321320703111200143000ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2011-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ namespace Vc_VERSIONED_NAMESPACE { // store {{{1 template template Vc_INTRINSIC void Mask::store(bool *mem, Flags f) const { Detail::mask_store(dataI(), mem, f); } // load {{{1 template template Vc_INTRINSIC void Mask::load(const bool *mem, Flags f) { d.v() = AVX::avx_cast(Detail::mask_load(mem, f)); } // operator[] {{{1 #ifdef Vc_IMPL_AVX2 template <> Vc_INTRINSIC Vc_PURE bool AVX2::Mask::get(const AVX2::Mask &m, int index) noexcept { return m.shiftMask() & (1 << 2 * index); } template <> Vc_INTRINSIC Vc_PURE bool AVX2::Mask::get(const AVX2::Mask &m, int index) noexcept { return m.shiftMask() & (1 << 2 * index); } #endif // operator== {{{1 template <> Vc_INTRINSIC Vc_PURE bool AVX2::double_m::operator==(const AVX2::double_m &rhs) const { return Detail::movemask(dataD()) == Detail::movemask(rhs.dataD()); } #ifdef Vc_IMPL_AVX2 template <> Vc_INTRINSIC Vc_PURE bool AVX2::short_m::operator==(const AVX2::short_m &rhs) const { return Detail::movemask(dataI()) == Detail::movemask(rhs.dataI()); } template <> Vc_INTRINSIC Vc_PURE bool AVX2::ushort_m::operator==(const AVX2::ushort_m &rhs) const { return Detail::movemask(dataI()) == Detail::movemask(rhs.dataI()); } #endif // isFull, isNotEmpty, isEmpty, isMix specializations{{{1 template Vc_INTRINSIC bool Mask::isFull() const { if (sizeof(T) == 8) { return 0 != Detail::testc(dataD(), Detail::allone()); } else if (sizeof(T) == 4) { return 0 != Detail::testc(data (), Detail::allone()); } else { return 0 != Detail::testc(dataI(), Detail::allone()); } } template Vc_INTRINSIC bool Mask::isNotEmpty() const { if (sizeof(T) == 8) { return 0 == Detail::testz(dataD(), dataD()); } else if (sizeof(T) == 4) { return 0 == Detail::testz(data (), data ()); } else { return 0 == Detail::testz(dataI(), dataI()); } } template Vc_INTRINSIC bool Mask::isEmpty() const { if (sizeof(T) == 8) { return 0 != Detail::testz(dataD(), dataD()); } else if (sizeof(T) == 4) { return 0 != Detail::testz(data (), data ()); } else { return 0 != Detail::testz(dataI(), dataI()); } } template Vc_INTRINSIC bool Mask::isMix() const { if (sizeof(T) == 8) { return 0 != Detail::testnzc(dataD(), Detail::allone()); } else if (sizeof(T) == 4) { return 0 != Detail::testnzc(data (), Detail::allone()); } else { return 0 != Detail::testnzc(dataI(), Detail::allone()); } } // generate {{{1 template Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant) { return _mm256_setr_epi64x( gen(0) ? 0xffffffffffffffffull : 0, gen(1) ? 0xffffffffffffffffull : 0, gen(2) ? 0xffffffffffffffffull : 0, gen(3) ? 0xffffffffffffffffull : 0); } template Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant) { return _mm256_setr_epi32(gen(0) ? 0xfffffffful : 0, gen(1) ? 0xfffffffful : 0, gen(2) ? 0xfffffffful : 0, gen(3) ? 0xfffffffful : 0, gen(4) ? 0xfffffffful : 0, gen(5) ? 0xfffffffful : 0, gen(6) ? 0xfffffffful : 0, gen(7) ? 0xfffffffful : 0); } template Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant) { return _mm256_setr_epi16(gen(0) ? 0xfffful : 0, gen(1) ? 0xfffful : 0, gen(2) ? 0xfffful : 0, gen(3) ? 0xfffful : 0, gen(4) ? 0xfffful : 0, gen(5) ? 0xfffful : 0, gen(6) ? 0xfffful : 0, gen(7) ? 0xfffful : 0, gen(8) ? 0xfffful : 0, gen(9) ? 0xfffful : 0, gen(10) ? 0xfffful : 0, gen(11) ? 0xfffful : 0, gen(12) ? 0xfffful : 0, gen(13) ? 0xfffful : 0, gen(14) ? 0xfffful : 0, gen(15) ? 0xfffful : 0); } template template Vc_INTRINSIC AVX2::Mask Mask::generate(G &&gen) { return generate_impl>(std::forward(gen), std::integral_constant()); } // shifted {{{1 template Vc_INTRINSIC Vc_PURE AVX2::Mask Mask::shifted(int amount) const { switch (amount * int(sizeof(VectorEntryType))) { case 0: return *this; case 1: return Detail::shifted< 1>(dataI()); case 2: return Detail::shifted< 2>(dataI()); case 3: return Detail::shifted< 3>(dataI()); case 4: return Detail::shifted< 4>(dataI()); case 5: return Detail::shifted< 5>(dataI()); case 6: return Detail::shifted< 6>(dataI()); case 7: return Detail::shifted< 7>(dataI()); case 8: return Detail::shifted< 8>(dataI()); case 9: return Detail::shifted< 9>(dataI()); case 10: return Detail::shifted< 10>(dataI()); case 11: return Detail::shifted< 11>(dataI()); case 12: return Detail::shifted< 12>(dataI()); case 13: return Detail::shifted< 13>(dataI()); case 14: return Detail::shifted< 14>(dataI()); case 15: return Detail::shifted< 15>(dataI()); case 16: return Detail::shifted< 16>(dataI()); case 17: return Detail::shifted< 17>(dataI()); case 18: return Detail::shifted< 18>(dataI()); case 19: return Detail::shifted< 19>(dataI()); case 20: return Detail::shifted< 20>(dataI()); case 21: return Detail::shifted< 21>(dataI()); case 22: return Detail::shifted< 22>(dataI()); case 23: return Detail::shifted< 23>(dataI()); case 24: return Detail::shifted< 24>(dataI()); case 25: return Detail::shifted< 25>(dataI()); case 26: return Detail::shifted< 26>(dataI()); case 27: return Detail::shifted< 27>(dataI()); case 28: return Detail::shifted< 28>(dataI()); case 29: return Detail::shifted< 29>(dataI()); case 30: return Detail::shifted< 30>(dataI()); case 31: return Detail::shifted< 31>(dataI()); case -1: return Detail::shifted< -1>(dataI()); case -2: return Detail::shifted< -2>(dataI()); case -3: return Detail::shifted< -3>(dataI()); case -4: return Detail::shifted< -4>(dataI()); case -5: return Detail::shifted< -5>(dataI()); case -6: return Detail::shifted< -6>(dataI()); case -7: return Detail::shifted< -7>(dataI()); case -8: return Detail::shifted< -8>(dataI()); case -9: return Detail::shifted< -9>(dataI()); case -10: return Detail::shifted<-10>(dataI()); case -11: return Detail::shifted<-11>(dataI()); case -12: return Detail::shifted<-12>(dataI()); case -13: return Detail::shifted<-13>(dataI()); case -14: return Detail::shifted<-14>(dataI()); case -15: return Detail::shifted<-15>(dataI()); case -16: return Detail::shifted<-16>(dataI()); case -17: return Detail::shifted<-17>(dataI()); case -18: return Detail::shifted<-18>(dataI()); case -19: return Detail::shifted<-19>(dataI()); case -20: return Detail::shifted<-20>(dataI()); case -21: return Detail::shifted<-21>(dataI()); case -22: return Detail::shifted<-22>(dataI()); case -23: return Detail::shifted<-23>(dataI()); case -24: return Detail::shifted<-24>(dataI()); case -25: return Detail::shifted<-25>(dataI()); case -26: return Detail::shifted<-26>(dataI()); case -27: return Detail::shifted<-27>(dataI()); case -28: return Detail::shifted<-28>(dataI()); case -29: return Detail::shifted<-29>(dataI()); case -30: return Detail::shifted<-30>(dataI()); case -31: return Detail::shifted<-31>(dataI()); } return Zero(); } // }}}1 /* template<> Vc_INTRINSIC AVX2::Mask< 4, 32> &AVX2::Mask< 4, 32>::operator=(const std::array &values) { static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte"); unsigned int x = *reinterpret_cast(values.data()); x *= 0xffu; __m128i y = _mm_cvtsi32_si128(x); // 4 Bytes y = _mm_unpacklo_epi8(y, y); // 8 Bytes y = _mm_unpacklo_epi16(y, y); // 16 Bytes d.v() = AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi32(y, y), _mm_unpackhi_epi32(y, y))); return *this; } template<> Vc_INTRINSIC AVX2::Mask< 8, 32> &AVX2::Mask< 8, 32>::operator=(const std::array &values) { static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte"); unsigned long long x = *reinterpret_cast(values.data()); x *= 0xffull; __m128i y = _mm_cvtsi64_si128(x); // 8 Bytes y = _mm_unpacklo_epi8(y, y); // 16 Bytes d.v() = AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi16(y, y), _mm_unpackhi_epi16(y, y))); return *this; } template<> Vc_INTRINSIC AVX2::Mask< 8, 16> &AVX2::Mask< 8, 16>::operator=(const std::array &values) { static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte"); unsigned long long x = *reinterpret_cast(values.data()); x *= 0xffull; __m128i y = _mm_cvtsi64_si128(x); // 8 Bytes d.v() = AVX::avx_cast<__m128>(_mm_unpacklo_epi8(y, y)); return *this; } template<> Vc_INTRINSIC AVX2::Mask<16, 16> &AVX2::Mask<16, 16>::operator=(const std::array &values) { static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte"); __m128i x = _mm_loadu_si128(reinterpret_cast(values.data())); d.v() = _mm_andnot_ps(AVX::_mm_setallone_ps(), AVX::avx_cast<__m128>(_mm_sub_epi8(x, _mm_set1_epi8(1)))); return *this; } template<> Vc_INTRINSIC AVX2::Mask< 4, 32>::operator std::array() const { static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte"); __m128i x = _mm_packs_epi32(AVX::lo128(dataI()), AVX::hi128(dataI())); // 64bit -> 32bit x = _mm_packs_epi32(x, x); // 32bit -> 16bit x = _mm_srli_epi16(x, 15); x = _mm_packs_epi16(x, x); // 16bit -> 8bit std::array r; asm volatile("vmovd %1,%0" : "=m"(*r.data()) : "x"(x)); return r; } template<> Vc_INTRINSIC AVX2::Mask< 8, 32>::operator std::array() const { static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte"); __m128i x = _mm_packs_epi32(AVX::lo128(dataI()), AVX::hi128(dataI())); // 32bit -> 16bit x = _mm_srli_epi16(x, 15); x = _mm_packs_epi16(x, x); // 16bit -> 8bit std::array r; asm volatile("vmovq %1,%0" : "=m"(*r.data()) : "x"(x)); return r; } template<> Vc_INTRINSIC AVX2::Mask< 8, 16>::operator std::array() const { static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte"); __m128i x = _mm_srli_epi16(dataI(), 15); x = _mm_packs_epi16(x, x); // 16bit -> 8bit std::array r; asm volatile("vmovq %1,%0" : "=m"(*r.data()) : "x"(x)); return r; } template<> Vc_INTRINSIC AVX2::Mask<16, 16>::operator std::array() const { static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte"); __m128 x = _mm_and_ps(d.v(), AVX::avx_cast<__m128>(_mm_set1_epi32(0x01010101))); std::array r; asm volatile("vmovups %1,%0" : "=m"(*r.data()) : "x"(x)); return r; } */ } // vim: foldmethod=marker Vc-1.3.3/avx/math.h000066400000000000000000000310421320703111200137540ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2009-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_AVX_MATH_H_ #define VC_AVX_MATH_H_ #include "const.h" #include "limits.h" #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { // min & max {{{1 #ifdef Vc_IMPL_AVX2 Vc_ALWAYS_INLINE AVX2::int_v min(const AVX2::int_v &x, const AVX2::int_v &y) { return _mm256_min_epi32(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::uint_v min(const AVX2::uint_v &x, const AVX2::uint_v &y) { return _mm256_min_epu32(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::short_v min(const AVX2::short_v &x, const AVX2::short_v &y) { return _mm256_min_epi16(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::ushort_v min(const AVX2::ushort_v &x, const AVX2::ushort_v &y) { return _mm256_min_epu16(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::int_v max(const AVX2::int_v &x, const AVX2::int_v &y) { return _mm256_max_epi32(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::uint_v max(const AVX2::uint_v &x, const AVX2::uint_v &y) { return _mm256_max_epu32(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::short_v max(const AVX2::short_v &x, const AVX2::short_v &y) { return _mm256_max_epi16(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::ushort_v max(const AVX2::ushort_v &x, const AVX2::ushort_v &y) { return _mm256_max_epu16(x.data(), y.data()); } #endif Vc_ALWAYS_INLINE AVX2::float_v min(const AVX2::float_v &x, const AVX2::float_v &y) { return _mm256_min_ps(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::double_v min(const AVX2::double_v &x, const AVX2::double_v &y) { return _mm256_min_pd(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::float_v max(const AVX2::float_v &x, const AVX2::float_v &y) { return _mm256_max_ps(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::double_v max(const AVX2::double_v &x, const AVX2::double_v &y) { return _mm256_max_pd(x.data(), y.data()); } // sqrt {{{1 template Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector sqrt(const AVX2::Vector &x) { return AVX::VectorHelper::sqrt(x.data()); } // rsqrt {{{1 template Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector rsqrt(const AVX2::Vector &x) { return AVX::VectorHelper::rsqrt(x.data()); } // reciprocal {{{1 template Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector reciprocal(const AVX2::Vector &x) { return AVX::VectorHelper::reciprocal(x.data()); } // round {{{1 template Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector round(const AVX2::Vector &x) { return AVX::VectorHelper::round(x.data()); } // abs {{{1 Vc_INTRINSIC Vc_CONST AVX2::double_v abs(AVX2::double_v x) { return Detail::and_(x.data(), AVX::setabsmask_pd()); } Vc_INTRINSIC Vc_CONST AVX2::float_v abs(AVX2::float_v x) { return Detail::and_(x.data(), AVX::setabsmask_ps()); } #ifdef Vc_IMPL_AVX2 Vc_INTRINSIC Vc_CONST AVX2::int_v abs(AVX2::int_v x) { return _mm256_abs_epi32(x.data()); } Vc_INTRINSIC Vc_CONST AVX2::short_v abs(AVX2::short_v x) { return _mm256_abs_epi16(x.data()); } #endif // isfinite {{{1 Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isfinite(const AVX2::double_v &x) { return AVX::cmpord_pd(x.data(), _mm256_mul_pd(Detail::zero<__m256d>(), x.data())); } Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isfinite(const AVX2::float_v &x) { return AVX::cmpord_ps(x.data(), _mm256_mul_ps(Detail::zero<__m256>(), x.data())); } // isinf {{{1 Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isinf(const AVX2::double_v &x) { return _mm256_castsi256_pd(AVX::cmpeq_epi64( _mm256_castpd_si256(abs(x).data()), _mm256_castpd_si256(Detail::avx_broadcast(AVX::c_log::d(1))))); } Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isinf(const AVX2::float_v &x) { return _mm256_castsi256_ps( AVX::cmpeq_epi32(_mm256_castps_si256(abs(x).data()), _mm256_castps_si256(Detail::avx_broadcast(AVX::c_log::d(1))))); } // isnan {{{1 Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isnan(const AVX2::double_v &x) { return AVX::cmpunord_pd(x.data(), x.data()); } Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isnan(const AVX2::float_v &x) { return AVX::cmpunord_ps(x.data(), x.data()); } // copysign {{{1 Vc_INTRINSIC Vc_CONST AVX2::float_v copysign(AVX2::float_v mag, AVX2::float_v sign) { return _mm256_or_ps(_mm256_and_ps(sign.data(), AVX::setsignmask_ps()), _mm256_and_ps(mag.data(), AVX::setabsmask_ps())); } Vc_INTRINSIC Vc_CONST AVX2::double_v copysign(AVX2::double_v::AsArg mag, AVX2::double_v::AsArg sign) { return _mm256_or_pd(_mm256_and_pd(sign.data(), AVX::setsignmask_pd()), _mm256_and_pd(mag.data(), AVX::setabsmask_pd())); } //}}}1 // frexp {{{1 /** * splits \p v into exponent and mantissa, the sign is kept with the mantissa * * The return value will be in the range [0.5, 1.0[ * The \p e value will be an integer defining the power-of-two exponent */ inline AVX2::double_v frexp(AVX2::double_v::AsArg v, SimdArray *e) { const __m256d exponentBits = AVX::Const::exponentMask().dataD(); const __m256d exponentPart = _mm256_and_pd(v.data(), exponentBits); auto lo = AVX::avx_cast<__m128i>(AVX::lo128(exponentPart)); auto hi = AVX::avx_cast<__m128i>(AVX::hi128(exponentPart)); lo = _mm_sub_epi32(_mm_srli_epi64(lo, 52), _mm_set1_epi64x(0x3fe)); hi = _mm_sub_epi32(_mm_srli_epi64(hi, 52), _mm_set1_epi64x(0x3fe)); SSE::int_v exponent = Mem::shuffle(lo, hi); const __m256d exponentMaximized = _mm256_or_pd(v.data(), exponentBits); AVX2::double_v ret = _mm256_and_pd(exponentMaximized, _mm256_broadcast_sd(reinterpret_cast(&AVX::c_general::frexpMask))); const double_m zeroMask = v == AVX2::double_v::Zero(); ret(isnan(v) || !isfinite(v) || zeroMask) = v; exponent.setZero(simd_cast(zeroMask)); internal_data(*e) = exponent; return ret; } #ifdef Vc_IMPL_AVX2 inline SimdArray frexp( const SimdArray &v, SimdArray *e) { const __m256d exponentBits = AVX::Const::exponentMask().dataD(); const __m256d w[2] = {internal_data(internal_data0(v)).data(), internal_data(internal_data1(v)).data()}; const __m256i exponentPart[2] = { _mm256_castpd_si256(_mm256_and_pd(w[0], exponentBits)), _mm256_castpd_si256(_mm256_and_pd(w[1], exponentBits))}; const __m256i lo = _mm256_sub_epi32(_mm256_srli_epi64(exponentPart[0], 52), _mm256_set1_epi32(0x3fe)); // 0.1. 2.3. const __m256i hi = _mm256_sub_epi32(_mm256_srli_epi64(exponentPart[1], 52), _mm256_set1_epi32(0x3fe)); // 4.5. 6.7. const __m256i a = _mm256_unpacklo_epi32(lo, hi); // 04.. 26.. const __m256i b = _mm256_unpackhi_epi32(lo, hi); // 15.. 37.. const __m256i tmp = _mm256_unpacklo_epi32(a, b); // 0145 2367 const __m256i exponent = AVX::concat(_mm_unpacklo_epi64(AVX::lo128(tmp), AVX::hi128(tmp)), _mm_unpackhi_epi64(AVX::lo128(tmp), AVX::hi128(tmp))); // 0123 4567 const __m256d exponentMaximized[2] = {_mm256_or_pd(w[0], exponentBits), _mm256_or_pd(w[1], exponentBits)}; const auto frexpMask = _mm256_broadcast_sd(reinterpret_cast(&AVX::c_general::frexpMask)); SimdArray ret = { SimdArray( _mm256_and_pd(exponentMaximized[0], frexpMask)), SimdArray( _mm256_and_pd(exponentMaximized[1], frexpMask))}; const auto zeroMask = v == v.Zero(); ret(isnan(v) || !isfinite(v) || zeroMask) = v; internal_data(*e) = Detail::andnot_(simd_cast(zeroMask).dataI(), exponent); return ret; } #endif // Vc_IMPL_AVX2 namespace Detail { Vc_INTRINSIC AVX2::float_v::IndexType extractExponent(__m256 e) { SimdArray exponentPart; const auto ee = AVX::avx_cast<__m256i>(e); #ifdef Vc_IMPL_AVX2 exponentPart = AVX2::uint_v(ee); #else internal_data(internal_data0(exponentPart)) = AVX::lo128(ee); internal_data(internal_data1(exponentPart)) = AVX::hi128(ee); #endif return (exponentPart >> 23) - 0x7e; } } // namespace Detail inline AVX2::float_v frexp(AVX2::float_v::AsArg v, AVX2::float_v::IndexType *e) { using namespace Detail; using namespace AVX2; const __m256 exponentBits = Const::exponentMask().data(); *e = extractExponent(and_(v.data(), exponentBits)); const __m256 exponentMaximized = or_(v.data(), exponentBits); AVX2::float_v ret = _mm256_and_ps(exponentMaximized, avx_cast<__m256>(set1_epi32(0xbf7fffffu))); ret(isnan(v) || !isfinite(v) || v == AVX2::float_v::Zero()) = v; e->setZero(simd_cast(v == AVX2::float_v::Zero())); return ret; } // ldexp {{{1 /* -> x * 2^e * x == NaN -> NaN * x == (-)inf -> (-)inf */ inline AVX2::double_v ldexp(AVX2::double_v::AsArg v, const SimdArray &_e) { SSE::int_v e = internal_data(_e); e.setZero(simd_cast(v == AVX2::double_v::Zero())); const __m256i exponentBits = AVX::concat(_mm_slli_epi64(_mm_unpacklo_epi32(e.data(), e.data()), 52), _mm_slli_epi64(_mm_unpackhi_epi32(e.data(), e.data()), 52)); return AVX::avx_cast<__m256d>( AVX::add_epi64(AVX::avx_cast<__m256i>(v.data()), exponentBits)); } inline AVX2::float_v ldexp(AVX2::float_v::AsArg v, SimdArray e) { e.setZero(simd_cast(v == AVX2::float_v::Zero())); e <<= 23; return {AVX::avx_cast<__m256>( AVX::concat(_mm_add_epi32(AVX::avx_cast<__m128i>(AVX::lo128(v.data())), internal_data(internal_data0(e)).data()), _mm_add_epi32(AVX::avx_cast<__m128i>(AVX::hi128(v.data())), internal_data(internal_data1(e)).data())))}; } // trunc {{{1 Vc_ALWAYS_INLINE AVX2::float_v trunc(AVX2::float_v::AsArg v) { return _mm256_round_ps(v.data(), 0x3); } Vc_ALWAYS_INLINE AVX2::double_v trunc(AVX2::double_v::AsArg v) { return _mm256_round_pd(v.data(), 0x3); } // floor {{{1 Vc_ALWAYS_INLINE AVX2::float_v floor(AVX2::float_v::AsArg v) { return _mm256_floor_ps(v.data()); } Vc_ALWAYS_INLINE AVX2::double_v floor(AVX2::double_v::AsArg v) { return _mm256_floor_pd(v.data()); } // ceil {{{1 Vc_ALWAYS_INLINE AVX2::float_v ceil(AVX2::float_v::AsArg v) { return _mm256_ceil_ps(v.data()); } Vc_ALWAYS_INLINE AVX2::double_v ceil(AVX2::double_v::AsArg v) { return _mm256_ceil_pd(v.data()); } // fma {{{1 template Vc_ALWAYS_INLINE Vector fma(Vector a, Vector b, Vector c) { return Detail::fma(a.data(), b.data(), c.data(), T()); } // }}}1 } // namespace Vc #endif // VC_AVX_MATH_H_ // vim: foldmethod=marker Vc-1.3.3/avx/shuffle.h000066400000000000000000000457061320703111200144730ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2011-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_AVX_SHUFFLE_H_ #define VC_AVX_SHUFFLE_H_ #include "../sse/shuffle.h" #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace Detail { template struct Permutation {}; template struct Mask {}; #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST __m256i blend(__m256i a, __m256i b, Mask) { static_assert((Sel0 == 0 || Sel0 == 1) && (Sel1 == 0 || Sel1 == 1) && (Sel2 == 0 || Sel2 == 1) && (Sel3 == 0 || Sel3 == 1) && (Sel4 == 0 || Sel4 == 1) && (Sel5 == 0 || Sel5 == 1) && (Sel6 == 0 || Sel6 == 1) && (Sel7 == 0 || Sel7 == 1) && (Sel8 == 0 || Sel8 == 1) && (Sel9 == 0 || Sel9 == 1) && (Sel10 == 0 || Sel10 == 1) && (Sel11 == 0 || Sel11 == 1) && (Sel12 == 0 || Sel12 == 1) && (Sel13 == 0 || Sel13 == 1) && (Sel14 == 0 || Sel14 == 1) && (Sel15 == 0 || Sel15 == 1), "Selectors must be 0 or 1 to select the value from a or b"); constexpr uint8_t mask = static_cast( (Sel0 << 0 ) | (Sel1 << 1 ) | (Sel2 << 2 ) | (Sel3 << 3 ) | (Sel4 << 4 ) | (Sel5 << 5 ) | (Sel6 << 6 ) | (Sel7 << 7 ) | (Sel8 << 8 ) | (Sel9 << 9 ) | (Sel10 << 10) | (Sel11 << 11) | (Sel12 << 12) | (Sel13 << 13) | (Sel14 << 14) | (Sel15 << 15)); return _mm256_blend_epi16(a, b, mask); } #endif // Vc_IMPL_AVX2 } // namespace Detail namespace Mem { #ifdef Vc_IMPL_AVX2 template static Vc_ALWAYS_INLINE __m256i Vc_CONST permuteLo(__m256i x) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); return _mm256_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } template static Vc_ALWAYS_INLINE __m256i Vc_CONST permuteHi(__m256i x) { static_assert(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, "Incorrect_Range"); static_assert(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, "Incorrect_Range"); return _mm256_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64); } #endif // Vc_IMPL_AVX2 template static Vc_ALWAYS_INLINE __m256 Vc_CONST permute128(__m256 x) { static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range"); static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range"); return _mm256_permute2f128_ps( x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4))); } template static Vc_ALWAYS_INLINE __m256d Vc_CONST permute128(__m256d x) { static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range"); static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range"); return _mm256_permute2f128_pd( x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4))); } template static Vc_ALWAYS_INLINE __m256i Vc_CONST permute128(__m256i x) { static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range"); static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range"); #ifdef Vc_IMPL_AVX2 return _mm256_permute2x128_si256( x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4))); #else return _mm256_permute2f128_si256( x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4))); #endif } template static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle128(__m256 x, __m256 y) { static_assert(L >= X0 && H >= X0, "Incorrect_Range"); static_assert(L <= Y1 && H <= Y1, "Incorrect_Range"); return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); } template static Vc_ALWAYS_INLINE __m256i Vc_CONST shuffle128(__m256i x, __m256i y) { static_assert(L >= X0 && H >= X0, "Incorrect_Range"); static_assert(L <= Y1 && H <= Y1, "Incorrect_Range"); #ifdef Vc_IMPL_AVX2 return _mm256_permute2x128_si256( x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); #else return _mm256_permute2f128_si256( x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); #endif } template static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle128(__m256d x, __m256d y) { static_assert(L >= X0 && H >= X0, "Incorrect_Range"); static_assert(L <= Y1 && H <= Y1, "Incorrect_Range"); return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); } template static Vc_ALWAYS_INLINE __m256d Vc_CONST permute(__m256d x) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, "Incorrect_Range"); static_assert(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8); } template static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } template static Vc_ALWAYS_INLINE __m256i Vc_CONST permute(__m256i x) { return _mm256_castps_si256(permute(_mm256_castsi256_ps(x))); } #ifdef Vc_IMPL_AVX2 template static Vc_ALWAYS_INLINE __m256i Vc_CONST permute4x64(__m256i x) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); return _mm256_permute4x64_epi64(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } #endif // Vc_IMPL_AVX2 template static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle(__m256d x, __m256d y) { static_assert(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, "Incorrect_Range"); static_assert(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, "Incorrect_Range"); return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8); } template static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle(__m256 x, __m256 y) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range"); static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range"); return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64); } template static Vc_ALWAYS_INLINE __m256 Vc_CONST blend(__m256 x, __m256 y) { static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range"); static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range"); static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range"); static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range"); static_assert(Dst4 == X4 || Dst4 == Y4, "Incorrect_Range"); static_assert(Dst5 == X5 || Dst5 == Y5, "Incorrect_Range"); static_assert(Dst6 == X6 || Dst6 == Y6, "Incorrect_Range"); static_assert(Dst7 == X7 || Dst7 == Y7, "Incorrect_Range"); return _mm256_blend_ps(x, y, (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 + (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 + (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 + (Dst6 / Y6) * 64 + (Dst7 / Y7) *128 ); } template static Vc_ALWAYS_INLINE __m256i Vc_CONST blend(__m256i x, __m256i y) { return _mm256_castps_si256(blend(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); } template struct ScaleForBlend { enum { Value = Dst >= X4 ? Dst - X4 + Y0 : Dst }; }; template static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) { static_assert(Dst0 >= X0 && Dst0 <= X7, "Incorrect_Range"); static_assert(Dst1 >= X0 && Dst1 <= X7, "Incorrect_Range"); static_assert(Dst2 >= X0 && Dst2 <= X7, "Incorrect_Range"); static_assert(Dst3 >= X0 && Dst3 <= X7, "Incorrect_Range"); static_assert(Dst4 >= X0 && Dst4 <= X7, "Incorrect_Range"); static_assert(Dst5 >= X0 && Dst5 <= X7, "Incorrect_Range"); static_assert(Dst6 >= X0 && Dst6 <= X7, "Incorrect_Range"); static_assert(Dst7 >= X0 && Dst7 <= X7, "Incorrect_Range"); if (Dst0 + X4 == Dst4 && Dst1 + X4 == Dst5 && Dst2 + X4 == Dst6 && Dst3 + X4 == Dst7) { return permute(x); } const __m128 loIn = _mm256_castps256_ps128(x); const __m128 hiIn = _mm256_extractf128_ps(x, 1); __m128 lo, hi; if (Dst0 < X4 && Dst1 < X4 && Dst2 < X4 && Dst3 < X4) { lo = _mm_permute_ps(loIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4) { lo = _mm_permute_ps(hiIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } else if (Dst0 < X4 && Dst1 < X4 && Dst2 >= X4 && Dst3 >= X4) { lo = shuffle(loIn, hiIn); } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 < X4 && Dst3 < X4) { lo = shuffle(hiIn, loIn); } else if (Dst0 == X0 && Dst1 == X4 && Dst2 == X1 && Dst3 == X5) { lo = _mm_unpacklo_ps(loIn, hiIn); } else if (Dst0 == X4 && Dst1 == X0 && Dst2 == X5 && Dst3 == X1) { lo = _mm_unpacklo_ps(hiIn, loIn); } else if (Dst0 == X2 && Dst1 == X6 && Dst2 == X3 && Dst3 == X7) { lo = _mm_unpackhi_ps(loIn, hiIn); } else if (Dst0 == X6 && Dst1 == X2 && Dst2 == X7 && Dst3 == X3) { lo = _mm_unpackhi_ps(hiIn, loIn); } else if (Dst0 % X4 == 0 && Dst1 % X4 == 1 && Dst2 % X4 == 2 && Dst3 % X4 == 3) { lo = blend::Value, ScaleForBlend::Value, ScaleForBlend::Value, ScaleForBlend::Value>(loIn, hiIn); } if (Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4) { hi = _mm_permute_ps(hiIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64); } else if (Dst4 < X4 && Dst5 < X4 && Dst6 < X4 && Dst7 < X4) { hi = _mm_permute_ps(loIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64); } else if (Dst4 < X4 && Dst5 < X4 && Dst6 >= X4 && Dst7 >= X4) { hi = shuffle(loIn, hiIn); } else if (Dst4 >= X4 && Dst5 >= X4 && Dst6 < X4 && Dst7 < X4) { hi = shuffle(hiIn, loIn); } else if (Dst4 == X0 && Dst5 == X4 && Dst6 == X1 && Dst7 == X5) { hi = _mm_unpacklo_ps(loIn, hiIn); } else if (Dst4 == X4 && Dst5 == X0 && Dst6 == X5 && Dst7 == X1) { hi = _mm_unpacklo_ps(hiIn, loIn); } else if (Dst4 == X2 && Dst5 == X6 && Dst6 == X3 && Dst7 == X7) { hi = _mm_unpackhi_ps(loIn, hiIn); } else if (Dst4 == X6 && Dst5 == X2 && Dst6 == X7 && Dst7 == X3) { hi = _mm_unpackhi_ps(hiIn, loIn); } else if (Dst4 % X4 == 0 && Dst5 % X4 == 1 && Dst6 % X4 == 2 && Dst7 % X4 == 3) { hi = blend::Value, ScaleForBlend::Value, ScaleForBlend::Value, ScaleForBlend::Value>(loIn, hiIn); } return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); } } // namespace Mem } // namespace Vc // little endian has the lo bits on the right and high bits on the left // with vectors this becomes greatly confusing: // Mem: abcd // Reg: dcba // // The shuffles and permutes above use memory ordering. The ones below use register ordering: namespace Vc_VERSIONED_NAMESPACE { namespace Reg { template static Vc_ALWAYS_INLINE __m256 Vc_CONST permute128(__m256 x, __m256 y) { static_assert(L >= X0 && H >= X0, "Incorrect_Range"); static_assert(L <= Y1 && H <= Y1, "Incorrect_Range"); return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); } template static Vc_ALWAYS_INLINE __m256i Vc_CONST permute128(__m256i x, __m256i y) { static_assert(L >= X0 && H >= X0, "Incorrect_Range"); static_assert(L <= Y1 && H <= Y1, "Incorrect_Range"); #ifdef Vc_IMPL_AVX2 return _mm256_permute2x128_si256( x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); #else return _mm256_permute2f128_si256( x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); #endif } template static Vc_ALWAYS_INLINE __m256d Vc_CONST permute128(__m256d x, __m256d y) { static_assert(L >= X0 && H >= X0, "Incorrect_Range"); static_assert(L <= Y1 && H <= Y1, "Incorrect_Range"); return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); } template static Vc_ALWAYS_INLINE __m256d Vc_CONST permute(__m256d x) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, "Incorrect_Range"); static_assert(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8); } template static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } template static Vc_ALWAYS_INLINE __m128d Vc_CONST permute(__m128d x) { static_assert(Dst0 >= X0 && Dst1 >= X0, "Incorrect_Range"); static_assert(Dst0 <= X1 && Dst1 <= X1, "Incorrect_Range"); return _mm_permute_pd(x, Dst0 + Dst1 * 2); } template static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); return _mm_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } template static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle(__m256d x, __m256d y) { static_assert(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, "Incorrect_Range"); static_assert(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, "Incorrect_Range"); return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8); } template static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle(__m256 x, __m256 y) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range"); static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range"); return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64); } } // namespace Reg } // namespace Vc #endif // VC_AVX_SHUFFLE_H_ Vc-1.3.3/avx/simd_cast.h000066400000000000000000004135421320703111200150020ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2014-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_AVX_SIMD_CAST_H_ #define VC_AVX_SIMD_CAST_H_ #ifndef VC_AVX_VECTOR_H_ #error "Vc/avx/vector.h needs to be included before Vc/avx/simd_cast.h" #endif #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { // Declarations: helper macros Vc_SIMD_CAST_AVX_[124] & Vc_SIMD_CAST_[124] {{{1 #define Vc_SIMD_CAST_AVX_1(from_, to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ AVX2::from_ x, enable_if::value> = nullarg) #define Vc_SIMD_CAST_AVX_2(from_, to_) \ static_assert(AVX2::from_::size() * 2 <= AVX2::to_::size(), \ "this type combination is wrong"); \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ AVX2::from_ x0, AVX2::from_ x1, \ enable_if::value> = nullarg) #define Vc_SIMD_CAST_AVX_3(from_, to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, \ enable_if::value> = nullarg) #define Vc_SIMD_CAST_AVX_4(from_, to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, AVX2::from_ x3, \ enable_if::value> = nullarg) #define Vc_SIMD_CAST_1(from_, to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x, enable_if::value> = nullarg) #define Vc_SIMD_CAST_2(from_, to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x0, from_ x1, enable_if::value> = nullarg) #define Vc_SIMD_CAST_3(from_, to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x0, from_ x1, from_ x2, enable_if::value> = nullarg) #define Vc_SIMD_CAST_4(from_, to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x0, from_ x1, from_ x2, from_ x3, \ enable_if::value> = nullarg) #define Vc_SIMD_CAST_5(from_, to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \ enable_if::value> = nullarg) #define Vc_SIMD_CAST_6(from_, to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, \ enable_if::value> = nullarg) #define Vc_SIMD_CAST_7(from_, to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, from_ x6, \ enable_if::value> = nullarg) #define Vc_SIMD_CAST_8(from_, to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, from_ x6, from_ x7, \ enable_if::value> = nullarg) #define Vc_SIMD_CAST_OFFSET(from_, to_, offset_) \ static_assert(from_::size() >= to_::size() * (offset_ + 1), \ "this offset cannot exist for this type combination"); \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x, \ enable_if<(offset == offset_ && std::is_same::value)> = nullarg) // Declaration: SSE -> AVX where the AVX Vector is integral and thus of equal size() {{{1 // as the equivalent SSE Vector template Vc_INTRINSIC Vc_CONST To simd_cast(From x, enable_if<(AVX2::is_vector::value && SSE::is_vector::value && SSE::Vector::Size == To::Size)> = nullarg); template Vc_INTRINSIC Vc_CONST To simd_cast( From x0, From x1, enable_if<(AVX2::is_vector::value && SSE::is_vector::value && SSE::Vector::Size == To::Size)> = nullarg); template Vc_INTRINSIC Vc_CONST To simd_cast( From x0, From x1, From x2, enable_if<(AVX2::is_vector::value && SSE::is_vector::value && SSE::Vector::Size == To::Size)> = nullarg); template Vc_INTRINSIC Vc_CONST To simd_cast( From x0, From x1, From x2, From x3, enable_if<(AVX2::is_vector::value && SSE::is_vector::value && SSE::Vector::Size == To::Size)> = nullarg); template Vc_INTRINSIC Vc_CONST To simd_cast( From x0, From x1, From x2, From x3, From x4, From x5, From x6, From x7, enable_if<(AVX2::is_vector::value && SSE::is_vector::value && SSE::Vector::Size == To::Size)> = nullarg); // Declarations: Vector casts without offset {{{1 // AVX2::Vector {{{2 Vc_SIMD_CAST_AVX_1( float_v, double_v); Vc_SIMD_CAST_AVX_1(double_v, float_v); Vc_SIMD_CAST_AVX_2(double_v, float_v); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_1( int_v, double_v); Vc_SIMD_CAST_AVX_1( uint_v, double_v); Vc_SIMD_CAST_AVX_1( short_v, double_v); Vc_SIMD_CAST_AVX_1(ushort_v, double_v); Vc_SIMD_CAST_AVX_1( int_v, float_v); Vc_SIMD_CAST_AVX_1( uint_v, float_v); Vc_SIMD_CAST_AVX_1( short_v, float_v); Vc_SIMD_CAST_AVX_1(ushort_v, float_v); Vc_SIMD_CAST_AVX_1(double_v, int_v); Vc_SIMD_CAST_AVX_1( float_v, int_v); Vc_SIMD_CAST_AVX_1( uint_v, int_v); Vc_SIMD_CAST_AVX_1( short_v, int_v); Vc_SIMD_CAST_AVX_1(ushort_v, int_v); Vc_SIMD_CAST_AVX_2(double_v, int_v); Vc_SIMD_CAST_AVX_1(double_v, uint_v); Vc_SIMD_CAST_AVX_1( float_v, uint_v); Vc_SIMD_CAST_AVX_1( int_v, uint_v); Vc_SIMD_CAST_AVX_1( short_v, uint_v); Vc_SIMD_CAST_AVX_1(ushort_v, uint_v); Vc_SIMD_CAST_AVX_2(double_v, uint_v); Vc_SIMD_CAST_AVX_1(double_v, short_v); Vc_SIMD_CAST_AVX_1( float_v, short_v); Vc_SIMD_CAST_AVX_1( int_v, short_v); Vc_SIMD_CAST_AVX_1( uint_v, short_v); Vc_SIMD_CAST_AVX_1(ushort_v, short_v); Vc_SIMD_CAST_AVX_2(double_v, short_v); Vc_SIMD_CAST_AVX_2( float_v, short_v); Vc_SIMD_CAST_AVX_2( int_v, short_v); Vc_SIMD_CAST_AVX_2( uint_v, short_v); Vc_SIMD_CAST_AVX_3(double_v, short_v); Vc_SIMD_CAST_AVX_4(double_v, short_v); Vc_SIMD_CAST_AVX_1(double_v, ushort_v); Vc_SIMD_CAST_AVX_1( float_v, ushort_v); Vc_SIMD_CAST_AVX_1( int_v, ushort_v); Vc_SIMD_CAST_AVX_1( uint_v, ushort_v); Vc_SIMD_CAST_AVX_1( short_v, ushort_v); Vc_SIMD_CAST_AVX_2(double_v, ushort_v); Vc_SIMD_CAST_AVX_2( float_v, ushort_v); Vc_SIMD_CAST_AVX_2( int_v, ushort_v); Vc_SIMD_CAST_AVX_2( uint_v, ushort_v); Vc_SIMD_CAST_AVX_3(double_v, ushort_v); Vc_SIMD_CAST_AVX_4(double_v, ushort_v); #endif // 1 SSE::Vector to 1 AVX2::Vector {{{2 Vc_SIMD_CAST_1(SSE::double_v, AVX2::double_v); Vc_SIMD_CAST_1(SSE:: float_v, AVX2::double_v); Vc_SIMD_CAST_1(SSE:: int_v, AVX2::double_v); Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::double_v); Vc_SIMD_CAST_1(SSE:: short_v, AVX2::double_v); Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::double_v); Vc_SIMD_CAST_1(SSE::double_v, AVX2:: float_v); Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: float_v); Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: float_v); Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: float_v); Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: float_v); Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: float_v); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_1(SSE::double_v, AVX2:: int_v); Vc_SIMD_CAST_1(SSE::double_v, AVX2:: uint_v); Vc_SIMD_CAST_1(SSE::double_v, AVX2:: short_v); Vc_SIMD_CAST_1(SSE::double_v, AVX2::ushort_v); Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: int_v); Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: uint_v); Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: short_v); Vc_SIMD_CAST_1(SSE:: float_v, AVX2::ushort_v); Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: int_v); Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: int_v); Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: int_v); Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: int_v); Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: uint_v); Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: uint_v); Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: uint_v); Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: uint_v); Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: short_v); Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: short_v); Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: short_v); Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: short_v); Vc_SIMD_CAST_1(SSE:: int_v, AVX2::ushort_v); Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::ushort_v); Vc_SIMD_CAST_1(SSE:: short_v, AVX2::ushort_v); Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::ushort_v); #endif // 2 SSE::Vector to 1 AVX2::Vector {{{2 Vc_SIMD_CAST_2(SSE::double_v, AVX2::double_v); Vc_SIMD_CAST_2(SSE::double_v, AVX2:: float_v); Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: float_v); Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: float_v); Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: float_v); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_2(SSE::double_v, AVX2:: int_v); Vc_SIMD_CAST_2(SSE::double_v, AVX2:: uint_v); Vc_SIMD_CAST_2(SSE::double_v, AVX2:: short_v); Vc_SIMD_CAST_2(SSE::double_v, AVX2::ushort_v); Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: int_v); Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: uint_v); Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: short_v); Vc_SIMD_CAST_2(SSE:: float_v, AVX2::ushort_v); Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: int_v); Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: int_v); Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: uint_v); Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: uint_v); Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: short_v); Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: short_v); Vc_SIMD_CAST_2(SSE:: short_v, AVX2:: short_v); Vc_SIMD_CAST_2(SSE::ushort_v, AVX2:: short_v); Vc_SIMD_CAST_2(SSE:: int_v, AVX2::ushort_v); Vc_SIMD_CAST_2(SSE:: uint_v, AVX2::ushort_v); Vc_SIMD_CAST_2(SSE:: short_v, AVX2::ushort_v); Vc_SIMD_CAST_2(SSE::ushort_v, AVX2::ushort_v); #endif // 3 SSE::Vector to 1 AVX2::Vector {{{2 Vc_SIMD_CAST_3(SSE::double_v, AVX2:: float_v); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_3(SSE::double_v, AVX2:: int_v); Vc_SIMD_CAST_3(SSE::double_v, AVX2:: uint_v); Vc_SIMD_CAST_3(SSE::double_v, AVX2:: short_v); Vc_SIMD_CAST_3(SSE::double_v, AVX2::ushort_v); Vc_SIMD_CAST_3(SSE:: float_v, AVX2:: short_v); Vc_SIMD_CAST_3(SSE:: float_v, AVX2::ushort_v); Vc_SIMD_CAST_3(SSE:: int_v, AVX2:: short_v); Vc_SIMD_CAST_3(SSE:: uint_v, AVX2:: short_v); Vc_SIMD_CAST_3(SSE:: int_v, AVX2::ushort_v); Vc_SIMD_CAST_3(SSE:: uint_v, AVX2::ushort_v); #endif // 4 SSE::Vector to 1 AVX2::Vector {{{2 Vc_SIMD_CAST_4(SSE::double_v, AVX2:: float_v); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_4(SSE::double_v, AVX2:: int_v); Vc_SIMD_CAST_4(SSE::double_v, AVX2:: uint_v); Vc_SIMD_CAST_4(SSE::double_v, AVX2:: short_v); Vc_SIMD_CAST_4(SSE::double_v, AVX2::ushort_v); Vc_SIMD_CAST_4(SSE:: float_v, AVX2:: short_v); Vc_SIMD_CAST_4(SSE:: float_v, AVX2::ushort_v); Vc_SIMD_CAST_4(SSE:: int_v, AVX2:: short_v); Vc_SIMD_CAST_4(SSE:: uint_v, AVX2:: short_v); Vc_SIMD_CAST_4(SSE:: int_v, AVX2::ushort_v); Vc_SIMD_CAST_4(SSE:: uint_v, AVX2::ushort_v); #endif // 5 SSE::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_5(SSE::double_v, AVX2:: short_v); Vc_SIMD_CAST_5(SSE::double_v, AVX2::ushort_v); #endif // 6 SSE::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_6(SSE::double_v, AVX2:: short_v); Vc_SIMD_CAST_6(SSE::double_v, AVX2::ushort_v); #endif // 7 SSE::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_7(SSE::double_v, AVX2:: short_v); Vc_SIMD_CAST_7(SSE::double_v, AVX2::ushort_v); #endif // 8 SSE::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_8(SSE::double_v, AVX2:: short_v); Vc_SIMD_CAST_8(SSE::double_v, AVX2::ushort_v); #endif // 1 AVX2::Vector to 1 SSE::Vector {{{2 Vc_SIMD_CAST_1(AVX2::double_v, SSE::double_v); Vc_SIMD_CAST_1(AVX2::double_v, SSE:: float_v); Vc_SIMD_CAST_1(AVX2::double_v, SSE:: int_v); Vc_SIMD_CAST_1(AVX2::double_v, SSE:: uint_v); Vc_SIMD_CAST_1(AVX2::double_v, SSE:: short_v); Vc_SIMD_CAST_1(AVX2::double_v, SSE::ushort_v); Vc_SIMD_CAST_1(AVX2:: float_v, SSE::double_v); Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: float_v); Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: int_v); Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: uint_v); Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: short_v); Vc_SIMD_CAST_1(AVX2:: float_v, SSE::ushort_v); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_1(AVX2:: int_v, SSE::double_v); Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: float_v); Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: uint_v); Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: int_v); Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: short_v); Vc_SIMD_CAST_1(AVX2:: int_v, SSE::ushort_v); Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::double_v); Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: float_v); Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: int_v); Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: uint_v); Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: short_v); Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::ushort_v); Vc_SIMD_CAST_1(AVX2:: short_v, SSE::double_v); Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: float_v); Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: int_v); Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: uint_v); Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: short_v); Vc_SIMD_CAST_1(AVX2:: short_v, SSE::ushort_v); Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::double_v); Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: float_v); Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: int_v); Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: uint_v); Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: short_v); Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::ushort_v); #endif // 2 AVX2::Vector to 1 SSE::Vector {{{2 Vc_SIMD_CAST_2(AVX2::double_v, SSE:: short_v); Vc_SIMD_CAST_2(AVX2::double_v, SSE::ushort_v); // 1 Scalar::Vector to 1 AVX2::Vector {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> = nullarg); #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> = nullarg); #endif // 2 Scalar::Vector to 1 AVX2::Vector {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> = nullarg); #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> = nullarg); #endif // 3 Scalar::Vector to 1 AVX2::Vector {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value> = nullarg); #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value> = nullarg); #endif // 4 Scalar::Vector to 1 AVX2::Vector {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value> = nullarg); #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value> = nullarg); #endif // 5 Scalar::Vector to 1 AVX2::Vector {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value> = nullarg); #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value> = nullarg); #endif // 6 Scalar::Vector to 1 AVX2::Vector {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value> = nullarg); #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value> = nullarg); #endif // 7 Scalar::Vector to 1 AVX2::Vector {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value> = nullarg); #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value> = nullarg); #endif // 8 Scalar::Vector to 1 AVX2::Vector {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value> = nullarg); #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value> = nullarg); #endif // 9 Scalar::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, enable_if::value> = nullarg); #endif // 10 Scalar::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, enable_if::value> = nullarg); #endif // 11 Scalar::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, enable_if::value> = nullarg); #endif // 12 Scalar::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, enable_if::value> = nullarg); #endif // 13 Scalar::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, enable_if::value> = nullarg); #endif // 14 Scalar::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, enable_if::value> = nullarg); #endif // 15 Scalar::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, enable_if::value> = nullarg); #endif // 16 Scalar::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, Scalar::Vector x15, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, Scalar::Vector x15, enable_if::value> = nullarg); #endif // 1 AVX2::Vector to 1 Scalar::Vector {{{2 template Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::Vector x, enable_if::value> = nullarg); // Declarations: Mask casts without offset {{{1 // 1 AVX2::Mask to 1 AVX2::Mask {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(const AVX2::Mask &k, enable_if::value> = nullarg); // 2 AVX2::Mask to 1 AVX2::Mask {{{2 Vc_SIMD_CAST_AVX_2(double_m, float_m); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_2(double_m, int_m); Vc_SIMD_CAST_AVX_2(double_m, uint_m); Vc_SIMD_CAST_AVX_2(double_m, short_m); Vc_SIMD_CAST_AVX_2(double_m, ushort_m); Vc_SIMD_CAST_AVX_2( float_m, short_m); Vc_SIMD_CAST_AVX_2( float_m, ushort_m); Vc_SIMD_CAST_AVX_2( int_m, short_m); Vc_SIMD_CAST_AVX_2( int_m, ushort_m); Vc_SIMD_CAST_AVX_2( uint_m, short_m); Vc_SIMD_CAST_AVX_2( uint_m, ushort_m); #endif // 4 AVX2::Mask to 1 AVX2::Mask {{{2 #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_4(double_m, short_m); Vc_SIMD_CAST_AVX_4(double_m, ushort_m); #endif // 1 SSE::Mask to 1 AVX2::Mask {{{2 Vc_SIMD_CAST_1(SSE::double_m, AVX2::double_m); Vc_SIMD_CAST_1(SSE::double_m, AVX2:: float_m); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_1(SSE::double_m, AVX2:: int_m); Vc_SIMD_CAST_1(SSE::double_m, AVX2:: uint_m); Vc_SIMD_CAST_1(SSE::double_m, AVX2:: short_m); Vc_SIMD_CAST_1(SSE::double_m, AVX2::ushort_m); #endif Vc_SIMD_CAST_1(SSE:: float_m, AVX2::double_m); Vc_SIMD_CAST_1(SSE:: int_m, AVX2::double_m); Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::double_m); Vc_SIMD_CAST_1(SSE:: short_m, AVX2::double_m); Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::double_m); Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: float_m); Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: float_m); Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: float_m); Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: float_m); Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: float_m); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: int_m); Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: uint_m); Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: int_m); Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: uint_m); Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: int_m); Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: uint_m); Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: short_m); Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: short_m); Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: short_m); Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: short_m); Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: short_m); Vc_SIMD_CAST_1(SSE:: float_m, AVX2::ushort_m); Vc_SIMD_CAST_1(SSE:: int_m, AVX2::ushort_m); Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::ushort_m); Vc_SIMD_CAST_1(SSE:: short_m, AVX2::ushort_m); Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::ushort_m); Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: int_m); Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: uint_m); Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: int_m); Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: uint_m); #endif // 2 SSE::Mask to 1 AVX2::Mask {{{2 Vc_SIMD_CAST_2(SSE::double_m, AVX2::double_m); Vc_SIMD_CAST_2(SSE::double_m, AVX2:: float_m); Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: float_m); Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: float_m); Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: float_m); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_2(SSE::double_m, AVX2:: int_m); Vc_SIMD_CAST_2(SSE::double_m, AVX2:: uint_m); Vc_SIMD_CAST_2(SSE::double_m, AVX2:: short_m); Vc_SIMD_CAST_2(SSE::double_m, AVX2::ushort_m); Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: int_m); Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: uint_m); Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: short_m); Vc_SIMD_CAST_2(SSE:: float_m, AVX2::ushort_m); Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: int_m); Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: uint_m); Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: short_m); Vc_SIMD_CAST_2(SSE:: int_m, AVX2::ushort_m); Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: int_m); Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: uint_m); Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: short_m); Vc_SIMD_CAST_2(SSE:: uint_m, AVX2::ushort_m); Vc_SIMD_CAST_2(SSE:: short_m, AVX2:: short_m); Vc_SIMD_CAST_2(SSE:: short_m, AVX2::ushort_m); Vc_SIMD_CAST_2(SSE::ushort_m, AVX2:: short_m); Vc_SIMD_CAST_2(SSE::ushort_m, AVX2::ushort_m); #endif // 4 SSE::Mask to 1 AVX2::Mask {{{2 Vc_SIMD_CAST_4(SSE::double_m, AVX2:: float_m); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_4(SSE::double_m, AVX2:: int_m); Vc_SIMD_CAST_4(SSE::double_m, AVX2:: uint_m); Vc_SIMD_CAST_4(SSE::double_m, AVX2:: short_m); Vc_SIMD_CAST_4(SSE::double_m, AVX2::ushort_m); Vc_SIMD_CAST_4(SSE:: float_m, AVX2:: short_m); Vc_SIMD_CAST_4(SSE:: float_m, AVX2::ushort_m); Vc_SIMD_CAST_4(SSE:: int_m, AVX2:: short_m); Vc_SIMD_CAST_4(SSE:: int_m, AVX2::ushort_m); Vc_SIMD_CAST_4(SSE:: uint_m, AVX2:: short_m); Vc_SIMD_CAST_4(SSE:: uint_m, AVX2::ushort_m); #endif // 1 Scalar::Mask to 1 AVX2::Mask {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask k, enable_if::value> = nullarg); // 2 Scalar::Mask to 1 AVX2::Mask {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask k0, Scalar::Mask k1, enable_if::value> = nullarg); // 4 Scalar::Mask to 1 AVX2::Mask {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast( Scalar::Mask k0, Scalar::Mask k1, Scalar::Mask k2, Scalar::Mask k3, enable_if<(AVX2::is_mask::value && Return::Size >= 4)> = nullarg); // 8 Scalar::Mask to 1 AVX2::Mask {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast( Scalar::Mask k0, Scalar::Mask k1, Scalar::Mask k2, Scalar::Mask k3, Scalar::Mask k4, Scalar::Mask k5, Scalar::Mask k6, Scalar::Mask k7, enable_if<(AVX2::is_mask::value && Return::Size >= 8)> = nullarg); // 16 Scalar::Mask to 1 AVX2::Mask {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask k0, Scalar::Mask k1, Scalar::Mask k2, Scalar::Mask k3, Scalar::Mask k4, Scalar::Mask k5, Scalar::Mask k6, Scalar::Mask k7, Scalar::Mask k8, Scalar::Mask k9, Scalar::Mask k10, Scalar::Mask k11, Scalar::Mask k12, Scalar::Mask k13, Scalar::Mask k14, Scalar::Mask k15, enable_if<(AVX2::is_mask::value && Return::Size >= 16)> = nullarg); // 1 AVX2::Mask to 1 SSE::Mask {{{2 Vc_SIMD_CAST_1(AVX2::double_m, SSE::double_m); Vc_SIMD_CAST_1(AVX2::double_m, SSE:: float_m); Vc_SIMD_CAST_1(AVX2::double_m, SSE:: int_m); Vc_SIMD_CAST_1(AVX2::double_m, SSE:: uint_m); Vc_SIMD_CAST_1(AVX2::double_m, SSE:: short_m); Vc_SIMD_CAST_1(AVX2::double_m, SSE::ushort_m); Vc_SIMD_CAST_1(AVX2:: float_m, SSE::double_m); Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: float_m); Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: int_m); Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: uint_m); Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: short_m); Vc_SIMD_CAST_1(AVX2:: float_m, SSE::ushort_m); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_1(AVX2:: int_m, SSE::double_m); Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: float_m); Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: int_m); Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: uint_m); Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: short_m); Vc_SIMD_CAST_1(AVX2:: int_m, SSE::ushort_m); Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::double_m); Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: float_m); Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: int_m); Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: uint_m); Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: short_m); Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::ushort_m); Vc_SIMD_CAST_1(AVX2:: short_m, SSE::double_m); Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: float_m); Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: int_m); Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: uint_m); Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: short_m); Vc_SIMD_CAST_1(AVX2:: short_m, SSE::ushort_m); Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::double_m); Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: float_m); Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: int_m); Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: uint_m); Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: short_m); Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::ushort_m); #endif // 2 AVX2::Mask to 1 SSE::Mask {{{2 Vc_SIMD_CAST_2(AVX2::double_m, SSE:: short_m); Vc_SIMD_CAST_2(AVX2::double_m, SSE::ushort_m); // 1 AVX2::Mask to 1 Scalar::Mask {{{2 template Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::Mask x, enable_if::value> = nullarg); // Declaration: offset == 0 | convert from AVX2::Mask/Vector {{{1 template Vc_INTRINSIC Vc_CONST enable_if< (offset == 0 && ((AVX2::is_vector::value && !Scalar::is_vector::value && Traits::is_simd_vector::value && !Traits::isSimdArray::value) || (AVX2::is_mask::value && !Scalar::is_mask::value && Traits::is_simd_mask::value && !Traits::isSimdMaskArray::value))), Return> simd_cast(const From &x); // Declaration: offset == 0 | convert from SSE::Mask/Vector to AVX2::Mask/Vector {{{1 template Vc_INTRINSIC Vc_CONST Return simd_cast( const From &x, enable_if::value && AVX2::is_vector::value) || (SSE::is_mask::value && AVX2::is_mask::value))> = nullarg); // Declarations: Vector casts with offset {{{1 // AVX2 to AVX2 {{{2 template Vc_INTRINSIC Vc_CONST enable_if<(AVX2::is_vector::value && offset != 0), Return> simd_cast(AVX2::Vector x); // AVX2 to SSE (Vector) {{{2 template Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector::value && sizeof(AVX2::Vector) == 32), Return> simd_cast(AVX2::Vector x); template Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector::value && sizeof(AVX2::Vector) == 16), Return> simd_cast(AVX2::Vector x); // SSE to AVX2 {{{2 Vc_SIMD_CAST_OFFSET(SSE:: short_v, AVX2::double_v, 1); Vc_SIMD_CAST_OFFSET(SSE::ushort_v, AVX2::double_v, 1); // Declarations: Mask casts with offset {{{1 // 1 AVX2::Mask to N AVX2::Mask {{{2 /* This declaration confuses GCC (4.9.2). If the declarations are there the definitions * are ignored by the compiler. ;-( template Vc_INTRINSIC_L Vc_CONST_L Return simd_cast(const AVX2::Mask &k, enable_if::value> = nullarg) Vc_INTRINSIC_R Vc_CONST_R; template Vc_INTRINSIC_L Vc_CONST_L Return simd_cast(const AVX2::Mask &k, enable_if::value> = nullarg) Vc_INTRINSIC_R Vc_CONST_R; template Vc_INTRINSIC_L Vc_CONST_L Return simd_cast(const AVX2::Mask &k, enable_if::value> = nullarg) Vc_INTRINSIC_R Vc_CONST_R; template Vc_INTRINSIC_L Vc_CONST_L Return simd_cast(const AVX2::Mask &k, enable_if::value> = nullarg) Vc_INTRINSIC_R Vc_CONST_R; */ // 1 SSE::Mask to N AVX2(2)::Mask {{{2 Vc_SIMD_CAST_OFFSET(SSE:: short_m, AVX2::double_m, 1); Vc_SIMD_CAST_OFFSET(SSE::ushort_m, AVX2::double_m, 1); // AVX2 to SSE (Mask) {{{2 template Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask::value && sizeof(AVX2::Mask) == 32), Return> simd_cast(AVX2::Mask x); template Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask::value && sizeof(AVX2::Mask) == 16), Return> simd_cast(AVX2::Mask x); // helper macros Vc_SIMD_CAST_AVX_[124] & Vc_SIMD_CAST_[124] {{{1 #undef Vc_SIMD_CAST_AVX_1 #define Vc_SIMD_CAST_AVX_1(from_, to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x, \ enable_if::value>) #undef Vc_SIMD_CAST_AVX_2 #define Vc_SIMD_CAST_AVX_2(from_, to_) \ static_assert(AVX2::from_::size() * 2 <= AVX2::to_::size(), \ "this type combination is wrong"); \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x0, AVX2::from_ x1, \ enable_if::value>) #undef Vc_SIMD_CAST_AVX_3 #define Vc_SIMD_CAST_AVX_3(from_, to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, \ enable_if::value>) #undef Vc_SIMD_CAST_AVX_4 #define Vc_SIMD_CAST_AVX_4(from_, to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, \ AVX2::from_ x3, \ enable_if::value>) #undef Vc_SIMD_CAST_1 #define Vc_SIMD_CAST_1(from_, to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(from_ x, enable_if::value>) #undef Vc_SIMD_CAST_2 #define Vc_SIMD_CAST_2(from_, to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, \ enable_if::value>) #undef Vc_SIMD_CAST_3 #define Vc_SIMD_CAST_3(from_, to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, \ enable_if::value>) #undef Vc_SIMD_CAST_4 #define Vc_SIMD_CAST_4(from_, to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, \ enable_if::value>) #undef Vc_SIMD_CAST_5 #define Vc_SIMD_CAST_5(from_, to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \ enable_if::value>) #undef Vc_SIMD_CAST_6 #define Vc_SIMD_CAST_6(from_, to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \ from_ x5, \ enable_if::value>) #undef Vc_SIMD_CAST_7 #define Vc_SIMD_CAST_7(from_, to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \ from_ x5, from_ x6, \ enable_if::value>) #undef Vc_SIMD_CAST_8 #define Vc_SIMD_CAST_8(from_, to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \ from_ x5, from_ x6, from_ x7, \ enable_if::value>) #undef Vc_SIMD_CAST_OFFSET #define Vc_SIMD_CAST_OFFSET(from_, to_, offset_) \ static_assert(from_::size() >= to_::size() * (offset_ + 1), \ "this offset cannot exist for this type combination"); \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x, enable_if<(offset == offset_ && std::is_same::value)>) // SSE -> AVX2 where the AVX2 Vector is integral and thus of equal size() as the {{{1 // equivalent SSE Vector template Vc_INTRINSIC Vc_CONST To simd_cast(From x, enable_if<(AVX2::is_vector::value && SSE::is_vector::value && SSE::Vector::Size == To::Size)>) { return simd_cast>(x).data(); } template Vc_INTRINSIC Vc_CONST To simd_cast(From x0, From x1, enable_if<(AVX2::is_vector::value && SSE::is_vector::value && SSE::Vector::Size == To::Size)>) { return simd_cast>(x0, x1).data(); } template Vc_INTRINSIC Vc_CONST To simd_cast(From x0, From x1, From x2, enable_if<(AVX2::is_vector::value && SSE::is_vector::value && SSE::Vector::Size == To::Size)>) { return simd_cast>(x0, x1, x2).data(); } template Vc_INTRINSIC Vc_CONST To simd_cast(From x0, From x1, From x2, From x3, enable_if<(AVX2::is_vector::value && SSE::is_vector::value && SSE::Vector::Size == To::Size)>) { return simd_cast>(x0, x1, x2, x3).data(); } template Vc_INTRINSIC Vc_CONST To simd_cast(From x0, From x1, From x2, From x3, From x4, From x5, From x6, From x7, enable_if<(AVX2::is_vector::value && SSE::is_vector::value && SSE::Vector::Size == To::Size)>) { return simd_cast>(x0, x1, x2, x3, x4, x5, x6, x7) .data(); } // Vector casts without offset {{{1 // AVX2::Vector {{{2 // 1: to double_v {{{3 Vc_SIMD_CAST_AVX_1( float_v, double_v) { return _mm256_cvtps_pd(AVX::lo128(x.data())); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_1( int_v, double_v) { return AVX::convert< int, double>(AVX::lo128(x.data())); } Vc_SIMD_CAST_AVX_1( uint_v, double_v) { return AVX::convert< uint, double>(AVX::lo128(x.data())); } Vc_SIMD_CAST_AVX_1( short_v, double_v) { return AVX::convert< short, double>(AVX::lo128(x.data())); } Vc_SIMD_CAST_AVX_1(ushort_v, double_v) { return AVX::convert(AVX::lo128(x.data())); } #endif // 1: to float_v {{{3 Vc_SIMD_CAST_AVX_1(double_v, float_v) { return AVX::zeroExtend(_mm256_cvtpd_ps(x.data())); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_1( int_v, float_v) { return AVX::convert< int, float>(x.data()); } Vc_SIMD_CAST_AVX_1( uint_v, float_v) { return AVX::convert< uint, float>(x.data()); } Vc_SIMD_CAST_AVX_1( short_v, float_v) { return AVX::convert< short, float>(AVX::lo128(x.data())); } Vc_SIMD_CAST_AVX_1(ushort_v, float_v) { return AVX::convert(AVX::lo128(x.data())); } #endif // 2: to float_v {{{3 Vc_SIMD_CAST_AVX_2(double_v, float_v) { return AVX::concat(_mm256_cvtpd_ps(x0.data()), _mm256_cvtpd_ps(x1.data())); } // 1: to int_v {{{3 #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_1(double_v, int_v) { return AVX::zeroExtend(_mm256_cvttpd_epi32(x.data())); } Vc_SIMD_CAST_AVX_1( float_v, int_v) { return _mm256_cvttps_epi32(x.data()); } Vc_SIMD_CAST_AVX_1( uint_v, int_v) { return x.data(); } Vc_SIMD_CAST_AVX_1( short_v, int_v) { const auto tmp = Mem::permute4x64(x.data()); return _mm256_srai_epi32(_mm256_unpacklo_epi16(tmp, tmp), 16); } Vc_SIMD_CAST_AVX_1(ushort_v, int_v) { const auto tmp = Mem::permute4x64(x.data()); return _mm256_srli_epi32(_mm256_unpacklo_epi16(tmp, tmp), 16); } #endif // 2: to int_v {{{3 #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_2(double_v, int_v) { return AVX::concat(_mm256_cvttpd_epi32(x0.data()), _mm256_cvttpd_epi32(x1.data())); } #endif // 1: to uint_v {{{3 #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_1(double_v, uint_v) { return AVX::zeroExtend(AVX::convert(x.data())); } Vc_SIMD_CAST_AVX_1( float_v, uint_v) { return _mm256_blendv_epi8( _mm256_cvttps_epi32(x.data()), _mm256_add_epi32( _mm256_cvttps_epi32(_mm256_sub_ps(x.data(), AVX::set2power31_ps())), AVX::set2power31_epu32()), _mm256_castps_si256(AVX::cmpge_ps(x.data(), AVX::set2power31_ps()))); } Vc_SIMD_CAST_AVX_1( int_v, uint_v) { return x.data(); } Vc_SIMD_CAST_AVX_1( short_v, uint_v) { const auto tmp = Mem::permute4x64(x.data()); return _mm256_srai_epi32(_mm256_unpacklo_epi16(tmp, tmp), 16); } Vc_SIMD_CAST_AVX_1(ushort_v, uint_v) { const auto tmp = Mem::permute4x64(x.data()); return _mm256_srli_epi32(_mm256_unpacklo_epi16(tmp, tmp), 16); } #endif // 2: to uint_v {{{3 #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_2(double_v, uint_v) { return AVX::concat(AVX::convert(x0.data()), AVX::convert(x1.data())); } #endif // 1: to short_v {{{3 #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_1(double_v, short_v) { return AVX::zeroExtend(_mm_packs_epi32(_mm256_cvttpd_epi32(x.data()), _mm_setzero_si128())); } Vc_SIMD_CAST_AVX_1( float_v, short_v) { const auto tmp = _mm256_cvttps_epi32(x.data()); return AVX::zeroExtend(_mm_packs_epi32(AVX::lo128(tmp), AVX::hi128(tmp))); } Vc_SIMD_CAST_AVX_1( int_v, short_v) { return AVX::zeroExtend(AVX::convert< int, short>(x.data())); } Vc_SIMD_CAST_AVX_1( uint_v, short_v) { return AVX::zeroExtend(AVX::convert(x.data())); } Vc_SIMD_CAST_AVX_1(ushort_v, short_v) { return x.data(); } #endif // 2: to short_v {{{3 #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_2(double_v, short_v) { const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); return AVX::zeroExtend(_mm_packs_epi32(tmp0, tmp1)); } Vc_SIMD_CAST_AVX_2( float_v, short_v) { using AVX2::short_v; using AVX2::int_v; return simd_cast(simd_cast(x0), simd_cast(x1)); } Vc_SIMD_CAST_AVX_2( int_v, short_v) { auto tmp0 = _mm256_unpacklo_epi16(x0.data(), x1.data()); auto tmp1 = _mm256_unpackhi_epi16(x0.data(), x1.data()); auto tmp2 = _mm256_unpacklo_epi16(tmp0, tmp1); auto tmp3 = _mm256_unpackhi_epi16(tmp0, tmp1); return Mem::permute4x64(_mm256_unpacklo_epi16(tmp2, tmp3)); } Vc_SIMD_CAST_AVX_2( uint_v, short_v) { auto tmp0 = _mm256_unpacklo_epi16(x0.data(), x1.data()); auto tmp1 = _mm256_unpackhi_epi16(x0.data(), x1.data()); auto tmp2 = _mm256_unpacklo_epi16(tmp0, tmp1); auto tmp3 = _mm256_unpackhi_epi16(tmp0, tmp1); return Mem::permute4x64(_mm256_unpacklo_epi16(tmp2, tmp3)); } #endif // 3: to short_v {{{3 #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_3(double_v, short_v) { const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); const auto tmp2 = _mm256_cvttpd_epi32(x2.data()); return AVX::concat(_mm_packs_epi32(tmp0, tmp1), _mm_packs_epi32(tmp2, _mm_setzero_si128())); } #endif // 4: to short_v {{{3 #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_4(double_v, short_v) { const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); const auto tmp2 = _mm256_cvttpd_epi32(x2.data()); const auto tmp3 = _mm256_cvttpd_epi32(x3.data()); return AVX::concat(_mm_packs_epi32(tmp0, tmp1), _mm_packs_epi32(tmp2, tmp3)); } #endif // 1: to ushort_v {{{3 #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_1(double_v, ushort_v) { const auto tmp = _mm256_cvttpd_epi32(x.data()); return AVX::zeroExtend(_mm_packs_epi32(tmp, _mm_setzero_si128())); } Vc_SIMD_CAST_AVX_1( float_v, ushort_v) { const auto tmp = _mm256_cvttps_epi32(x.data()); return AVX::zeroExtend(_mm_packs_epi32(AVX::lo128(tmp), AVX::hi128(tmp))); } Vc_SIMD_CAST_AVX_1( int_v, ushort_v) { return AVX::zeroExtend(AVX::convert< int, ushort>(x.data())); } Vc_SIMD_CAST_AVX_1( uint_v, ushort_v) { return AVX::zeroExtend(AVX::convert(x.data())); } Vc_SIMD_CAST_AVX_1( short_v, ushort_v) { return x.data(); } #endif // 2: to ushort_v {{{3 #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_2(double_v, ushort_v) { const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); return AVX::zeroExtend(_mm_packs_epi32(tmp0, tmp1)); } Vc_SIMD_CAST_AVX_2( float_v, ushort_v) { using AVX2::ushort_v; using AVX2::int_v; return simd_cast(simd_cast(x0), simd_cast(x1)); } Vc_SIMD_CAST_AVX_2( int_v, ushort_v) { auto tmp0 = _mm256_unpacklo_epi16(x0.data(), x1.data()); auto tmp1 = _mm256_unpackhi_epi16(x0.data(), x1.data()); auto tmp2 = _mm256_unpacklo_epi16(tmp0, tmp1); auto tmp3 = _mm256_unpackhi_epi16(tmp0, tmp1); return Mem::permute4x64(_mm256_unpacklo_epi16(tmp2, tmp3)); } Vc_SIMD_CAST_AVX_2( uint_v, ushort_v) { auto tmp0 = _mm256_unpacklo_epi16(x0.data(), x1.data()); auto tmp1 = _mm256_unpackhi_epi16(x0.data(), x1.data()); auto tmp2 = _mm256_unpacklo_epi16(tmp0, tmp1); auto tmp3 = _mm256_unpackhi_epi16(tmp0, tmp1); return Mem::permute4x64(_mm256_unpacklo_epi16(tmp2, tmp3)); } #endif // 3: to ushort_v {{{3 #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_3(double_v, ushort_v) { const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); const auto tmp2 = _mm256_cvttpd_epi32(x2.data()); return AVX::concat(_mm_packs_epi32(tmp0, tmp1), _mm_packs_epi32(tmp2, _mm_setzero_si128())); } #endif // 4: to ushort_v {{{3 #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_4(double_v, ushort_v) { const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); const auto tmp2 = _mm256_cvttpd_epi32(x2.data()); const auto tmp3 = _mm256_cvttpd_epi32(x3.data()); return AVX::concat(_mm_packs_epi32(tmp0, tmp1), _mm_packs_epi32(tmp2, tmp3)); } #endif // 1 SSE::Vector to 1 AVX2::Vector {{{2 Vc_SIMD_CAST_1(SSE::double_v, AVX2::double_v) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: float_v, AVX2::double_v) { return _mm256_cvtps_pd(x.data()); } Vc_SIMD_CAST_1(SSE:: int_v, AVX2::double_v) { return _mm256_cvtepi32_pd(x.data()); } Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::double_v) { using namespace AvxIntrinsics; return _mm256_add_pd(_mm256_cvtepi32_pd(_mm_sub_epi32(x.data(), _mm_setmin_epi32())), set1_pd(1u << 31)); } Vc_SIMD_CAST_1(SSE:: short_v, AVX2::double_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::double_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(SSE::double_v, AVX2:: float_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: float_v) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: float_v) { return AVX::zeroExtend(_mm_cvtepi32_ps(x.data())); } Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: float_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: float_v) { return AVX::convert< short, float>(x.data()); } Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: float_v) { return AVX::convert(x.data()); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_1(SSE::double_v, AVX2:: int_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE::double_v, AVX2:: uint_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: int_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: uint_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: float_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: int_v) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: int_v) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: int_v) { return AVX::convert< short, int>(x.data()); } Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: int_v) { return AVX::convert(x.data()); } Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: uint_v) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: uint_v) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: uint_v) { return AVX::convert< short, uint>(x.data()); } Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: uint_v) { return AVX::convert(x.data()); } Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: short_v) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: short_v) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: int_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: short_v, AVX2::ushort_v) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::ushort_v) { return AVX::zeroExtend(x.data()); } #endif // 2 SSE::Vector to 1 AVX2::Vector {{{2 Vc_SIMD_CAST_2(SSE::double_v, AVX2::double_v) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE::double_v, AVX2:: float_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: float_v) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: float_v) { return AVX::convert< int, float>(AVX::concat(x0.data(), x1.data())); } Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: float_v) { return AVX::convert(AVX::concat(x0.data(), x1.data())); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_2(SSE::double_v, AVX2:: int_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } Vc_SIMD_CAST_2(SSE::double_v, AVX2:: uint_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } Vc_SIMD_CAST_2(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } Vc_SIMD_CAST_2(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: int_v) { return simd_cast(simd_cast(x0, x1)); } Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: uint_v) { return simd_cast(simd_cast(x0, x1)); } Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } Vc_SIMD_CAST_2(SSE:: float_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: int_v) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: int_v) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: uint_v) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: uint_v) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } Vc_SIMD_CAST_2(SSE:: short_v, AVX2:: short_v) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE::ushort_v, AVX2:: short_v) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: int_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } Vc_SIMD_CAST_2(SSE:: uint_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } Vc_SIMD_CAST_2(SSE:: short_v, AVX2::ushort_v) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE::ushort_v, AVX2::ushort_v) { return AVX::concat(x0.data(), x1.data()); } #endif // 3 SSE::Vector to 1 AVX2::Vector {{{2 Vc_SIMD_CAST_3(SSE::double_v, AVX2:: float_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_3(SSE::double_v, AVX2:: int_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } Vc_SIMD_CAST_3(SSE::double_v, AVX2:: uint_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } Vc_SIMD_CAST_3(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x0, x1, x2).data()); } Vc_SIMD_CAST_3(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x0, x1, x2).data()); } Vc_SIMD_CAST_3(SSE:: float_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } Vc_SIMD_CAST_3(SSE:: float_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } Vc_SIMD_CAST_3(SSE:: int_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } Vc_SIMD_CAST_3(SSE:: uint_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } Vc_SIMD_CAST_3(SSE:: int_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } Vc_SIMD_CAST_3(SSE:: uint_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } #endif // 4 SSE::Vector to 1 AVX2::Vector {{{2 Vc_SIMD_CAST_4(SSE::double_v, AVX2:: float_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_4(SSE::double_v, AVX2:: int_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } Vc_SIMD_CAST_4(SSE::double_v, AVX2:: uint_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } Vc_SIMD_CAST_4(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x0, x1, x2, x3).data()); } Vc_SIMD_CAST_4(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x0, x1, x2, x3).data()); } Vc_SIMD_CAST_4(SSE:: float_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } Vc_SIMD_CAST_4(SSE:: float_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } Vc_SIMD_CAST_4(SSE:: int_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } Vc_SIMD_CAST_4(SSE:: uint_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } Vc_SIMD_CAST_4(SSE:: int_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } Vc_SIMD_CAST_4(SSE:: uint_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } #endif // 5 SSE::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_5(SSE::double_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4)); } Vc_SIMD_CAST_5(SSE::double_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4)); } #endif // 6 SSE::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_6(SSE::double_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4, x5)); } Vc_SIMD_CAST_6(SSE::double_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4, x5)); } #endif // 7 SSE::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_7(SSE::double_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4, x5), simd_cast(x6)); } Vc_SIMD_CAST_7(SSE::double_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4, x5), simd_cast(x6)); } #endif // 8 SSE::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_8(SSE::double_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4, x5), simd_cast(x6, x7)); } Vc_SIMD_CAST_8(SSE::double_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4, x5), simd_cast(x6, x7)); } #endif // 1 AVX2::Vector to 1 SSE::Vector {{{2 Vc_SIMD_CAST_1(AVX2::double_v, SSE::double_v) { return AVX::lo128(x.data()); } Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: float_v) { return AVX::lo128(x.data()); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: int_v) { return AVX::lo128(x.data()); } Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: uint_v) { return AVX::lo128(x.data()); } Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: short_v) { return AVX::lo128(x.data()); } Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::ushort_v) { return AVX::lo128(x.data()); } #endif Vc_SIMD_CAST_1(AVX2::double_v, SSE:: float_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2::double_v, SSE:: int_v) { return AVX::convert(x.data()); } Vc_SIMD_CAST_1(AVX2::double_v, SSE:: uint_v) { return AVX::convert(x.data()); } Vc_SIMD_CAST_1(AVX2::double_v, SSE:: short_v) { return AVX::convert(x.data()); } Vc_SIMD_CAST_1(AVX2::double_v, SSE::ushort_v) { return AVX::convert(x.data()); } Vc_SIMD_CAST_1(AVX2:: float_v, SSE::double_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: int_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: uint_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: short_v) { return AVX::convert(x.data()); } Vc_SIMD_CAST_1(AVX2:: float_v, SSE::ushort_v) { return AVX::convert(x.data()); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_1(AVX2:: int_v, SSE::double_v) { return SSE::convert(AVX::lo128(x.data())); } Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: float_v) { return SSE::convert(AVX::lo128(x.data())); } Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: uint_v) { return AVX::lo128(x.data()); } Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: short_v) { return AVX::convert(x.data()); } Vc_SIMD_CAST_1(AVX2:: int_v, SSE::ushort_v) { return AVX::convert(x.data()); } Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::double_v) { return SSE::convert(AVX::lo128(x.data())); } Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: float_v) { return SSE::convert(AVX::lo128(x.data())); } Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: int_v) { return AVX::lo128(x.data()); } Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: short_v) { return AVX::convert(x.data()); } Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::ushort_v) { return AVX::convert(x.data()); } Vc_SIMD_CAST_1(AVX2:: short_v, SSE::double_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: float_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: int_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: uint_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2:: short_v, SSE::ushort_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::double_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: float_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: int_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: uint_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: short_v) { return simd_cast(simd_cast(x)); } #endif // 2 AVX2::Vector to 1 SSE::Vector {{{2 Vc_SIMD_CAST_2(AVX2::double_v, SSE:: short_v) { const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); return _mm_packs_epi32(tmp0, tmp1); } Vc_SIMD_CAST_2(AVX2::double_v, SSE::ushort_v) { const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); return _mm_packs_epi32(tmp0, tmp1); } // 1 Scalar::Vector to 1 AVX2::Vector {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value>) { return AVX::zeroExtend(_mm_setr_pd(x.data(), 0.)); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value>) { return AVX::zeroExtend(_mm_setr_ps(x.data(), 0.f, 0.f, 0.f)); } #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value>) { return _mm256_setr_epi32(x.data(), 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value>) { return _mm256_setr_epi32(uint(x.data()), 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value>) { return _mm256_setr_epi16(x.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value>) { return _mm256_setr_epi16(x.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } #endif // 2 Scalar::Vector to 1 AVX2::Vector {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value>) { return AVX::zeroExtend(_mm_setr_pd(x0.data(), x1.data())); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value>) { return AVX::zeroExtend(_mm_setr_ps(x0.data(), x1.data(), 0.f, 0.f)); } #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value>) { return _mm256_setr_epi32(x0.data(), x1.data(), 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value>) { return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } #endif // 3 Scalar::Vector to 1 AVX2::Vector {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value>) { return _mm256_setr_pd(x0.data(), x1.data(), x2.data(), 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value>) { return AVX::zeroExtend(_mm_setr_ps(x0.data(), x1.data(), x2.data(), 0)); } #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value>) { return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value>) { return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()), 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } #endif // 4 Scalar::Vector to 1 AVX2::Vector {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value>) { return _mm256_setr_pd(x0.data(), x1.data(), x2.data(), x3.data()); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value>) { return AVX::zeroExtend(_mm_setr_ps(x0.data(), x1.data(), x2.data(), x3.data())); } #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value>) { return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value>) { return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()), uint(x3.data()), 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } #endif // 5 Scalar::Vector to 1 AVX2::Vector {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value>) { return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0); } #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value>) { return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value>) { return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()), uint(x3.data()), uint(x4.data()), 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } #endif // 6 Scalar::Vector to 1 AVX2::Vector {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value>) { return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), 0, 0); } #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value>) { return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value>) { return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()), uint(x3.data()), uint(x4.data()), uint(x5.data()), 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } #endif // 7 Scalar::Vector to 1 AVX2::Vector {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value>) { return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), 0); } #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value>) { return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value>) { return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()), uint(x3.data()), uint(x4.data()), uint(x5.data()), uint(x6.data()), 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0); } #endif // 8 Scalar::Vector to 1 AVX2::Vector {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value>) { return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data()); } #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value>) { return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data()); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value>) { return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()), uint(x3.data()), uint(x4.data()), uint(x5.data()), uint(x6.data()), uint(x7.data())); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), 0, 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), 0, 0, 0, 0, 0, 0, 0, 0); } #endif // 9 Scalar::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), 0, 0, 0, 0, 0, 0, 0); } #endif // 10 Scalar::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), 0, 0, 0, 0, 0, 0); } #endif // 11 Scalar::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), 0, 0, 0, 0, 0); } #endif // 12 Scalar::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), x11.data(), 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), x11.data(), 0, 0, 0, 0); } #endif // 13 Scalar::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), x11.data(), x12.data(), 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), x11.data(), x12.data(), 0, 0, 0); } #endif // 14 Scalar::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), x11.data(), x12.data(), x13.data(), 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), x11.data(), x12.data(), x13.data(), 0, 0); } #endif // 15 Scalar::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), x11.data(), x12.data(), x13.data(), x14.data(), 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), x11.data(), x12.data(), x13.data(), x14.data(), 0); } #endif // 16 Scalar::Vector to 1 AVX2::Vector {{{2 #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, Scalar::Vector x15, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), x11.data(), x12.data(), x13.data(), x14.data(), x15.data()); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, Scalar::Vector x15, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), x11.data(), x12.data(), x13.data(), x14.data(), x15.data()); } #endif // 1 AVX2::Vector to 1 Scalar::Vector {{{2 template Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::Vector x, enable_if::value>) { return static_cast(x[0]); } // Mask casts without offset {{{1 // 1 AVX2::Mask to 1 AVX2::Mask {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(const AVX2::Mask &k, enable_if::value>) { return {Detail::mask_cast::Size, Return::Size, typename Return::VectorTypeF>(k.dataI())}; } // 2 AVX2::Mask to 1 AVX2::Mask {{{2 Vc_SIMD_CAST_AVX_2(double_m, float_m) { return AVX::concat(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI()))); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_2(double_m, int_m) { return Mem::permute4x64(_mm256_packs_epi32(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_AVX_2(double_m, uint_m) { return Mem::permute4x64(_mm256_packs_epi32(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_AVX_2(double_m, short_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI())))); } Vc_SIMD_CAST_AVX_2(double_m, ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI())))); } Vc_SIMD_CAST_AVX_2( float_m, short_m) { return Mem::permute4x64(_mm256_packs_epi16(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_AVX_2( float_m, ushort_m) { return Mem::permute4x64(_mm256_packs_epi16(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_AVX_2( int_m, short_m) { return Mem::permute4x64(_mm256_packs_epi16(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_AVX_2( int_m, ushort_m) { return Mem::permute4x64(_mm256_packs_epi16(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_AVX_2( uint_m, short_m) { return Mem::permute4x64(_mm256_packs_epi16(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_AVX_2( uint_m, ushort_m) { return Mem::permute4x64(_mm256_packs_epi16(x0.dataI(), x1.dataI())); } #endif // 4 AVX2::Mask to 1 AVX2::Mask {{{2 #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_4(double_m, short_m) { using namespace AVX; const auto tmp = _mm256_packs_epi32( _mm256_packs_epi32(x0.dataI(), x1.dataI()) // a0 a1 b0 b1 a2 a3 b2 b3 , _mm256_packs_epi32(x2.dataI(), x3.dataI()) // c0 c1 d0 d1 c2 c3 d2 d3 ); // a0 a1 b0 b1 c0 c1 d0 d1 a2 a3 b2 b3 c2 c3 d2 d3 return concat(_mm_unpacklo_epi32(lo128(tmp), hi128(tmp)), // a0 a1 a2 a3 b0 b1 b2 b3 _mm_unpackhi_epi32(lo128(tmp), hi128(tmp))); // c0 c1 c2 c3 d0 d1 d2 d3 } Vc_SIMD_CAST_AVX_4(double_m, ushort_m) { return simd_cast(x0, x1, x2, x3).data(); } #endif // 1 SSE::Mask to 1 AVX2::Mask {{{2 Vc_SIMD_CAST_1(SSE::double_m, AVX2::double_m) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE::double_m, AVX2:: float_m) { return AVX::zeroExtend(simd_cast(x).data()); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_1(SSE::double_m, AVX2:: int_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE::double_m, AVX2:: uint_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE::double_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE::double_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast(x).data()); } #endif Vc_SIMD_CAST_1(SSE:: float_m, AVX2::double_m) { return AVX::concat(_mm_unpacklo_ps(x.data(), x.data()), _mm_unpackhi_ps(x.data(), x.data())); } Vc_SIMD_CAST_1(SSE:: int_m, AVX2::double_m) { return AVX::concat(_mm_unpacklo_ps(x.data(), x.data()), _mm_unpackhi_ps(x.data(), x.data())); } Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::double_m) { return AVX::concat(_mm_unpacklo_ps(x.data(), x.data()), _mm_unpackhi_ps(x.data(), x.data())); } Vc_SIMD_CAST_1(SSE:: short_m, AVX2::double_m) { auto tmp = _mm_unpacklo_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); } Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::double_m) { auto tmp = _mm_unpacklo_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); } Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: float_m) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: float_m) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: float_m) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: float_m) { return AVX::concat(_mm_unpacklo_epi16(x.dataI(), x.dataI()), _mm_unpackhi_epi16(x.dataI(), x.dataI())); } Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: float_m) { return AVX::concat(_mm_unpacklo_epi16(x.dataI(), x.dataI()), _mm_unpackhi_epi16(x.dataI(), x.dataI())); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: int_m) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: uint_m) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: int_m) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: uint_m) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: int_m) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: uint_m) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: float_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: int_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: short_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: int_m) { const auto v = Mem::permute4x64(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); } Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: uint_m) { const auto v = Mem::permute4x64(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); } Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: int_m) { const auto v = Mem::permute4x64(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); } Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: uint_m) { const auto v = Mem::permute4x64(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); } #endif // 2 SSE::Mask to 1 AVX2::Mask {{{2 Vc_SIMD_CAST_2(SSE::double_m, AVX2::double_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE::double_m, AVX2:: float_m) { return AVX::zeroExtend(_mm_packs_epi32(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: float_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: float_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: float_m) { return AVX::concat(x0.data(), x1.data()); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_2(SSE::double_m, AVX2:: int_m) { return AVX::zeroExtend(_mm_packs_epi32(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_2(SSE::double_m, AVX2:: uint_m) { return AVX::zeroExtend(_mm_packs_epi32(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_2(SSE::double_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_setzero_si128())); } Vc_SIMD_CAST_2(SSE::double_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_setzero_si128())); } Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: int_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: uint_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_2(SSE:: float_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: int_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: uint_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_2(SSE:: int_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: int_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: uint_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_2(SSE:: uint_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_2(SSE:: short_m, AVX2:: short_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: short_m, AVX2::ushort_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE::ushort_m, AVX2:: short_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE::ushort_m, AVX2::ushort_m) { return AVX::concat(x0.data(), x1.data()); } #endif // 4 SSE::Mask to 1 AVX2::Mask {{{2 Vc_SIMD_CAST_4(SSE::double_m, AVX2:: float_m) { return AVX::concat(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI())); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_4(SSE::double_m, AVX2:: int_m) { return AVX::concat(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI())); } Vc_SIMD_CAST_4(SSE::double_m, AVX2:: uint_m) { return AVX::concat(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI())); } Vc_SIMD_CAST_4(SSE::double_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI()))); } Vc_SIMD_CAST_4(SSE::double_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI()))); } Vc_SIMD_CAST_4(SSE:: float_m, AVX2:: short_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); } Vc_SIMD_CAST_4(SSE:: float_m, AVX2::ushort_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); } Vc_SIMD_CAST_4(SSE:: int_m, AVX2:: short_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); } Vc_SIMD_CAST_4(SSE:: int_m, AVX2::ushort_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); } Vc_SIMD_CAST_4(SSE:: uint_m, AVX2:: short_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); } Vc_SIMD_CAST_4(SSE:: uint_m, AVX2::ushort_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); } #endif // 1 Scalar::Mask to 1 AVX2::Mask {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask k, enable_if::value>) { Return r{false}; r[0] = k.data(); return r; } // 2 Scalar::Mask to 1 AVX2::Mask {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask k0, Scalar::Mask k1, enable_if::value>) { Return r{false}; r[0] = k0.data(); r[1] = k1.data(); return r; } // 4 Scalar::Mask to 1 AVX2::Mask {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask k0, Scalar::Mask k1, Scalar::Mask k2, Scalar::Mask k3, enable_if<(AVX2::is_mask::value && Return::Size >= 4)>) { Return r{false}; r[0] = k0.data(); r[1] = k1.data(); r[2] = k2.data(); r[3] = k3.data(); return r; } // 8 Scalar::Mask to 1 AVX2::Mask {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask k0, Scalar::Mask k1, Scalar::Mask k2, Scalar::Mask k3, Scalar::Mask k4, Scalar::Mask k5, Scalar::Mask k6, Scalar::Mask k7, enable_if<(AVX2::is_mask::value && Return::Size >= 8)>) { Return r{false}; r[0] = k0.data(); r[1] = k1.data(); r[2] = k2.data(); r[3] = k3.data(); r[4] = k4.data(); r[5] = k5.data(); r[6] = k6.data(); r[7] = k7.data(); return r; } // 16 Scalar::Mask to 1 AVX2::Mask {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask k0, Scalar::Mask k1, Scalar::Mask k2, Scalar::Mask k3, Scalar::Mask k4, Scalar::Mask k5, Scalar::Mask k6, Scalar::Mask k7, Scalar::Mask k8, Scalar::Mask k9, Scalar::Mask k10, Scalar::Mask k11, Scalar::Mask k12, Scalar::Mask k13, Scalar::Mask k14, Scalar::Mask k15, enable_if<(AVX2::is_mask::value && Return::Size >= 16)>) { Return r{false}; r[0] = k0.data(); r[1] = k1.data(); r[2] = k2.data(); r[3] = k3.data(); r[4] = k4.data(); r[5] = k5.data(); r[6] = k6.data(); r[7] = k7.data(); r[8] = k8.data(); r[9] = k9.data(); r[10] = k10.data(); r[11] = k11.data(); r[12] = k12.data(); r[13] = k13.data(); r[14] = k14.data(); r[15] = k15.data(); return r; } // 1 AVX2::Mask to 1 SSE::Mask {{{2 Vc_SIMD_CAST_1(AVX2::double_m, SSE::double_m) { return AVX::lo128(x.data()); } Vc_SIMD_CAST_1(AVX2::double_m, SSE:: float_m) { return _mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } Vc_SIMD_CAST_1(AVX2::double_m, SSE:: int_m) { return _mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } Vc_SIMD_CAST_1(AVX2::double_m, SSE:: uint_m) { return _mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } Vc_SIMD_CAST_1(AVX2::double_m, SSE:: short_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())), _mm_setzero_si128()); } Vc_SIMD_CAST_1(AVX2::double_m, SSE::ushort_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())), _mm_setzero_si128()); } Vc_SIMD_CAST_1(AVX2:: float_m, SSE::double_m) { return _mm_unpacklo_ps(AVX::lo128(x.data()), AVX::lo128(x.data())); } Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: float_m) { return AVX::lo128(x.data()); } Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: int_m) { return AVX::lo128(x.data()); } Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: uint_m) { return AVX::lo128(x.data()); } Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: short_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } Vc_SIMD_CAST_1(AVX2:: float_m, SSE::ushort_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_1(AVX2:: int_m, SSE::double_m) { return _mm_unpacklo_epi32(AVX::lo128(x.dataI()), AVX::lo128(x.dataI())); } Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: float_m) { return AVX::lo128(x.dataI()); } Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: int_m) { return AVX::lo128(x.dataI()); } Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: uint_m) { return AVX::lo128(x.dataI()); } Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: short_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } Vc_SIMD_CAST_1(AVX2:: int_m, SSE::ushort_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::double_m) { return _mm_unpacklo_epi32(AVX::lo128(x.dataI()), AVX::lo128(x.dataI())); } Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: float_m) { return AVX::lo128(x.dataI()); } Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: int_m) { return AVX::lo128(x.dataI()); } Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: uint_m) { return AVX::lo128(x.dataI()); } Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: short_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::ushort_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } Vc_SIMD_CAST_1(AVX2:: short_m, SSE::double_m) { return simd_cast(SSE::short_m(AVX::lo128(x.data()))); } Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: float_m) { return simd_cast(SSE::short_m(AVX::lo128(x.data()))); } Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: int_m) { return simd_cast(SSE::short_m(AVX::lo128(x.data()))); } Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: uint_m) { return simd_cast(SSE::short_m(AVX::lo128(x.data()))); } Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: short_m) { return simd_cast(SSE::short_m(AVX::lo128(x.data()))); } Vc_SIMD_CAST_1(AVX2:: short_m, SSE::ushort_m) { return simd_cast(SSE::short_m(AVX::lo128(x.data()))); } Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::double_m) { return simd_cast(SSE::ushort_m(AVX::lo128(x.data()))); } Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: float_m) { return simd_cast(SSE::ushort_m(AVX::lo128(x.data()))); } Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: int_m) { return simd_cast(SSE::ushort_m(AVX::lo128(x.data()))); } Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: uint_m) { return simd_cast(SSE::ushort_m(AVX::lo128(x.data()))); } Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: short_m) { return simd_cast(SSE::ushort_m(AVX::lo128(x.data()))); } Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::ushort_m) { return simd_cast(SSE::ushort_m(AVX::lo128(x.data()))); } #endif // 2 AVX2::Mask to 1 SSE::Mask {{{2 Vc_SIMD_CAST_2(AVX2::double_m, SSE:: short_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI()))); } Vc_SIMD_CAST_2(AVX2::double_m, SSE::ushort_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI()))); } // 1 AVX2::Mask to 1 Scalar::Mask {{{2 template Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::Mask x, enable_if::value>) { return static_cast(x[0]); } // offset == 0 | convert from AVX2::Mask/Vector {{{1 template Vc_INTRINSIC Vc_CONST enable_if< (offset == 0 && ((AVX2::is_vector::value && !Scalar::is_vector::value && Traits::is_simd_vector::value && !Traits::isSimdArray::value) || (AVX2::is_mask::value && !Scalar::is_mask::value && Traits::is_simd_mask::value && !Traits::isSimdMaskArray::value))), Return> simd_cast(const From &x) { return simd_cast(x); } // offset == 0 | convert from SSE::Mask/Vector to AVX2::Mask/Vector {{{1 template Vc_INTRINSIC Vc_CONST Return simd_cast(const From &x, enable_if::value && AVX2::is_vector::value) || (SSE::is_mask::value && AVX2::is_mask::value))>) { return simd_cast(x); } // Vector casts with offset {{{1 // AVX2 to AVX2 {{{2 template Vc_INTRINSIC Vc_CONST enable_if<(AVX2::is_vector::value && offset != 0), Return> simd_cast(AVX2::Vector x) { // TODO: there certainly is potential for leaving out the shift/permute // instruction at the cost of a lot more specializations using V = AVX2::Vector; constexpr int shift = sizeof(T) * offset * Return::Size; static_assert(shift > 0 && shift < sizeof(x), ""); if (shift < 16) { return simd_cast(V{AVX::avx_cast( _mm_srli_si128(AVX::avx_cast<__m128i>(AVX::lo128(x.data())), shift))}); } else if (shift == 16) { return simd_cast(V{Mem::permute128(x.data())}); } else { #ifdef Vc_MSVC #pragma warning(push) #pragma warning(disable : 4556) // value of intrinsic immediate argument '-8' is out of // range '0 - 255' #endif return simd_cast(V{AVX::avx_cast( _mm_srli_si128(AVX::avx_cast<__m128i>(AVX::hi128(x.data())), shift - 16))}); #ifdef Vc_MSVC #pragma warning(pop) #endif } } // AVX2 to SSE (Vector) {{{2 template Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector::value && sizeof(AVX2::Vector) == 32), Return> simd_cast(AVX2::Vector x) { using V = AVX2::Vector; constexpr int shift = sizeof(V) / V::Size * offset * Return::Size; static_assert(shift > 0, ""); static_assert(shift < sizeof(V), ""); using SseVector = SSE::Vector; if (shift == 16) { return simd_cast(SseVector{AVX::hi128(x.data())}); } using Intrin = typename SseVector::VectorType; return simd_cast(SseVector{AVX::avx_cast( _mm_alignr_epi8(AVX::avx_cast<__m128i>(AVX::hi128(x.data())), AVX::avx_cast<__m128i>(AVX::lo128(x.data())), shift))}); } template Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector::value && sizeof(AVX2::Vector) == 16), Return> simd_cast(AVX2::Vector x) { using V = AVX2::Vector; constexpr int shift = sizeof(V) / V::Size * offset * Return::Size; static_assert(shift > 0, ""); static_assert(shift < sizeof(V), ""); using SseVector = SSE::Vector; return simd_cast(SseVector{_mm_srli_si128(x.data(), shift)}); } // SSE to AVX2 {{{2 Vc_SIMD_CAST_OFFSET(SSE:: short_v, AVX2::double_v, 1) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_OFFSET(SSE::ushort_v, AVX2::double_v, 1) { return simd_cast(simd_cast(x)); } // Mask casts with offset {{{1 // 1 AVX2::Mask to N AVX2::Mask {{{2 // float_v and (u)int_v have size 8, double_v has size 4, and (u)short_v have size 16. Consequently, // offset can 0, 1, 2, or 3. // - offset == 0 is already done. // - offset == 1 can be 16 -> 8, 16 -> 4, 8 -> 4, and 16 -> 4 // - offset == 2 && offset == 3 can only be 16 -> 4 template Vc_INTRINSIC Vc_CONST Return simd_cast(const AVX2::Mask &k, enable_if<(AVX2::is_mask::value && offset == 1 && AVX2::Mask::Size == Return::Size * 2)> = nullarg) { const auto tmp = AVX::hi128(k.dataI()); return AVX::concat(_mm_unpacklo_epi8(tmp, tmp), _mm_unpackhi_epi8(tmp, tmp)); } template Vc_INTRINSIC Vc_CONST Return simd_cast(const AVX2::Mask &k, enable_if<(AVX2::is_mask::value && offset == 1 && AVX2::Mask::Size == Return::Size * 4)> = nullarg) { auto tmp = AVX::lo128(k.dataI()); tmp = _mm_unpackhi_epi8(tmp, tmp); return AVX::concat(_mm_unpacklo_epi16(tmp, tmp), _mm_unpackhi_epi16(tmp, tmp)); } template Vc_INTRINSIC Vc_CONST Return simd_cast(const AVX2::Mask &k, enable_if<(AVX2::is_mask::value && offset == 2 && AVX2::Mask::Size == Return::Size * 4)> = nullarg) { auto tmp = AVX::hi128(k.dataI()); tmp = _mm_unpacklo_epi8(tmp, tmp); return AVX::concat(_mm_unpacklo_epi16(tmp, tmp), _mm_unpackhi_epi16(tmp, tmp)); } template Vc_INTRINSIC Vc_CONST Return simd_cast(const AVX2::Mask &k, enable_if<(AVX2::is_mask::value && offset == 3 && AVX2::Mask::Size == Return::Size * 4)> = nullarg) { auto tmp = AVX::hi128(k.dataI()); tmp = _mm_unpackhi_epi8(tmp, tmp); return AVX::concat(_mm_unpacklo_epi16(tmp, tmp), _mm_unpackhi_epi16(tmp, tmp)); } // 1 SSE::Mask to N AVX2::Mask {{{2 Vc_SIMD_CAST_OFFSET(SSE:: short_m, AVX2::double_m, 1) { auto tmp = _mm_unpackhi_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); } Vc_SIMD_CAST_OFFSET(SSE::ushort_m, AVX2::double_m, 1) { auto tmp = _mm_unpackhi_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); } // AVX2 to SSE (Mask) {{{2 template Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask::value && sizeof(AVX2::Mask) == 32), Return> simd_cast(AVX2::Mask x) { using M = AVX2::Mask; constexpr int shift = sizeof(M) / M::Size * offset * Return::Size; static_assert(shift > 0, ""); static_assert(shift < sizeof(M), ""); using SseVector = SSE::Mask>; if (shift == 16) { return simd_cast(SseVector{AVX::hi128(x.data())}); } using Intrin = typename SseVector::VectorType; return simd_cast(SseVector{AVX::avx_cast( _mm_alignr_epi8(AVX::hi128(x.dataI()), AVX::lo128(x.dataI()), shift))}); } template Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask::value && sizeof(AVX2::Mask) == 16), Return> simd_cast(AVX2::Mask x) { return simd_cast(simd_cast>(x)); } // undef Vc_SIMD_CAST_AVX_[1234] & Vc_SIMD_CAST_[12345678] {{{1 #undef Vc_SIMD_CAST_AVX_1 #undef Vc_SIMD_CAST_AVX_2 #undef Vc_SIMD_CAST_AVX_3 #undef Vc_SIMD_CAST_AVX_4 #undef Vc_SIMD_CAST_1 #undef Vc_SIMD_CAST_2 #undef Vc_SIMD_CAST_3 #undef Vc_SIMD_CAST_4 #undef Vc_SIMD_CAST_5 #undef Vc_SIMD_CAST_6 #undef Vc_SIMD_CAST_7 #undef Vc_SIMD_CAST_8 #undef Vc_SIMD_CAST_OFFSET // }}}1 } // namespace Vc #endif // VC_AVX_SIMD_CAST_H_ // vim: foldmethod=marker Vc-1.3.3/avx/simd_cast_caller.tcc000066400000000000000000000042431320703111200166400ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2014-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef Vc_AVX_SIMD_CAST_CALLER_TCC_ #define Vc_AVX_SIMD_CAST_CALLER_TCC_ #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { #if Vc_IS_VERSION_1 template template Vc_INTRINSIC Vector::Vector(U &&x) : d(simd_cast(std::forward(x)).data()) { } template template Vc_INTRINSIC Mask::Mask(U &&rhs, Common::enable_if_mask_converts_explicitly) : Mask(simd_cast(std::forward(rhs))) { } #endif // Vc_IS_VERSION_1 } #endif // Vc_AVX_SIMD_CAST_CALLER_TCC_ // vim: foldmethod=marker Vc-1.3.3/avx/types.h000066400000000000000000000105321320703111200141700ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2009-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_AVX_TYPES_H_ #define VC_AVX_TYPES_H_ #include "../sse/types.h" #include "../traits/type_traits.h" #include "macros.h" #ifdef Vc_DEFAULT_IMPL_AVX2 #define Vc_DOUBLE_V_SIZE 4 #define Vc_FLOAT_V_SIZE 8 #define Vc_INT_V_SIZE 8 #define Vc_UINT_V_SIZE 8 #define Vc_SHORT_V_SIZE 16 #define Vc_USHORT_V_SIZE 16 #elif defined Vc_DEFAULT_IMPL_AVX #define Vc_DOUBLE_V_SIZE 4 #define Vc_FLOAT_V_SIZE 8 #define Vc_INT_V_SIZE 4 #define Vc_UINT_V_SIZE 4 #define Vc_SHORT_V_SIZE 8 #define Vc_USHORT_V_SIZE 8 #endif namespace Vc_VERSIONED_NAMESPACE { namespace AVX { template using Vector = Vc::Vector>; typedef Vector double_v; typedef Vector float_v; typedef Vector int_v; typedef Vector uint_v; typedef Vector short_v; typedef Vector ushort_v; template using Mask = Vc::Mask>; typedef Mask double_m; typedef Mask float_m; typedef Mask int_m; typedef Mask uint_m; typedef Mask short_m; typedef Mask ushort_m; template struct Const; template struct is_vector : public std::false_type {}; template struct is_vector> : public std::true_type {}; template struct is_mask : public std::false_type {}; template struct is_mask> : public std::true_type {}; } // namespace AVX namespace AVX2 { template using Vector = Vc::Vector; using double_v = Vector; using float_v = Vector< float>; using int_v = Vector< int>; using uint_v = Vector< uint>; using short_v = Vector< short>; using ushort_v = Vector; template using Mask = Vc::Mask; using double_m = Mask; using float_m = Mask< float>; using llong_m = Mask< llong>; using ullong_m = Mask; using long_m = Mask< long>; using ulong_m = Mask< ulong>; using int_m = Mask< int>; using uint_m = Mask< uint>; using short_m = Mask< short>; using ushort_m = Mask; using schar_m = Mask< schar>; using uchar_m = Mask< uchar>; template struct is_vector : public std::false_type {}; template struct is_vector> : public std::true_type {}; template struct is_mask : public std::false_type {}; template struct is_mask> : public std::true_type {}; } // namespace AVX2 namespace Traits { template struct is_simd_vector_internal> : public is_valid_vector_argument {}; template struct is_simd_mask_internal> : public std::true_type {}; } // namespace Traits } // namespace Vc #endif // VC_AVX_TYPES_H_ Vc-1.3.3/avx/vector.h000066400000000000000000000542351320703111200143360ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2009-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_AVX_VECTOR_H_ #define VC_AVX_VECTOR_H_ #include "intrinsics.h" #include "casts.h" #include "../sse/vector.h" #include "shuffle.h" #include "vectorhelper.h" #include "mask.h" #include #include #include "../common/aliasingentryhelper.h" #include "../common/memoryfwd.h" #include "../common/where.h" #include "macros.h" #ifdef isfinite #undef isfinite #endif #ifdef isnan #undef isnan #endif namespace Vc_VERSIONED_NAMESPACE { namespace Detail { template struct VectorTraits { using mask_type = Vc::Mask; using vector_type = Vc::Vector; using writemasked_vector_type = Common::WriteMaskedVector; using intrinsic_type = typename AVX::VectorTypeHelper::Type; }; } // namespace Detail #define Vc_CURRENT_CLASS_NAME Vector template class Vector { public: using abi = VectorAbi::Avx; private: using traits_type = Detail::VectorTraits; static_assert( std::is_arithmetic::value, "Vector only accepts arithmetic builtin types as template parameter T."); using WriteMaskedVector = typename traits_type::writemasked_vector_type; public: using VectorType = typename traits_type::intrinsic_type; using vector_type = VectorType; using mask_type = typename traits_type::mask_type; using Mask = mask_type; using MaskType = mask_type; using MaskArg Vc_DEPRECATED_ALIAS("Use MaskArgument instead.") = typename Mask::AsArg; using MaskArgument = typename Mask::AsArg; using reference = Detail::ElementReference; Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(VectorType)); using EntryType = typename Common::ensure_alignment_equals_sizeof::type; using value_type = EntryType; typedef EntryType VectorEntryType; static constexpr size_t Size = sizeof(VectorType) / sizeof(EntryType); static constexpr size_t MemoryAlignment = alignof(VectorType); enum Constants { HasVectorDivision = AVX::HasVectorDivisionHelper::Value }; #ifdef Vc_IMPL_AVX2 typedef typename std::conditional< (Size >= 8), SimdArray, typename std::conditional<(Size >= 4), SimdArray, SimdArray>::type>::type IndexType; #else typedef typename std::conditional<(Size >= 4), SimdArray, SimdArray>::type IndexType; #endif typedef Vector AsArg; typedef VectorType VectorTypeArg; protected: template using V = Vector; // helper that specializes on VectorType typedef AVX::VectorHelper HV; // helper that specializes on T typedef AVX::VectorHelper HT; // cast any m256/m128 to VectorType template static Vc_INTRINSIC VectorType _cast(V v) { return AVX::avx_cast(v); } typedef Common::VectorMemoryUnion StorageType; StorageType d; using WidthT = Common::WidthT; // ICC can't compile this: // static constexpr WidthT Width = WidthT(); public: #include "../common/generalinterface.h" static Vc_ALWAYS_INLINE_L Vector Random() Vc_ALWAYS_INLINE_R; /////////////////////////////////////////////////////////////////////////////////////////// // internal: required to enable returning objects of VectorType Vc_ALWAYS_INLINE Vector(VectorTypeArg x) : d(x) {} // implict conversion from compatible Vector template Vc_INTRINSIC Vector( V x, typename std::enable_if::value, void *>::type = nullptr) : d(AVX::convert(x.data())) { } #if Vc_IS_VERSION_1 // static_cast from the remaining Vector template Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between " "vector types") Vc_INTRINSIC explicit Vector( V x, typename std::enable_if::value, void *>::type = nullptr) : d(Detail::zeroExtendIfNeeded(AVX::convert(x.data()))) { } // static_cast from other types, implemented via the non-member simd_cast function in // simd_cast_caller.tcc template ::value && !std::is_same>::value>> Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between " "vector types") Vc_INTRINSIC_L explicit Vector(U &&x) Vc_INTRINSIC_R; #endif /////////////////////////////////////////////////////////////////////////////////////////// // broadcast Vc_INTRINSIC Vector(EntryType a) : d(Detail::avx_broadcast(a)) {} template Vc_INTRINSIC Vector(U a, typename std::enable_if::value && !std::is_same::value, void *>::type = nullptr) : Vector(static_cast(a)) { } //template explicit Vector(std::initializer_list) { static_assert(std::is_same::value, "A SIMD vector object cannot be initialized from an initializer list " "because the number of entries in the vector is target-dependent."); } #include "../common/loadinterface.h" #include "../common/storeinterface.h" /////////////////////////////////////////////////////////////////////////////////////////// // zeroing Vc_INTRINSIC_L void setZero() Vc_INTRINSIC_R; Vc_INTRINSIC_L void setZero(const Mask &k) Vc_INTRINSIC_R; Vc_INTRINSIC_L void setZeroInverted(const Mask &k) Vc_INTRINSIC_R; Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R; Vc_INTRINSIC_L void setQnan(MaskArgument k) Vc_INTRINSIC_R; #include "../common/gatherinterface.h" #include "../common/scatterinterface.h" #if defined Vc_IMPL_AVX2 && !defined Vc_MSVC // skip this code for MSVC because it fails to do overload resolution correctly Vc_INTRINSIC_L void gatherImplementation( const EntryType *mem, typename std::conditional< Size == 8, AVX2::int_v, typename std::conditional::type>::type indexes) Vc_INTRINSIC_R; template Vc_INTRINSIC enable_if::value && std::is_integral::value && (sizeof(MT) >= sizeof(short)), void> gatherImplementation(const MT *mem, const SimdArray &indexes) { *this = simd_cast(SimdArray(mem, indexes)); } template Vc_INTRINSIC enable_if::value && sizeof(EntryType) == 2, void> gatherImplementation(const EntryType *mem, const SimdArray &indexes) { const auto lo = simd_cast(indexes); const auto hi = simd_cast(indexes); *this = simd_cast( AVX2::int_v(_mm256_i32gather_epi32( reinterpret_cast *>(mem), lo.data(), 2)), AVX2::int_v(_mm256_i32gather_epi32( reinterpret_cast *>(mem), hi.data(), 2))); } template Vc_INTRINSIC enable_if::value && Size == 8, void> gatherImplementation(const EntryType *mem, const SimdArray &indexes) { gatherImplementation(mem, simd_cast(indexes)); } template Vc_INTRINSIC enable_if::value && Size == 4, void> gatherImplementation(const EntryType *mem, const SimdArray &indexes) { gatherImplementation(mem, simd_cast(indexes)); } #endif // Vc_IMPL_AVX2 && !MSVC /////////////////////////////////////////////////////////////////////////////////////////// //prefix Vc_ALWAYS_INLINE Vector &operator++() { data() = Detail::add(data(), Detail::one(T()), T()); return *this; } Vc_ALWAYS_INLINE Vector &operator--() { data() = Detail::sub(data(), Detail::one(T()), T()); return *this; } //postfix Vc_ALWAYS_INLINE Vector operator++(int) { const Vector r = *this; data() = Detail::add(data(), Detail::one(T()), T()); return r; } Vc_ALWAYS_INLINE Vector operator--(int) { const Vector r = *this; data() = Detail::sub(data(), Detail::one(T()), T()); return r; } private: friend reference; Vc_INTRINSIC static value_type get(const Vector &o, int i) noexcept { return o.d.m(i); } template Vc_INTRINSIC static void set(Vector &o, int i, U &&v) noexcept( noexcept(std::declval() = v)) { return o.d.set(i, v); } public: /** * \note the returned object models the concept of a reference and * as such it can exist longer than the data it is referencing. * \note to avoid lifetime issues, we strongly advice not to store * any reference objects. */ Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept { static_assert(noexcept(reference{std::declval(), int()}), ""); return {*this, int(index)}; } Vc_ALWAYS_INLINE value_type operator[](size_t index) const noexcept { return d.m(index); } Vc_INTRINSIC_L Vc_PURE_L Vector operator[](Permutation::ReversedTag) const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L Vector operator[](const IndexType &perm) const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC Vc_PURE Mask operator!() const { return *this == Zero(); } Vc_ALWAYS_INLINE Vector operator~() const { #ifndef Vc_ENABLE_FLOAT_BIT_OPERATORS static_assert(std::is_integral::value, "bit-complement can only be used with Vectors of integral type"); #endif return Detail::andnot_(data(), Detail::allone()); } Vc_ALWAYS_INLINE_L Vc_PURE_L Vector operator-() const Vc_ALWAYS_INLINE_R Vc_PURE_R; Vc_INTRINSIC Vc_PURE Vector operator+() const { return *this; } // shifts #define Vc_OP_VEC(op) \ Vc_INTRINSIC Vector &operator op##=(AsArg x); \ Vc_INTRINSIC Vc_PURE Vector operator op(AsArg x) const \ { \ static_assert( \ std::is_integral::value, \ "bitwise-operators can only be used with Vectors of integral type"); \ } Vc_ALL_SHIFTS(Vc_OP_VEC); #undef Vc_OP_VEC Vc_ALWAYS_INLINE_L Vector &operator>>=(int x) Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L Vector &operator<<=(int x) Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L Vector operator>>(int x) const Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L Vector operator<<(int x) const Vc_ALWAYS_INLINE_R; Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC Vc_PURE Mask isNegative() const { return Vc::isnegative(*this); } Vc_ALWAYS_INLINE void assign( const Vector &v, const Mask &mask ) { const VectorType k = _cast(mask.data()); data() = Detail::blend(data(), v.data(), k); } template Vc_DEPRECATED("Use simd_cast instead of Vector::staticCast") Vc_ALWAYS_INLINE V2 staticCast() const { return V2(*this); } template Vc_DEPRECATED("use reinterpret_components_cast instead") Vc_ALWAYS_INLINE V2 reinterpretCast() const { return AVX::avx_cast(data()); } Vc_ALWAYS_INLINE WriteMaskedVector operator()(const Mask &k) { return {*this, k}; } Vc_ALWAYS_INLINE VectorType &data() { return d.v(); } Vc_ALWAYS_INLINE const VectorType &data() const { return d.v(); } template Vc_INTRINSIC_L Vector broadcast() const Vc_INTRINSIC_R; Vc_INTRINSIC_L std::pair minIndex() const Vc_INTRINSIC_R; Vc_INTRINSIC_L std::pair maxIndex() const Vc_INTRINSIC_R; Vc_ALWAYS_INLINE EntryType min() const { return Detail::min(data(), T()); } Vc_ALWAYS_INLINE EntryType max() const { return Detail::max(data(), T()); } Vc_ALWAYS_INLINE EntryType product() const { return Detail::mul(data(), T()); } Vc_ALWAYS_INLINE EntryType sum() const { return Detail::add(data(), T()); } Vc_ALWAYS_INLINE_L Vector partialSum() const Vc_ALWAYS_INLINE_R; //template Vc_ALWAYS_INLINE_L Vector partialSum(BinaryOperation op) const Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L EntryType min(MaskArgument m) const Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L EntryType max(MaskArgument m) const Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L EntryType product(MaskArgument m) const Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L EntryType sum(MaskArgument m) const Vc_ALWAYS_INLINE_R; Vc_INTRINSIC_L Vector shifted(int amount, Vector shiftIn) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector shifted(int amount) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector rotated(int amount) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vc_PURE_L Vector reversed() const Vc_INTRINSIC_R Vc_PURE_R; Vc_ALWAYS_INLINE_L Vc_PURE_L Vector sorted() const Vc_ALWAYS_INLINE_R Vc_PURE_R; template void callWithValuesSorted(F &&f) { EntryType value = d.m(0); f(value); for (size_t i = 1; i < Size; ++i) { if (d.m(i) != value) { value = d.m(i); f(value); } } } template Vc_INTRINSIC void call(F &&f) const { Common::for_all_vector_entries([&](size_t i) { f(EntryType(d.m(i))); }); } template Vc_INTRINSIC void call(F &&f, const Mask &mask) const { for (size_t i : where(mask)) { f(EntryType(d.m(i))); } } template Vc_INTRINSIC Vector apply(F &&f) const { Vector r; Common::for_all_vector_entries( [&](size_t i) { r.d.set(i, f(EntryType(d.m(i)))); }); return r; } template Vc_INTRINSIC Vector apply(F &&f, const Mask &mask) const { Vector r(*this); for (size_t i : where(mask)) { r.d.set(i, f(EntryType(r.d.m(i)))); } return r; } template Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) { Common::for_all_vector_entries([&](size_t i) { d.set(i, f(i)); }); } Vc_INTRINSIC void fill(EntryType (&f)()) { Common::for_all_vector_entries([&](size_t i) { d.set(i, f()); }); } template static Vc_INTRINSIC_L Vector generate(G gen) Vc_INTRINSIC_R; Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC Vector copySign(AsArg reference) const { return Vc::copysign(*this, reference); } Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC Vector exponent() const { Vc::exponent(*this); } Vc_INTRINSIC_L Vector interleaveLow(Vector x) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector interleaveHigh(Vector x) const Vc_INTRINSIC_R; }; #undef Vc_CURRENT_CLASS_NAME template constexpr size_t Vector::Size; template constexpr size_t Vector::MemoryAlignment; static_assert(Traits::is_simd_vector::value, "is_simd_vector::value"); static_assert(Traits::is_simd_vector::value, "is_simd_vector< float_v>::value"); static_assert(Traits::is_simd_vector::value, "is_simd_vector< int_v>::value"); static_assert(Traits::is_simd_vector::value, "is_simd_vector< uint_v>::value"); static_assert(Traits::is_simd_vector::value, "is_simd_vector< short_v>::value"); static_assert(Traits::is_simd_vector::value, "is_simd_vector::value"); static_assert(Traits::is_simd_mask ::value, "is_simd_mask ::value"); static_assert(Traits::is_simd_mask ::value, "is_simd_mask < float_m>::value"); static_assert(Traits::is_simd_mask ::value, "is_simd_mask < int_m>::value"); static_assert(Traits::is_simd_mask ::value, "is_simd_mask < uint_m>::value"); static_assert(Traits::is_simd_mask ::value, "is_simd_mask < short_m>::value"); static_assert(Traits::is_simd_mask ::value, "is_simd_mask ::value"); #ifdef Vc_IMPL_AVX2 static_assert(!std::is_convertible::value, "A float* should never implicitly convert to short_v. Something is broken."); static_assert(!std::is_convertible::value, "An int* should never implicitly convert to short_v. Something is broken."); static_assert(!std::is_convertible::value, "A short* should never implicitly convert to short_v. Something is broken."); #endif #define Vc_CONDITIONAL_ASSIGN(name_, op_) \ template \ Vc_INTRINSIC enable_if conditional_assign( \ AVX2::Vector &lhs, M &&mask, U &&rhs) \ { \ lhs(mask) op_ rhs; \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_CONDITIONAL_ASSIGN( Assign, =); Vc_CONDITIONAL_ASSIGN( PlusAssign, +=); Vc_CONDITIONAL_ASSIGN( MinusAssign, -=); Vc_CONDITIONAL_ASSIGN( MultiplyAssign, *=); Vc_CONDITIONAL_ASSIGN( DivideAssign, /=); Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=); Vc_CONDITIONAL_ASSIGN( XorAssign, ^=); Vc_CONDITIONAL_ASSIGN( AndAssign, &=); Vc_CONDITIONAL_ASSIGN( OrAssign, |=); Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=); Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=); #undef Vc_CONDITIONAL_ASSIGN #define Vc_CONDITIONAL_ASSIGN(name_, expr_) \ template \ Vc_INTRINSIC enable_if> conditional_assign( \ AVX2::Vector &lhs, M &&mask) \ { \ return expr_; \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs(mask)++); Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs(mask)); Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs(mask)--); Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs(mask)); #undef Vc_CONDITIONAL_ASSIGN } // namespace Vc #include "vector.tcc" #include "simd_cast.h" #endif // VC_AVX_VECTOR_H_ Vc-1.3.3/avx/vector.tcc000066400000000000000000001246331320703111200146600ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2011-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #include "../common/x86_prefetches.h" #include "../common/gatherimplementation.h" #include "../common/scatterimplementation.h" #include "limits.h" #include "const.h" #include "../common/set.h" #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace Detail { // compare operators {{{1 Vc_INTRINSIC AVX2::double_m operator==(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpeq_pd(a.data(), b.data()); } Vc_INTRINSIC AVX2:: float_m operator==(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpeq_ps(a.data(), b.data()); } Vc_INTRINSIC AVX2:: int_m operator==(AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); } Vc_INTRINSIC AVX2:: uint_m operator==(AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); } Vc_INTRINSIC AVX2:: short_m operator==(AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); } Vc_INTRINSIC AVX2::ushort_m operator==(AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); } Vc_INTRINSIC AVX2::double_m operator!=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpneq_pd(a.data(), b.data()); } Vc_INTRINSIC AVX2:: float_m operator!=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpneq_ps(a.data(), b.data()); } Vc_INTRINSIC AVX2:: int_m operator!=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); } Vc_INTRINSIC AVX2:: uint_m operator!=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); } Vc_INTRINSIC AVX2:: short_m operator!=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); } Vc_INTRINSIC AVX2::ushort_m operator!=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); } Vc_INTRINSIC AVX2::double_m operator>=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpnlt_pd(a.data(), b.data()); } Vc_INTRINSIC AVX2:: float_m operator>=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpnlt_ps(a.data(), b.data()); } Vc_INTRINSIC AVX2:: int_m operator>=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmplt_epi32(a.data(), b.data())); } Vc_INTRINSIC AVX2:: uint_m operator>=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmplt_epu32(a.data(), b.data())); } Vc_INTRINSIC AVX2:: short_m operator>=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmplt_epi16(a.data(), b.data())); } Vc_INTRINSIC AVX2::ushort_m operator>=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmplt_epu16(a.data(), b.data())); } Vc_INTRINSIC AVX2::double_m operator<=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmple_pd(a.data(), b.data()); } Vc_INTRINSIC AVX2:: float_m operator<=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmple_ps(a.data(), b.data()); } Vc_INTRINSIC AVX2:: int_m operator<=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmpgt_epi32(a.data(), b.data())); } Vc_INTRINSIC AVX2:: uint_m operator<=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmpgt_epu32(a.data(), b.data())); } Vc_INTRINSIC AVX2:: short_m operator<=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpgt_epi16(a.data(), b.data())); } Vc_INTRINSIC AVX2::ushort_m operator<=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpgt_epu16(a.data(), b.data())); } Vc_INTRINSIC AVX2::double_m operator> (AVX2::double_v a, AVX2::double_v b) { return AVX::cmpgt_pd(a.data(), b.data()); } Vc_INTRINSIC AVX2:: float_m operator> (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpgt_ps(a.data(), b.data()); } Vc_INTRINSIC AVX2:: int_m operator> (AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmpgt_epi32(a.data(), b.data()); } Vc_INTRINSIC AVX2:: uint_m operator> (AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmpgt_epu32(a.data(), b.data()); } Vc_INTRINSIC AVX2:: short_m operator> (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpgt_epi16(a.data(), b.data()); } Vc_INTRINSIC AVX2::ushort_m operator> (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpgt_epu16(a.data(), b.data()); } Vc_INTRINSIC AVX2::double_m operator< (AVX2::double_v a, AVX2::double_v b) { return AVX::cmplt_pd(a.data(), b.data()); } Vc_INTRINSIC AVX2:: float_m operator< (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmplt_ps(a.data(), b.data()); } Vc_INTRINSIC AVX2:: int_m operator< (AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmplt_epi32(a.data(), b.data()); } Vc_INTRINSIC AVX2:: uint_m operator< (AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmplt_epu32(a.data(), b.data()); } Vc_INTRINSIC AVX2:: short_m operator< (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmplt_epi16(a.data(), b.data()); } Vc_INTRINSIC AVX2::ushort_m operator< (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmplt_epu16(a.data(), b.data()); } // bitwise operators {{{1 template Vc_INTRINSIC AVX2::Vector operator^(AVX2::Vector a, AVX2::Vector b) { return xor_(a.data(), b.data()); } template Vc_INTRINSIC AVX2::Vector operator&(AVX2::Vector a, AVX2::Vector b) { return and_(a.data(), b.data()); } template Vc_INTRINSIC AVX2::Vector operator|(AVX2::Vector a, AVX2::Vector b) { return or_(a.data(), b.data()); } // }}}1 // arithmetic operators {{{1 template Vc_INTRINSIC AVX2::Vector operator+(AVX2::Vector a, AVX2::Vector b) { return add(a.data(), b.data(), T()); } template Vc_INTRINSIC AVX2::Vector operator-(AVX2::Vector a, AVX2::Vector b) { return sub(a.data(), b.data(), T()); } template Vc_INTRINSIC AVX2::Vector operator*(AVX2::Vector a, AVX2::Vector b) { return mul(a.data(), b.data(), T()); } template Vc_INTRINSIC AVX2::Vector operator/(AVX2::Vector a, AVX2::Vector b) { return div(a.data(), b.data(), T()); } Vc_INTRINSIC AVX2::Vector operator/(AVX2::Vector a, AVX2::Vector b) { using namespace AVX; const __m256 lo = _mm256_div_ps(convert(lo128(a.data())), convert(lo128(b.data()))); const __m256 hi = _mm256_div_ps(convert(hi128(a.data())), convert(hi128(b.data()))); const float_v threshold = 32767.f; using Detail::operator>; const __m128i loShort = (Vc_IS_UNLIKELY((float_v(lo) > threshold).isNotEmpty())) ? convert(lo) : convert(lo); const __m128i hiShort = (Vc_IS_UNLIKELY((float_v(hi) > threshold).isNotEmpty())) ? convert(hi) : convert(hi); return concat(loShort, hiShort); } template Vc_INTRINSIC enable_if::value, AVX2::Vector> operator%( AVX2::Vector a, AVX2::Vector b) { return a - a / b * b; } // }}}1 } // namespace Detail /////////////////////////////////////////////////////////////////////////////////////////// // generate {{{1 template <> template Vc_INTRINSIC AVX2::double_v AVX2::double_v::generate(G gen) { const auto tmp0 = gen(0); const auto tmp1 = gen(1); const auto tmp2 = gen(2); const auto tmp3 = gen(3); return _mm256_setr_pd(tmp0, tmp1, tmp2, tmp3); } template <> template Vc_INTRINSIC AVX2::float_v AVX2::float_v::generate(G gen) { const auto tmp0 = gen(0); const auto tmp1 = gen(1); const auto tmp2 = gen(2); const auto tmp3 = gen(3); const auto tmp4 = gen(4); const auto tmp5 = gen(5); const auto tmp6 = gen(6); const auto tmp7 = gen(7); return _mm256_setr_ps(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); } #ifdef Vc_IMPL_AVX2 template <> template Vc_INTRINSIC AVX2::int_v AVX2::int_v::generate(G gen) { const auto tmp0 = gen(0); const auto tmp1 = gen(1); const auto tmp2 = gen(2); const auto tmp3 = gen(3); const auto tmp4 = gen(4); const auto tmp5 = gen(5); const auto tmp6 = gen(6); const auto tmp7 = gen(7); return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); } template <> template Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::generate(G gen) { const auto tmp0 = gen(0); const auto tmp1 = gen(1); const auto tmp2 = gen(2); const auto tmp3 = gen(3); const auto tmp4 = gen(4); const auto tmp5 = gen(5); const auto tmp6 = gen(6); const auto tmp7 = gen(7); return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); } template <> template Vc_INTRINSIC AVX2::short_v AVX2::short_v::generate(G gen) { const auto tmp0 = gen(0); const auto tmp1 = gen(1); const auto tmp2 = gen(2); const auto tmp3 = gen(3); const auto tmp4 = gen(4); const auto tmp5 = gen(5); const auto tmp6 = gen(6); const auto tmp7 = gen(7); const auto tmp8 = gen(8); const auto tmp9 = gen(9); const auto tmp10 = gen(10); const auto tmp11 = gen(11); const auto tmp12 = gen(12); const auto tmp13 = gen(13); const auto tmp14 = gen(14); const auto tmp15 = gen(15); return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15); } template <> template Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::generate(G gen) { const auto tmp0 = gen(0); const auto tmp1 = gen(1); const auto tmp2 = gen(2); const auto tmp3 = gen(3); const auto tmp4 = gen(4); const auto tmp5 = gen(5); const auto tmp6 = gen(6); const auto tmp7 = gen(7); const auto tmp8 = gen(8); const auto tmp9 = gen(9); const auto tmp10 = gen(10); const auto tmp11 = gen(11); const auto tmp12 = gen(12); const auto tmp13 = gen(13); const auto tmp14 = gen(14); const auto tmp15 = gen(15); return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15); } #endif // constants {{{1 template Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerZero) : d{} {} template <> Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerOne) : d(AVX::setone_pd()) {} template <> Vc_INTRINSIC Vector< float, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_ps()) {} #ifdef Vc_IMPL_AVX2 template <> Vc_INTRINSIC Vector< int, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi32()) {} template <> Vc_INTRINSIC Vector< uint, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu32()) {} template <> Vc_INTRINSIC Vector< short, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi16()) {} template <> Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu16()) {} template <> Vc_INTRINSIC Vector< schar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi8()) {} template <> Vc_INTRINSIC Vector< uchar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu8()) {} #endif template Vc_ALWAYS_INLINE Vector::Vector( VectorSpecialInitializerIndexesFromZero) : Vector(AVX::IndexesFromZeroData::address(), Vc::Aligned) { } template <> Vc_ALWAYS_INLINE Vector::Vector(VectorSpecialInitializerIndexesFromZero) : Vector(AVX::IndexesFromZeroData::address(), Vc::Aligned) { } template <> Vc_ALWAYS_INLINE Vector::Vector(VectorSpecialInitializerIndexesFromZero) : Vector(AVX::IndexesFromZeroData::address(), Vc::Aligned) { } /////////////////////////////////////////////////////////////////////////////////////////// // load member functions {{{1 // general load, implemented via LoadHelper {{{2 template template Vc_INTRINSIC typename Vector:: #ifndef Vc_MSVC template #endif load_concept::type Vector::load(const SrcT *mem, Flags flags) { Common::handleLoadPrefetches(mem, flags); d.v() = Detail::load(mem, flags); } /////////////////////////////////////////////////////////////////////////////////////////// // zeroing {{{1 template Vc_INTRINSIC void Vector::setZero() { data() = Detail::zero(); } template Vc_INTRINSIC void Vector::setZero(const Mask &k) { data() = Detail::andnot_(AVX::avx_cast(k.data()), data()); } template Vc_INTRINSIC void Vector::setZeroInverted(const Mask &k) { data() = Detail::and_(AVX::avx_cast(k.data()), data()); } template<> Vc_INTRINSIC void Vector::setQnan() { data() = Detail::allone(); } template<> Vc_INTRINSIC void Vector::setQnan(MaskArgument k) { data() = _mm256_or_pd(data(), k.dataD()); } template<> Vc_INTRINSIC void Vector::setQnan() { data() = Detail::allone(); } template<> Vc_INTRINSIC void Vector::setQnan(MaskArgument k) { data() = _mm256_or_ps(data(), k.data()); } /////////////////////////////////////////////////////////////////////////////////////////// // stores {{{1 template template Vc_INTRINSIC void Vector::store(U *mem, Flags flags) const { Common::handleStorePrefetches(mem, flags); HV::template store(mem, data()); } template template Vc_INTRINSIC void Vector::store(U *mem, Mask mask, Flags flags) const { Common::handleStorePrefetches(mem, flags); HV::template store(mem, data(), AVX::avx_cast(mask.data())); } /////////////////////////////////////////////////////////////////////////////////////////// // integer ops {{{1 #ifdef Vc_IMPL_AVX2 template <> Vc_ALWAYS_INLINE AVX2::Vector< int> Vector< int, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); } template <> Vc_ALWAYS_INLINE AVX2::Vector< uint> Vector< uint, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); } template <> Vc_ALWAYS_INLINE AVX2::Vector< int> Vector< int, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srav_epi32(d.v(), x.d.v()); } template <> Vc_ALWAYS_INLINE AVX2::Vector< uint> Vector< uint, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srlv_epi32(d.v(), x.d.v()); } template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); } template <> Vc_ALWAYS_INLINE AVX2::Vector Vector::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); } template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); } template <> Vc_ALWAYS_INLINE AVX2::Vector Vector::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); } template Vc_ALWAYS_INLINE AVX2::Vector &Vector::operator<<=(AsArg x) { static_assert(std::is_integral::value, "bitwise-operators can only be used with Vectors of integral type"); return *this = *this << x; } template Vc_ALWAYS_INLINE AVX2::Vector &Vector::operator>>=(AsArg x) { static_assert(std::is_integral::value, "bitwise-operators can only be used with Vectors of integral type"); return *this = *this >> x; } #endif template Vc_ALWAYS_INLINE AVX2::Vector &Vector::operator>>=(int shift) { d.v() = Detail::shiftRight(d.v(), shift, T()); return *static_cast *>(this); } template Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector Vector::operator>>(int shift) const { return Detail::shiftRight(d.v(), shift, T()); } template Vc_ALWAYS_INLINE AVX2::Vector &Vector::operator<<=(int shift) { d.v() = Detail::shiftLeft(d.v(), shift, T()); return *static_cast *>(this); } template Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector Vector::operator<<(int shift) const { return Detail::shiftLeft(d.v(), shift, T()); } // isnegative {{{1 Vc_INTRINSIC Vc_CONST AVX2::float_m isnegative(AVX2::float_v x) { return AVX::avx_cast<__m256>(AVX::srai_epi32<31>( AVX::avx_cast<__m256i>(_mm256_and_ps(AVX::setsignmask_ps(), x.data())))); } Vc_INTRINSIC Vc_CONST AVX2::double_m isnegative(AVX2::double_v x) { return Mem::permute(AVX::avx_cast<__m256>(AVX::srai_epi32<31>( AVX::avx_cast<__m256i>(_mm256_and_pd(AVX::setsignmask_pd(), x.data()))))); } // gathers {{{1 template <> template inline void AVX2::double_v::gatherImplementation(const MT *mem, const IT &indexes) { d.v() = _mm256_setr_pd(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]); } template <> template inline void AVX2::float_v::gatherImplementation(const MT *mem, const IT &indexes) { d.v() = _mm256_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); } #ifdef Vc_IMPL_AVX2 #ifndef Vc_MSVC // skip this code for MSVC because it fails to do overload resolution correctly template <> Vc_INTRINSIC void AVX2::double_v::gatherImplementation(const double *mem, SSE::int_v indexes) { d.v() = _mm256_i32gather_pd(mem, indexes.data(), sizeof(double)); } template <> Vc_INTRINSIC void AVX2::float_v::gatherImplementation(const float *mem, AVX2::int_v indexes) { d.v() = _mm256_i32gather_ps(mem, indexes.data(), sizeof(float)); } template <> Vc_INTRINSIC void AVX2::int_v::gatherImplementation(const int *mem, AVX2::int_v indexes) { d.v() = _mm256_i32gather_epi32(mem, indexes.data(), sizeof(int)); } template <> Vc_INTRINSIC void AVX2::uint_v::gatherImplementation(const uint *mem, AVX2::int_v indexes) { d.v() = _mm256_i32gather_epi32(reinterpret_cast *>(mem), indexes.data(), sizeof(unsigned)); } #endif // !Vc_MSVC template <> template inline void AVX2::int_v::gatherImplementation(const MT *mem, const IT &indexes) { d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); } template <> template inline void AVX2::uint_v::gatherImplementation(const MT *mem, const IT &indexes) { d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); } template <> template inline void AVX2::short_v::gatherImplementation(const MT *mem, const IT &indexes) { d.v() = _mm256_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]], mem[indexes[8]], mem[indexes[9]], mem[indexes[10]], mem[indexes[11]], mem[indexes[12]], mem[indexes[13]], mem[indexes[14]], mem[indexes[15]]); } template <> template inline void AVX2::ushort_v::gatherImplementation(const MT *mem, const IT &indexes) { d.v() = _mm256_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]], mem[indexes[8]], mem[indexes[9]], mem[indexes[10]], mem[indexes[11]], mem[indexes[12]], mem[indexes[13]], mem[indexes[14]], mem[indexes[15]]); } #endif template template inline void Vector::gatherImplementation(const MT *mem, const IT &indexes, MaskArgument mask) { using Selector = std::integral_constant < Common::GatherScatterImplementation, #ifdef Vc_USE_SET_GATHERS Traits::is_simd_vector::value ? Common::GatherScatterImplementation::SetIndexZero : #endif #ifdef Vc_USE_BSF_GATHERS Common::GatherScatterImplementation::BitScanLoop #elif defined Vc_USE_POPCNT_BSF_GATHERS Common::GatherScatterImplementation::PopcntSwitch #else Common::GatherScatterImplementation::SimpleLoop #endif > ; Common::executeGather(Selector(), *this, mem, indexes, mask); } template template inline void Vector::scatterImplementation(MT *mem, IT &&indexes) const { Common::unrolled_loop([&](std::size_t i) { mem[indexes[i]] = d.m(i); }); } template template inline void Vector::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const { using Selector = std::integral_constant < Common::GatherScatterImplementation, #ifdef Vc_USE_SET_GATHERS Traits::is_simd_vector::value ? Common::GatherScatterImplementation::SetIndexZero : #endif #ifdef Vc_USE_BSF_GATHERS Common::GatherScatterImplementation::BitScanLoop #elif defined Vc_USE_POPCNT_BSF_GATHERS Common::GatherScatterImplementation::PopcntSwitch #else Common::GatherScatterImplementation::SimpleLoop #endif > ; Common::executeScatter(Selector(), *this, mem, std::forward(indexes), mask); } /////////////////////////////////////////////////////////////////////////////////////////// // operator- {{{1 #ifdef Vc_USE_BUILTIN_VECTOR_TYPES template Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector Vector::operator-() const { return VectorType(-d.builtin()); } #else template Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector Vector::operator-() const { return Detail::negate(d.v(), std::integral_constant()); } #endif /////////////////////////////////////////////////////////////////////////////////////////// // horizontal ops {{{1 template Vc_INTRINSIC std::pair, int> Vector::minIndex() const { AVX2::Vector x = min(); return std::make_pair(x, (*this == x).firstOne()); } template Vc_INTRINSIC std::pair, int> Vector::maxIndex() const { AVX2::Vector x = max(); return std::make_pair(x, (*this == x).firstOne()); } template <> Vc_INTRINSIC std::pair AVX2::float_v::minIndex() const { /* // 28 cycles latency: __m256 x = _mm256_min_ps(Mem::permute128(d.v()), d.v()); x = _mm256_min_ps(x, Reg::permute(x)); AVX2::float_v xx = _mm256_min_ps(x, Reg::permute(x)); AVX2::uint_v idx = AVX2::uint_v::IndexesFromZero(); idx = _mm256_castps_si256( _mm256_or_ps((*this != xx).data(), _mm256_castsi256_ps(idx.data()))); return std::make_pair(xx, (*this == xx).firstOne()); __m128 loData = AVX::lo128(d.v()); __m128 hiData = AVX::hi128(d.v()); const __m128 less2 = _mm_cmplt_ps(hiData, loData); loData = _mm_min_ps(loData, hiData); hiData = Mem::permute(loData); const __m128 less1 = _mm_cmplt_ps(hiData, loData); loData = _mm_min_ps(loData, hiData); hiData = Mem::permute(loData); const __m128 less0 = _mm_cmplt_ps(hiData, loData); unsigned bits = _mm_movemask_ps(less0) & 0x1; bits |= ((_mm_movemask_ps(less1) << 1) - bits) & 0x2; bits |= ((_mm_movemask_ps(less2) << 3) - bits) & 0x4; loData = _mm_min_ps(loData, hiData); return std::make_pair(AVX::concat(loData, loData), bits); */ // 28 cycles Latency: __m256 x = d.v(); __m256 idx = Vector::IndexesFromZero().data(); __m256 y = Mem::permute128(x); __m256 idy = Mem::permute128(idx); __m256 less = AVX::cmplt_ps(x, y); x = _mm256_blendv_ps(y, x, less); idx = _mm256_blendv_ps(idy, idx, less); y = Reg::permute(x); idy = Reg::permute(idx); less = AVX::cmplt_ps(x, y); x = _mm256_blendv_ps(y, x, less); idx = _mm256_blendv_ps(idy, idx, less); y = Reg::permute(x); idy = Reg::permute(idx); less = AVX::cmplt_ps(x, y); idx = _mm256_blendv_ps(idy, idx, less); const auto index = _mm_cvtsi128_si32(AVX::avx_cast<__m128i>(idx)); #ifdef Vc_GNU_ASM __asm__ __volatile__(""); // help GCC to order the instructions better #endif x = _mm256_blendv_ps(y, x, less); return std::make_pair(x, index); } template Vc_ALWAYS_INLINE AVX2::Vector Vector::partialSum() const { // a b c d e f g h // + a b c d e f g -> a ab bc cd de ef fg gh // + a ab bc cd de ef -> a ab abc abcd bcde cdef defg efgh // + a ab abc abcd -> a ab abc abcd abcde abcdef abcdefg abcdefgh AVX2::Vector tmp = *this; if (Size > 1) tmp += tmp.shifted(-1); if (Size > 2) tmp += tmp.shifted(-2); if (Size > 4) tmp += tmp.shifted(-4); if (Size > 8) tmp += tmp.shifted(-8); if (Size > 16) tmp += tmp.shifted(-16); return tmp; } /* This function requires correct masking because the neutral element of \p op is not necessarily 0 * template template Vc_ALWAYS_INLINE AVX2::Vector Vector::partialSum(BinaryOperation op) const { // a b c d e f g h // + a b c d e f g -> a ab bc cd de ef fg gh // + a ab bc cd de ef -> a ab abc abcd bcde cdef defg efgh // + a ab abc abcd -> a ab abc abcd abcde abcdef abcdefg abcdefgh AVX2::Vector tmp = *this; Mask mask(true); if (Size > 1) tmp(mask) = op(tmp, tmp.shifted(-1)); if (Size > 2) tmp(mask) = op(tmp, tmp.shifted(-2)); if (Size > 4) tmp(mask) = op(tmp, tmp.shifted(-4)); if (Size > 8) tmp(mask) = op(tmp, tmp.shifted(-8)); if (Size > 16) tmp(mask) = op(tmp, tmp.shifted(-16)); return tmp; } */ template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::min(MaskArgument m) const { AVX2::Vector tmp = std::numeric_limits >::max(); tmp(m) = *this; return tmp.min(); } template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::max(MaskArgument m) const { AVX2::Vector tmp = std::numeric_limits >::min(); tmp(m) = *this; return tmp.max(); } template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::product(MaskArgument m) const { AVX2::Vector tmp(Vc::One); tmp(m) = *this; return tmp.product(); } template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::sum(MaskArgument m) const { AVX2::Vector tmp(Vc::Zero); tmp(m) = *this; return tmp.sum(); }//}}} // exponent {{{1 namespace Detail { Vc_INTRINSIC Vc_CONST __m256 exponent(__m256 v) { using namespace AVX; __m128i tmp0 = _mm_srli_epi32(avx_cast<__m128i>(v), 23); __m128i tmp1 = _mm_srli_epi32(avx_cast<__m128i>(hi128(v)), 23); tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f)); tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f)); return _mm256_cvtepi32_ps(concat(tmp0, tmp1)); } Vc_INTRINSIC Vc_CONST __m256d exponent(__m256d v) { using namespace AVX; __m128i tmp0 = _mm_srli_epi64(avx_cast<__m128i>(v), 52); __m128i tmp1 = _mm_srli_epi64(avx_cast<__m128i>(hi128(v)), 52); tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x3ff)); tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x3ff)); return _mm256_cvtepi32_pd(avx_cast<__m128i>(Mem::shuffle(avx_cast<__m128>(tmp0), avx_cast<__m128>(tmp1)))); } } // namespace Detail Vc_INTRINSIC Vc_CONST AVX2::float_v exponent(AVX2::float_v x) { using Detail::operator>=; Vc_ASSERT((x >= x.Zero()).isFull()); return Detail::exponent(x.data()); } Vc_INTRINSIC Vc_CONST AVX2::double_v exponent(AVX2::double_v x) { using Detail::operator>=; Vc_ASSERT((x >= x.Zero()).isFull()); return Detail::exponent(x.data()); } // }}}1 // Random {{{1 static Vc_ALWAYS_INLINE __m256i _doRandomStep() { using Detail::operator*; using Detail::operator+; #ifdef Vc_IMPL_AVX2 using AVX2::uint_v; uint_v state0(&Common::RandomState[0]); uint_v state1(&Common::RandomState[uint_v::Size]); (state1 * uint_v(0xdeece66du) + uint_v(11)).store(&Common::RandomState[uint_v::Size]); uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(), _mm256_srli_epi32(state1.data(), 16))) .store(&Common::RandomState[0]); return state0.data(); #else using SSE::uint_v; uint_v state0(&Common::RandomState[0]); uint_v state1(&Common::RandomState[uint_v::Size]); uint_v state2(&Common::RandomState[2 * uint_v::Size]); uint_v state3(&Common::RandomState[3 * uint_v::Size]); (state2 * uint_v(0xdeece66du) + uint_v(11)) .store(&Common::RandomState[2 * uint_v::Size]); (state3 * uint_v(0xdeece66du) + uint_v(11)) .store(&Common::RandomState[3 * uint_v::Size]); uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(), _mm_srli_epi32(state2.data(), 16))) .store(&Common::RandomState[0]); uint_v(Detail::xor_((state1 * uint_v(0xdeece66du) + uint_v(11)).data(), _mm_srli_epi32(state3.data(), 16))) .store(&Common::RandomState[uint_v::Size]); return AVX::concat(state0.data(), state1.data()); #endif } #ifdef Vc_IMPL_AVX2 template Vc_ALWAYS_INLINE AVX2::Vector Vector::Random() { return {_doRandomStep()}; } #endif template <> Vc_ALWAYS_INLINE AVX2::float_v AVX2::float_v::Random() { return HT::sub(Detail::or_(_cast(AVX::srli_epi32<2>(_doRandomStep())), HT::one()), HT::one()); } template<> Vc_ALWAYS_INLINE AVX2::double_v AVX2::double_v::Random() { const __m256i state = Detail::load(&Common::RandomState[0], Vc::Aligned, Detail::LoadTag<__m256i, int>()); for (size_t k = 0; k < 8; k += 2) { typedef unsigned long long uint64 Vc_MAY_ALIAS; const uint64 stateX = *reinterpret_cast(&Common::RandomState[k]); *reinterpret_cast(&Common::RandomState[k]) = (stateX * 0x5deece66dull + 11); } return HT::sub(Detail::or_(_cast(AVX::srli_epi64<12>(state)), HT::one()), HT::one()); } // }}}1 // shifted / rotated {{{1 template Vc_INTRINSIC AVX2::Vector Vector::shifted(int amount) const { return Detail::shifted(d.v(), amount); } template Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m128>) { return Mem::shuffle(left, right); } template Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m256>) { return Mem::shuffle128(left, right); } template Vc_INTRINSIC AVX2::Vector Vector::shifted(int amount, Vector shiftIn) const { #ifdef __GNUC__ if (__builtin_constant_p(amount)) { const __m256i a = AVX::avx_cast<__m256i>(d.v()); const __m256i b = AVX::avx_cast<__m256i>(shiftIn.d.v()); if (amount * 2 == int(Size)) { return shifted_shortcut(d.v(), shiftIn.d.v(), WidthT()); } if (amount * 2 == -int(Size)) { return shifted_shortcut(shiftIn.d.v(), d.v(), WidthT()); } switch (amount) { case 1: return AVX::avx_cast( #ifdef Vc_IMPL_AVX2 _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a, sizeof(EntryType)) #else // Vc_IMPL_AVX2 AVX::concat( _mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), sizeof(EntryType)), _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), sizeof(EntryType))) #endif // Vc_IMPL_AVX2 ); case 2: return AVX::avx_cast( #ifdef Vc_IMPL_AVX2 _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a, 2 * sizeof(EntryType)) #else // Vc_IMPL_AVX2 AVX::concat( _mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), 2 * sizeof(EntryType)), _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), 2 * sizeof(EntryType))) #endif // Vc_IMPL_AVX2 ); case 3: if (6u < Size) { return AVX::avx_cast( #ifdef Vc_IMPL_AVX2 _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a, 3 * sizeof(EntryType)) #else // Vc_IMPL_AVX2 AVX::concat(_mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), 3 * sizeof(EntryType)), _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), 3 * sizeof(EntryType))) #endif // Vc_IMPL_AVX2 ); // TODO: } else { } } } #endif using Detail::operator|; return shifted(amount) | (amount > 0 ? shiftIn.shifted(amount - Size) : shiftIn.shifted(Size + amount)); } template Vc_INTRINSIC AVX2::Vector Vector::rotated(int amount) const { return Detail::rotated(d.v(), amount); } // sorted {{{1 template Vc_ALWAYS_INLINE Vc_PURE Vector Vector::sorted() const { return Detail::sorted(*this); } // interleaveLow/-High {{{1 template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveLow(AVX2::double_v x) const { return Mem::shuffle128(_mm256_unpacklo_pd(data(), x.data()), _mm256_unpackhi_pd(data(), x.data())); } template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveHigh(AVX2::double_v x) const { return Mem::shuffle128(_mm256_unpacklo_pd(data(), x.data()), _mm256_unpackhi_pd(data(), x.data())); } template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveLow(AVX2::float_v x) const { return Mem::shuffle128(_mm256_unpacklo_ps(data(), x.data()), _mm256_unpackhi_ps(data(), x.data())); } template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveHigh(AVX2::float_v x) const { return Mem::shuffle128(_mm256_unpacklo_ps(data(), x.data()), _mm256_unpackhi_ps(data(), x.data())); } #ifdef Vc_IMPL_AVX2 template <> Vc_INTRINSIC AVX2::int_v AVX2::int_v::interleaveLow ( AVX2::int_v x) const { return Mem::shuffle128(_mm256_unpacklo_epi32(data(), x.data()), _mm256_unpackhi_epi32(data(), x.data())); } template <> Vc_INTRINSIC AVX2::int_v AVX2::int_v::interleaveHigh( AVX2::int_v x) const { return Mem::shuffle128(_mm256_unpacklo_epi32(data(), x.data()), _mm256_unpackhi_epi32(data(), x.data())); } template <> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::interleaveLow ( AVX2::uint_v x) const { return Mem::shuffle128(_mm256_unpacklo_epi32(data(), x.data()), _mm256_unpackhi_epi32(data(), x.data())); } template <> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::interleaveHigh( AVX2::uint_v x) const { return Mem::shuffle128(_mm256_unpacklo_epi32(data(), x.data()), _mm256_unpackhi_epi32(data(), x.data())); } template <> Vc_INTRINSIC AVX2::short_v AVX2::short_v::interleaveLow ( AVX2::short_v x) const { return Mem::shuffle128(_mm256_unpacklo_epi16(data(), x.data()), _mm256_unpackhi_epi16(data(), x.data())); } template <> Vc_INTRINSIC AVX2::short_v AVX2::short_v::interleaveHigh( AVX2::short_v x) const { return Mem::shuffle128(_mm256_unpacklo_epi16(data(), x.data()), _mm256_unpackhi_epi16(data(), x.data())); } template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveLow (AVX2::ushort_v x) const { return Mem::shuffle128(_mm256_unpacklo_epi16(data(), x.data()), _mm256_unpackhi_epi16(data(), x.data())); } template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveHigh(AVX2::ushort_v x) const { return Mem::shuffle128(_mm256_unpacklo_epi16(data(), x.data()), _mm256_unpackhi_epi16(data(), x.data())); } #endif // permutation via operator[] {{{1 template <> Vc_INTRINSIC Vc_PURE AVX2::double_v AVX2::double_v::operator[](Permutation::ReversedTag) const { return Mem::permute128(Mem::permute(d.v())); } template <> Vc_INTRINSIC Vc_PURE AVX2::float_v AVX2::float_v::operator[](Permutation::ReversedTag) const { return Mem::permute128(Mem::permute(d.v())); } #ifdef Vc_IMPL_AVX2 template <> Vc_INTRINSIC Vc_PURE AVX2::int_v AVX2::int_v::operator[](Permutation::ReversedTag) const { return Mem::permute128(Mem::permute(d.v())); } template <> Vc_INTRINSIC Vc_PURE AVX2::uint_v AVX2::uint_v::operator[](Permutation::ReversedTag) const { return Mem::permute128(Mem::permute(d.v())); } template <> Vc_INTRINSIC Vc_PURE AVX2::short_v AVX2::short_v::operator[]( Permutation::ReversedTag) const { return Mem::permute128(AVX::avx_cast<__m256i>(Mem::shuffle( AVX::avx_cast<__m256d>(Mem::permuteHi(d.v())), AVX::avx_cast<__m256d>(Mem::permuteLo(d.v()))))); } template <> Vc_INTRINSIC Vc_PURE AVX2::ushort_v AVX2::ushort_v::operator[]( Permutation::ReversedTag) const { return Mem::permute128(AVX::avx_cast<__m256i>(Mem::shuffle( AVX::avx_cast<__m256d>(Mem::permuteHi(d.v())), AVX::avx_cast<__m256d>(Mem::permuteLo(d.v()))))); } #endif template <> Vc_INTRINSIC AVX2::float_v Vector::operator[](const IndexType &/*perm*/) const { // TODO return *this; #ifdef Vc_IMPL_AVX2 #else /* const int_m cross128 = AVX::concat(_mm_cmpgt_epi32(AVX::lo128(perm.data()), _mm_set1_epi32(3)), _mm_cmplt_epi32(AVX::hi128(perm.data()), _mm_set1_epi32(4))); if (cross128.isNotEmpty()) { AVX2::float_v x = _mm256_permutevar_ps(d.v(), perm.data()); x(cross128) = _mm256_permutevar_ps(Mem::permute128(d.v()), perm.data()); return x; } else { */ #endif } // reversed {{{1 template Vc_INTRINSIC Vc_PURE Vector Vector::reversed() const { return (*this)[Permutation::Reversed]; } // broadcast from constexpr index {{{1 template <> template Vc_INTRINSIC AVX2::float_v AVX2::float_v::broadcast() const { constexpr VecPos Inner = static_cast(Index & 0x3); constexpr VecPos Outer = static_cast((Index & 0x4) / 4); return Mem::permute(Mem::permute128(d.v())); } template <> template Vc_INTRINSIC AVX2::double_v AVX2::double_v::broadcast() const { constexpr VecPos Inner = static_cast(Index & 0x1); constexpr VecPos Outer = static_cast((Index & 0x2) / 2); return Mem::permute(Mem::permute128(d.v())); } // }}}1 } // namespace Vc // vim: foldmethod=marker Vc-1.3.3/avx/vectorhelper.h000066400000000000000000000365661320703111200155450ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2009-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_AVX_VECTORHELPER_H_ #define VC_AVX_VECTORHELPER_H_ #include #include "types.h" #include "intrinsics.h" #include "casts.h" #include "../common/loadstoreflags.h" #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace AVX { template<> struct VectorHelper<__m256> { typedef __m256 VectorType; typedef const VectorType VTArg; template static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_ps(mem, x); } template static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_ps(mem, x); } template static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_ps(mem, x); } template static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_ps()); } template static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, VTArg m, typename std::enable_if::type = nullptr) { _mm256_maskstore(mem, m, x); } template static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); } }; template<> struct VectorHelper<__m256d> { typedef __m256d VectorType; typedef const VectorType VTArg; template static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_pd(mem, x); } template static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_pd(mem, x); } template static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_pd(mem, x); } template static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_pd()); } template static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, VTArg m, typename std::enable_if::type = nullptr) { _mm256_maskstore(mem, m, x); } template static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); } }; template<> struct VectorHelper<__m256i> { typedef __m256i VectorType; typedef const VectorType VTArg; template static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_si256(reinterpret_cast<__m256i *>(mem), x); } template static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_si256(reinterpret_cast<__m256i *>(mem), x); } template static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_si256(reinterpret_cast<__m256i *>(mem), x); } template static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_si256()); } template static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, VTArg m, typename std::enable_if::type = nullptr) { _mm256_maskstore(mem, m, x); } template static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); } }; #define Vc_OP1(op) \ static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a) { return Vc_CAT2(_mm256_##op##_, Vc_SUFFIX)(a); } #define Vc_OP(op) \ static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(op##_ , Vc_SUFFIX)(a, b); } #define Vc_OP_(op) \ static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(_mm256_##op , Vc_SUFFIX)(a, b); } #define Vc_OPx(op, op2) \ static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(_mm256_##op2##_, Vc_SUFFIX)(a, b); } template<> struct VectorHelper { typedef __m256d VectorType; typedef const VectorType VTArg; typedef double EntryType; #define Vc_SUFFIX pd static Vc_ALWAYS_INLINE VectorType notMaskedToZero(VTArg a, __m256 mask) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(_mm256_castps_pd(mask), a); } static Vc_ALWAYS_INLINE VectorType set(const double a) { return Vc_CAT2(_mm256_set1_, Vc_SUFFIX)(a); } static Vc_ALWAYS_INLINE VectorType set(const double a, const double b, const double c, const double d) { return Vc_CAT2(_mm256_set_, Vc_SUFFIX)(a, b, c, d); } static Vc_ALWAYS_INLINE VectorType zero() { return Vc_CAT2(_mm256_setzero_, Vc_SUFFIX)(); } static Vc_ALWAYS_INLINE VectorType one() { return Vc_CAT2(setone_, Vc_SUFFIX)(); }// set(1.); } static inline void fma(VectorType &v1, VTArg v2, VTArg v3) { #ifdef Vc_IMPL_FMA4 v1 = _mm256_macc_pd(v1, v2, v3); #else VectorType h1 = _mm256_and_pd(v1, _mm256_broadcast_sd(reinterpret_cast(&c_general::highMaskDouble))); VectorType h2 = _mm256_and_pd(v2, _mm256_broadcast_sd(reinterpret_cast(&c_general::highMaskDouble))); #if defined(Vc_GCC) && Vc_GCC < 0x40703 // GCC before 4.7.3 uses an incorrect optimization where it replaces the subtraction with an andnot // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54703 asm("":"+x"(h1), "+x"(h2)); #endif const VectorType l1 = _mm256_sub_pd(v1, h1); const VectorType l2 = _mm256_sub_pd(v2, h2); const VectorType ll = mul(l1, l2); const VectorType lh = add(mul(l1, h2), mul(h1, l2)); const VectorType hh = mul(h1, h2); // ll < lh < hh for all entries is certain const VectorType lh_lt_v3 = cmplt_pd(abs(lh), abs(v3)); // |lh| < |v3| const VectorType b = _mm256_blendv_pd(v3, lh, lh_lt_v3); const VectorType c = _mm256_blendv_pd(lh, v3, lh_lt_v3); v1 = add(add(ll, b), add(c, hh)); #endif } static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm256_add_pd(a,b); } static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm256_sub_pd(a,b); } static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mul_pd(a,b); } Vc_OP1(sqrt) static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VTArg x) { return _mm256_div_pd(one(), sqrt(x)); } static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) { return _mm256_div_pd(one(), x); } static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(a, setabsmask_pd()); } static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm256_min_pd(a, b); } static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm256_max_pd(a, b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) { __m128d b = _mm_min_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1)); b = _mm_min_sd(b, _mm_unpackhi_pd(b, b)); return _mm_cvtsd_f64(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) { __m128d b = _mm_max_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1)); b = _mm_max_sd(b, _mm_unpackhi_pd(b, b)); return _mm_cvtsd_f64(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) { __m128d b = _mm_mul_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1)); b = _mm_mul_sd(b, _mm_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1))); return _mm_cvtsd_f64(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) { __m128d b = _mm_add_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1)); b = _mm_hadd_pd(b, b); // or: b = _mm_add_sd(b, _mm256_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1))); return _mm_cvtsd_f64(b); } #undef Vc_SUFFIX static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { return _mm256_round_pd(a, _MM_FROUND_NINT); } }; template<> struct VectorHelper { typedef float EntryType; typedef __m256 VectorType; typedef const VectorType VTArg; #define Vc_SUFFIX ps static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VTArg a, __m256 mask) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(mask, a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return Vc_CAT2(_mm256_set1_, Vc_SUFFIX)(a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d, const float e, const float f, const float g, const float h) { return Vc_CAT2(_mm256_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h); } static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm256_setzero_, Vc_SUFFIX)(); } static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(setone_, Vc_SUFFIX)(); }// set(1.f); } static Vc_ALWAYS_INLINE Vc_CONST __m256 concat(__m256d a, __m256d b) { return _mm256_insertf128_ps(avx_cast<__m256>(_mm256_cvtpd_ps(a)), _mm256_cvtpd_ps(b), 1); } static inline void fma(VectorType &v1, VTArg v2, VTArg v3) { #ifdef Vc_IMPL_FMA4 v1 = _mm256_macc_ps(v1, v2, v3); #else __m256d v1_0 = _mm256_cvtps_pd(lo128(v1)); __m256d v1_1 = _mm256_cvtps_pd(hi128(v1)); __m256d v2_0 = _mm256_cvtps_pd(lo128(v2)); __m256d v2_1 = _mm256_cvtps_pd(hi128(v2)); __m256d v3_0 = _mm256_cvtps_pd(lo128(v3)); __m256d v3_1 = _mm256_cvtps_pd(hi128(v3)); v1 = AVX::concat( _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)), _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1))); #endif } static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm256_add_ps(a, b); } static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm256_sub_ps(a, b); } static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mul_ps(a, b); } Vc_OP1(sqrt) Vc_OP1(rsqrt) static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) { return _mm256_rcp_ps(x); } static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(a, setabsmask_ps()); } static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm256_min_ps(a, b); } static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm256_max_ps(a, b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) { __m128 b = _mm_min_ps(lo128(a), hi128(a)); b = _mm_min_ps(b, _mm_movehl_ps(b, b)); // b = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3) b = _mm_min_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = min(a0, a1), a1, a2, a3 return _mm_cvtss_f32(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) { __m128 b = _mm_max_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1)); b = _mm_max_ps(b, _mm_movehl_ps(b, b)); // b = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3) b = _mm_max_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = max(a0, a1), a1, a2, a3 return _mm_cvtss_f32(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) { __m128 b = _mm_mul_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1)); b = _mm_mul_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3))); b = _mm_mul_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1))); return _mm_cvtss_f32(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) { __m128 b = _mm_add_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1)); b = _mm_add_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3))); b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1))); return _mm_cvtss_f32(b); } #undef Vc_SUFFIX static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { return _mm256_round_ps(a, _MM_FROUND_NINT); } }; #undef Vc_OP1 #undef Vc_OP #undef Vc_OP_ #undef Vc_OPx } // namespace AVX(2) } // namespace Vc #endif // VC_AVX_VECTORHELPER_H_ Vc-1.3.3/changeVersion.sh000077500000000000000000000017041320703111200152100ustar00rootroot00000000000000#!/bin/bash cd "`dirname "$0"`" # Read version number eval `awk '/VC_VERSION_NUMBER 0x[0-9]+/ { h=$3 } END { major=strtonum(substr(h, 1, 4)) minor=strtonum("0x" substr(h, 5, 2)) patch=strtonum("0x" substr(h, 7, 2)) / 2 printf "oldVersion=\"%d.%d.%d\"\n", major, minor, patch printf "newVersion=\"%d.%d.%d\"\n", major, minor, patch + 1 }' include/Vc/version.h` echo "current version: $oldVersion" echo -n " new version: " read -e -i "$newVersion" newVersion versionString="$newVersion-dev" versionNumber=`echo $newVersion | awk '{ split($0, v, "."); printf "0x%02x%02x%02x", v[1], v[2], v[3] * 2 }'` versionNumber=`echo $versionNumber | awk '{ printf "0x%06x", (strtonum($0) + 1) }'` sed -i "s/^PROJECT_NUMBER = .*\$/PROJECT_NUMBER = $versionString/" doc/Doxyfile sed -i \ -e "s/VC_VERSION_STRING \".*\"\$/VC_VERSION_STRING \"$versionString\"/" \ -e "s/VC_VERSION_NUMBER 0x.*\$/VC_VERSION_NUMBER $versionNumber/" \ include/Vc/version.h Vc-1.3.3/cmake/000077500000000000000000000000001320703111200131345ustar00rootroot00000000000000Vc-1.3.3/cmake/AddCompilerFlag.cmake000066400000000000000000000157711320703111200171260ustar00rootroot00000000000000# - Add a given compiler flag to flags variables. # AddCompilerFlag( []) # or # AddCompilerFlag( [C_FLAGS ] [CXX_FLAGS ] [C_RESULT ] # [CXX_RESULT ]) #============================================================================= # Copyright 2010-2015 Matthias Kretz # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # * Neither the names of contributing organizations nor the # names of its contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #============================================================================= get_filename_component(_currentDir "${CMAKE_CURRENT_LIST_FILE}" PATH) include("${_currentDir}/CheckCCompilerFlag.cmake") include("${_currentDir}/CheckCXXCompilerFlag.cmake") include("${_currentDir}/CheckMicCCompilerFlag.cmake") include("${_currentDir}/CheckMicCXXCompilerFlag.cmake") macro(AddCompilerFlag _flag) string(REGEX REPLACE "[-.+/:= ]" "_" _flag_esc "${_flag}") set(_c_flags "CMAKE_C_FLAGS") set(_cxx_flags "CMAKE_CXX_FLAGS") set(_mic_c_flags "CMAKE_MIC_C_FLAGS") set(_mic_cxx_flags "CMAKE_MIC_CXX_FLAGS") set(_c_result tmp) set(_cxx_result tmp) set(_mic_c_result) set(_mic_cxx_result) if(${ARGC} EQUAL 2) message(WARNING "Deprecated use of the AddCompilerFlag macro.") unset(_c_result) set(_cxx_result ${ARGV1}) elseif(${ARGC} GREATER 2) set(state 0) unset(_c_flags) unset(_cxx_flags) unset(_mic_c_flags) unset(_mic_cxx_flags) unset(_c_result) unset(_cxx_result) unset(_mic_c_result) unset(_mic_cxx_result) foreach(_arg ${ARGN}) if("x${_arg}" STREQUAL "xC_FLAGS") set(state 1) if(NOT DEFINED _c_result) set(_c_result tmp0) endif() elseif("x${_arg}" STREQUAL "xCXX_FLAGS") set(state 2) if(NOT DEFINED _cxx_result) set(_cxx_result tmp1) endif() elseif("x${_arg}" STREQUAL "xC_RESULT") set(state 3) elseif("x${_arg}" STREQUAL "xCXX_RESULT") set(state 4) elseif("x${_arg}" STREQUAL "xMIC_C_RESULT") set(state 5) elseif("x${_arg}" STREQUAL "xMIC_CXX_RESULT") set(state 6) elseif("x${_arg}" STREQUAL "xMIC_C_FLAGS") if(NOT DEFINED _mic_c_result) set(_mic_c_result tmp2) endif() set(state 7) elseif("x${_arg}" STREQUAL "xMIC_CXX_FLAGS") if(NOT DEFINED _mic_cxx_result) set(_mic_cxx_result tmp3) endif() set(state 8) elseif(state EQUAL 1) set(_c_flags "${_arg}") elseif(state EQUAL 2) set(_cxx_flags "${_arg}") elseif(state EQUAL 3) set(_c_result "${_arg}") elseif(state EQUAL 4) set(_cxx_result "${_arg}") elseif(state EQUAL 5) set(_mic_c_result "${_arg}") elseif(state EQUAL 6) set(_mic_cxx_result "${_arg}") elseif(state EQUAL 7) set(_mic_c_flags "${_arg}") elseif(state EQUAL 8) set(_mic_cxx_flags "${_arg}") else() message(FATAL_ERROR "Syntax error for AddCompilerFlag") endif() endforeach() endif() set(_c_code "int main() { return 0; }") set(_cxx_code "int main() { return 0; }") if("${_flag}" STREQUAL "-mfma") # Compiling with FMA3 support may fail only at the assembler level. # In that case we need to have such an instruction in the test code set(_c_code "#include __m128 foo(__m128 x) { return _mm_fmadd_ps(x, x, x); } int main() { return 0; }") set(_cxx_code "${_c_code}") elseif("${_flag}" STREQUAL "-stdlib=libc++") # Compiling with libc++ not only requires a compiler that understands it, but also # the libc++ headers itself set(_cxx_code "#include #include int main() { return 0; }") else() set(_cxx_code "#include int main() { return 0; }") endif() if(DEFINED _c_result) check_c_compiler_flag("${_flag}" check_c_compiler_flag_${_flag_esc} "${_c_code}") set(${_c_result} ${check_c_compiler_flag_${_flag_esc}}) endif() if(DEFINED _cxx_result) check_cxx_compiler_flag("${_flag}" check_cxx_compiler_flag_${_flag_esc} "${_cxx_code}") set(${_cxx_result} ${check_cxx_compiler_flag_${_flag_esc}}) endif() macro(my_append _list _flag _special) if("x${_list}" STREQUAL "x${_special}") set(${_list} "${${_list}} ${_flag}") else() list(APPEND ${_list} "${_flag}") endif() endmacro() if(check_c_compiler_flag_${_flag_esc} AND DEFINED _c_flags) my_append(${_c_flags} "${_flag}" CMAKE_C_FLAGS) endif() if(check_cxx_compiler_flag_${_flag_esc} AND DEFINED _cxx_flags) my_append(${_cxx_flags} "${_flag}" CMAKE_CXX_FLAGS) endif() if(MIC_NATIVE_FOUND) if(DEFINED _mic_c_result) check_mic_c_compiler_flag("${_flag}" check_mic_c_compiler_flag_${_flag_esc} "${_c_code}") set(${_mic_c_result} ${check_mic_c_compiler_flag_${_flag_esc}}) endif() if(DEFINED _mic_cxx_result) check_mic_cxx_compiler_flag("${_flag}" check_mic_cxx_compiler_flag_${_flag_esc} "${_cxx_code}") set(${_mic_cxx_result} ${check_mic_cxx_compiler_flag_${_flag_esc}}) endif() if(check_mic_c_compiler_flag_${_flag_esc} AND DEFINED _mic_c_flags) my_append(${_mic_c_flags} "${_flag}" CMAKE_MIC_C_FLAGS) endif() if(check_mic_cxx_compiler_flag_${_flag_esc} AND DEFINED _mic_cxx_flags) my_append(${_mic_cxx_flags} "${_flag}" CMAKE_MIC_CXX_FLAGS) endif() endif() endmacro(AddCompilerFlag) Vc-1.3.3/cmake/AddTargetProperty.cmake000066400000000000000000000040101320703111200175350ustar00rootroot00000000000000#============================================================================= # Copyright 2010-2015 Matthias Kretz # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * Neither the names of contributing organizations nor the # names of its contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #============================================================================= macro(add_target_property _target _prop _value) get_target_property(_oldprop "${_target}" ${_prop}) if(NOT _oldprop) set_target_properties("${_target}" PROPERTIES ${_prop} "${_value}") else(NOT _oldprop) set_target_properties("${_target}" PROPERTIES ${_prop} "${_oldprop} ${_value}") endif(NOT _oldprop) endmacro(add_target_property) Vc-1.3.3/cmake/CheckCCompilerFlag.cmake000066400000000000000000000073701320703111200175520ustar00rootroot00000000000000# - Check whether the C compiler supports a given flag. # CHECK_C_COMPILER_FLAG( ) # - the compiler flag # - variable to store the result # This internally calls the check_c_source_compiles macro. # See help for CheckCSourceCompiles for a listing of variables # that can modify the build. #============================================================================= # Copyright 2006-2009 Kitware, Inc. # Copyright 2006 Alexander Neundorf # Copyright 2011-2013 Matthias Kretz # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # * The names of Kitware, Inc., the Insight Consortium, or the names of # any consortium members, or of any contributors, may not be used to # endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #============================================================================= INCLUDE(CheckCSourceCompiles) MACRO (CHECK_C_COMPILER_FLAG _FLAG _RESULT) SET(SAFE_CMAKE_REQUIRED_DEFINITIONS "${CMAKE_REQUIRED_DEFINITIONS}") SET(CMAKE_REQUIRED_DEFINITIONS "${_FLAG}") if(${ARGC} GREATER 2) SET(TEST_SOURCE "${ARGV2}") else() SET(TEST_SOURCE "int main() { return 0;}") endif() CHECK_C_SOURCE_COMPILES("${TEST_SOURCE}" ${_RESULT} # Some compilers do not fail with a bad flag FAIL_REGEX "error: bad value (.*) for .* switch" # GNU FAIL_REGEX "argument unused during compilation" # clang FAIL_REGEX "is valid for .* but not for C" # GNU FAIL_REGEX "unrecognized .*option" # GNU FAIL_REGEX "ignored for target" # GNU FAIL_REGEX "ignoring unknown option" # MSVC FAIL_REGEX "warning D9002" # MSVC FAIL_REGEX "[Uu]nknown option" # HP FAIL_REGEX "[Ww]arning: [Oo]ption" # SunPro FAIL_REGEX "command option .* is not recognized" # XL FAIL_REGEX "WARNING: unknown flag:" # Open64 FAIL_REGEX "command line error" # ICC FAIL_REGEX "command line warning" # ICC FAIL_REGEX "#10236:" # ICC: File not found FAIL_REGEX " #10159: " # ICC FAIL_REGEX " #10353: " # ICC: option '-mfma' ignored, suggest using '-march=core-avx2' ) SET (CMAKE_REQUIRED_DEFINITIONS "${SAFE_CMAKE_REQUIRED_DEFINITIONS}") ENDMACRO (CHECK_C_COMPILER_FLAG) Vc-1.3.3/cmake/CheckCXXCompilerFlag.cmake000066400000000000000000000074111320703111200200260ustar00rootroot00000000000000# - Check whether the CXX compiler supports a given flag. # CHECK_CXX_COMPILER_FLAG( ) # - the compiler flag # - variable to store the result # This internally calls the check_cxx_source_compiles macro. See help # for CheckCXXSourceCompiles for a listing of variables that can # modify the build. #============================================================================= # Copyright 2006-2009 Kitware, Inc. # Copyright 2006 Alexander Neundorf # Copyright 2011-2013 Matthias Kretz # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # * The names of Kitware, Inc., the Insight Consortium, or the names of # any consortium members, or of any contributors, may not be used to # endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #============================================================================= INCLUDE(CheckCXXSourceCompiles) MACRO (CHECK_CXX_COMPILER_FLAG _FLAG _RESULT) SET(SAFE_CMAKE_REQUIRED_DEFINITIONS "${CMAKE_REQUIRED_DEFINITIONS}") SET(CMAKE_REQUIRED_DEFINITIONS "${_FLAG}") if(${ARGC} GREATER 2) SET(TEST_SOURCE "${ARGV2}") else() SET(TEST_SOURCE "int main() { return 0;}") endif() CHECK_CXX_SOURCE_COMPILES("${TEST_SOURCE}" ${_RESULT} # Some compilers do not fail with a bad flag FAIL_REGEX "error: bad value (.*) for .* switch" # GNU FAIL_REGEX "argument unused during compilation" # clang FAIL_REGEX "is valid for .* but not for C\\\\+\\\\+" # GNU FAIL_REGEX "unrecognized .*option" # GNU FAIL_REGEX "ignored for target" # GNU FAIL_REGEX "ignoring unknown option" # MSVC FAIL_REGEX "warning D9002" # MSVC FAIL_REGEX "[Uu]nknown option" # HP FAIL_REGEX "[Ww]arning: [Oo]ption" # SunPro FAIL_REGEX "command option .* is not recognized" # XL FAIL_REGEX "WARNING: unknown flag:" # Open64 FAIL_REGEX "command line error" # ICC FAIL_REGEX "command line warning" # ICC FAIL_REGEX "#10236:" # ICC: File not found FAIL_REGEX " #10159: " # ICC FAIL_REGEX " #10353: " # ICC: option '-mfma' ignored, suggest using '-march=core-avx2' ) SET (CMAKE_REQUIRED_DEFINITIONS "${SAFE_CMAKE_REQUIRED_DEFINITIONS}") ENDMACRO (CHECK_CXX_COMPILER_FLAG) Vc-1.3.3/cmake/CheckMicCCompilerFlag.cmake000066400000000000000000000113571320703111200202030ustar00rootroot00000000000000# - Check whether the MIC C compiler supports a given flag. # CHECK_MIC_C_COMPILER_FLAG( ) # - the compiler flag # - variable to store the result # This internally calls the check_c_source_compiles macro. See help # for CheckCSourceCompiles for a listing of variables that can # modify the build. #============================================================================= # Copyright 2006-2009 Kitware, Inc. # Copyright 2006 Alexander Neundorf # Copyright 2011-2013 Matthias Kretz # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # * The names of Kitware, Inc., the Insight Consortium, or the names of # any consortium members, or of any contributors, may not be used to # endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #============================================================================= macro(check_mic_c_compiler_flag _FLAG _RESULT) if(NOT DEFINED "${_RESULT}") set(_tmpdir "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp") if(${ARGC} GREATER 2) file(WRITE "${_tmpdir}/src.c" "${ARGV2}") else() file(WRITE "${_tmpdir}/src.c" "int main() { return 0; }") endif() execute_process( COMMAND "${MIC_CC}" -mmic -c -o "${_tmpdir}/src.o" "${_FLAG}" "${_tmpdir}/src.c" WORKING_DIRECTORY ${CMAKE_BINARY_DIR} RESULT_VARIABLE ${_RESULT} OUTPUT_VARIABLE OUTPUT ERROR_VARIABLE OUTPUT ) if(${_RESULT} EQUAL 0) foreach(_fail_regex "error: bad value (.*) for .* switch" # GNU "argument unused during compilation" # clang "is valid for .* but not for C" # GNU "unrecognized .*option" # GNU "ignored for target" # GNU "ignoring unknown option" # MSVC "[Uu]nknown option" # HP "[Ww]arning: [Oo]ption" # SunPro "command option .* is not recognized" # XL "WARNING: unknown flag:" # Open64 "command line error" # ICC "command line warning" # ICC "#10236:" # ICC: File not found ) if("${OUTPUT}" MATCHES "${_fail_regex}") set(${_RESULT} 1) endif() endforeach() endif() if(${_RESULT} EQUAL 0) set(${_RESULT} 1 CACHE INTERNAL "Test ${_FLAG}") message(STATUS "Performing Test Check MIC C Compiler flag ${_FLAG} - Success") file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log "Performing MIC C Compiler Flag Test ${_FLAG} succeded with the following output:\n" "${OUTPUT}\n" "COMMAND: ${MIC_CC} -mmic -c -o ${_tmpdir}/src.o ${_FLAG} ${_tmpdir}/src.cpp\n" ) else() message(STATUS "Performing Test Check MIC C Compiler flag ${_FLAG} - Failed") set(${_RESULT} "" CACHE INTERNAL "Test ${_FLAG}") file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log "Performing MIC C Compiler Flag Test ${_FLAG} failed with the following output:\n" "${OUTPUT}\n" "COMMAND: ${MIC_CC} -mmic -c -o ${_tmpdir}/src.o ${_FLAG} ${_tmpdir}/src.cpp\n" ) endif() endif() endmacro() Vc-1.3.3/cmake/CheckMicCXXCompilerFlag.cmake000066400000000000000000000114061320703111200204560ustar00rootroot00000000000000# - Check whether the MIC CXX compiler supports a given flag. # CHECK_MIC_CXX_COMPILER_FLAG( ) # - the compiler flag # - variable to store the result # This internally calls the check_cxx_source_compiles macro. See help # for CheckCXXSourceCompiles for a listing of variables that can # modify the build. #============================================================================= # Copyright 2006-2009 Kitware, Inc. # Copyright 2006 Alexander Neundorf # Copyright 2011-2013 Matthias Kretz # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # * The names of Kitware, Inc., the Insight Consortium, or the names of # any consortium members, or of any contributors, may not be used to # endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #============================================================================= macro(check_mic_cxx_compiler_flag _FLAG _RESULT) if(NOT DEFINED "${_RESULT}") set(_tmpdir "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp") if(${ARGC} GREATER 2) file(WRITE "${_tmpdir}/src.cpp" "${ARGV2}") else() file(WRITE "${_tmpdir}/src.cpp" "int main() { return 0; }") endif() execute_process( COMMAND "${MIC_CXX}" -mmic -c -o "${_tmpdir}/src.o" "${_FLAG}" "${_tmpdir}/src.cpp" WORKING_DIRECTORY ${CMAKE_BINARY_DIR} RESULT_VARIABLE ${_RESULT} OUTPUT_VARIABLE OUTPUT ERROR_VARIABLE OUTPUT ) if(${_RESULT} EQUAL 0) foreach(_fail_regex "error: bad value (.*) for .* switch" # GNU "argument unused during compilation" # clang "is valid for .* but not for C\\\\+\\\\+" # GNU "unrecognized .*option" # GNU "ignored for target" # GNU "ignoring unknown option" # MSVC "[Uu]nknown option" # HP "[Ww]arning: [Oo]ption" # SunPro "command option .* is not recognized" # XL "WARNING: unknown flag:" # Open64 "command line error" # ICC "command line warning" # ICC "#10236:" # ICC: File not found ) if("${OUTPUT}" MATCHES "${_fail_regex}") set(${_RESULT} 1) endif() endforeach() endif() if(${_RESULT} EQUAL 0) set(${_RESULT} 1 CACHE INTERNAL "Test ${_FLAG}") message(STATUS "Performing Test Check MIC C++ Compiler flag ${_FLAG} - Success") file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log "Performing MIC C++ Compiler Flag Test ${_FLAG} succeded with the following output:\n" "${OUTPUT}\n" "COMMAND: ${MIC_CXX} -mmic -c -o ${_tmpdir}/src.o ${_FLAG} ${_tmpdir}/src.cpp\n" ) else() message(STATUS "Performing Test Check MIC C++ Compiler flag ${_FLAG} - Failed") set(${_RESULT} "" CACHE INTERNAL "Test ${_FLAG}") file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log "Performing MIC C++ Compiler Flag Test ${_FLAG} failed with the following output:\n" "${OUTPUT}\n" "COMMAND: ${MIC_CXX} -mmic -c -o ${_tmpdir}/src.o ${_FLAG} ${_tmpdir}/src.cpp\n" ) endif() endif() endmacro() Vc-1.3.3/cmake/FindMIC.cmake000066400000000000000000000443441320703111200153600ustar00rootroot00000000000000#============================================================================= # Copyright © 2010-2015 Matthias Kretz # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the names of contributing organizations nor the # names of its contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #============================================================================= # # This check will search for a MIC compiler and check whether the C and C++ # compilers are able to offload via offload pragma and target(mic) attribute. # The project may choose to either build native MIC binaries, or offload # binaries (hybrid code), or both. In the case where only native MIC binaries # are built, the compiler does not need to support offloading # # MIC_NATIVE_FOUND is true if native MIC binaries can be built # MIC_OFFLOAD_FOUND is true if hybrid host/MIC binaries via offload can be built # MIC_FOUND is true if either MIC_NATIVE_FOUND or MIC_OFFLOAD_FOUND is true # # When MIC_NATIVE_FOUND is true you can use the macros # mic_add_definitions # mic_include_directories # mic_set_link_libraries # mic_add_library # mic_add_executable # for building native libraries/executables # # When MIC_OFFLOAD_FOUND is true you use the standard cmake macros to build # libraries and executables but have to make sure manually that the necessary # offload compiler switches are present. You might want to add something like: # if(MIC_OFFLOAD_FOUND) # AddCompilerFlag("-offload-build") # AddCompilerFlag("-offload-copts=-vec-report=3 -H") # AddCompilerFlag("-offload-ldopts=-lmylib") # AddCompilerFlag("-opt-report-phase=offload") # endif() set(MIC_FOUND false) set(MIC_NATIVE_FOUND false) set(MIC_OFFLOAD_FOUND false) if(CMAKE_GENERATOR MATCHES "Makefile") option(ENABLE_MIC "Enable native builds for the MIC architecture (Intel Knights Corner)" ON) else() message(STATUS "MIC builds are only supported with a Makefile generator") set(ENABLE_MIC false) endif() if(ENABLE_MIC) file(GLOB _intel_dirs "/opt/intel/compilers_and_libraries_*/linux") if ("${_intel_dirs}" STREQUAL "") file(GLOB _intel_dirs "/opt/intel/composer_xe_*") endif() list(SORT _intel_dirs) list(REVERSE _intel_dirs) find_path(MIC_SDK_DIR bin/intel64_mic/icpc PATHS "$ENV{MIC_SDK_DIR}" ${_intel_dirs} ) mark_as_advanced(MIC_SDK_DIR) ############################################################################## # First check whether offload works # For now offload is not supported so skip it # if(NOT DEFINED c_compiler_can_offload OR NOT DEFINED cxx_compiler_can_offload) # set(c_compiler_can_offload FALSE) # set(cxx_compiler_can_offload FALSE) # # include(CheckCSourceCompiles) # include(CheckCXXSourceCompiles) # # #find_library(MIC_HOST_IMF_LIBRARY imf HINTS ENV LIBRARY_PATH) # #find_library(MIC_HOST_SVML_LIBRARY svml HINTS ENV LIBRARY_PATH) # #find_library(MIC_HOST_INTLC_LIBRARY intlc HINTS ENV LIBRARY_PATH) # # #set(MIC_HOST_LIBS ${MIC_HOST_IMF_LIBRARY} ${MIC_HOST_SVML_LIBRARY} ${MIC_HOST_INTLC_LIBRARY}) # # set(_mic_offload_test_source " ##ifdef __MIC__ ##include ##endif #__attribute__((target(mic))) void test() #{ ##ifdef __MIC__ # __m512 v = _mm512_setzero_ps(); # (void)v; ##endif #} # #int main() #{ ##pragma offload target(mic) # test(); # return 0; #} #") # set(CMAKE_REQUIRED_FLAGS "-offload-build") # check_c_source_compiles("${_mic_offload_test_source}" c_compiler_can_offload) # check_cxx_source_compiles("${_mic_offload_test_source}" cxx_compiler_can_offload) # set(CMAKE_REQUIRED_FLAGS) # endif() # # if(c_compiler_can_offload AND cxx_compiler_can_offload) # message(STATUS "C/C++ Compiler can offload to MIC.") # set(MIC_OFFLOAD_FOUND true) # else() # message(STATUS "C/C++ Compiler can NOT offload to MIC.") # endif() ############################################################################## # Next check whether everything required for native builds is available find_path(MIC_TARGET_TOOLS_DIR bin/x86_64-k1om-linux-ar HINTS "$ENV{MIC_TARGET_TOOLS_DIR}" "${MIC_SDK_DIR}/target" "/usr/linux-k1om-4.7" ) find_program(MIC_AR x86_64-k1om-linux-ar PATHS "${MIC_TARGET_TOOLS_DIR}/bin") find_program(MIC_RANLIB x86_64-k1om-linux-ranlib PATHS "${MIC_TARGET_TOOLS_DIR}/bin") find_program(MIC_OBJCOPY x86_64-k1om-linux-objcopy PATHS "${MIC_TARGET_TOOLS_DIR}/bin") find_program(MIC_NATIVELOAD micnativeloadex PATHS ENV PATH) mark_as_advanced(MIC_TARGET_TOOLS_DIR MIC_AR MIC_RANLIB MIC_NATIVELOAD MIC_OBJCOPY) if(MIC_SDK_DIR AND MIC_AR AND MIC_RANLIB) find_program(MIC_CC icc HINTS "${MIC_SDK_DIR}/bin" "${MIC_SDK_DIR}/bin/intel64") find_program(MIC_CXX icpc HINTS "${MIC_SDK_DIR}/bin" "${MIC_SDK_DIR}/bin/intel64") find_library(MIC_IMF_LIBRARY imf HINTS "${MIC_SDK_DIR}/compiler/lib/mic") find_library(MIC_SVML_LIBRARY svml HINTS "${MIC_SDK_DIR}/compiler/lib/mic") find_library(MIC_INTLC_LIBRARY intlc HINTS "${MIC_SDK_DIR}/compiler/lib/mic") mark_as_advanced(MIC_CC MIC_CXX MIC_IMF_LIBRARY MIC_SVML_LIBRARY MIC_INTLC_LIBRARY) set(MIC_LIBS ${MIC_IMF_LIBRARY} ${MIC_SVML_LIBRARY} ${MIC_INTLC_LIBRARY}) set(MIC_CFLAGS "-O2 -vec") exec_program(${MIC_CXX} ARGS -V OUTPUT_VARIABLE _mic_icc_version_string RETURN_VALUE _mic_icc_ok) if(0 EQUAL _mic_icc_ok) string(REGEX MATCH "Version (Mainline)?[0-9. a-zA-Z]+" Vc_MIC_ICC_VERSION "${_mic_icc_version_string}") string(SUBSTRING "${Vc_MIC_ICC_VERSION}" 8 -1 Vc_MIC_ICC_VERSION) message(STATUS "MIC ICC Version: \"${Vc_MIC_ICC_VERSION}\"") if(MIC_CC AND MIC_CXX AND MIC_IMF_LIBRARY AND MIC_SVML_LIBRARY AND MIC_INTLC_LIBRARY) set(MIC_NATIVE_FOUND true) endif() else() message(STATUS "MIC ICC found, but not usable.") endif() endif() endif(ENABLE_MIC) if(MIC_NATIVE_FOUND OR MIC_OFFLOAD_FOUND) set(MIC_FOUND true) list(APPEND CMAKE_MIC_CXX_FLAGS "-diag-disable 2338") # this switch statement does not have a default clause list(APPEND CMAKE_MIC_CXX_FLAGS "-diag-disable 193") # zero used for undefined preprocessing identifier "Vc_GCC" list(APPEND CMAKE_MIC_CXX_FLAGS "-diag-disable 61") # warning #61: integer operation result is out of range list(APPEND CMAKE_MIC_CXX_FLAGS "-diag-disable 173") # warning #173: floating-point value does not fit in required integral type list(APPEND CMAKE_MIC_CXX_FLAGS "-diag-disable 264") # warning #264: floating-point value does not fit in required floating-point type list(APPEND CMAKE_MIC_CXX_FLAGS "-fp-model source") # fix IEEE FP comliance set(Vc_MIC_CXX_FLAGS "") macro(mic_add_definitions) add_definitions(${ARGN}) foreach(_def ${ARGN}) set(_mic_cflags ${_mic_cflags} "${_def}") endforeach() endmacro() macro(mic_include_directories) foreach(_dir ${ARGN}) set(_mic_cflags ${_mic_cflags} "-I${_dir}") endforeach() include_directories(${ARGN}) endmacro() if(NOT DEFINED MIC_C_FLAGS) set(MIC_C_FLAGS) endif() if(NOT DEFINED MIC_CXX_FLAGS) set(MIC_CXX_FLAGS) endif() else() message(STATUS "MIC SDK was not found!") endif() if(MIC_NATIVE_FOUND) macro(_mic_add_object _target _source _output) get_property(_deps SOURCE "${_source}" PROPERTY OBJECT_DEPENDS) get_filename_component(_abs "${_source}" ABSOLUTE) get_filename_component(_ext "${_source}" EXT) get_filename_component(_tmp "${_source}" NAME_WE) set(${_output} "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${_target}.dir/${_tmp}${_ext}.mic.o") set(_lang CXX) set(_compiler "${MIC_CXX}") if(_ext STREQUAL "c") set(_lang C) set(_compiler "${MIC_CC}") endif() string(TOUPPER "${CMAKE_BUILD_TYPE}" _tmp) string(STRIP "${CMAKE_MIC_${_lang}_FLAGS} ${CMAKE_${_lang}_FLAGS_${_tmp}} ${_mic_cflags} ${Vc_MIC_CXX_FLAGS}" _flags) string(REPLACE " " ";" _flags "${_flags} ${ARGN}") get_directory_property(_inc INCLUDE_DIRECTORIES) foreach(_i ${_inc}) list(APPEND _flags "-I${_i}") endforeach() get_property(_launch_rule GLOBAL PROPERTY RULE_LAUNCH_COMPILE) string(REPLACE "\"" "" _launch_rule "${_launch_rule}") string(REPLACE " " ";" _launch_rule "${_launch_rule}") string(REPLACE "" "${_target}" _launch_rule "${_launch_rule}") string(REPLACE "" "${CMAKE_CURRENT_BINARY_DIR}" _launch_rule "${_launch_rule}") string(REPLACE "" "${${_output}}" _launch_rule "${_launch_rule}") string(REPLACE "" "${_abs}" _launch_rule "${_launch_rule}") string(REPLACE "" "C++" _launch_rule "${_launch_rule}") add_custom_command(OUTPUT "${${_output}}" COMMAND ${_launch_rule} "${_compiler}" -mmic -DVc_IMPL=MIC ${_flags} -c -o "${${_output}}" "${_abs}" DEPENDS "${_abs}" ${_deps} IMPLICIT_DEPENDS ${_lang} "${_abs}" WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMENT "Compiling (MIC) ${${_output}}" VERBATIM ) endmacro() macro(mic_set_link_libraries) set(_mic_lflags) foreach(_lib ${ARGN}) get_filename_component(_lpath "${_lib}" PATH) get_filename_component(_lname "${_lib}" NAME) set(_mic_lflags ${_mic_lflags} "-L${_lpath}" "-l${_lname}") endforeach() endmacro() macro(mic_add_library _target) set(_state 0) if(BUILD_SHARED_LIBS) set(_type SHARED) else() set(_type STATIC) endif() set(_all ALL) set(_srcs) set(_cflags) set(_libs) foreach(_arg ${ARGN}) if(_arg MATCHES "^(STATIC|SHARED|MODULE)$") set(_type ${_arg}) elseif(_arg STREQUAL "EXCLUDE_FROM_ALL") set(_all) elseif(_arg STREQUAL "COMPILE_FLAGS" OR _arg STREQUAL "COMPILE_OPTIONS") set(_state 1) elseif(_arg STREQUAL "LINK_LIBRARIES") set(_state 2) elseif(_arg STREQUAL "SOURCES") set(_state 0) elseif(_state EQUAL 0) # SOURCES set(_srcs ${_srcs} "${_arg}") elseif(_state EQUAL 1) # COMPILE_FLAGS list(APPEND _cflags ${_arg}) elseif(_state EQUAL 2) # LINK_LIBRARIES get_filename_component(_lpath "${_arg}" PATH) get_filename_component(_lname "${_arg}" NAME) set(_libs ${_libs} "-L${_lpath}" "-l${_lname}") endif() endforeach() set(_objects) set(_objectsStr) foreach(_src ${_srcs}) _mic_add_object("${_target}" "${_src}" _obj ${_cflags}) list(APPEND _objects "${_obj}") set(_objectsStr "${_objectsStr} \"${_obj}\"") endforeach() set(_outdir "${CMAKE_CURRENT_BINARY_DIR}/x86_64-k1om-linux") file(MAKE_DIRECTORY "${_outdir}") #TODO: handle STATIC/SHARED/MODULE differently set(_output "lib${_target}.a") set(_linkscript "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${_target}.dir/link.txt") set(_cleanscript "CMakeFiles/${_target}.dir/cmake_clean_target.cmake") file(WRITE "${_linkscript}" "${MIC_AR} cr ${_output} ${_objectsStr} ${MIC_RANLIB} ${_output} ") file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/${_cleanscript}" "FILE(REMOVE_RECURSE \"${_output}\") ") add_custom_command(OUTPUT "${_outdir}/${_output}" COMMAND "${CMAKE_COMMAND}" -E cmake_link_script "${_linkscript}" --verbose=$(VERBOSE) DEPENDS ${_objects} WORKING_DIRECTORY "${_outdir}" COMMENT "Linking (MIC) ${_output}" VERBATIM ) add_custom_target("${_target}" ${_all} DEPENDS "${_outdir}/${_output}" COMMENT "" SOURCES ${_srcs} ) set_target_properties("${_target}" PROPERTIES OUTPUT_NAME "${_outdir}/${_output}" ) endmacro() macro(mic_add_executable _target) set(_state 0) set(_all ALL) set(_srcs) set(_cflags) set(_libs) set(_libTargets) set(_dump_asm false) set(_exec_output_name "${_target}") set(_objects) set(_objectsStr) foreach(_arg ${ARGN}) if(_arg STREQUAL "EXCLUDE_FROM_ALL") set(_all) elseif(_arg STREQUAL "COMPILE_FLAGS") set(_state 1) elseif(_arg STREQUAL "LINK_LIBRARIES") set(_state 2) elseif(_arg STREQUAL "OUTPUT_NAME") set(_state 3) elseif(_arg STREQUAL "SOURCES") set(_state 0) elseif(_arg STREQUAL "OBJECTS") set(_state 4) elseif(_arg STREQUAL "DUMP_ASM") set(_dump_asm true) elseif(_state EQUAL 0) # SOURCES set(_srcs ${_srcs} "${_arg}") elseif(_state EQUAL 1) # COMPILE_FLAGS set(_cflags ${_cflags} "${_arg}") elseif(_state EQUAL 2) # LINK_LIBRARIES if(TARGET ${_arg}) get_target_property(_tmp "${_arg}" OUTPUT_NAME) if(_tmp) set(_libs ${_libs} "${_tmp}") set(_libTargets ${_libTargets} "${_tmp}" "${_arg}") else() set(_libs ${_libs} "${_arg}") if(EXISTS "${_arg}") set(_libTargets ${_libTargets} "${_arg}") endif() endif() else() set(_libs ${_libs} "${_arg}") if(EXISTS "${_arg}") set(_libTargets ${_libTargets} "${_arg}") endif() endif() elseif(_state EQUAL 3) # OUTPUT_NAME set(_exec_output_name "${_arg}") elseif(_state EQUAL 4) # OBJECTS set(_objects ${_objects} "${_arg}") set(_objectsStr "${_objectsStr} \"${_arg}\"") endif() endforeach() foreach(_src ${_srcs}) _mic_add_object("${_target}" "${_src}" _obj ${_cflags}) set(_objects ${_objects} "${_obj}") set(_objectsStr "${_objectsStr} \"${_obj}\"") endforeach() set(_exec_output "${CMAKE_CURRENT_BINARY_DIR}/${_exec_output_name}") get_property(_launch_rule GLOBAL PROPERTY RULE_LAUNCH_LINK) string(REPLACE "\"" "" _launch_rule "${_launch_rule}") string(REPLACE " " ";" _launch_rule "${_launch_rule}") string(REPLACE "" "${_target}" _launch_rule "${_launch_rule}") string(REPLACE "" "${CMAKE_CURRENT_BINARY_DIR}" _launch_rule "${_launch_rule}") add_custom_command(OUTPUT "${_exec_output}" COMMAND ${_launch_rule} "${MIC_CXX}" -mmic "-L${MIC_SDK_DIR}/compiler/lib/mic/" ${_mic_lflags} ${_objects} -o "${_exec_output}" ${_libs} DEPENDS ${_objects} ${_libTargets} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMENT "Linking (MIC) ${_exec_output}" VERBATIM ) set(_dump_asm_output) if(_dump_asm) foreach(_src ${_srcs}) get_filename_component(_abs "${_src}" ABSOLUTE) get_filename_component(_name "${_src}" NAME) add_custom_command(OUTPUT "${_name}.s" COMMAND "${MIC_CXX}" -mmic -DVc_IMPL=MIC ${_mic_cflags} ${_cflags} ${Vc_MIC_CXX_FLAGS} ${_abs} -S -fsource-asm -fno-verbose-asm -o "${_name}.x" COMMAND sh -c "grep -v ___tag_value '${_name}.x' | c++filt > '${_name}.s'" COMMAND rm "${_name}.x" DEPENDS ${_abs} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMENT "Creating MIC Assembly ${_name}.s" VERBATIM ) set(_dump_asm_output ${_dump_asm_output} "${CMAKE_CURRENT_BINARY_DIR}/${_name}.s") endforeach() endif() add_custom_target("${_target}" ${_all} DEPENDS "${_exec_output}" ${_dump_asm_output} COMMENT "" SOURCES ${_srcs} ) set_target_properties("${_target}" PROPERTIES OUTPUT_NAME "${_exec_output_name}") endmacro() endif() if(MIC_OFFLOAD_FOUND) macro(mic_offload _target) set(_mic_debug) if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") set(_mic_debug "-g") endif() add_target_property(${_target} COMPILE_FLAGS "-offload-build -DCAN_OFFLOAD ${_mic_debug}") set(_offload_ldflags "${_mic_debug}") set(_libTargets) foreach(_lib ${ARGN}) get_target_property(_tmp "${_lib}" OUTPUT_NAME) if(_tmp) set(_offload_ldflags "${_offload_ldflags} ${_tmp}") set(_libTargets ${_libTargets} "${_arg}") else() get_filename_component(_lpath "${_arg}" PATH) get_filename_component(_lname "${_arg}" NAME) set(_offload_ldflags "${_offload_ldflags} -L${_lpath} -l${_lname}") endif() endforeach() add_target_property(${_target} LINK_FLAGS "-offload-build -offload-ldopts=\"${_offload_ldflags}\" ${_mic_debug}") if(_libTargets) add_dependencies(${_target} ${_libTargets}) endif() endmacro() endif() Vc-1.3.3/cmake/FindSSE.cmake000066400000000000000000000104071320703111200153730ustar00rootroot00000000000000# Check if SSE instructions are available on the machine where # the project is compiled. IF(CMAKE_SYSTEM_NAME MATCHES "Linux") EXEC_PROGRAM(cat ARGS "/proc/cpuinfo" OUTPUT_VARIABLE CPUINFO) STRING(REGEX REPLACE "^.*(sse2).*$" "\\1" SSE_THERE ${CPUINFO}) STRING(COMPARE EQUAL "sse2" "${SSE_THERE}" SSE2_TRUE) IF (SSE2_TRUE) set(SSE2_FOUND true CACHE BOOL "SSE2 available on host") ELSE (SSE2_TRUE) set(SSE2_FOUND false CACHE BOOL "SSE2 available on host") ENDIF (SSE2_TRUE) # /proc/cpuinfo apparently omits sse3 :( STRING(REGEX REPLACE "^.*[^s](sse3).*$" "\\1" SSE_THERE ${CPUINFO}) STRING(COMPARE EQUAL "sse3" "${SSE_THERE}" SSE3_TRUE) IF (NOT SSE3_TRUE) STRING(REGEX REPLACE "^.*(T2300).*$" "\\1" SSE_THERE ${CPUINFO}) STRING(COMPARE EQUAL "T2300" "${SSE_THERE}" SSE3_TRUE) ENDIF (NOT SSE3_TRUE) STRING(REGEX REPLACE "^.*(ssse3).*$" "\\1" SSE_THERE ${CPUINFO}) STRING(COMPARE EQUAL "ssse3" "${SSE_THERE}" SSSE3_TRUE) IF (SSE3_TRUE OR SSSE3_TRUE) set(SSE3_FOUND true CACHE BOOL "SSE3 available on host") ELSE (SSE3_TRUE OR SSSE3_TRUE) set(SSE3_FOUND false CACHE BOOL "SSE3 available on host") ENDIF (SSE3_TRUE OR SSSE3_TRUE) IF (SSSE3_TRUE) set(SSSE3_FOUND true CACHE BOOL "SSSE3 available on host") ELSE (SSSE3_TRUE) set(SSSE3_FOUND false CACHE BOOL "SSSE3 available on host") ENDIF (SSSE3_TRUE) STRING(REGEX REPLACE "^.*(sse4_1).*$" "\\1" SSE_THERE ${CPUINFO}) STRING(COMPARE EQUAL "sse4_1" "${SSE_THERE}" SSE41_TRUE) IF (SSE41_TRUE) set(SSE4_1_FOUND true CACHE BOOL "SSE4.1 available on host") ELSE (SSE41_TRUE) set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host") ENDIF (SSE41_TRUE) ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin") EXEC_PROGRAM("/usr/sbin/sysctl -n machdep.cpu.features" OUTPUT_VARIABLE CPUINFO) STRING(REGEX REPLACE "^.*[^S](SSE2).*$" "\\1" SSE_THERE ${CPUINFO}) STRING(COMPARE EQUAL "SSE2" "${SSE_THERE}" SSE2_TRUE) IF (SSE2_TRUE) set(SSE2_FOUND true CACHE BOOL "SSE2 available on host") ELSE (SSE2_TRUE) set(SSE2_FOUND false CACHE BOOL "SSE2 available on host") ENDIF (SSE2_TRUE) STRING(REGEX REPLACE "^.*[^S](SSE3).*$" "\\1" SSE_THERE ${CPUINFO}) STRING(COMPARE EQUAL "SSE3" "${SSE_THERE}" SSE3_TRUE) IF (SSE3_TRUE) set(SSE3_FOUND true CACHE BOOL "SSE3 available on host") ELSE (SSE3_TRUE) set(SSE3_FOUND false CACHE BOOL "SSE3 available on host") ENDIF (SSE3_TRUE) STRING(REGEX REPLACE "^.*(SSSE3).*$" "\\1" SSE_THERE ${CPUINFO}) STRING(COMPARE EQUAL "SSSE3" "${SSE_THERE}" SSSE3_TRUE) IF (SSSE3_TRUE) set(SSSE3_FOUND true CACHE BOOL "SSSE3 available on host") ELSE (SSSE3_TRUE) set(SSSE3_FOUND false CACHE BOOL "SSSE3 available on host") ENDIF (SSSE3_TRUE) STRING(REGEX REPLACE "^.*(SSE4.1).*$" "\\1" SSE_THERE ${CPUINFO}) STRING(COMPARE EQUAL "SSE4.1" "${SSE_THERE}" SSE41_TRUE) IF (SSE41_TRUE) set(SSE4_1_FOUND true CACHE BOOL "SSE4.1 available on host") ELSE (SSE41_TRUE) set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host") ENDIF (SSE41_TRUE) ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Windows") # TODO set(SSE2_FOUND true CACHE BOOL "SSE2 available on host") set(SSE3_FOUND false CACHE BOOL "SSE3 available on host") set(SSSE3_FOUND false CACHE BOOL "SSSE3 available on host") set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host") ELSE(CMAKE_SYSTEM_NAME MATCHES "Linux") set(SSE2_FOUND true CACHE BOOL "SSE2 available on host") set(SSE3_FOUND false CACHE BOOL "SSE3 available on host") set(SSSE3_FOUND false CACHE BOOL "SSSE3 available on host") set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host") ENDIF(CMAKE_SYSTEM_NAME MATCHES "Linux") if(NOT SSE2_FOUND) MESSAGE(STATUS "Could not find hardware support for SSE2 on this machine.") endif(NOT SSE2_FOUND) if(NOT SSE3_FOUND) MESSAGE(STATUS "Could not find hardware support for SSE3 on this machine.") endif(NOT SSE3_FOUND) if(NOT SSSE3_FOUND) MESSAGE(STATUS "Could not find hardware support for SSSE3 on this machine.") endif(NOT SSSE3_FOUND) if(NOT SSE4_1_FOUND) MESSAGE(STATUS "Could not find hardware support for SSE4.1 on this machine.") endif(NOT SSE4_1_FOUND) mark_as_advanced(SSE2_FOUND SSE3_FOUND SSSE3_FOUND SSE4_1_FOUND) Vc-1.3.3/cmake/FindVc.cmake000066400000000000000000000053251320703111200153140ustar00rootroot00000000000000# Locate the Vc template library. Vc can be found at https://github.com/VcDevel/Vc # # This file is meant to be copied into projects that want to use Vc. It will # search for VcConfig.cmake, which ships with Vc and will provide up-to-date # buildsystem changes. Thus there should not be any need to update FindVc.cmake # again after you integrated it into your project. # # This module defines the following variables: # Vc_FOUND # Vc_INCLUDE_DIR # Vc_LIBRARIES # Vc_DEFINITIONS # Vc_COMPILE_FLAGS # Vc_ARCHITECTURE_FLAGS # Vc_ALL_FLAGS (the union of the above three variables) # Vc_VERSION_MAJOR # Vc_VERSION_MINOR # Vc_VERSION_PATCH # Vc_VERSION # Vc_VERSION_STRING # Vc_INSTALL_DIR # Vc_LIB_DIR # Vc_CMAKE_MODULES_DIR # # The following two variables are set according to the compiler used. Feel free # to use them to skip whole compilation units. # Vc_SSE_INTRINSICS_BROKEN # Vc_AVX_INTRINSICS_BROKEN # #============================================================================= # Copyright 2009-2015 Matthias Kretz # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * Neither the names of contributing organizations nor the # names of its contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #============================================================================= find_package(Vc ${Vc_FIND_VERSION} QUIET NO_MODULE PATHS $ENV{HOME} /opt/Vc) include(FindPackageHandleStandardArgs) find_package_handle_standard_args(Vc CONFIG_MODE) Vc-1.3.3/cmake/OptimizeForArchitecture.cmake000066400000000000000000000702741320703111200207620ustar00rootroot00000000000000# Determine the host CPU feature set and determine the best set of compiler # flags to enable all supported SIMD relevant features. Alternatively, the # target CPU can be explicitly selected (for generating more generic binaries # or for targeting a different system). # Compilers provide e.g. the -march=native flag to achieve a similar result. # This fails to address the need for building for a different microarchitecture # than the current host. # The script tries to deduce all settings from the model and family numbers of # the CPU instead of reading the CPUID flags from e.g. /proc/cpuinfo. This makes # the detection more independent from the CPUID code in the kernel (e.g. avx2 is # not listed on older kernels). # # Usage: # OptimizeForArchitecture() # If either of Vc_SSE_INTRINSICS_BROKEN, Vc_AVX_INTRINSICS_BROKEN, # Vc_AVX2_INTRINSICS_BROKEN is defined and set, the OptimizeForArchitecture # macro will consequently disable the relevant features via compiler flags. #============================================================================= # Copyright 2010-2016 Matthias Kretz # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * Neither the names of contributing organizations nor the # names of its contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #============================================================================= get_filename_component(_currentDir "${CMAKE_CURRENT_LIST_FILE}" PATH) include("${_currentDir}/AddCompilerFlag.cmake") include(CheckIncludeFileCXX) macro(_my_find _list _value _ret) list(FIND ${_list} "${_value}" _found) if(_found EQUAL -1) set(${_ret} FALSE) else(_found EQUAL -1) set(${_ret} TRUE) endif(_found EQUAL -1) endmacro(_my_find) macro(AutodetectHostArchitecture) set(TARGET_ARCHITECTURE "generic") set(Vc_ARCHITECTURE_FLAGS) set(_vendor_id) set(_cpu_family) set(_cpu_model) if(CMAKE_SYSTEM_NAME STREQUAL "Linux") file(READ "/proc/cpuinfo" _cpuinfo) string(REGEX REPLACE ".*vendor_id[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _vendor_id "${_cpuinfo}") string(REGEX REPLACE ".*cpu family[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_family "${_cpuinfo}") string(REGEX REPLACE ".*model[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_model "${_cpuinfo}") string(REGEX REPLACE ".*flags[ \t]*:[ \t]+([^\n]+).*" "\\1" _cpu_flags "${_cpuinfo}") elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") exec_program("/usr/sbin/sysctl -n machdep.cpu.vendor" OUTPUT_VARIABLE _vendor_id) exec_program("/usr/sbin/sysctl -n machdep.cpu.model" OUTPUT_VARIABLE _cpu_model) exec_program("/usr/sbin/sysctl -n machdep.cpu.family" OUTPUT_VARIABLE _cpu_family) exec_program("/usr/sbin/sysctl -n machdep.cpu.features" OUTPUT_VARIABLE _cpu_flags) string(TOLOWER "${_cpu_flags}" _cpu_flags) string(REPLACE "." "_" _cpu_flags "${_cpu_flags}") elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") get_filename_component(_vendor_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;VendorIdentifier]" NAME CACHE) get_filename_component(_cpu_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;Identifier]" NAME CACHE) mark_as_advanced(_vendor_id _cpu_id) string(REGEX REPLACE ".* Family ([0-9]+) .*" "\\1" _cpu_family "${_cpu_id}") string(REGEX REPLACE ".* Model ([0-9]+) .*" "\\1" _cpu_model "${_cpu_id}") endif(CMAKE_SYSTEM_NAME STREQUAL "Linux") if(_vendor_id STREQUAL "GenuineIntel") if(_cpu_family EQUAL 6) # taken from the Intel ORM # http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html # CPUID Signature Values of Of Recent Intel Microarchitectures # 4E 5E | Skylake microarchitecture # 3D 47 56 | Broadwell microarchitecture # 3C 45 46 3F | Haswell microarchitecture # 3A 3E | Ivy Bridge microarchitecture # 2A 2D | Sandy Bridge microarchitecture # 25 2C 2F | Intel microarchitecture Westmere # 1A 1E 1F 2E | Intel microarchitecture Nehalem # 17 1D | Enhanced Intel Core microarchitecture # 0F | Intel Core microarchitecture # # Intel SDM Vol. 3C 35-1 / December 2016: # 57 | Xeon Phi 3200, 5200, 7200 [Knights Landing] # 85 | Future Xeon Phi # 8E 9E | 7th gen. Core [Kaby Lake] # 55 | Future Xeon [Skylake w/ AVX512] # 4E 5E | 6th gen. Core / E3 v5 [Skylake w/o AVX512] # 56 | Xeon D-1500 [Broadwell] # 4F | Xeon E5 v4, E7 v4, i7-69xx [Broadwell] # 47 | 5th gen. Core / Xeon E3 v4 [Broadwell] # 3D | M-5xxx / 5th gen. [Broadwell] # 3F | Xeon E5 v3, E7 v3, i7-59xx [Haswell-E] # 3C 45 46 | 4th gen. Core, Xeon E3 v3 [Haswell] # 3E | Xeon E5 v2, E7 v2, i7-49xx [Ivy Bridge-E] # 3A | 3rd gen. Core, Xeon E3 v2 [Ivy Bridge] # 2D | Xeon E5, i7-39xx [Sandy Bridge] # 2F | Xeon E7 # 2A | Xeon E3, 2nd gen. Core [Sandy Bridge] # 2E | Xeon 7500, 6500 series # 25 2C | Xeon 3600, 5600 series, Core i7, i5 and i3 # # Values from the Intel SDE: # 5C | Goldmont # 5A | Silvermont # 57 | Knights Landing # 66 | Cannonlake # 55 | Skylake Server # 4E | Skylake Client # 3C | Broadwell (likely a bug in the SDE) # 3C | Haswell if(_cpu_model EQUAL 87) # 57 set(TARGET_ARCHITECTURE "knl") # Knights Landing elseif(_cpu_model EQUAL 92) set(TARGET_ARCHITECTURE "goldmont") elseif(_cpu_model EQUAL 90 OR _cpu_model EQUAL 76) set(TARGET_ARCHITECTURE "silvermont") elseif(_cpu_model EQUAL 102) set(TARGET_ARCHITECTURE "cannonlake") elseif(_cpu_model EQUAL 142 OR _cpu_model EQUAL 158) # 8E, 9E set(TARGET_ARCHITECTURE "kaby-lake") elseif(_cpu_model EQUAL 85) # 55 set(TARGET_ARCHITECTURE "skylake-avx512") elseif(_cpu_model EQUAL 78 OR _cpu_model EQUAL 94) # 4E, 5E set(TARGET_ARCHITECTURE "skylake") elseif(_cpu_model EQUAL 61 OR _cpu_model EQUAL 71 OR _cpu_model EQUAL 79 OR _cpu_model EQUAL 86) # 3D, 47, 4F, 56 set(TARGET_ARCHITECTURE "broadwell") elseif(_cpu_model EQUAL 60 OR _cpu_model EQUAL 69 OR _cpu_model EQUAL 70 OR _cpu_model EQUAL 63) set(TARGET_ARCHITECTURE "haswell") elseif(_cpu_model EQUAL 58 OR _cpu_model EQUAL 62) set(TARGET_ARCHITECTURE "ivy-bridge") elseif(_cpu_model EQUAL 42 OR _cpu_model EQUAL 45) set(TARGET_ARCHITECTURE "sandy-bridge") elseif(_cpu_model EQUAL 37 OR _cpu_model EQUAL 44 OR _cpu_model EQUAL 47) set(TARGET_ARCHITECTURE "westmere") elseif(_cpu_model EQUAL 26 OR _cpu_model EQUAL 30 OR _cpu_model EQUAL 31 OR _cpu_model EQUAL 46) set(TARGET_ARCHITECTURE "nehalem") elseif(_cpu_model EQUAL 23 OR _cpu_model EQUAL 29) set(TARGET_ARCHITECTURE "penryn") elseif(_cpu_model EQUAL 15) set(TARGET_ARCHITECTURE "merom") elseif(_cpu_model EQUAL 28) set(TARGET_ARCHITECTURE "atom") elseif(_cpu_model EQUAL 14) set(TARGET_ARCHITECTURE "core") elseif(_cpu_model LESS 14) message(WARNING "Your CPU (family ${_cpu_family}, model ${_cpu_model}) is not known. Auto-detection of optimization flags failed and will use the generic CPU settings with SSE2.") set(TARGET_ARCHITECTURE "generic") else() message(WARNING "Your CPU (family ${_cpu_family}, model ${_cpu_model}) is not known. Auto-detection of optimization flags failed and will use the 65nm Core 2 CPU settings.") set(TARGET_ARCHITECTURE "merom") endif() elseif(_cpu_family EQUAL 7) # Itanium (not supported) message(WARNING "Your CPU (Itanium: family ${_cpu_family}, model ${_cpu_model}) is not supported by OptimizeForArchitecture.cmake.") elseif(_cpu_family EQUAL 15) # NetBurst list(APPEND _available_vector_units_list "sse" "sse2") if(_cpu_model GREATER 2) # Not sure whether this must be 3 or even 4 instead list(APPEND _available_vector_units_list "sse" "sse2" "sse3") endif(_cpu_model GREATER 2) endif(_cpu_family EQUAL 6) elseif(_vendor_id STREQUAL "AuthenticAMD") if(_cpu_family EQUAL 23) set(TARGET_ARCHITECTURE "zen") elseif(_cpu_family EQUAL 22) # 16h set(TARGET_ARCHITECTURE "AMD 16h") elseif(_cpu_family EQUAL 21) # 15h if(_cpu_model LESS 2) set(TARGET_ARCHITECTURE "bulldozer") else() set(TARGET_ARCHITECTURE "piledriver") endif() elseif(_cpu_family EQUAL 20) # 14h set(TARGET_ARCHITECTURE "AMD 14h") elseif(_cpu_family EQUAL 18) # 12h elseif(_cpu_family EQUAL 16) # 10h set(TARGET_ARCHITECTURE "barcelona") elseif(_cpu_family EQUAL 15) set(TARGET_ARCHITECTURE "k8") if(_cpu_model GREATER 64) # I don't know the right number to put here. This is just a guess from the hardware I have access to set(TARGET_ARCHITECTURE "k8-sse3") endif(_cpu_model GREATER 64) endif() endif(_vendor_id STREQUAL "GenuineIntel") endmacro() macro(OptimizeForArchitecture) if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(x86|AMD64)") OptimizeForArchitectureX86() else() message(STATUS "No support for auto-detection of the target instruction set/extension") set(TARGET_ARCHITECTURE "unused" CACHE STRING "CPU architecture to optimize for. (unused)") endif() endmacro() macro(OptimizeForArchitectureX86) set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. \ Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used. \ Setting the value to \"auto\" will try to optimize for the architecture where cmake is called. \ Other supported values are: \"none\", \"generic\", \"core\", \"merom\" (65nm Core2), \ \"penryn\" (45nm Core2), \"nehalem\", \"westmere\", \"sandy-bridge\", \"ivy-bridge\", \ \"haswell\", \"broadwell\", \"skylake\", \"skylake-xeon\", \"kaby-lake\", \"cannonlake\", \"silvermont\", \ \"goldmont\", \"knl\" (Knights Landing), \"atom\", \"k8\", \"k8-sse3\", \"barcelona\", \ \"istanbul\", \"magny-cours\", \"bulldozer\", \"interlagos\", \"piledriver\", \ \"AMD 14h\", \"AMD 16h\", \"zen\".") set(_force) if(NOT _last_target_arch STREQUAL "${TARGET_ARCHITECTURE}") message(STATUS "target changed from \"${_last_target_arch}\" to \"${TARGET_ARCHITECTURE}\"") set(_force FORCE) endif() set(_last_target_arch "${TARGET_ARCHITECTURE}" CACHE STRING "" FORCE) mark_as_advanced(_last_target_arch) string(TOLOWER "${TARGET_ARCHITECTURE}" TARGET_ARCHITECTURE) set(_march_flag_list) set(_available_vector_units_list) if(TARGET_ARCHITECTURE STREQUAL "auto") AutodetectHostArchitecture() message(STATUS "Detected CPU: ${TARGET_ARCHITECTURE}") endif(TARGET_ARCHITECTURE STREQUAL "auto") macro(_nehalem) list(APPEND _march_flag_list "nehalem") list(APPEND _march_flag_list "corei7") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2") endmacro() macro(_westmere) list(APPEND _march_flag_list "westmere") _nehalem() endmacro() macro(_sandybridge) list(APPEND _march_flag_list "sandybridge") list(APPEND _march_flag_list "corei7-avx") _westmere() list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2" "avx") endmacro() macro(_ivybridge) list(APPEND _march_flag_list "ivybridge") list(APPEND _march_flag_list "core-avx-i") _sandybridge() list(APPEND _available_vector_units_list "rdrnd" "f16c") endmacro() macro(_haswell) list(APPEND _march_flag_list "haswell") list(APPEND _march_flag_list "core-avx2") _ivybridge() list(APPEND _available_vector_units_list "avx2" "fma" "bmi" "bmi2") endmacro() macro(_broadwell) list(APPEND _march_flag_list "broadwell") _haswell() endmacro() macro(_skylake) list(APPEND _march_flag_list "skylake") _broadwell() endmacro() macro(_skylake_avx512) list(APPEND _march_flag_list "skylake-avx512") _skylake() list(APPEND _available_vector_units_list "avx512f" "avx512cd" "avx512dq" "avx512bw" "avx512vl") endmacro() macro(_cannonlake) list(APPEND _march_flag_list "cannonlake") _skylake_avx512() list(APPEND _available_vector_units_list "avx512ifma" "avx512vbmi") endmacro() macro(_knightslanding) list(APPEND _march_flag_list "knl") _broadwell() list(APPEND _available_vector_units_list "avx512f" "avx512pf" "avx512er" "avx512cd") endmacro() macro(_silvermont) list(APPEND _march_flag_list "silvermont") _westmere() list(APPEND _available_vector_units_list "rdrnd") endmacro() macro(_goldmont) list(APPEND _march_flag_list "goldmont") _silvermont() endmacro() if(TARGET_ARCHITECTURE STREQUAL "core") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3") elseif(TARGET_ARCHITECTURE STREQUAL "merom") list(APPEND _march_flag_list "merom") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3") elseif(TARGET_ARCHITECTURE STREQUAL "penryn") list(APPEND _march_flag_list "penryn") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3") message(STATUS "Sadly the Penryn architecture exists in variants with SSE4.1 and without SSE4.1.") if(_cpu_flags MATCHES "sse4_1") message(STATUS "SSE4.1: enabled (auto-detected from this computer's CPU flags)") list(APPEND _available_vector_units_list "sse4.1") else() message(STATUS "SSE4.1: disabled (auto-detected from this computer's CPU flags)") endif() elseif(TARGET_ARCHITECTURE STREQUAL "knl") _knightslanding() elseif(TARGET_ARCHITECTURE STREQUAL "cannonlake") _cannonlake() elseif(TARGET_ARCHITECTURE STREQUAL "kaby-lake") _skylake() elseif(TARGET_ARCHITECTURE STREQUAL "skylake-xeon" OR TARGET_ARCHITECTURE STREQUAL "skylake-avx512") _skylake_avx512() elseif(TARGET_ARCHITECTURE STREQUAL "skylake") _skylake() elseif(TARGET_ARCHITECTURE STREQUAL "broadwell") _broadwell() elseif(TARGET_ARCHITECTURE STREQUAL "haswell") _haswell() elseif(TARGET_ARCHITECTURE STREQUAL "ivy-bridge") _ivybridge() elseif(TARGET_ARCHITECTURE STREQUAL "sandy-bridge") _sandybridge() elseif(TARGET_ARCHITECTURE STREQUAL "westmere") _westmere() elseif(TARGET_ARCHITECTURE STREQUAL "nehalem") _nehalem() elseif(TARGET_ARCHITECTURE STREQUAL "goldmont") _goldmont() elseif(TARGET_ARCHITECTURE STREQUAL "silvermont") _silvermont() elseif(TARGET_ARCHITECTURE STREQUAL "atom") list(APPEND _march_flag_list "atom") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3") elseif(TARGET_ARCHITECTURE STREQUAL "k8") list(APPEND _march_flag_list "k8") list(APPEND _available_vector_units_list "sse" "sse2") elseif(TARGET_ARCHITECTURE STREQUAL "k8-sse3") list(APPEND _march_flag_list "k8-sse3") list(APPEND _march_flag_list "k8") list(APPEND _available_vector_units_list "sse" "sse2" "sse3") elseif(TARGET_ARCHITECTURE STREQUAL "AMD 16h") list(APPEND _march_flag_list "btver2") list(APPEND _march_flag_list "btver1") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "f16c") elseif(TARGET_ARCHITECTURE STREQUAL "AMD 14h") list(APPEND _march_flag_list "btver1") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a") elseif(TARGET_ARCHITECTURE STREQUAL "zen") list(APPEND _march_flag_list "znver1") _skylake() list(APPEND _available_vector_units_list "sse4a") elseif(TARGET_ARCHITECTURE STREQUAL "piledriver") list(APPEND _march_flag_list "bdver2") list(APPEND _march_flag_list "bdver1") list(APPEND _march_flag_list "bulldozer") list(APPEND _march_flag_list "barcelona") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4" "fma" "f16c") elseif(TARGET_ARCHITECTURE STREQUAL "interlagos") list(APPEND _march_flag_list "bdver1") list(APPEND _march_flag_list "bulldozer") list(APPEND _march_flag_list "barcelona") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4") elseif(TARGET_ARCHITECTURE STREQUAL "bulldozer") list(APPEND _march_flag_list "bdver1") list(APPEND _march_flag_list "bulldozer") list(APPEND _march_flag_list "barcelona") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4") elseif(TARGET_ARCHITECTURE STREQUAL "barcelona") list(APPEND _march_flag_list "barcelona") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a") elseif(TARGET_ARCHITECTURE STREQUAL "istanbul") list(APPEND _march_flag_list "barcelona") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a") elseif(TARGET_ARCHITECTURE STREQUAL "magny-cours") list(APPEND _march_flag_list "barcelona") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a") elseif(TARGET_ARCHITECTURE STREQUAL "generic") list(APPEND _march_flag_list "generic") elseif(TARGET_ARCHITECTURE STREQUAL "none") # add this clause to remove it from the else clause else(TARGET_ARCHITECTURE STREQUAL "core") message(FATAL_ERROR "Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") endif(TARGET_ARCHITECTURE STREQUAL "core") if(NOT TARGET_ARCHITECTURE STREQUAL "none") set(_disable_vector_unit_list) set(_enable_vector_unit_list) if(DEFINED Vc_AVX_INTRINSICS_BROKEN AND Vc_AVX_INTRINSICS_BROKEN) UserWarning("AVX disabled per default because of old/broken toolchain") set(_avx_broken true) set(_avx2_broken true) set(_fma4_broken true) set(_xop_broken true) else() set(_avx_broken false) if(DEFINED Vc_FMA4_INTRINSICS_BROKEN AND Vc_FMA4_INTRINSICS_BROKEN) UserWarning("FMA4 disabled per default because of old/broken toolchain") set(_fma4_broken true) else() set(_fma4_broken false) endif() if(DEFINED Vc_XOP_INTRINSICS_BROKEN AND Vc_XOP_INTRINSICS_BROKEN) UserWarning("XOP disabled per default because of old/broken toolchain") set(_xop_broken true) else() set(_xop_broken false) endif() if(DEFINED Vc_AVX2_INTRINSICS_BROKEN AND Vc_AVX2_INTRINSICS_BROKEN) UserWarning("AVX2 disabled per default because of old/broken toolchain") set(_avx2_broken true) else() set(_avx2_broken false) endif() endif() macro(_enable_or_disable _name _flag _documentation _broken) if(_broken) set(_found false) else() _my_find(_available_vector_units_list "${_flag}" _found) endif() set(USE_${_name} ${_found} CACHE BOOL "${documentation}" ${_force}) mark_as_advanced(USE_${_name}) if(USE_${_name}) list(APPEND _enable_vector_unit_list "${_flag}") else() list(APPEND _disable_vector_unit_list "${_flag}") endif() endmacro() _enable_or_disable(SSE2 "sse2" "Use SSE2. If SSE2 instructions are not enabled the SSE implementation will be disabled." false) _enable_or_disable(SSE3 "sse3" "Use SSE3. If SSE3 instructions are not enabled they will be emulated." false) _enable_or_disable(SSSE3 "ssse3" "Use SSSE3. If SSSE3 instructions are not enabled they will be emulated." false) _enable_or_disable(SSE4_1 "sse4.1" "Use SSE4.1. If SSE4.1 instructions are not enabled they will be emulated." false) _enable_or_disable(SSE4_2 "sse4.2" "Use SSE4.2. If SSE4.2 instructions are not enabled they will be emulated." false) _enable_or_disable(SSE4a "sse4a" "Use SSE4a. If SSE4a instructions are not enabled they will be emulated." false) _enable_or_disable(AVX "avx" "Use AVX. This will all floating-point vector sizes relative to SSE." _avx_broken) _enable_or_disable(FMA "fma" "Use FMA." _avx_broken) _enable_or_disable(BMI2 "bmi2" "Use BMI2." _avx_broken) _enable_or_disable(AVX2 "avx2" "Use AVX2. This will double all of the vector sizes relative to SSE." _avx2_broken) _enable_or_disable(XOP "xop" "Use XOP." _xop_broken) _enable_or_disable(FMA4 "fma4" "Use FMA4." _fma4_broken) _enable_or_disable(AVX512F "avx512f" "Use AVX512F. This will double all floating-point vector sizes relative to AVX2." false) _enable_or_disable(AVX512VL "avx512vl" "Use AVX512VL. This enables 128- and 256-bit vector length instructions with EVEX coding (improved write-masking & more vector registers)." _avx2_broken) _enable_or_disable(AVX512PF "avx512pf" "Use AVX512PF. This enables prefetch instructions for gathers and scatters." false) _enable_or_disable(AVX512ER "avx512er" "Use AVX512ER. This enables exponential and reciprocal instructions." false) _enable_or_disable(AVX512CD "avx512cd" "Use AVX512CD." false) _enable_or_disable(AVX512DQ "avx512dq" "Use AVX512DQ." false) _enable_or_disable(AVX512BW "avx512bw" "Use AVX512BW." false) _enable_or_disable(AVX512IFMA "avx512ifma" "Use AVX512IFMA." false) _enable_or_disable(AVX512VBMI "avx512vbmi" "Use AVX512VBMI." false) if(MSVC) # MSVC on 32 bit can select /arch:SSE2 (since 2010 also /arch:AVX) # MSVC on 64 bit cannot select anything (should have changed with MSVC 2010) _my_find(_enable_vector_unit_list "avx2" _found) if(_found) AddCompilerFlag("/arch:AVX2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS CXX_RESULT _found) endif() if(NOT _found) _my_find(_enable_vector_unit_list "avx" _found) if(_found) AddCompilerFlag("/arch:AVX" CXX_FLAGS Vc_ARCHITECTURE_FLAGS CXX_RESULT _found) endif() endif() if(NOT _found) _my_find(_enable_vector_unit_list "sse2" _found) if(_found) AddCompilerFlag("/arch:SSE2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) endif() endif() foreach(_flag ${_enable_vector_unit_list}) string(TOUPPER "${_flag}" _flag) string(REPLACE "." "_" _flag "__${_flag}__") add_definitions("-D${_flag}") endforeach(_flag) elseif(CMAKE_CXX_COMPILER MATCHES "/(icpc|icc)$") # ICC (on Linux) set(OFA_map_knl "-xMIC-AVX512") set(OFA_map_cannonlake "-xCORE-AVX512") set(OFA_map_skylake-avx512 "-xCORE-AVX512") set(OFA_map_skylake "-xCORE-AVX2") set(OFA_map_broadwell "-xCORE-AVX2") set(OFA_map_haswell "-xCORE-AVX2") set(OFA_map_ivybridge "-xCORE-AVX-I") set(OFA_map_sandybridge "-xAVX") set(OFA_map_westmere "-xSSE4.2") set(OFA_map_nehalem "-xSSE4.2") set(OFA_map_penryn "-xSSSE3") set(OFA_map_merom "-xSSSE3") set(OFA_map_core2 "-xSSE3") set(_ok FALSE) foreach(arch ${_march_flag_list}) if(DEFINED OFA_map_${arch}) AddCompilerFlag(${OFA_map_${arch}} CXX_FLAGS Vc_ARCHITECTURE_FLAGS CXX_RESULT _ok) if(_ok) break() endif() endif() endforeach() if(NOT _ok) # This is the Intel compiler, so SSE2 is a very reasonable baseline. message(STATUS "Did not recognize the requested architecture flag, falling back to SSE2") AddCompilerFlag("-xSSE2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) endif() else() # not MSVC and not ICC => GCC, Clang, Open64 foreach(_flag ${_march_flag_list}) AddCompilerFlag("-march=${_flag}" CXX_RESULT _good CXX_FLAGS Vc_ARCHITECTURE_FLAGS) if(_good) break() endif(_good) endforeach(_flag) foreach(_flag ${_enable_vector_unit_list}) AddCompilerFlag("-m${_flag}" CXX_RESULT _result) if(_result) set(_header FALSE) if(_flag STREQUAL "sse3") set(_header "pmmintrin.h") elseif(_flag STREQUAL "ssse3") set(_header "tmmintrin.h") elseif(_flag STREQUAL "sse4.1") set(_header "smmintrin.h") elseif(_flag STREQUAL "sse4.2") set(_header "smmintrin.h") elseif(_flag STREQUAL "sse4a") set(_header "ammintrin.h") elseif(_flag STREQUAL "avx") set(_header "immintrin.h") elseif(_flag STREQUAL "avx2") set(_header "immintrin.h") elseif(_flag STREQUAL "fma4") set(_header "x86intrin.h") elseif(_flag STREQUAL "xop") set(_header "x86intrin.h") endif() set(_resultVar "HAVE_${_header}") string(REPLACE "." "_" _resultVar "${_resultVar}") if(_header) CHECK_INCLUDE_FILE_CXX("${_header}" ${_resultVar} "-m${_flag}") if(NOT ${_resultVar}) set(_useVar "USE_${_flag}") string(TOUPPER "${_useVar}" _useVar) string(REPLACE "." "_" _useVar "${_useVar}") message(STATUS "disabling ${_useVar} because ${_header} is missing") set(${_useVar} FALSE) list(APPEND _disable_vector_unit_list "${_flag}") endif() endif() if(NOT _header OR ${_resultVar}) list(APPEND Vc_ARCHITECTURE_FLAGS "-m${_flag}") endif() endif() endforeach(_flag) foreach(_flag ${_disable_vector_unit_list}) AddCompilerFlag("-mno-${_flag}" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) endforeach(_flag) endif() endif() endmacro() Vc-1.3.3/cmake/UserWarning.cmake000066400000000000000000000003561320703111200164060ustar00rootroot00000000000000macro(UserWarning _msg) if("$ENV{DASHBOARD_TEST_FROM_CTEST}" STREQUAL "") # developer (non-dashboard) build message(WARNING "${_msg}") else() # dashboard build message(STATUS "${_msg}") endif() endmacro() Vc-1.3.3/cmake/VcConfig.cmake.in000066400000000000000000000023771320703111200162520ustar00rootroot00000000000000set(Vc_VERSION_MAJOR @Vc_VERSION_MAJOR@) set(Vc_VERSION_MINOR @Vc_VERSION_MINOR@) set(Vc_VERSION_PATCH @Vc_VERSION_PATCH@) set(Vc_VERSION @Vc_VERSION_MAJOR@.@Vc_VERSION_MINOR@.@Vc_VERSION_PATCH@) set(Vc_VERSION_STRING "@Vc_VERSION_MAJOR@.@Vc_VERSION_MINOR@.@Vc_VERSION_PATCH@") set(Vc_INSTALL_DIR "@CMAKE_INSTALL_PREFIX@") set(Vc_LIB_DIR "@CMAKE_INSTALL_PREFIX@/lib@LIB_SUFFIX@") find_path(Vc_INCLUDE_DIR Vc/global.h HINTS "@CMAKE_INSTALL_PREFIX@/include") find_path(Vc_CMAKE_MODULES_DIR AddCompilerFlag.cmake HINTS "${Vc_LIB_DIR}/cmake/Vc") list(APPEND CMAKE_MODULE_PATH "${Vc_CMAKE_MODULES_DIR}") find_library(Vc_LIBRARIES Vc PATHS "${Vc_LIB_DIR}" NO_DEFAULT_PATH) find_library(Vc_MIC_LIBRARIES Vc_MIC PATHS "${Vc_LIB_DIR}" NO_DEFAULT_PATH) include("${Vc_CMAKE_MODULES_DIR}/VcMacros.cmake") set(Vc_DEFINITIONS) set(Vc_COMPILE_FLAGS) set(Vc_ARCHITECTURE_FLAGS) vc_set_preferred_compiler_flags() separate_arguments(Vc_ALL_FLAGS UNIX_COMMAND "${Vc_DEFINITIONS}") list(APPEND Vc_ALL_FLAGS ${Vc_COMPILE_FLAGS}) list(APPEND Vc_ALL_FLAGS ${Vc_ARCHITECTURE_FLAGS}) include(FindPackageHandleStandardArgs) find_package_handle_standard_args(Vc FOUND_VAR Vc_FOUND REQUIRED_VARS Vc_LIBRARIES Vc_INCLUDE_DIR Vc_CMAKE_MODULES_DIR VERSION_VAR Vc_VERSION ) Vc-1.3.3/cmake/VcConfigVersion.cmake.in000066400000000000000000000005401320703111200176060ustar00rootroot00000000000000set(PACKAGE_VERSION @Vc_VERSION_MAJOR@.@Vc_VERSION_MINOR@.@Vc_VERSION_PATCH@) if("${PACKAGE_VERSION}" VERSION_LESS "${PACKAGE_FIND_VERSION}") set(PACKAGE_VERSION_COMPATIBLE FALSE) else() set(PACKAGE_VERSION_COMPATIBLE TRUE) if("${PACKAGE_FIND_VERSION}" STREQUAL "${PACKAGE_VERSION}") set(PACKAGE_VERSION_EXACT TRUE) endif() endif() Vc-1.3.3/cmake/VcMacros.cmake000066400000000000000000000622621320703111200156630ustar00rootroot00000000000000# Macros for use with the Vc library. Vc can be found at http://code.compeng.uni-frankfurt.de/projects/vc # # The following macros are provided: # vc_determine_compiler # vc_set_preferred_compiler_flags # #============================================================================= # Copyright 2009-2015 Matthias Kretz # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * Neither the names of contributing organizations nor the # names of its contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #============================================================================= cmake_minimum_required(VERSION 2.8.3) get_filename_component(_currentDir "${CMAKE_CURRENT_LIST_FILE}" PATH) include ("${_currentDir}/UserWarning.cmake") include ("${_currentDir}/AddCompilerFlag.cmake") include ("${_currentDir}/OptimizeForArchitecture.cmake") macro(vc_determine_compiler) if(NOT DEFINED Vc_COMPILER_IS_INTEL) execute_process(COMMAND "${CMAKE_CXX_COMPILER}" "--version" OUTPUT_VARIABLE _cxx_compiler_version ERROR_VARIABLE _cxx_compiler_version) set(Vc_COMPILER_IS_INTEL false) set(Vc_COMPILER_IS_OPEN64 false) set(Vc_COMPILER_IS_CLANG false) set(Vc_COMPILER_IS_MSVC false) set(Vc_COMPILER_IS_GCC false) if(CMAKE_CXX_COMPILER MATCHES "/(icpc|icc)$") set(Vc_COMPILER_IS_INTEL true) exec_program(${CMAKE_CXX_COMPILER} ARGS -dumpversion OUTPUT_VARIABLE Vc_ICC_VERSION) message(STATUS "Detected Compiler: Intel ${Vc_ICC_VERSION}") # break build with too old clang as early as possible. if(Vc_ICC_VERSION VERSION_LESS 15.0.3) message(FATAL_ERROR "Vc 1.x requires C++11 support. This requires at least ICC 15.0.3") endif() elseif(CMAKE_CXX_COMPILER MATCHES "(opencc|openCC)$") set(Vc_COMPILER_IS_OPEN64 true) message(STATUS "Detected Compiler: Open64") elseif(CMAKE_CXX_COMPILER MATCHES "clang\\+\\+$" OR "${_cxx_compiler_version}" MATCHES "clang") set(Vc_COMPILER_IS_CLANG true) exec_program(${CMAKE_CXX_COMPILER} ARGS --version OUTPUT_VARIABLE Vc_CLANG_VERSION) string(REGEX MATCH "[0-9]+\\.[0-9]+(\\.[0-9]+)?" Vc_CLANG_VERSION "${Vc_CLANG_VERSION}") message(STATUS "Detected Compiler: Clang ${Vc_CLANG_VERSION}") # break build with too old clang as early as possible. if(Vc_CLANG_VERSION VERSION_LESS 3.4) message(FATAL_ERROR "Vc 1.x requires C++11 support. This requires at least clang 3.4") endif() elseif(MSVC) set(Vc_COMPILER_IS_MSVC true) execute_process(COMMAND ${CMAKE_CXX_COMPILER} /nologo -EP "${_currentDir}/msvc_version.c" OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE Vc_MSVC_VERSION) string(STRIP "${Vc_MSVC_VERSION}" Vc_MSVC_VERSION) string(REPLACE "MSVC " "" Vc_MSVC_VERSION "${Vc_MSVC_VERSION}") message(STATUS "Detected Compiler: MSVC ${Vc_MSVC_VERSION}") elseif(CMAKE_COMPILER_IS_GNUCXX) set(Vc_COMPILER_IS_GCC true) exec_program(${CMAKE_CXX_COMPILER} ARGS -dumpversion OUTPUT_VARIABLE Vc_GCC_VERSION) message(STATUS "Detected Compiler: GCC ${Vc_GCC_VERSION}") # some distributions patch their GCC to return nothing or only major and minor version on -dumpversion. # In that case we must extract the version number from --version. if(NOT Vc_GCC_VERSION OR Vc_GCC_VERSION MATCHES "^[0-9]\\.[0-9]+$") exec_program(${CMAKE_CXX_COMPILER} ARGS --version OUTPUT_VARIABLE Vc_GCC_VERSION) string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" Vc_GCC_VERSION "${Vc_GCC_VERSION}") message(STATUS "GCC Version from --version: ${Vc_GCC_VERSION}") endif() # some distributions patch their GCC to be API incompatible to what the FSF released. In # those cases we require a macro to identify the distribution version find_program(Vc_lsb_release lsb_release) mark_as_advanced(Vc_lsb_release) if(Vc_lsb_release) if(NOT Vc_distributor_id) execute_process(COMMAND ${Vc_lsb_release} -is OUTPUT_VARIABLE Vc_distributor_id OUTPUT_STRIP_TRAILING_WHITESPACE) string(TOUPPER "${Vc_distributor_id}" Vc_distributor_id) set(Vc_distributor_id "${Vc_distributor_id}" CACHE STRING "lsb distribution id") execute_process(COMMAND ${Vc_lsb_release} -rs OUTPUT_VARIABLE Vc_distributor_release OUTPUT_STRIP_TRAILING_WHITESPACE) set(Vc_distributor_release "${Vc_distributor_release}" CACHE STRING "lsb release id") endif() if(Vc_distributor_id STREQUAL "UBUNTU") execute_process(COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE _gcc_version) string(REGEX MATCH "\\(.* ${Vc_GCC_VERSION}-([0-9]+).*\\)" _tmp "${_gcc_version}") if(_tmp) set(_patch ${CMAKE_MATCH_1}) string(REGEX MATCH "^([0-9]+)\\.([0-9]+)$" _tmp "${Vc_distributor_release}") execute_process(COMMAND printf 0x%x%02x%02x ${CMAKE_MATCH_1} ${CMAKE_MATCH_2} ${_patch} OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE _tmp) set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -D__GNUC_UBUNTU_VERSION__=${_tmp}") endif() endif() endif() # break build with too old GCC as early as possible. if(Vc_GCC_VERSION VERSION_LESS 4.8.1) message(FATAL_ERROR "Vc 1.x requires C++11 support. This requires at least GCC 4.8.1") endif() else() message(WARNING "Untested/-supported Compiler (${CMAKE_CXX_COMPILER}) for use with Vc.\nPlease fill out the missing parts in the CMake scripts and submit a patch to http://code.compeng.uni-frankfurt.de/projects/vc") endif() endif() endmacro() macro(vc_set_gnu_buildtype_flags) set(CMAKE_CXX_FLAGS_DEBUG "-g3" CACHE STRING "Flags used by the compiler during debug builds." FORCE) set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG" CACHE STRING "Flags used by the compiler during release minsize builds." FORCE) set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "Flags used by the compiler during release builds (/MD /Ob1 /Oi /Ot /Oy /Gs will produce slightly less optimized but smaller files)." FORCE) set(CMAKE_CXX_FLAGS_RELWITHDEBUG "-O3" CACHE STRING "Flags used by the compiler during release builds containing runtime checks." FORCE) set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBUG} -g" CACHE STRING "Flags used by the compiler during Release with Debug Info builds." FORCE) set(CMAKE_C_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}" CACHE STRING "Flags used by the compiler during debug builds." FORCE) set(CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL}" CACHE STRING "Flags used by the compiler during release minsize builds." FORCE) set(CMAKE_C_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}" CACHE STRING "Flags used by the compiler during release builds (/MD /Ob1 /Oi /Ot /Oy /Gs will produce slightly less optimized but smaller files)." FORCE) set(CMAKE_C_FLAGS_RELWITHDEBUG "${CMAKE_CXX_FLAGS_RELWITHDEBUG}" CACHE STRING "Flags used by the compiler during release builds containing runtime checks." FORCE) set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}" CACHE STRING "Flags used by the compiler during Release with Debug Info builds." FORCE) if(CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebug") set(ENABLE_STRICT_ALIASING true CACHE BOOL "Enables strict aliasing rules for more aggressive optimizations") if(NOT ENABLE_STRICT_ALIASING) AddCompilerFlag(-fno-strict-aliasing) endif(NOT ENABLE_STRICT_ALIASING) endif() mark_as_advanced(CMAKE_CXX_FLAGS_RELWITHDEBUG CMAKE_C_FLAGS_RELWITHDEBUG) endmacro() macro(vc_add_compiler_flag VAR _flag) AddCompilerFlag("${_flag}" CXX_FLAGS ${VAR}) endmacro() macro(vc_check_assembler) exec_program(${CMAKE_CXX_COMPILER} ARGS -print-prog-name=as OUTPUT_VARIABLE _as) mark_as_advanced(_as) if(NOT _as) message(WARNING "Could not find 'as', the assembler used by GCC. Hoping everything will work out...") else() exec_program(${_as} ARGS --version OUTPUT_VARIABLE _as_version) string(REGEX REPLACE "\\([^\\)]*\\)" "" _as_version "${_as_version}") string(REGEX MATCH "[1-9]\\.[0-9]+(\\.[0-9]+)?" _as_version "${_as_version}") if(_as_version VERSION_LESS "2.18.93") UserWarning("Your binutils is too old (${_as_version}). Some optimizations of Vc will be disabled.") set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -DVc_NO_XGETBV") # old assembler doesn't know the xgetbv instruction set(Vc_AVX_INTRINSICS_BROKEN true) set(Vc_XOP_INTRINSICS_BROKEN true) set(Vc_FMA4_INTRINSICS_BROKEN true) elseif(_as_version VERSION_LESS "2.21.0") UserWarning("Your binutils is too old (${_as_version}) for XOP and AVX2 instructions. They will therefore not be provided in libVc.") set(Vc_XOP_INTRINSICS_BROKEN true) set(Vc_AVX2_INTRINSICS_BROKEN true) endif() endif() endmacro() macro(vc_set_preferred_compiler_flags) vc_determine_compiler() set(_add_warning_flags false) set(_add_buildtype_flags false) foreach(_arg ${ARGN}) if(_arg STREQUAL "WARNING_FLAGS") set(_add_warning_flags true) elseif(_arg STREQUAL "BUILDTYPE_FLAGS") set(_add_buildtype_flags true) endif() endforeach() set(Vc_SSE_INTRINSICS_BROKEN false) set(Vc_AVX_INTRINSICS_BROKEN false) set(Vc_AVX2_INTRINSICS_BROKEN false) set(Vc_XOP_INTRINSICS_BROKEN false) set(Vc_FMA4_INTRINSICS_BROKEN false) if(Vc_COMPILER_IS_OPEN64) ################################################################################################## # Open64 # ################################################################################################## if(_add_warning_flags) AddCompilerFlag("-W") AddCompilerFlag("-Wall") AddCompilerFlag("-Wimplicit") AddCompilerFlag("-Wswitch") AddCompilerFlag("-Wformat") AddCompilerFlag("-Wchar-subscripts") AddCompilerFlag("-Wparentheses") AddCompilerFlag("-Wmultichar") AddCompilerFlag("-Wtrigraphs") AddCompilerFlag("-Wpointer-arith") AddCompilerFlag("-Wcast-align") AddCompilerFlag("-Wreturn-type") AddCompilerFlag("-pedantic") AddCompilerFlag("-Wno-long-long") AddCompilerFlag("-Wshadow") AddCompilerFlag("-Wold-style-cast") AddCompilerFlag("-Wno-variadic-macros") endif() if(_add_buildtype_flags) vc_set_gnu_buildtype_flags() endif() vc_check_assembler() # Open64 4.5.1 still doesn't ship immintrin.h set(Vc_AVX_INTRINSICS_BROKEN true) set(Vc_AVX2_INTRINSICS_BROKEN true) elseif(Vc_COMPILER_IS_GCC) ################################################################################################## # GCC # ################################################################################################## if(_add_warning_flags) foreach(_f -W -Wall -Wswitch -Wformat -Wchar-subscripts -Wparentheses -Wmultichar -Wtrigraphs -Wpointer-arith -Wcast-align -Wreturn-type -pedantic -Wshadow -Wundef) AddCompilerFlag("${_f}") endforeach() foreach(_f -Wold-style-cast) AddCompilerFlag("${_f}" CXX_FLAGS CMAKE_CXX_FLAGS) endforeach() endif() vc_add_compiler_flag(Vc_COMPILE_FLAGS "-Wabi") vc_add_compiler_flag(Vc_COMPILE_FLAGS "-fabi-version=0") # ABI version 4 is required to make __m128 and __m256 appear as different types. 0 should give us the latest version. vc_add_compiler_flag(Vc_COMPILE_FLAGS "-fabi-compat-version=0") # GCC 5 introduced this switch # and defaults it to 2 if -fabi-version is 0. But in that case the bug -fabi-version=0 is # supposed to fix resurfaces. For now just make sure that it compiles and links. # Bug report pending. if(_add_buildtype_flags) vc_set_gnu_buildtype_flags() endif() if(APPLE) # The GNU assembler (as) on Mac OS X is hopelessly outdated. The -q flag # to as tells it to use the clang assembler, though, which is fine. # -Wa,-q tells GCC to pass -q to as. vc_add_compiler_flag(Vc_COMPILE_FLAGS "-Wa,-q") # Apparently the MacOS clang assember doesn't understand XOP instructions. set(Vc_XOP_INTRINSICS_BROKEN true) else() vc_check_assembler() endif() elseif(Vc_COMPILER_IS_INTEL) ################################################################################################## # Intel Compiler # ################################################################################################## if(_add_buildtype_flags) set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DNDEBUG -O3") set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3") set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -DNDEBUG -O3") endif() vc_add_compiler_flag(Vc_COMPILE_FLAGS "-diag-disable 913") # Disable warning #13211 "Immediate parameter to intrinsic call too large". (sse/vector.tcc rotated(int)) vc_add_compiler_flag(Vc_COMPILE_FLAGS "-diag-disable 13211") vc_add_compiler_flag(Vc_COMPILE_FLAGS "-diag-disable 61") # warning #61: integer operation result is out of range vc_add_compiler_flag(Vc_COMPILE_FLAGS "-diag-disable 173") # warning #173: floating-point value does not fit in required integral type vc_add_compiler_flag(Vc_COMPILE_FLAGS "-diag-disable 264") # warning #264: floating-point value does not fit in required floating-point type if(CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") set(ENABLE_STRICT_ALIASING true CACHE BOOL "Enables strict aliasing rules for more aggressive optimizations") if(ENABLE_STRICT_ALIASING) AddCompilerFlag(-ansi-alias CXX_FLAGS Vc_COMPILE_FLAGS) else() AddCompilerFlag(-no-ansi-alias CXX_FLAGS Vc_COMPILE_FLAGS) endif() endif() if(NOT "$ENV{DASHBOARD_TEST_FROM_CTEST}" STREQUAL "") # disable warning #2928: the __GXX_EXPERIMENTAL_CXX0X__ macro is disabled when using GNU version 4.6 with the c++0x option # this warning just adds noise about problems in the compiler - but I'm only interested in seeing problems in Vc vc_add_compiler_flag(Vc_COMPILE_FLAGS "-diag-disable 2928") endif() # Intel doesn't implement the XOP or FMA4 intrinsics set(Vc_XOP_INTRINSICS_BROKEN true) set(Vc_FMA4_INTRINSICS_BROKEN true) elseif(Vc_COMPILER_IS_MSVC) ################################################################################################## # Microsoft Visual Studio # ################################################################################################## if(_add_warning_flags) AddCompilerFlag("/wd4800") # Disable warning "forcing value to bool" AddCompilerFlag("/wd4996") # Disable warning about strdup vs. _strdup AddCompilerFlag("/wd4244") # Disable warning "conversion from 'unsigned int' to 'float', possible loss of data" AddCompilerFlag("/wd4146") # Disable warning "unary minus operator applied to unsigned type, result still unsigned" AddCompilerFlag("/wd4227") # Disable warning "anachronism used : qualifiers on reference are ignored" (this is about 'restrict' usage on references, stupid MSVC) AddCompilerFlag("/wd4722") # Disable warning "destructor never returns, potential memory leak" (warns about ~_UnitTest_Global_Object which we don't care about) AddCompilerFlag("/wd4748") # Disable warning "/GS can not protect parameters and local variables from local buffer overrun because optimizations are disabled in function" (I don't get it) add_definitions(-D_CRT_SECURE_NO_WARNINGS) endif() vc_add_compiler_flag(Vc_COMPILE_FLAGS "/Gv") # default to __vectorcall if(MSVC_VERSION LESS 1900) UserWarning("MSVC before 2015 does not support enough of C++11") endif() elseif(Vc_COMPILER_IS_CLANG) ################################################################################################## # Clang # ################################################################################################## if(Vc_CLANG_VERSION VERSION_GREATER "3.5.99" AND Vc_CLANG_VERSION VERSION_LESS 3.7.0) UserWarning("Clang 3.6 has serious issues with AVX code generation, frequently losing 50% of the data. AVX is therefore disabled.\nPlease update to a more recent clang version.\n") set(Vc_AVX_INTRINSICS_BROKEN true) set(Vc_AVX2_INTRINSICS_BROKEN true) endif() # disable these warnings because clang shows them for function overloads that were discarded via SFINAE vc_add_compiler_flag(Vc_COMPILE_FLAGS "-Wno-local-type-template-args") vc_add_compiler_flag(Vc_COMPILE_FLAGS "-Wno-unnamed-type-template-args") endif() if(NOT Vc_COMPILER_IS_MSVC) vc_add_compiler_flag(Vc_COMPILE_FLAGS "-ffp-contract=fast") endif() OptimizeForArchitecture() set(Vc_IMPL "auto" CACHE STRING "Force the Vc implementation globally to the selected instruction set. \"auto\" lets Vc use the best available instructions.") if(NOT Vc_IMPL STREQUAL "auto") set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -DVc_IMPL=${Vc_IMPL}") if(NOT Vc_IMPL STREQUAL "Scalar") set(_use_var "USE_${Vc_IMPL}") if(Vc_IMPL STREQUAL "SSE") set(_use_var "USE_SSE2") endif() if(NOT ${_use_var}) message(WARNING "The selected value for Vc_IMPL (${Vc_IMPL}) will not work because the relevant instructions are not enabled via compiler flags.") endif() endif() endif() endmacro() # helper macro for vc_compile_for_all_implementations macro(_vc_compile_one_implementation _srcs _impl) list(FIND _disabled_targets "${_impl}" _disabled_index) list(FIND _only_targets "${_impl}" _only_index) if(${_disabled_index} GREATER -1) if(${_only_index} GREATER -1) # disabled and enabled -> error message(FATAL_ERROR "vc_compile_for_all_implementations lists ${_impl} in both the ONLY and EXCLUDE lists. Please remove one.") endif() list(REMOVE_AT _disabled_targets ${_disabled_index}) # skip the rest and return elseif(NOT _only_targets OR ${_only_index} GREATER -1) if(${_only_index} GREATER -1) list(REMOVE_AT _only_targets ${_only_index}) endif() set(_extra_flags) set(_ok FALSE) foreach(_flags_it ${ARGN}) if(_flags_it STREQUAL "NO_FLAG") set(_ok TRUE) break() endif() string(REPLACE " " ";" _flag_list "${_flags_it}") foreach(_f ${_flag_list}) AddCompilerFlag(${_f} CXX_RESULT _ok) if(NOT _ok) break() endif() endforeach() if(_ok) set(_extra_flags ${_flags_it}) break() endif() endforeach() if(Vc_COMPILER_IS_MSVC) # MSVC for 64bit does not recognize /arch:SSE2 anymore. Therefore we set override _ok if _impl # says SSE if("${_impl}" MATCHES "SSE") set(_ok TRUE) endif() endif() if(_ok) get_filename_component(_out "${_vc_compile_src}" NAME_WE) get_filename_component(_ext "${_vc_compile_src}" EXT) set(_out "${CMAKE_CURRENT_BINARY_DIR}/${_out}_${_impl}${_ext}") add_custom_command(OUTPUT "${_out}" COMMAND ${CMAKE_COMMAND} -E copy "${_vc_compile_src}" "${_out}" DEPENDS "${_vc_compile_src}" COMMENT "Copy to ${_out}" WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" VERBATIM) set_source_files_properties( "${_out}" PROPERTIES COMPILE_DEFINITIONS "Vc_IMPL=${_impl}" COMPILE_FLAGS "${_flags} ${_extra_flags}" ) list(APPEND ${_srcs} "${_out}") endif() endif() endmacro() # Generate compile rules for the given C++ source file for all available implementations and return # the resulting list of object files in _obj # all remaining arguments are additional flags # Example: # vc_compile_for_all_implementations(_objs src/trigonometric.cpp FLAGS -DCOMPILE_BLAH EXCLUDE Scalar) # add_executable(executable main.cpp ${_objs}) macro(vc_compile_for_all_implementations _srcs _src) set(_flags) unset(_disabled_targets) unset(_only_targets) set(_state 0) foreach(_arg ${ARGN}) if(_arg STREQUAL "FLAGS") set(_state 1) elseif(_arg STREQUAL "EXCLUDE") set(_state 2) elseif(_arg STREQUAL "ONLY") set(_state 3) elseif(_state EQUAL 1) set(_flags "${_flags} ${_arg}") elseif(_state EQUAL 2) list(APPEND _disabled_targets "${_arg}") elseif(_state EQUAL 3) list(APPEND _only_targets "${_arg}") else() message(FATAL_ERROR "incorrect argument to vc_compile_for_all_implementations") endif() endforeach() set(_vc_compile_src "${_src}") _vc_compile_one_implementation(${_srcs} Scalar NO_FLAG) if(NOT Vc_SSE_INTRINSICS_BROKEN) _vc_compile_one_implementation(${_srcs} SSE2 "-xSSE2" "-msse2" "/arch:SSE2") _vc_compile_one_implementation(${_srcs} SSE3 "-xSSE3" "-msse3" "/arch:SSE2") _vc_compile_one_implementation(${_srcs} SSSE3 "-xSSSE3" "-mssse3" "/arch:SSE2") _vc_compile_one_implementation(${_srcs} SSE4_1 "-xSSE4.1" "-msse4.1" "/arch:SSE2") _vc_compile_one_implementation(${_srcs} SSE4_2 "-xSSE4.2" "-msse4.2" "/arch:SSE2") _vc_compile_one_implementation(${_srcs} SSE3+SSE4a "-msse4a") endif() if(NOT Vc_AVX_INTRINSICS_BROKEN) _vc_compile_one_implementation(${_srcs} AVX "-xAVX" "-mavx" "/arch:AVX") if(NOT Vc_XOP_INTRINSICS_BROKEN) if(NOT Vc_FMA4_INTRINSICS_BROKEN) _vc_compile_one_implementation(${_srcs} SSE+XOP+FMA4 "-mxop -mfma4" "" "") _vc_compile_one_implementation(${_srcs} AVX+XOP+FMA4 "-mavx -mxop -mfma4" "" "") endif() _vc_compile_one_implementation(${_srcs} SSE+XOP+FMA "-mxop -mfma" "" "") _vc_compile_one_implementation(${_srcs} AVX+XOP+FMA "-mavx -mxop -mfma" "" "") endif() _vc_compile_one_implementation(${_srcs} AVX+FMA "-mavx -mfma" "" "") endif() if(NOT Vc_AVX2_INTRINSICS_BROKEN) # The necessary list is not clear to me yet. At this point I'll only consider Intel CPUs, in # which case AVX2 implies the availability of FMA and BMI2 #_vc_compile_one_implementation(${_srcs} AVX2 "-mavx2") #_vc_compile_one_implementation(${_srcs} AVX2+BMI2 "-mavx2 -mbmi2") _vc_compile_one_implementation(${_srcs} AVX2+FMA+BMI2 "-xCORE-AVX2" "-mavx2 -mfma -mbmi2" "/arch:AVX2") #_vc_compile_one_implementation(${_srcs} AVX2+FMA "-mavx2 -mfma") endif() list(LENGTH _only_targets _len) if(_len GREATER 0) message(WARNING "The following unknown targets where listed in the ONLY list of vc_compile_for_all_implementations: '${_only_targets}'") endif() list(LENGTH _disabled_targets _len) if(_len GREATER 0) message(WARNING "The following unknown targets where listed in the EXCLUDE list of vc_compile_for_all_implementations: '${_disabled_targets}'") endif() endmacro() Vc-1.3.3/cmake/msvc_version.c000066400000000000000000000000231320703111200160100ustar00rootroot00000000000000MSVC _MSC_FULL_VER Vc-1.3.3/cmake/toolchain-arm-linux-gnueabi-gcc.cmake000066400000000000000000000000751320703111200221740ustar00rootroot00000000000000SET(CMAKE_SYSTEM_NAME Linux) SET(CMAKE_SYSTEM_PROCESSOR arm) Vc-1.3.3/cmake/toolchain-arm-linux.cmake000066400000000000000000000000751320703111200200320ustar00rootroot00000000000000SET(CMAKE_SYSTEM_NAME Linux) SET(CMAKE_SYSTEM_PROCESSOR arm) Vc-1.3.3/common/000077500000000000000000000000001320703111200133445ustar00rootroot00000000000000Vc-1.3.3/common/algorithms.h000066400000000000000000000201551320703111200156710ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2013-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_ALGORITHMS_H_ #define VC_COMMON_ALGORITHMS_H_ #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { /** * \ingroup Utilities * * \name Boolean Reductions */ //@{ /** \ingroup Utilities * Returns whether all entries in the mask \p m are \c true. */ template constexpr bool all_of(const Mask &m) { return m.isFull(); } /** \ingroup Utilities * Returns \p b */ constexpr bool all_of(bool b) { return b; } /** \ingroup Utilities * Returns whether at least one entry in the mask \p m is \c true. */ template constexpr bool any_of(const Mask &m) { return m.isNotEmpty(); } /** \ingroup Utilities * Returns \p b */ constexpr bool any_of(bool b) { return b; } /** \ingroup Utilities * Returns whether all entries in the mask \p m are \c false. */ template constexpr bool none_of(const Mask &m) { return m.isEmpty(); } /** \ingroup Utilities * Returns \p !b */ constexpr bool none_of(bool b) { return !b; } /** \ingroup Utilities * Returns whether at least one entry in \p m is \c true and at least one entry in \p m is \c * false. */ template constexpr bool some_of(const Mask &m) { return m.isMix(); } /** \ingroup Utilities * Returns \c false */ constexpr bool some_of(bool) { return false; } //@} template inline enable_if< std::is_arithmetic::value_type>::value && Traits::is_functor_argument_immutable< UnaryFunction, Vector::value_type>>::value, UnaryFunction> simd_for_each(InputIt first, InputIt last, UnaryFunction f) { typedef Vector::value_type> V; typedef Scalar::Vector::value_type> V1; for (; reinterpret_cast(std::addressof(*first)) & (V::MemoryAlignment - 1) && first != last; ++first) { f(V1(std::addressof(*first), Vc::Aligned)); } const auto lastV = last - V::Size + 1; for (; first < lastV; first += V::Size) { f(V(std::addressof(*first), Vc::Aligned)); } for (; first != last; ++first) { f(V1(std::addressof(*first), Vc::Aligned)); } return std::move(f); } template inline enable_if< std::is_arithmetic::value_type>::value && !Traits::is_functor_argument_immutable< UnaryFunction, Vector::value_type>>::value, UnaryFunction> simd_for_each(InputIt first, InputIt last, UnaryFunction f) { typedef Vector::value_type> V; typedef Scalar::Vector::value_type> V1; for (; reinterpret_cast(std::addressof(*first)) & (V::MemoryAlignment - 1) && first != last; ++first) { V1 tmp(std::addressof(*first), Vc::Aligned); f(tmp); tmp.store(std::addressof(*first), Vc::Aligned); } const auto lastV = last - V::Size + 1; for (; first < lastV; first += V::Size) { V tmp(std::addressof(*first), Vc::Aligned); f(tmp); tmp.store(std::addressof(*first), Vc::Aligned); } for (; first != last; ++first) { V1 tmp(std::addressof(*first), Vc::Aligned); f(tmp); tmp.store(std::addressof(*first), Vc::Aligned); } return std::move(f); } template inline enable_if< !std::is_arithmetic::value_type>::value, UnaryFunction> simd_for_each(InputIt first, InputIt last, UnaryFunction f) { return std::for_each(first, last, std::move(f)); } /////////////////////////////////////////////////////////////////////////////// template inline enable_if< std::is_arithmetic::value_type>::value && Traits::is_functor_argument_immutable< UnaryFunction, Vector::value_type>>::value, UnaryFunction> simd_for_each_n(InputIt first, std::size_t count, UnaryFunction f) { typename std::make_signed::type len = count; typedef Vector::value_type> V; typedef Scalar::Vector::value_type> V1; for (; reinterpret_cast(std::addressof(*first)) & (V::MemoryAlignment - 1) && len != 0; --len, ++first) { f(V1(std::addressof(*first), Vc::Aligned)); } for (; len >= int(V::Size); len -= V::Size, first += V::Size) { f(V(std::addressof(*first), Vc::Aligned)); } for (; len != 0; --len, ++first) { f(V1(std::addressof(*first), Vc::Aligned)); } return std::move(f); } template inline enable_if< std::is_arithmetic::value_type>::value && !Traits::is_functor_argument_immutable< UnaryFunction, Vector::value_type>>::value, UnaryFunction> simd_for_each_n(InputIt first, std::size_t count, UnaryFunction f) { typename std::make_signed::type len = count; typedef Vector::value_type> V; typedef Scalar::Vector::value_type> V1; for (; reinterpret_cast(std::addressof(*first)) & (V::MemoryAlignment - 1) && len != 0; --len, ++first) { V1 tmp(std::addressof(*first), Vc::Aligned); f(tmp); tmp.store(std::addressof(*first), Vc::Aligned); } for (; len >= int(V::Size); len -= V::Size, first += V::Size) { V tmp(std::addressof(*first), Vc::Aligned); f(tmp); tmp.store(std::addressof(*first), Vc::Aligned); } for (; len != 0; --len, ++first) { V1 tmp(std::addressof(*first), Vc::Aligned); f(tmp); tmp.store(std::addressof(*first), Vc::Aligned); } return std::move(f); } #ifdef Vc_CXX17 template inline enable_if< !std::is_arithmetic::value_type>::value, UnaryFunction> simd_for_each_n(InputIt first, std::size_t count, UnaryFunction f) { return std::for_each_n(first, count, std::move(f)); } #endif } // namespace Vc #endif // VC_COMMON_ALGORITHMS_H_ Vc-1.3.3/common/aliasingentryhelper.h000066400000000000000000000160771320703111200176010ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2010-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_ALIASINGENTRYHELPER_H_ #define VC_COMMON_ALIASINGENTRYHELPER_H_ #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace Common { template class AliasingEntryHelper { private: typedef typename StorageType::EntryType T; #ifdef Vc_ICC StorageType *const m_storage; const int m_index; public: Vc_ALWAYS_INLINE AliasingEntryHelper(StorageType *d, int index) : m_storage(d), m_index(index) {} Vc_ALWAYS_INLINE AliasingEntryHelper(const AliasingEntryHelper &) = default; Vc_ALWAYS_INLINE AliasingEntryHelper(AliasingEntryHelper &&) = default; Vc_ALWAYS_INLINE AliasingEntryHelper &operator=(const AliasingEntryHelper &rhs) { m_storage->assign(m_index, rhs); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator =(T x) { m_storage->assign(m_index, x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator +=(T x) { m_storage->assign(m_index, m_storage->m(m_index) + x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator -=(T x) { m_storage->assign(m_index, m_storage->m(m_index) - x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator /=(T x) { m_storage->assign(m_index, m_storage->m(m_index) / x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator *=(T x) { m_storage->assign(m_index, m_storage->m(m_index) * x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator |=(T x) { m_storage->assign(m_index, m_storage->m(m_index) | x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator &=(T x) { m_storage->assign(m_index, m_storage->m(m_index) & x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator ^=(T x) { m_storage->assign(m_index, m_storage->m(m_index) ^ x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator %=(T x) { m_storage->assign(m_index, m_storage->m(m_index) % x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator<<=(T x) { m_storage->assign(m_index, m_storage->m(m_index)<< x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator>>=(T x) { m_storage->assign(m_index, m_storage->m(m_index)>> x); return *this; } #define m_data m_storage->read(m_index) #else typedef T A Vc_MAY_ALIAS; A &m_data; public: template Vc_ALWAYS_INLINE AliasingEntryHelper(T2 &d) : m_data(reinterpret_cast(d)) {} Vc_ALWAYS_INLINE AliasingEntryHelper(A &d) : m_data(d) {} Vc_ALWAYS_INLINE AliasingEntryHelper &operator=(const AliasingEntryHelper &rhs) { m_data = rhs.m_data; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator =(T x) { m_data = x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator+=(T x) { m_data += x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator-=(T x) { m_data -= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator/=(T x) { m_data /= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator*=(T x) { m_data *= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator|=(T x) { m_data |= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator&=(T x) { m_data &= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator^=(T x) { m_data ^= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator%=(T x) { m_data %= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator<<=(T x) { m_data <<= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator>>=(T x) { m_data >>= x; return *this; } #endif Vc_ALWAYS_INLINE Vc_PURE operator const T() const { return m_data; } Vc_ALWAYS_INLINE Vc_PURE bool operator==(T x) const { return static_cast(m_data) == x; } Vc_ALWAYS_INLINE Vc_PURE bool operator!=(T x) const { return static_cast(m_data) != x; } Vc_ALWAYS_INLINE Vc_PURE bool operator<=(T x) const { return static_cast(m_data) <= x; } Vc_ALWAYS_INLINE Vc_PURE bool operator>=(T x) const { return static_cast(m_data) >= x; } Vc_ALWAYS_INLINE Vc_PURE bool operator< (T x) const { return static_cast(m_data) < x; } Vc_ALWAYS_INLINE Vc_PURE bool operator> (T x) const { return static_cast(m_data) > x; } Vc_ALWAYS_INLINE Vc_PURE T operator-() const { return -static_cast(m_data); } Vc_ALWAYS_INLINE Vc_PURE T operator~() const { return ~static_cast(m_data); } Vc_ALWAYS_INLINE Vc_PURE T operator+(T x) const { return static_cast(m_data) + x; } Vc_ALWAYS_INLINE Vc_PURE T operator-(T x) const { return static_cast(m_data) - x; } Vc_ALWAYS_INLINE Vc_PURE T operator/(T x) const { return static_cast(m_data) / x; } Vc_ALWAYS_INLINE Vc_PURE T operator*(T x) const { return static_cast(m_data) * x; } Vc_ALWAYS_INLINE Vc_PURE T operator|(T x) const { return static_cast(m_data) | x; } Vc_ALWAYS_INLINE Vc_PURE T operator&(T x) const { return static_cast(m_data) & x; } Vc_ALWAYS_INLINE Vc_PURE T operator^(T x) const { return static_cast(m_data) ^ x; } Vc_ALWAYS_INLINE Vc_PURE T operator%(T x) const { return static_cast(m_data) % x; } //T operator<<(T x) const { return static_cast(m_data) << x; } //T operator>>(T x) const { return static_cast(m_data) >> x; } #ifdef m_data #undef m_data #endif }; } // namespace Common } // namespace Vc #endif // VC_COMMON_ALIASINGENTRYHELPER_H_ Vc-1.3.3/common/alignedbase.h000066400000000000000000000120441320703111200157540ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_ALIGNEDBASE_H_ #define VC_COMMON_ALIGNEDBASE_H_ #include "types.h" #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace Detail { /**\internal * Break the recursion of the function below. */ template constexpr T max(T a) { return a; } /**\internal * \returns the maximum of all specified arguments. */ template constexpr T max(T a, T b, Ts... rest) { return a > b ? max(a, rest...) : max(b, rest...); } } // namespace Detail namespace Common { template Vc_INTRINSIC void *aligned_malloc(std::size_t); Vc_ALWAYS_INLINE void free(void *); } // namespace Common /** * \ingroup Utilities * * Helper class to ensure a given alignment. * * This class reimplements the \c new and \c delete operators to align objects allocated * on the heap suitably with the specified alignment \c Alignment. * * \see Vc::VectorAlignedBase * \see Vc::MemoryAlignedBase */ template struct alignas(Alignment) AlignedBase { Vc_FREE_STORE_OPERATORS_ALIGNED(Alignment); }; /** * \ingroup Utilities * * Helper type to ensure suitable alignment for any Vc::Vector type (using the default * VectorAbi). * * This class reimplements the \c new and \c delete operators to align objects allocated * on the heap suitably for objects of Vc::Vector type. This is necessary since the * standard \c new operator does not adhere to the alignment requirements of the type. * * \see Vc::VectorAlignedBaseT * \see Vc::MemoryAlignedBase * \see Vc::AlignedBase */ using VectorAlignedBase = AlignedBase< Detail::max(alignof(Vector), alignof(Vector), alignof(Vector), alignof(Vector), alignof(Vector), alignof(Vector), alignof(Vector), alignof(Vector), alignof(Vector), alignof(Vector), alignof(Vector), alignof(Vector))>; /** * \ingroup Utilities * Variant of the above type ensuring suitable alignment only for the specified vector * type \p V. * * \see Vc::VectorAlignedBase * \see Vc::MemoryAlignedBaseT */ template using VectorAlignedBaseT = AlignedBase; /** * \ingroup Utilities * * Helper class to ensure suitable alignment for arrays of scalar objects for any * Vc::Vector type (using the default VectorAbi). * * This class reimplements the \c new and \c delete operators to align objects allocated * on the heap suitably for arrays of type \p Vc::Vector::EntryType. Subsequent load * and store operations are safe to use the aligned variant. * * \see Vc::MemoryAlignedBaseT * \see Vc::VectorAlignedBase * \see Vc::AlignedBase */ using MemoryAlignedBase = AlignedBase< Detail::max(Vector::MemoryAlignment, Vector::MemoryAlignment, Vector::MemoryAlignment, Vector::MemoryAlignment, Vector::MemoryAlignment, Vector::MemoryAlignment, Vector::MemoryAlignment, Vector::MemoryAlignment, Vector::MemoryAlignment, Vector::MemoryAlignment, Vector::MemoryAlignment, Vector::MemoryAlignment)>; /** * \ingroup Utilities * Variant of the above type ensuring suitable alignment only for the specified vector * type \p V. * * \see Vc::MemoryAlignedBase * \see Vc::VectorAlignedBaseT */ template using MemoryAlignedBaseT = AlignedBase; } #endif // VC_COMMON_ALIGNEDBASE_H_ // vim: foldmethod=marker Vc-1.3.3/common/bitscanintrinsics.h000066400000000000000000000052331320703111200172510ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2011-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_BITSCANINTRINSICS_H_ #define VC_COMMON_BITSCANINTRINSICS_H_ #if defined(Vc_GCC) || defined(Vc_CLANG) || defined(Vc_APPLECLANG) # if Vc_GCC >= 0x40500 // GCC 4.5.0 introduced _bit_scan_forward / _bit_scan_reverse # include # else // GCC <= 4.4 and clang have x86intrin.h, but not the required functions # define _bit_scan_forward(x) __builtin_ctz(x) #include "macros.h" static Vc_ALWAYS_INLINE Vc_CONST int _Vc_bit_scan_reverse_asm(unsigned int x) { int r; __asm__("bsr %1,%0" : "=r"(r) : "X"(x)); return r; } # define _bit_scan_reverse(x) _Vc_bit_scan_reverse_asm(x) # endif #elif defined(_WIN32) #include static inline __forceinline unsigned long _bit_scan_forward(unsigned long x) { unsigned long index; _BitScanForward(&index, x); return index; } static inline __forceinline unsigned long _bit_scan_reverse(unsigned long x) { unsigned long index; _BitScanReverse(&index, x); return index; } #elif defined(Vc_ICC) // for all I know ICC supports the _bit_scan_* intrinsics #else // just assume the compiler can do it #endif #endif // VC_COMMON_BITSCANINTRINSICS_H_ Vc-1.3.3/common/const.h000066400000000000000000000071461320703111200146530ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2013-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_CONST_H_ #define VC_COMMON_CONST_H_ #include #include namespace Vc_VERSIONED_NAMESPACE { namespace Detail { template constexpr double exponentToFloat(std::integral_constant); template constexpr double exponentToFloat(std::integral_constant); template <> constexpr double exponentToFloat<0>(std::integral_constant) { return 1.; } template <> constexpr double exponentToFloat<0>(std::integral_constant) { return 1.; } template <> constexpr double exponentToFloat<-32>(std::integral_constant) { return 1. / (65536. * 65536.); } template <> constexpr double exponentToFloat<32>(std::integral_constant) { return 65536. * 65536.; } template <> constexpr double exponentToFloat<-64>(std::integral_constant) { return 1. / (65536. * 65536. * 65536. * 65536.); } template <> constexpr double exponentToFloat<64>(std::integral_constant) { return 65536. * 65536. * 65536. * 65536.; } template constexpr double exponentToFloat(std::integral_constant negative) { return exponentToFloat(negative) * 2.0; } template constexpr double exponentToFloat(std::integral_constant negative) { return exponentToFloat(negative) * 0.5; } template constexpr double doubleConstant() { return (static_cast((mantissa & 0x000fffffffffffffull) | 0x0010000000000000ull) / 0x0010000000000000ull) * exponentToFloat(std::integral_constant()) * sign; } template constexpr float floatConstant() { return (static_cast((mantissa & 0x007fffffu) | 0x00800000u) / 0x00800000u) * static_cast( exponentToFloat(std::integral_constant())) * sign; } } // namespace Detail } // namespace Vc #endif // VC_COMMON_CONST_H_ Vc-1.3.3/common/data.h000066400000000000000000000035311320703111200144300ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2013-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_CONST_DATA_H_ #define VC_COMMON_CONST_DATA_H_ #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace Common { alignas(64) extern unsigned int RandomState[]; alignas(32) extern const unsigned int AllBitsSet[8]; } // namespace Common } // namespace Vc #endif // VC_COMMON_CONST_DATA_H_ Vc-1.3.3/common/deinterleave.h000066400000000000000000000072621320703111200161730ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2010-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_DEINTERLEAVE_H_ #define VC_COMMON_DEINTERLEAVE_H_ #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { /** * \ingroup Vectors * * \deprecated Turn to InterleavedMemoryWrapper for a more flexible and complete solution. * * Loads two vectors of values from an interleaved array. * * \param a, b The vectors to load the values from memory into. * \param memory The memory location where to read the next 2 * V::Size values from * \param align Either pass Vc::Aligned or Vc::Unaligned. It defaults to Vc::Aligned if nothing is * specified. * * If you store your data as * \code * struct { float x, y; } m[1000]; * \endcode * then the deinterleave function allows you to read \p Size concurrent x and y values like this: * \code * Vc::float_v x, y; * Vc::deinterleave(&x, &y, &m[10], Vc::Unaligned); * \endcode * This code will load m[10], m[12], m[14], ... into \p x and m[11], m[13], m[15], ... into \p y. * * The deinterleave function supports the following type combinations: \verbatim V \ M | float | double | ushort | short | uint | int =========|=======|========|========|=======|======|===== float_v | X | | X | X | | ---------|-------|--------|--------|-------|------|----- double_v | | X | | | | ---------|-------|--------|--------|-------|------|----- int_v | | | | X | | X ---------|-------|--------|--------|-------|------|----- uint_v | | | X | | X | ---------|-------|--------|--------|-------|------|----- short_v | | | | X | | ---------|-------|--------|--------|-------|------|----- ushort_v | | | X | | | \endverbatim */ template Vc_ALWAYS_INLINE void deinterleave(V *a, V *b, const M *memory, A align) { Detail::deinterleave(*a, *b, memory, align); } // documented as default for align above template Vc_ALWAYS_INLINE void deinterleave(V *a, V *b, const M *memory) { Detail::deinterleave(*a, *b, memory, Aligned); } } // namespace Vc #endif // VC_COMMON_DEINTERLEAVE_H_ Vc-1.3.3/common/elementreference.h000066400000000000000000000153751320703111200170400ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2016 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_ELEMENTREFERENCE_H_ #define VC_COMMON_ELEMENTREFERENCE_H_ #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace Detail { template class ElementReference { using value_type = typename U::value_type; friend U; friend Accessor; Vc_INTRINSIC ElementReference(U &o, int i) noexcept : index(i), obj(o) {} static constexpr bool get_noexcept = noexcept(Accessor::get(std::declval(), int())); template static constexpr bool set_noexcept() { return noexcept(Accessor::set(std::declval(), int(), std::declval())); } public: Vc_INTRINSIC ElementReference(const ElementReference &) = delete; /** * Move Constructor * * this is the only way to constructor an ElementReference in user code * * \note * Please be aware that this class models the concept of a reference * and as such it can have the same lifetime issue as a standard C++ * reference. * * \note * C++ 17 support copy-elision, which in turn allows to * the ElementReference obtained via operator[] from a function * and avoid copying. C++11 and C++14 don't offer this, thus we add * the move constructor, to allow them to move the data and thus avoid * copying (which was prohibited by the deleted constructor above */ Vc_INTRINSIC ElementReference(ElementReference &&) = default; Vc_INTRINSIC operator value_type() const noexcept(get_noexcept) { return Accessor::get(obj, index); } template Vc_INTRINSIC ElementReference &operator=(T &&x) && noexcept(noexcept(Accessor::set(std::declval(), int(), std::declval()))) { Accessor::set(obj, index, std::forward(x)); return *this; } // TODO: improve with operator.() #define Vc_OP_(op_) \ template () \ op_ std::declval())> \ Vc_INTRINSIC ElementReference &operator op_##=(T &&x) && \ noexcept(get_noexcept && noexcept(Accessor::set(std::declval(), int(), \ std::declval()))) \ { \ const value_type &lhs = Accessor::get(obj, index); \ Accessor::set(obj, index, lhs op_ std::forward(x)); \ return *this; \ } Vc_ALL_ARITHMETICS(Vc_OP_); Vc_ALL_SHIFTS(Vc_OP_); Vc_ALL_BINARY(Vc_OP_); #undef Vc_OP_ template Vc_INTRINSIC ElementReference &operator++() && noexcept(noexcept(std::declval() = Accessor::get(std::declval(), int())) && set_noexcept())>()) { value_type x = Accessor::get(obj, index); Accessor::set(obj, index, ++x); return *this; } template Vc_INTRINSIC value_type operator++(int) && noexcept(noexcept(std::declval() = Accessor::get(std::declval(), int())) && set_noexcept()++)>()) { const value_type r = Accessor::get(obj, index); value_type x = r; Accessor::set(obj, index, ++x); return r; } template Vc_INTRINSIC ElementReference &operator--() && noexcept(noexcept(std::declval() = Accessor::get(std::declval(), int())) && set_noexcept())>()) { value_type x = Accessor::get(obj, index); Accessor::set(obj, index, --x); return *this; } template Vc_INTRINSIC value_type operator--(int) && noexcept(noexcept(std::declval() = Accessor::get(std::declval(), int())) && set_noexcept()--)>()) { const value_type r = Accessor::get(obj, index); value_type x = r; Accessor::set(obj, index, --x); return r; } friend void swap(ElementReference &&a, ElementReference &&b) { value_type tmp(a); static_cast(a) = static_cast(b); static_cast(b) = tmp; } friend void swap(value_type &a, ElementReference &&b) { value_type tmp(a); a = static_cast(b); static_cast(b) = tmp; } friend void swap(ElementReference &&a, value_type &b) { value_type tmp(a); static_cast(a) = b; b = tmp; } private: int index; U &obj; }; } // namespace Detail } // namespace Vc #endif // VC_COMMON_ELEMENTREFERENCE_H_ // vim: foldmethod=marker Vc-1.3.3/common/exponential.h000066400000000000000000000070361320703111200160510ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2012-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------- The exp implementation is derived from Cephes, which carries the following Copyright notice: Cephes Math Library Release 2.2: June, 1992 Copyright 1984, 1987, 1989 by Stephen L. Moshier Direct inquiries to 30 Frost Street, Cambridge, MA 02140 }}}*/ #ifdef Vc_COMMON_MATH_H_INTERNAL constexpr float log2_e = 1.44269504088896341f; constexpr float MAXLOGF = 88.72283905206835f; constexpr float MINLOGF = -103.278929903431851103f; /* log(2^-149) */ constexpr float MAXNUMF = 3.4028234663852885981170418348451692544e38f; template ::value || std::is_same::value>> inline Vector exp(Vector x) { using V = Vector; typedef typename V::Mask M; typedef Detail::Const C; const M overflow = x > MAXLOGF; const M underflow = x < MINLOGF; // log₂(eˣ) = x * log₂(e) * log₂(2) // = log₂(2^(x * log₂(e))) // => eˣ = 2^(x * log₂(e)) // => n = ⌊x * log₂(e) + ½⌋ // => y = x - n * ln(2) | recall that: ln(2) * log₂(e) == 1 // <=> eˣ = 2ⁿ * eʸ V z = floor(C::log2_e() * x + 0.5f); const auto n = static_cast>(z); x -= z * C::ln2_large(); x -= z * C::ln2_small(); /* Theoretical peak relative error in [-0.5, +0.5] is 4.2e-9. */ z = ((((( 1.9875691500E-4f * x + 1.3981999507E-3f) * x + 8.3334519073E-3f) * x + 4.1665795894E-2f) * x + 1.6666665459E-1f) * x + 5.0000001201E-1f) * (x * x) + x + 1.0f; x = ldexp(z, n); // == z * 2ⁿ x(overflow) = std::numeric_limits::infinity(); x.setZero(underflow); return x; } #endif // Vc_COMMON_MATH_H_INTERNAL Vc-1.3.3/common/fix_clang_emmintrin.h000066400000000000000000000061121320703111200175310ustar00rootroot00000000000000/*{{{ Copyright (C) 2013-2015 Matthias Kretz Permission to use, copy, modify, and distribute this software and its documentation for any purpose and without fee is hereby granted, provided that the above copyright notice appear in all copies and that both that the copyright notice and this permission notice and warranty disclaimer appear in supporting documentation, and that the name of the author not be used in advertising or publicity pertaining to distribution of the software without specific, written prior permission. The author disclaim all warranties with regard to this software, including all implied warranties of merchantability and fitness. In no event shall the author be liable for any special, indirect or consequential damages or any damages whatsoever resulting from loss of use, data or profits, whether in an action of contract, negligence or other tortious action, arising out of or in connection with the use or performance of this software. }}}*/ #ifndef VC_COMMON_FIX_CLANG_EMMINTRIN_H_ #define VC_COMMON_FIX_CLANG_EMMINTRIN_H_ #include #if (defined Vc_CLANG && Vc_CLANG < 0x30700) || (defined Vc_APPLECLANG && Vc_APPLECLANG < 0x70000) #ifdef _mm_slli_si128 #undef _mm_slli_si128 #define _mm_slli_si128(a, count) __extension__ ({ \ (__m128i)__builtin_ia32_pslldqi128((__m128i)(a), (count)*8); }) #endif #ifdef _mm_srli_si128 #undef _mm_srli_si128 #define _mm_srli_si128(a, count) __extension__ ({ \ (__m128i)__builtin_ia32_psrldqi128((__m128i)(a), (count)*8); }) #endif #ifdef _mm_shuffle_epi32 #undef _mm_shuffle_epi32 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \ (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), (__v4si) _mm_set1_epi32(0), \ (imm) & 0x3, ((imm) & 0xc) >> 2, \ ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); }) #endif #ifdef _mm_shufflelo_epi16 #undef _mm_shufflelo_epi16 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \ (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), (__v8hi) _mm_set1_epi16(0), \ (imm) & 0x3, ((imm) & 0xc) >> 2, \ ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ 4, 5, 6, 7); }) #endif #ifdef _mm_shufflehi_epi16 #undef _mm_shufflehi_epi16 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \ (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), (__v8hi) _mm_set1_epi16(0), \ 0, 1, 2, 3, \ 4 + (((imm) & 0x03) >> 0), \ 4 + (((imm) & 0x0c) >> 2), \ 4 + (((imm) & 0x30) >> 4), \ 4 + (((imm) & 0xc0) >> 6)); }) #endif #ifdef _mm_shuffle_pd #undef _mm_shuffle_pd #define _mm_shuffle_pd(a, b, i) __extension__ ({ \ __builtin_shufflevector((__m128d)(a), (__m128d)(b), (i) & 1, (((i) & 2) >> 1) + 2); }) #endif #endif // Vc_CLANG || Vc_APPLECLANG #endif // VC_COMMON_FIX_CLANG_EMMINTRIN_H_ Vc-1.3.3/common/gatherimplementation.h000066400000000000000000000225261320703111200177440ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2014-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_GATHERIMPLEMENTATION_H_ #define VC_COMMON_GATHERIMPLEMENTATION_H_ #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace Common { enum class GatherScatterImplementation : int { SimpleLoop, SetIndexZero, BitScanLoop, PopcntSwitch }; using SimpleLoopT = std::integral_constant; using SetIndexZeroT = std::integral_constant; using BitScanLoopT = std::integral_constant; using PopcntSwitchT = std::integral_constant; template Vc_ALWAYS_INLINE void executeGather(SetIndexZeroT, V &v, const MT *mem, IT &&indexes_, typename V::MaskArgument mask) { auto indexes = std::forward(indexes_); indexes.setZeroInverted(static_cast(mask)); const V tmp(mem, indexes); where(mask) | v = tmp; } template Vc_ALWAYS_INLINE void executeGather(SimpleLoopT, V &v, const MT *mem, const IT &indexes, typename V::MaskArgument mask) { if (Vc_IS_UNLIKELY(mask.isEmpty())) { return; } Common::unrolled_loop([&](std::size_t i) { if (mask[i]) v[i] = mem[indexes[i]]; }); } template Vc_ALWAYS_INLINE void executeGather(BitScanLoopT, V &v, const MT *mem, const IT &indexes, typename V::MaskArgument mask) { #ifdef Vc_GNU_ASM size_t bits = mask.toInt(); while (Vc_IS_LIKELY(bits > 0)) { size_t i, j; asm("bsf %[bits],%[i]\n\t" "bsr %[bits],%[j]\n\t" "btr %[i],%[bits]\n\t" "btr %[j],%[bits]\n\t" : [i] "=r"(i), [j] "=r"(j), [bits] "+r"(bits)); v[i] = mem[indexes[i]]; v[j] = mem[indexes[j]]; } #else // Alternative from Vc::SSE (0.7) int bits = mask.toInt(); while (bits) { const int i = _bit_scan_forward(bits); bits &= bits - 1; v[i] = mem[indexes[i]]; } #endif // Vc_GNU_ASM } template Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT, V &v, const MT *mem, const IT &indexes, typename V::MaskArgument mask, enable_if = nullarg) { unsigned int bits = mask.toInt(); unsigned int low, high = 0; switch (Vc::Detail::popcnt16(bits)) { case 16: v.gather(mem, indexes); break; case 15: low = _bit_scan_forward(bits); bits ^= 1 << low; v[low] = mem[indexes[low]]; case 14: high = _bit_scan_reverse(bits); v[high] = mem[indexes[high]]; high = (1 << high); case 13: low = _bit_scan_forward(bits); bits ^= high | (1 << low); v[low] = mem[indexes[low]]; case 12: high = _bit_scan_reverse(bits); v[high] = mem[indexes[high]]; high = (1 << high); case 11: low = _bit_scan_forward(bits); bits ^= high | (1 << low); v[low] = mem[indexes[low]]; case 10: high = _bit_scan_reverse(bits); v[high] = mem[indexes[high]]; high = (1 << high); case 9: low = _bit_scan_forward(bits); bits ^= high | (1 << low); v[low] = mem[indexes[low]]; case 8: high = _bit_scan_reverse(bits); v[high] = mem[indexes[high]]; high = (1 << high); case 7: low = _bit_scan_forward(bits); bits ^= high | (1 << low); v[low] = mem[indexes[low]]; case 6: high = _bit_scan_reverse(bits); v[high] = mem[indexes[high]]; high = (1 << high); case 5: low = _bit_scan_forward(bits); bits ^= high | (1 << low); v[low] = mem[indexes[low]]; case 4: high = _bit_scan_reverse(bits); v[high] = mem[indexes[high]]; high = (1 << high); case 3: low = _bit_scan_forward(bits); bits ^= high | (1 << low); v[low] = mem[indexes[low]]; case 2: high = _bit_scan_reverse(bits); v[high] = mem[indexes[high]]; case 1: low = _bit_scan_forward(bits); v[low] = mem[indexes[low]]; case 0: break; } } template Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT, V &v, const MT *mem, const IT &indexes, typename V::MaskArgument mask, enable_if = nullarg) { unsigned int bits = mask.toInt(); unsigned int low, high = 0; switch (Vc::Detail::popcnt8(bits)) { case 8: v.gather(mem, indexes); break; case 7: low = _bit_scan_forward(bits); bits ^= 1 << low; v[low] = mem[indexes[low]]; case 6: high = _bit_scan_reverse(bits); v[high] = mem[indexes[high]]; high = (1 << high); case 5: low = _bit_scan_forward(bits); bits ^= high | (1 << low); v[low] = mem[indexes[low]]; case 4: high = _bit_scan_reverse(bits); v[high] = mem[indexes[high]]; high = (1 << high); case 3: low = _bit_scan_forward(bits); bits ^= high | (1 << low); v[low] = mem[indexes[low]]; case 2: high = _bit_scan_reverse(bits); v[high] = mem[indexes[high]]; case 1: low = _bit_scan_forward(bits); v[low] = mem[indexes[low]]; case 0: break; } } template Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT, V &v, const MT *mem, const IT &indexes, typename V::MaskArgument mask, enable_if = nullarg) { unsigned int bits = mask.toInt(); unsigned int low, high = 0; switch (Vc::Detail::popcnt4(bits)) { case 4: v.gather(mem, indexes); break; case 3: low = _bit_scan_forward(bits); bits ^= 1 << low; v[low] = mem[indexes[low]]; case 2: high = _bit_scan_reverse(bits); v[high] = mem[indexes[high]]; case 1: low = _bit_scan_forward(bits); v[low] = mem[indexes[low]]; case 0: break; } } template Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT, V &v, const MT *mem, const IT &indexes, typename V::MaskArgument mask, enable_if = nullarg) { unsigned int bits = mask.toInt(); unsigned int low; switch (Vc::Detail::popcnt4(bits)) { case 2: v.gather(mem, indexes); break; case 1: low = _bit_scan_forward(bits); v[low] = mem[indexes[low]]; case 0: break; } } } // namespace Common } // namespace Vc #endif // VC_COMMON_GATHERIMPLEMENTATION_H_ Vc-1.3.3/common/gatherinterface.h000066400000000000000000000665461320703111200166710ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2014-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef Vc_CURRENT_CLASS_NAME #error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors." #endif /////////////////////////////////////////////////////////////////////////////////////////// // gathers // A gather takes the following arguments: // 1. A const pointer to memory of any type that can convert to EntryType // 2. An indexes “vector”. The requirement is that the type implements the subscript operator, // stores «Size» valid index values, and each offset to the pointer above yields a valid // memory location for reading. // 3. Optionally the third argument may be a mask. The mask disables several memory reads and // thus removes the requirements in (2.) for the disabled entries. private: /**\internal * This function implements a gather given a pointer to memory \p mem and some * container object storing the gather \p indexes. * * \param mem This pointer must be aligned correctly for the type \p MT. This is the * natural behavior of C++, so this is typically the case. * \param indexes This object contains at least \VSize{T} indexes that denote the * offset in \p mem where the components for the current vector should be copied from. * The offset is not in Bytes, but in multiples of `sizeof(MT)`. */ // enable_if::value && // has_subscript_operator::value> template inline void gatherImplementation(const MT *mem, const IT &indexes); /**\internal * This overload of the above function adds a \p mask argument to disable memory * accesses at the \p indexes offsets where \p mask is \c false. */ template inline void gatherImplementation(const MT *mem, const IT &indexes, MaskArgument mask); /**\internal * Overload for the case of C-arrays or %Vc vector objects. * * In this case the \p indexes parameter is usable without adjustment. * * \param indexes An object to be used for gather or scatter. * \returns Forwards the \p indexes parameter. */ template ::value || Traits::is_simd_vector::value>> static Vc_INTRINSIC const IT &adjustIndexParameter(const IT &indexes) { return indexes; } /**\internal * Overload for the case of a container that returns an lvalue reference from its * subscript operator. * * In this case the container is assumed to use contiguous storage and therefore the * \p indexes object is converted to a C-array interface. * * \param indexes An object to be used for gather or scatter. * \returns A pointer to the first object in the \p indexes container. */ template < typename IT, typename = enable_if< !std::is_pointer::value && !Traits::is_simd_vector::value && std::is_lvalue_reference()[0])>::value>> static Vc_INTRINSIC decltype(std::addressof(std::declval()[0])) adjustIndexParameter(const IT &i) { return std::addressof(i[0]); } /**\internal * Overload for the case of a container that returns an rvalue from its * subscript operator. * * \param indexes An object to be used for gather or scatter. * \returns Forwards the \p indexes parameter. */ template static Vc_INTRINSIC enable_if< !std::is_pointer::value && !Traits::is_simd_vector::value && !std::is_lvalue_reference()[0])>::value, IT> adjustIndexParameter(const IT &i) { return i; } public: #define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \ static_assert( \ std::is_convertible::value, \ "The memory pointer needs to point to a type that can be converted to the " \ "EntryType of this SIMD vector type."); \ static_assert( \ Vc::Traits::has_subscript_operator::value, \ "The indexes argument must be a type that implements the subscript operator."); \ static_assert( \ !Traits::is_simd_vector::value || \ Traits::simd_vector_size::value >= Size, \ "If you use a SIMD vector for the indexes parameter, the index vector must " \ "have at least as many entries as this SIMD vector."); \ static_assert( \ !std::is_array::value || \ (std::rank::value == 1 && \ (std::extent::value == 0 || std::extent::value >= Size)), \ "If you use a simple array for the indexes parameter, the array must have " \ "at least as many entries as this SIMD vector.") /** * \name Gather constructors and member functions * * Constructs or loads a vector from the objects at `mem[indexes[0]]`, * `mem[indexes[1]]`, `mem[indexes[2]]`, ... * * All gather functions optionally take a mask as last argument. In that case only the * entries that are selected in the mask are accessed in memory and copied to the * vector. This enables invalid indexes in the \p indexes vector if those are masked * off in \p mask. * * Gathers from structured data (AoS: arrays of struct) are possible via a special * subscript operator of the container (array). You can use \ref Vc::array and \ref * Vc::vector as drop-in replacements for \c std::array and \c std::vector. These * container classes contain the necessary subscript operator overload. Example: * \code * Vc::vector data(100); * std::iota(data.begin(), data.end(), 0.f); // fill with values 0, 1, 2, ... * auto indexes = float_v::IndexType::IndexesFromZero(); * float_v gathered = data[indexes]; // gathered == [0, 1, 2, ...] * \endcode * * Alternatively, you can use Vc::Common::AdaptSubscriptOperator to extend a given * container class with the necessary subscript operator. Example: * \code * template > * using my_vector = Vc::Common::AdaptSubscriptOperator>; * \endcode * * \param mem A pointer to memory which contains objects of type \p MT at the offsets * given by \p indexes. * \param indexes A container/vector of offsets into \p mem. * The type of \p indexes (\p IT) may either be a pointer to integers * (C-array) or a vector of integers (preferrably IndexType). * \param mask If a mask is given, only the active entries will be copied from memory. * * \note If you use a masked gather constructor the masked-off entries of the vector * are zero-initilized. */ ///@{ /// Gather constructor template ::value>> Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(mem, adjustIndexParameter(indexes)); } /// Masked gather constructor template ::value>> Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(mem, adjustIndexParameter(indexes), mask); } /// Gather function template ::value>> Vc_INTRINSIC void gather(const MT *mem, const IT &indexes) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(mem, adjustIndexParameter(indexes)); } /// Masked gather function template ::value>> Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(mem, adjustIndexParameter(indexes), mask); } ///@} /// \name Deprecated Members ///@{ /** * \deprecated Use Vc::array or Vc::vector subscripting instead. * * \param array A pointer into memory (without alignment restrictions). * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to * be read. Thus the offsets in \p indexes are relative to the \p array and not to * the size of the gathered type (i.e. array[i].*member1 is accessed instead of * (&(array->*member1))[i]) * \param indexes Determines the offsets into \p array where the values are gathered from/scattered * to. The type of indexes can either be an integer vector or a type that supports * operator[] access. */ template Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array, const EntryType S1::*member1, IT indexes) { gather(Common::SubscriptOperation, true>( array, indexes)[member1] .gatherArguments()); } /** * \deprecated Use Vc::array or Vc::vector subscripting instead. * * \param array A pointer into memory (without alignment restrictions). * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to * be read. Thus the offsets in \p indexes are relative to the \p array and not to * the size of the gathered type (i.e. array[i].*member1 is accessed instead of * (&(array->*member1))[i]) * \param indexes Determines the offsets into \p array where the values are gathered from/scattered * to. The type of indexes can either be an integer vector or a type that supports * operator[] access. * \param mask If a mask is given only the active entries will be gathered/scattered. */ template Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array, const EntryType S1::*member1, IT indexes, MaskArgument mask) { gather(Common::SubscriptOperation, true>( array, indexes)[member1] .gatherArguments(), mask); } /** * \deprecated Use Vc::array or Vc::vector subscripting instead. * * \param array A pointer into memory (without alignment restrictions). * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to * be read. Thus the offsets in \p indexes are relative to the \p array and not to * the size of the gathered type (i.e. array[i].*member1 is accessed instead of * (&(array->*member1))[i]) * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that * struct (i.e. array[i].*member1.*member2 is read). * \param indexes Determines the offsets into \p array where the values are gathered from/scattered * to. The type of indexes can either be an integer vector or a type that supports * operator[] access. */ template Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array, const S2 S1::*member1, const EntryType S2::*member2, IT indexes) { gather(Common::SubscriptOperation, true>( array, indexes)[member1][member2] .gatherArguments()); } /** * \deprecated Use Vc::array or Vc::vector subscripting instead. * * \param array A pointer into memory (without alignment restrictions). * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to * be read. Thus the offsets in \p indexes are relative to the \p array and not to * the size of the gathered type (i.e. array[i].*member1 is accessed instead of * (&(array->*member1))[i]) * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that * struct (i.e. array[i].*member1.*member2 is read). * \param indexes Determines the offsets into \p array where the values are gathered from/scattered * to. The type of indexes can either be an integer vector or a type that supports * operator[] access. * \param mask If a mask is given only the active entries will be gathered/scattered. */ template Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array, const S2 S1::*member1, const EntryType S2::*member2, IT indexes, MaskArgument mask) { gather(Common::SubscriptOperation, true>( array, indexes)[member1][member2] .gatherArguments(), mask); } /** * \deprecated Use Vc::array or Vc::vector subscripting instead. * * \param array A pointer into memory (without alignment restrictions). * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to * be read. Thus the offsets in \p indexes are relative to the \p array and not to * the size of the gathered type (i.e. array[i].*member1 is accessed instead of * (&(array->*member1))[i]) * \param outerIndexes * \param innerIndexes */ template Vc_DEPRECATED( "use the subscript operator to Vc::array or Vc::vector " "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array, const EntryType *const S1::*ptrMember1, IT1 outerIndexes, IT2 innerIndexes) { gather(Common::SubscriptOperation, true>( array, outerIndexes)[ptrMember1][innerIndexes] .gatherArguments()); } /** * \deprecated Use Vc::array or Vc::vector subscripting instead. * * \param array A pointer into memory (without alignment restrictions). * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to * be read. Thus the offsets in \p indexes are relative to the \p array and not to * the size of the gathered type (i.e. array[i].*member1 is accessed instead of * (&(array->*member1))[i]) * \param outerIndexes * \param innerIndexes * \param mask If a mask is given only the active entries will be gathered/scattered. */ template Vc_DEPRECATED( "use the subscript operator to Vc::array or Vc::vector " "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array, const EntryType *const S1::*ptrMember1, IT1 outerIndexes, IT2 innerIndexes, MaskArgument mask) { gather(Common::SubscriptOperation, true>( array, outerIndexes)[ptrMember1][innerIndexes] .gatherArguments(), mask); } /** * \deprecated Use Vc::array or Vc::vector subscripting instead. * * \param array A pointer into memory (without alignment restrictions). * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to * be read. Thus the offsets in \p indexes are relative to the \p array and not to * the size of the gathered type (i.e. array[i].*member1 is accessed instead of * (&(array->*member1))[i]) * \param indexes Determines the offsets into \p array where the values are gathered from/scattered * to. The type of indexes can either be an integer vector or a type that supports * operator[] access. */ template Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " "instead.") inline void gather(const S1 *array, const EntryType S1::*member1, IT indexes) { gather(Common::SubscriptOperation, true>( array, indexes)[member1] .gatherArguments()); } /** * \deprecated Use Vc::array or Vc::vector subscripting instead. * * \param array A pointer into memory (without alignment restrictions). * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to * be read. Thus the offsets in \p indexes are relative to the \p array and not to * the size of the gathered type (i.e. array[i].*member1 is accessed instead of * (&(array->*member1))[i]) * \param indexes Determines the offsets into \p array where the values are gathered from/scattered * to. The type of indexes can either be an integer vector or a type that supports * operator[] access. * \param mask If a mask is given only the active entries will be gathered/scattered. */ template Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " "instead.") inline void gather(const S1 *array, const EntryType S1::*member1, IT indexes, MaskArgument mask) { gather(Common::SubscriptOperation, true>( array, indexes)[member1] .gatherArguments(), mask); } /** * \deprecated Use Vc::array or Vc::vector subscripting instead. * * \param array A pointer into memory (without alignment restrictions). * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to * be read. Thus the offsets in \p indexes are relative to the \p array and not to * the size of the gathered type (i.e. array[i].*member1 is accessed instead of * (&(array->*member1))[i]) * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that * struct (i.e. array[i].*member1.*member2 is read). * \param indexes Determines the offsets into \p array where the values are gathered from/scattered * to. The type of indexes can either be an integer vector or a type that supports * operator[] access. */ template Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " "instead.") inline void gather(const S1 *array, const S2 S1::*member1, const EntryType S2::*member2, IT indexes) { gather(Common::SubscriptOperation, true>( array, indexes)[member1][member2] .gatherArguments()); } /** * \deprecated Use Vc::array or Vc::vector subscripting instead. * * \param array A pointer into memory (without alignment restrictions). * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to * be read. Thus the offsets in \p indexes are relative to the \p array and not to * the size of the gathered type (i.e. array[i].*member1 is accessed instead of * (&(array->*member1))[i]) * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that * struct (i.e. array[i].*member1.*member2 is read). * \param indexes Determines the offsets into \p array where the values are gathered from/scattered * to. The type of indexes can either be an integer vector or a type that supports * operator[] access. * \param mask If a mask is given only the active entries will be gathered/scattered. */ template Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " "instead.") inline void gather(const S1 *array, const S2 S1::*member1, const EntryType S2::*member2, IT indexes, MaskArgument mask) { gather(Common::SubscriptOperation, true>( array, indexes)[member1][member2] .gatherArguments(), mask); } /** * \deprecated Use Vc::array or Vc::vector subscripting instead. * * \param array A pointer into memory (without alignment restrictions). * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to * be read. Thus the offsets in \p indexes are relative to the \p array and not to * the size of the gathered type (i.e. array[i].*member1 is accessed instead of * (&(array->*member1))[i]) * \param outerIndexes * \param innerIndexes */ template Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " "instead.") inline void gather(const S1 *array, const EntryType *const S1::*ptrMember1, IT1 outerIndexes, IT2 innerIndexes) { gather(Common::SubscriptOperation, true>( array, outerIndexes)[ptrMember1][innerIndexes] .gatherArguments()); } /** * \deprecated Use Vc::array or Vc::vector subscripting instead. * * \param array A pointer into memory (without alignment restrictions). * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to * be read. Thus the offsets in \p indexes are relative to the \p array and not to * the size of the gathered type (i.e. array[i].*member1 is accessed instead of * (&(array->*member1))[i]) * \param outerIndexes * \param innerIndexes * \param mask If a mask is given only the active entries will be gathered/scattered. */ template Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " "instead.") inline void gather(const S1 *array, const EntryType *const S1::*ptrMember1, IT1 outerIndexes, IT2 innerIndexes, MaskArgument mask) { gather(Common::SubscriptOperation, true>( array, outerIndexes)[ptrMember1][innerIndexes] .gatherArguments(), mask); } ///@} /**\internal * \name Gather function to use from Vc::Common::subscript_operator * * \param args * \param mask */ ///@{ template Vc_INTRINSIC void gather(const Common::GatherArguments &args) { gather(args.address, adjustIndexParameter(args.indexes)); } template Vc_INTRINSIC void gather(const Common::GatherArguments &args, MaskArgument mask) { gather(args.address, adjustIndexParameter(args.indexes), mask); } ///@} #undef Vc_ASSERT_GATHER_PARAMETER_TYPES_ Vc-1.3.3/common/generalinterface.h000066400000000000000000000050411320703111200170130ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2014-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ public: /////////////////////////////////////////////////////////////////////////// // init to zero Vector() = default; /////////////////////////////////////////////////////////////////////////// // types /////////////////////////////////////////////////////////////////////////// // constants static constexpr std::size_t size() { return Size; } /////////////////////////////////////////////////////////////////////////// // constant Vectors explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerZero) Vc_INTRINSIC_R; explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerOne) Vc_INTRINSIC_R; explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerIndexesFromZero) Vc_INTRINSIC_R; static Vc_INTRINSIC Vc_CONST Vector Zero() { return Vector(Vc::Zero); } static Vc_INTRINSIC Vc_CONST Vector One() { return Vector(Vc::One); } static Vc_INTRINSIC Vc_CONST Vector IndexesFromZero() { return Vector(Vc::IndexesFromZero); } // vim: foldmethod=marker Vc-1.3.3/common/iif.h000066400000000000000000000075121320703111200142710ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2012-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_IIF_H_ #define VC_COMMON_IIF_H_ #include #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { /** * \ingroup Utilities * * Function to mimic the ternary operator '?:' (inline-if). * * \param condition Determines which values are returned. This is analog to the first argument to * the ternary operator. * \param trueValue The values to return where \p condition is \c true. * \param falseValue The values to return where \p condition is \c false. * \return A combination of entries from \p trueValue and \p falseValue, according to \p condition. * * So instead of the scalar variant * \code * float x = a > 1.f ? b : b + c; * \endcode * you'd write * \code * float_v x = Vc::iif (a > 1.f, b, b + c); * \endcode * * Assuming \c a has the values [0, 3, 5, 1], \c b is [1, 1, 1, 1], and \c c is [1, 2, 3, 4], then x * will be [2, 2, 3, 5]. */ template Vc_ALWAYS_INLINE enable_if::value && is_simd_vector::value, T> iif( const Mask &condition, const T &trueValue, const T &falseValue) { T result(falseValue); Vc::where(condition) | result = trueValue; return result; } /**\internal * The following declaration makes it explicit that `iif (Mask, non-vector, non-vector)` * is not supposed to work. Doing the same thing with \c static_assert would break SFINAE. */ template enable_if::value && !is_simd_vector::value, T> iif( const Mask &, const T &, const T &) = delete; /** * \ingroup Utilities * * Overload of the above for boolean conditions. * * This typically results in direct use of the ternary operator. This function makes it easier to * switch from a Vc type to a builtin type. * * \param condition Determines which value is returned. This is analog to the first argument to * the ternary operator. * \param trueValue The value to return if \p condition is \c true. * \param falseValue The value to return if \p condition is \c false. * \return Either \p trueValue or \p falseValue, depending on \p condition. */ template constexpr T iif (bool condition, const T &trueValue, const T &falseValue) { return condition ? trueValue : falseValue; } } // namespace Vc #endif // VC_COMMON_IIF_H_ Vc-1.3.3/common/indexsequence.h000066400000000000000000000062631320703111200163640ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2014-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_INDEXSEQUENCE_H_ #define VC_COMMON_INDEXSEQUENCE_H_ #include namespace Vc_VERSIONED_NAMESPACE { /** \internal * Helper class for a sequence of size_t values from 0 to N. This type will be included in * C++14. */ template struct index_sequence { static constexpr std::size_t size() noexcept { return sizeof...(I); } }; /** \internal * This struct builds an index_sequence type from a given upper bound \p N. * It does so recursively via concatenation of to index sequences of length N/2. */ template struct make_index_sequence_impl { template static index_sequence join(std::false_type, index_sequence); template static index_sequence join( std::true_type, index_sequence); using is_odd = std::integral_constant; using half = typename make_index_sequence_impl::type; using type = decltype(join<(N + 1) / 2>(is_odd(), half())); }; template <> struct make_index_sequence_impl<0> { using type = index_sequence<>; }; template <> struct make_index_sequence_impl<1> { using type = index_sequence<0>; }; template <> struct make_index_sequence_impl<2> { using type = index_sequence<0, 1>; }; /** \internal * Creates an index_sequence type for the upper bound \p N. */ template using make_index_sequence = typename make_index_sequence_impl::type; } #endif // VC_COMMON_INDEXSEQUENCE_H_ // vim: foldmethod=marker Vc-1.3.3/common/interleave.h000066400000000000000000000050331320703111200156540ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2014-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_INTERLEAVE_H_ #define VC_COMMON_INTERLEAVE_H_ #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { /** \ingroup Utilities Interleaves the entries from \p a and \p b into two vectors of the same type. The order in the returned vector contains the elements `a[0], b[0], a[1], b[1], a[2], b[2], a[3], b[3], ...`. Example: \code Vc::SimdArray a = { 1, 2, 3, 4 }; Vc::SimdArray b = { 9, 8, 7, 6 }; std::tie(a, b) = Vc::interleave(a, b); std::cout << a << b; // prints: // <1 9 2 8><3 7 4 6> \endcode \param a input vector whose data will appear at even indexes in the output \param b input vector whose data will appear at odd indexes in the output \return two vectors with data from \p a and \p b interleaved */ template ::value>> std::pair interleave(const V &a, const V &b) { return {a.interleaveLow(b), a.interleaveHigh(b)}; } } // namespace Vc #endif // VC_COMMON_INTERLEAVE_H_ // vim: foldmethod=marker Vc-1.3.3/common/interleavedmemory.h000066400000000000000000000276121320703111200172600ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2012-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_INTERLEAVEDMEMORY_H_ #define VC_COMMON_INTERLEAVEDMEMORY_H_ #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace Common { /** * \internal */ template struct InterleavedMemoryAccessBase { // Partial specialization doesn't work for functions without partial specialization of the whole // class. Therefore we capture the contents of InterleavedMemoryAccessBase in a macro to easily // copy it into its specializations. typedef typename std::conditional< Readonly, typename std::add_const::type, typename V::EntryType>::type T; typedef typename V::AsArg VArg; typedef T Ta Vc_MAY_ALIAS; const I m_indexes; Ta *const m_data; Vc_ALWAYS_INLINE InterleavedMemoryAccessBase(typename I::AsArg indexes, Ta *data) : m_indexes(indexes), m_data(data) { } // implementations of the following are in {scalar,sse,avx}/detail.h template Vc_INTRINSIC void deinterleave(Vs &&... vs) const { Impl::deinterleave(m_data, m_indexes, std::forward(vs)...); } protected: using Impl = Vc::Detail::InterleaveImpl; template Vc_INTRINSIC void callInterleave(T &&a, index_sequence) { Impl::interleave(m_data, m_indexes, a[Indexes]...); } }; /** * \internal */ // delay execution of the deinterleaving gather until operator= template struct InterleavedMemoryReadAccess : public InterleavedMemoryAccessBase { typedef InterleavedMemoryAccessBase Base; typedef typename Base::Ta Ta; Vc_ALWAYS_INLINE InterleavedMemoryReadAccess(Ta *data, typename I::AsArg indexes) : Base(StructSize == 1u ? indexes : StructSize == 2u ? indexes << 1 : StructSize == 4u ? indexes << 2 : StructSize == 8u ? indexes << 3 : StructSize == 16u ? indexes << 4 : indexes * I(int(StructSize)), data) { } template Vc_ALWAYS_INLINE T deinterleave_unpack(index_sequence) const { T r; Base::Impl::deinterleave(this->m_data, this->m_indexes, std::get(r)...); return r; } template ::value && std::is_same( std::declval()))>>::value)>> Vc_ALWAYS_INLINE operator T() const { return deinterleave_unpack(make_index_sequence::value>()); } }; ///\internal Runtime check (NDEBUG) for asserting unique indexes. template struct CheckIndexesUnique { #ifdef NDEBUG static Vc_INTRINSIC void test(const I &) {} #else static void test(const I &indexes) { const I test = indexes.sorted(); Vc_ASSERT(I::Size == 1 || (test == test.rotated(1)).isEmpty()) } #endif }; ///\internal For SuccessiveEntries there can never be a problem. template struct CheckIndexesUnique > { static Vc_INTRINSIC void test(const SuccessiveEntries &) {} }; /** * \internal */ template struct InterleavedMemoryAccess : public InterleavedMemoryReadAccess { typedef InterleavedMemoryAccessBase Base; typedef typename Base::Ta Ta; Vc_ALWAYS_INLINE InterleavedMemoryAccess(Ta *data, typename I::AsArg indexes) : InterleavedMemoryReadAccess(data, indexes) { CheckIndexesUnique::test(indexes); } template Vc_ALWAYS_INLINE void operator=(VectorReferenceArray &&rhs) { static_assert(N <= StructSize, "You_are_trying_to_scatter_more_data_into_the_struct_than_it_has"); this->callInterleave(std::move(rhs), make_index_sequence()); } template Vc_ALWAYS_INLINE void operator=(VectorReferenceArray &&rhs) { static_assert(N <= StructSize, "You_are_trying_to_scatter_more_data_into_the_struct_than_it_has"); this->callInterleave(std::move(rhs), make_index_sequence()); } }; /** * Wraps a pointer to memory with convenience functions to access it via vectors. * * \param S The type of the struct. * \param V The type of the vector to be returned when read. This should reflect the type of the * members inside the struct. * * \see operator[] * \ingroup Utilities * \headerfile interleavedmemory.h */ template class InterleavedMemoryWrapper { typedef typename std::conditional::value, const typename V::EntryType, typename V::EntryType>::type T; typedef typename V::IndexType I; typedef typename V::AsArg VArg; typedef const I &IndexType; static constexpr std::size_t StructSize = sizeof(S) / sizeof(T); typedef InterleavedMemoryAccess Access; typedef InterleavedMemoryReadAccess ReadAccess; typedef InterleavedMemoryAccess > AccessSuccessiveEntries; typedef InterleavedMemoryReadAccess > ReadSuccessiveEntries; typedef T Ta Vc_MAY_ALIAS; Ta *const m_data; static_assert(StructSize * sizeof(T) == sizeof(S), "InterleavedMemoryAccess_does_not_support_packed_structs"); public: /** * Constructs the wrapper object. * * \param s A pointer to a C-array. */ Vc_ALWAYS_INLINE InterleavedMemoryWrapper(S *s) : m_data(reinterpret_cast(s)) { } /** * Interleaved scatter/gather access. * * Assuming you have a struct of floats and a vector of \p indexes into the array, this function * can be used to access the struct entries as vectors using the minimal number of store or load * instructions. * * \param indexes Vector of indexes that determine the gather locations. * * \return A special (magic) object that executes the loads and deinterleave on assignment to a * vector tuple. * * Example: * \code * struct Foo { * float x, y, z; * }; * * void fillWithBar(Foo *_data, uint_v indexes) * { * Vc::InterleavedMemoryWrapper data(_data); * const float_v x = bar(1); * const float_v y = bar(2); * const float_v z = bar(3); * data[indexes] = (x, y, z); * // it's also possible to just store a subset at the front of the struct: * data[indexes] = (x, y); * // if you want to store a single entry, use scatter: * z.scatter(_data, &Foo::x, indexes); * } * * float_v normalizeStuff(Foo *_data, uint_v indexes) * { * Vc::InterleavedMemoryWrapper data(_data); * float_v x, y, z; * (x, y, z) = data[indexes]; * // it is also possible to just load a subset from the front of the struct: * // (x, y) = data[indexes]; * return Vc::sqrt(x * x + y * y + z * z); * } * \endcode * * You may think of the gather operation (or scatter as the inverse) like this: \verbatim Memory: {x0 y0 z0 x1 y1 z1 x2 y2 z2 x3 y3 z3 x4 y4 z4 x5 y5 z5 x6 y6 z6 x7 y7 z7 x8 y8 z8} indexes: [5, 0, 1, 7] Result in (x, y, z): ({x5 x0 x1 x7}, {y5 y0 y1 y7}, {z5 z0 z1 z7}) \endverbatim * * \warning If \p indexes contains non-unique entries on scatter, the result is undefined. If * \c NDEBUG is not defined the implementation will assert that the \p indexes entries are unique. */ template Vc_ALWAYS_INLINE enable_if< std::is_convertible::value && !std::is_const::value, Access> operator[](IT indexes) { return Access(m_data, indexes); } /// const overload (gathers only) of the above function Vc_ALWAYS_INLINE ReadAccess operator[](IndexType indexes) const { return ReadAccess(m_data, indexes); } /// alias of the above function Vc_ALWAYS_INLINE ReadAccess gather(IndexType indexes) const { return operator[](indexes); } /** * Interleaved access. * * This function is an optimization of the function above, for cases where the index vector * contains consecutive values. It will load \p V::Size consecutive entries from memory and * deinterleave them into Vc vectors. * * \param first The first of \p V::Size indizes to be accessed. * * \return A special (magic) object that executes the loads and deinterleave on assignment to a * vector tuple. * * Example: * \code * struct Foo { * float x, y, z; * }; * * void foo(Foo *_data) * { * Vc::InterleavedMemoryWrapper data(_data); * for (size_t i = 0; i < 32U; i += float_v::Size) { * float_v x, y, z; * (x, y, z) = data[i]; * // now: * // x = { _data[i].x, _data[i + 1].x, _data[i + 2].x, ... } * // y = { _data[i].y, _data[i + 1].y, _data[i + 2].y, ... } * // z = { _data[i].z, _data[i + 1].z, _data[i + 2].z, ... } * ... * } * } * \endcode */ Vc_ALWAYS_INLINE ReadSuccessiveEntries operator[](size_t first) const { return ReadSuccessiveEntries(m_data, first); } Vc_ALWAYS_INLINE AccessSuccessiveEntries operator[](size_t first) { return AccessSuccessiveEntries(m_data, first); } //Vc_ALWAYS_INLINE Access scatter(I indexes, VArg v0, VArg v1); }; } // namespace Common using Common::InterleavedMemoryWrapper; template inline Common::InterleavedMemoryWrapper make_interleave_wrapper(S *s) { return Common::InterleavedMemoryWrapper(s); } } // namespace Vc #endif // VC_COMMON_INTERLEAVEDMEMORY_H_ Vc-1.3.3/common/iterators.h000066400000000000000000000253261320703111200155410ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2013-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_ITERATORS_H_ #define VC_COMMON_ITERATORS_H_ #include #include #ifdef Vc_MSVC #include // for _BitScanForward #endif // Vc_MSVC #include "where.h" #include "elementreference.h" #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace Common { template class MemoryVector; template class MemoryVectorIterator; template class Iterator; template class IteratorBase; template class IteratorBase { public: using iterator_category = std::input_iterator_tag; using value_type = typename V::value_type; using difference_type = int; using reference = value_type; Vc_ALWAYS_INLINE reference operator*() const { return v()[i()]; } Vc_ALWAYS_INLINE reference operator[](difference_type i2) const { return v()[i2]; } private: Vc_INTRINSIC V &v() const { return *static_cast *>(this)->v; } Vc_INTRINSIC difference_type i() const { return static_cast *>(this)->i; } }; template class IteratorBase { public: using iterator_category = std::input_iterator_tag; using value_type = typename V::value_type; using difference_type = int; using reference = Vc::Detail::ElementReference; Vc_ALWAYS_INLINE reference operator*() const { return {*v(), i()}; } Vc_ALWAYS_INLINE reference operator[](difference_type i2) const { return {*v(), i2}; } private: Vc_INTRINSIC V *v() const { return static_cast *>(this)->v; } Vc_INTRINSIC difference_type i() const { return static_cast *>(this)->i; } friend reference; static Vc_INTRINSIC value_type get(const V &o, int i) { return o[i]; } template static Vc_INTRINSIC void set(V &o, int i, T &&v) { o[i] = std::forward(v); } }; // class Iterator {{{ template class Iterator : public IteratorBase::value> { using Base = IteratorBase::value>; friend Base; public: using typename Base::iterator_category; using typename Base::value_type; using typename Base::difference_type; using pointer = const Iterator *; using typename Base::reference; constexpr Iterator() = default; constexpr Iterator(V &_v, difference_type _i) : v(&_v), i(_i) {} // rely on implicit copy constructor/assignment Vc_ALWAYS_INLINE pointer operator->() const { return this; } using Base::operator*; Vc_ALWAYS_INLINE Iterator &operator++() { ++i; return *this; } Vc_ALWAYS_INLINE Iterator operator++(int) { Iterator tmp = *this; ++i; return tmp; } // bidirectional iteration is supported Vc_ALWAYS_INLINE Iterator &operator--() { --i; return *this; } Vc_ALWAYS_INLINE Iterator operator--(int) { Iterator tmp = *this; --i; return tmp; } // RandomAccessIterator: using Base::operator[]; Vc_ALWAYS_INLINE Iterator &operator+=(difference_type d) { i += d; return *this; } Vc_ALWAYS_INLINE Iterator &operator-=(difference_type d) { i -= d; return *this; } Vc_ALWAYS_INLINE Iterator operator+(difference_type d) const { return {*v, i + d}; } Vc_ALWAYS_INLINE Iterator operator-(difference_type d) const { return {*v, i - d}; } Vc_ALWAYS_INLINE difference_type operator-(const Iterator &rhs) const { return i - rhs.i; } friend Vc_ALWAYS_INLINE Iterator operator+(difference_type d, const Iterator &rhs) { return {*rhs.v, rhs.i + d}; } // InputIterator would not need to test v == rhs.v, but except for `reference` this // class implements a complete RandomAccessIterator Vc_ALWAYS_INLINE bool operator==(const Iterator &rhs) const { return v == rhs.v && i == rhs.i; } Vc_ALWAYS_INLINE bool operator!=(const Iterator &rhs) const { return v == rhs.v && i != rhs.i; } Vc_ALWAYS_INLINE bool operator< (const Iterator &rhs) const { return v == rhs.v && i < rhs.i; } Vc_ALWAYS_INLINE bool operator<=(const Iterator &rhs) const { return v == rhs.v && i <= rhs.i; } Vc_ALWAYS_INLINE bool operator> (const Iterator &rhs) const { return v == rhs.v && i > rhs.i; } Vc_ALWAYS_INLINE bool operator>=(const Iterator &rhs) const { return v == rhs.v && i >= rhs.i; } private: V *v = nullptr; difference_type i = 0; };/*}}}*/ template using ConstIterator = Iterator; #ifdef Vc_IMPL_MIC class BitmaskIterator/*{{{*/ { const int mask; int bit; public: Vc_ALWAYS_INLINE BitmaskIterator(int m) : mask(m), bit(_mm_tzcnt_32(mask)) {} Vc_ALWAYS_INLINE BitmaskIterator(const BitmaskIterator &) = default; Vc_ALWAYS_INLINE BitmaskIterator(BitmaskIterator &&) = default; Vc_ALWAYS_INLINE size_t operator->() const { return bit; } Vc_ALWAYS_INLINE size_t operator*() const { return bit; } Vc_ALWAYS_INLINE BitmaskIterator &operator++() { bit = _mm_tzcnti_32(bit, mask); return *this; } Vc_ALWAYS_INLINE BitmaskIterator operator++(int) { BitmaskIterator tmp = *this; bit = _mm_tzcnti_32(bit, mask); return tmp; } Vc_ALWAYS_INLINE bool operator==(const BitmaskIterator &rhs) const { return bit == rhs.bit; } Vc_ALWAYS_INLINE bool operator!=(const BitmaskIterator &rhs) const { return bit != rhs.bit; } };/*}}}*/ #else class BitmaskIterator/*{{{*/ { #ifdef Vc_MSVC unsigned long mask; unsigned long bit; #else size_t mask; size_t bit; #endif void nextBit() { #ifdef Vc_GNU_ASM bit = __builtin_ctzl(mask); #elif defined(Vc_MSVC) _BitScanForward(&bit, mask); #else #error "Not implemented yet. Please contact vc-devel@compeng.uni-frankfurt.de" #endif } void resetLsb() { // 01100100 - 1 = 01100011 mask &= (mask - 1); /* #ifdef Vc_GNU_ASM __asm__("btr %1,%0" : "+r"(mask) : "r"(bit)); #elif defined(_WIN64) _bittestandreset64(&mask, bit); #elif defined(_WIN32) _bittestandreset(&mask, bit); #else #error "Not implemented yet. Please contact vc-devel@compeng.uni-frankfurt.de" #endif */ } public: BitmaskIterator(decltype(mask) m) : mask(m) { nextBit(); } BitmaskIterator(const BitmaskIterator &) = default; BitmaskIterator(BitmaskIterator &&) = default; Vc_ALWAYS_INLINE size_t operator->() const { return bit; } Vc_ALWAYS_INLINE size_t operator*() const { return bit; } Vc_ALWAYS_INLINE BitmaskIterator &operator++() { resetLsb(); nextBit(); return *this; } Vc_ALWAYS_INLINE BitmaskIterator operator++(int) { BitmaskIterator tmp = *this; resetLsb(); nextBit(); return tmp; } Vc_ALWAYS_INLINE bool operator==(const BitmaskIterator &rhs) const { return mask == rhs.mask; } Vc_ALWAYS_INLINE bool operator!=(const BitmaskIterator &rhs) const { return mask != rhs.mask; } };/*}}}*/ #endif template Vc_ALWAYS_INLINE enable_if::value || Traits::is_simd_mask::value, Iterator::type>> begin(T &&x) { return {std::forward(x), 0}; } template Vc_ALWAYS_INLINE enable_if::value || Traits::is_simd_mask::value, Iterator::type>> end(T &&x) { using TT = typename std::decay::type; return {std::forward(x), int(TT::size())}; } template Vc_ALWAYS_INLINE enable_if< Traits::is_simd_mask::value || Traits::is_simd_vector::value, ConstIterator> cbegin(const T &v) { return {v, 0}; } template Vc_ALWAYS_INLINE enable_if< Traits::is_simd_mask::value || Traits::is_simd_vector::value, ConstIterator> cend(const T &v) { return {v, int(T::size())}; } template Vc_ALWAYS_INLINE BitmaskIterator begin(const WhereImpl::WhereMask &w) { return w.mask.toInt(); } template Vc_ALWAYS_INLINE BitmaskIterator end(const WhereImpl::WhereMask &) { return 0; } template Vc_ALWAYS_INLINE MemoryVectorIterator makeIterator(T *mem, Flags) { return new(mem) MemoryVector; } template Vc_ALWAYS_INLINE MemoryVectorIterator makeIterator(const T *mem, Flags) { return new(const_cast(mem)) MemoryVector; } template Vc_ALWAYS_INLINE MemoryVectorIterator makeIterator(MemoryVector &mv, Flags) { return new(&mv) MemoryVector; } template Vc_ALWAYS_INLINE MemoryVectorIterator makeIterator(MemoryVector &mv, Flags) { return new(&mv) MemoryVector; } } // namespace Common using Common::begin; using Common::end; using Common::cbegin; using Common::cend; using Common::makeIterator; } // namespace Vc #endif // VC_COMMON_ITERATORS_H_ // vim: foldmethod=marker Vc-1.3.3/common/loadinterface.h000066400000000000000000000103251320703111200163160ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2014-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ // load ctors{{{1 /** * Construct a vector from loading its entries from the array at \p mem. * * \param mem A pointer to data. The pointer must not be aligned on a * MemoryAlignment boundary unless you add the Vc::Aligned flag as a second * argument. */ explicit Vc_INTRINSIC Vector(const EntryType *mem) { load(mem); } /** * Construct a vector from loading its entries from the array at \p mem. * * \param mem A pointer to data. If \p flags contains the Vc::Aligned flag, the pointer * must be aligned on a MemoryAlignment boundary. * \param flags A (combination of) flag object(s), such as Vc::Aligned, Vc::Streaming, * Vc::Unaligned, and/or Vc::PrefetchDefault. */ template ::value>> explicit Vc_INTRINSIC Vector(const EntryType *mem, Flags flags) { load(mem, flags); } template ::value || !std::is_integral::value || sizeof(EntryType) >= sizeof(U)) && std::is_arithmetic::value &&Traits::is_load_store_flag::value>> explicit Vc_INTRINSIC Vector(const U *x, Flags flags = Flags()) { load(x, flags); } // load member functions{{{1 /** * Load the vector entries from \p mem, overwriting the previous values. * * \param mem * A pointer to data. The pointer must not be aligned on a MemoryAlignment boundary unless * you add the Vc::Aligned flag as a second argument. */ Vc_INTRINSIC void load(const EntryType *mem) { load(mem, DefaultLoadTag()); } /** * Load the vector entries from \p mem, overwriting the previous values. * * \param mem * A pointer to data. If \p flags contains the Vc::Aligned flag, the pointer must be * aligned on a MemoryAlignment boundary. * \param flags * A (combination of) flag object(s), such as Vc::Aligned, Vc::Streaming, Vc::Unaligned, * and/or Vc::PrefetchDefault. */ template Vc_INTRINSIC enable_if::value, void> load(const EntryType *mem, Flags flags) { load(mem, flags); } private: template struct load_concept : public std::enable_if< (!std::is_integral::value || !std::is_integral::value || sizeof(EntryType) >= sizeof(U)) && std::is_arithmetic::value && Traits::is_load_store_flag::value, void> {}; public: template Vc_INTRINSIC_L typename load_concept::type load(const U *mem, Flags = Flags()) Vc_INTRINSIC_R; //}}}1 // vim: foldmethod=marker Vc-1.3.3/common/loadstoreflags.h000066400000000000000000000236501320703111200165340ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2013-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_LOADSTOREFLAGS_H_ #define VC_COMMON_LOADSTOREFLAGS_H_ #include "../traits/type_traits.h" namespace Vc_VERSIONED_NAMESPACE { /** * Hint for \ref Prefetch to select prefetches that mark the memory as exclusive. * * This hint may optimize the prefetch if the memory will subsequently be written to. */ struct Exclusive {}; /** * Hint for \ref Prefetch to select prefetches that mark the memory as shared. */ struct Shared {}; namespace LoadStoreFlags { struct StreamingFlag {}; struct UnalignedFlag {}; struct PrefetchFlagBase {}; #ifdef Vc_IMPL_MIC template struct PrefetchFlag : public PrefetchFlagBase { typedef ExclusiveOrShared_ ExclusiveOrShared; static constexpr size_t L1Stride = L1; static constexpr size_t L2Stride = L2; static constexpr bool IsExclusive = std::is_same::value; static constexpr bool IsShared = std::is_same::value; }; template struct ExtractType { typedef Default type; }; template struct ExtractType { typedef typename std::conditional::value, T, typename ExtractType::type>::type type; }; // ICC warns about the constexpr members in LoadStoreFlags: member "LoadStoreFlags::IsAligned" was declared but never referenced // who needs that warning, especially if it was referenced... // The warning cannot be reenabled because it gets emitted whenever the LoadStoreFlags is instantiated // somewhere, so it could be anywhere. #ifdef Vc_ICC #pragma warning(disable: 177) #endif /**\internal * Implementation of the load/store flags mechanism. This is internal API. Only some * concrete aliases are API-relevant types. */ template struct LoadStoreFlags { private: // ICC doesn't grok this line: //template using TestFlag = std::is_same::type, void>; typedef typename ExtractType, Flags...>::type Prefetch; public: constexpr LoadStoreFlags() {} static constexpr bool IsStreaming = !std::is_same::type, void>::value; static constexpr bool IsUnaligned = !std::is_same::type, void>::value; static constexpr bool IsAligned = !IsUnaligned; static constexpr bool IsPrefetch = !std::is_same::type, void>::value; static constexpr bool IsExclusivePrefetch = Prefetch::IsExclusive; static constexpr bool IsSharedPrefetch = Prefetch::IsShared; static constexpr size_t L1Stride = Prefetch::L1Stride; static constexpr size_t L2Stride = Prefetch::L2Stride; typedef LoadStoreFlags::value, void, Flags>::type...> UnalignedRemoved; // The following EnableIf* convenience types cannot use enable_if because then no LoadStoreFlags type // could ever be instantiated. Instead these types are defined either as void* or void. The // function that does SFINAE then assigns "= nullptr" to this type. Thus, the ones with just // void result in substitution failure. typedef typename std::conditional::type EnableIfAligned; typedef typename std::conditional::type EnableIfStreaming; typedef typename std::conditional::type EnableIfUnalignedNotStreaming; typedef typename std::conditional::type EnableIfUnalignedAndStreaming; typedef typename std::conditional::type EnableIfUnaligned; typedef typename std::conditional::type EnableIfNotUnaligned; typedef typename std::conditional::type EnableIfPrefetch; typedef typename std::conditional::type EnableIfNotPrefetch; }; /**\internal * Specialization for no flags (i.e aligned, non-streaming, no prefetching) */ template<> struct LoadStoreFlags<> { constexpr LoadStoreFlags() {} static constexpr bool IsStreaming = false; static constexpr bool IsUnaligned = false; static constexpr bool IsAligned = !IsUnaligned; static constexpr bool IsPrefetch = false; static constexpr bool IsExclusivePrefetch = false; static constexpr bool IsSharedPrefetch = false; static constexpr size_t L1Stride = 0; static constexpr size_t L2Stride = 0; typedef void* EnableIfAligned; typedef void* EnableIfNotUnaligned; typedef void* EnableIfNotPrefetch; }; /** * Operator for concatenation of LoadStoreFlags. * * Example: * \code * float_v x(mem, Vc::Aligned | Vc::Streaming); * \endcode */ template constexpr LoadStoreFlags operator|(LoadStoreFlags, LoadStoreFlags) { return LoadStoreFlags(); } } // LoadStoreFlags namespace using LoadStoreFlags::PrefetchFlag; typedef LoadStoreFlags::LoadStoreFlags<> AlignedTag; typedef LoadStoreFlags::LoadStoreFlags StreamingTag; typedef LoadStoreFlags::LoadStoreFlags UnalignedTag; /// The default load tag type uses unaligned (non-streaming) loads. typedef UnalignedTag DefaultLoadTag; /// The default store tag type uses unaligned (non-streaming) stores. typedef UnalignedTag DefaultStoreTag; /**\addtogroup Utilities * @{ */ /** * Use this object for a \p flags parameter to request aligned loads and stores. * * It specifies that a load/store can expect a memory address that is aligned on * the correct boundary. (i.e. \p MemoryAlignment) * * \warning * If you specify Aligned, but the memory address is not aligned the program * will most likely crash. */ constexpr AlignedTag Aligned; /** * Use this object for a \p flags parameter to request unaligned loads and stores. * * It specifies that a load/store can \em not expect a memory address that is * aligned on the correct boundary. (i.e. alignment is less than * \p MemoryAlignment) * * \note * If you specify Unaligned, but the memory address is aligned the load/store * will execute slightly slower than necessary. */ constexpr UnalignedTag Unaligned; /** * Use this object for a \p flags parameter to request streaming loads and stores. * * It specifies that the cache should be bypassed for the given load/store. * Whether this will actually be done depends on the target system's capabilities. * * Streaming stores can be interesting when the code calculates values that, after being * written to memory, will not be used for a long time or used by a different thread. * * \note * Expect that most target systems do not support unaligned streaming loads or stores. * Therefore, make sure that you also specify Aligned. */ constexpr StreamingTag Streaming; /** * Use this object for a \p flags parameter to request default software prefetches to be * emitted. */ constexpr LoadStoreFlags::LoadStoreFlags> PrefetchDefault; ///@} /** * \tparam L1 * \tparam L2 * \tparam ExclusiveOrShared */ template ::L1Stride, size_t L2 = PrefetchFlag<>::L2Stride, typename ExclusiveOrShared = PrefetchFlag<>::ExclusiveOrShared> struct Prefetch : public LoadStoreFlags::LoadStoreFlags> { }; namespace Traits { ///\internal partial specialization for detecting LoadStoreFlags types template struct is_loadstoreflag_internal> : public std::true_type { }; ///\internal partial specialization for detecting the derived Prefetch type as a /// load/store flag. template struct is_loadstoreflag_internal> : public std::true_type { }; } // namespace Traits } // namespace Vc #endif // VC_COMMON_LOADSTOREFLAGS_H_ Vc-1.3.3/common/logarithm.h000066400000000000000000000260231320703111200155060ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2009-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ /* The log implementations are based on code from Julien Pommier which carries the following copyright information: */ /* Inspired by Intel Approximate Math library, and based on the corresponding algorithms of the cephes math library */ /* Copyright (C) 2007 Julien Pommier This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. (this is the zlib license) */ #ifdef Vc_COMMON_MATH_H_INTERNAL enum LogarithmBase { BaseE, Base10, Base2 }; namespace Detail { template using Const = typename std::conditional::value, AVX::Const, SSE::Const>::type; template struct LogImpl { template static Vc_ALWAYS_INLINE void log_series(Vector &Vc_RESTRICT x, typename Vector::AsArg exponent) { typedef Vector V; typedef Detail::Const C; // Taylor series around x = 2^exponent // f(x) = ln(x) → exponent * ln(2) → C::ln2_small + C::ln2_large // f'(x) = x⁻¹ → x → 1 // f''(x) = - x⁻² → -x² / 2 → C::_1_2() // = 2!x⁻³ → x³ / 3 → C::P(8) // = -3!x⁻⁴ → -x⁴ / 4 → C::P(7) // = 4!x⁻⁵ → x⁵ / 5 → C::P(6) // ... // The high order coefficients are adjusted to reduce the error that occurs from ommission // of higher order terms. // P(0) is the smallest term and |x| < 1 ⇒ |xⁿ| > |xⁿ⁺¹| // The order of additions must go from smallest to largest terms const V x2 = x * x; // 0 → 4 #ifdef Vc_LOG_ILP V y2 = (C::P(6) * /*4 → 8*/ x2 + /* 8 → 11*/ C::P(7) * /*1 → 5*/ x) + /*11 → 14*/ C::P(8); V y0 = (C::P(0) * /*5 → 9*/ x2 + /* 9 → 12*/ C::P(1) * /*2 → 6*/ x) + /*12 → 15*/ C::P(2); V y1 = (C::P(3) * /*6 → 10*/ x2 + /*10 → 13*/ C::P(4) * /*3 → 7*/ x) + /*13 → 16*/ C::P(5); const V x3 = x2 * x; // 7 → 11 const V x6 = x3 * x3; // 11 → 15 const V x9 = x6 * x3; // 15 → 19 V y = (y0 * /*19 → 23*/ x9 + /*23 → 26*/ y1 * /*16 → 20*/ x6) + /*26 → 29*/ y2 * /*14 → 18*/ x3; #elif defined Vc_LOG_ILP2 /* * name start done * movaps %xmm0, %xmm1 ; x 0 1 * movaps %xmm0, %xmm2 ; x 0 1 * mulps %xmm1, %xmm1 ; x2 1 5 *xmm1 * movaps , %xmm15 ; y8 1 2 * mulps %xmm1, %xmm2 ; x3 5 9 *xmm2 * movaps %xmm1, %xmm3 ; x2 5 6 * movaps %xmm1, %xmm4 ; x2 5 6 * mulps %xmm3, %xmm3 ; x4 6 10 *xmm3 * movaps %xmm2, %xmm5 ; x3 9 10 * movaps %xmm2, %xmm6 ; x3 9 10 * mulps %xmm2, %xmm4 ; x5 9 13 *xmm4 * movaps %xmm3, %xmm7 ; x4 10 11 * movaps %xmm3, %xmm8 ; x4 10 11 * movaps %xmm3, %xmm9 ; x4 10 11 * mulps %xmm5, %xmm5 ; x6 10 14 *xmm5 * mulps %xmm3, %xmm6 ; x7 11 15 *xmm6 * mulps %xmm7, %xmm7 ; x8 12 16 *xmm7 * movaps %xmm4, %xmm10 ; x5 13 14 * mulps %xmm4, %xmm8 ; x9 13 17 *xmm8 * mulps %xmm5, %xmm10 ; x11 14 18 *xmm10 * mulps %xmm5, %xmm9 ; x10 15 19 *xmm9 * mulps , %xmm10 ; y0 18 22 * mulps , %xmm9 ; y1 19 23 * mulps , %xmm8 ; y2 20 24 * mulps , %xmm7 ; y3 21 25 * addps %xmm10, %xmm9 ; y 23 26 * addps %xmm9, %xmm8 ; y 26 29 * addps %xmm8, %xmm7 ; y 29 32 */ const V x3 = x2 * x; // 4 → 8 const V x4 = x2 * x2; // 5 → 9 const V x5 = x2 * x3; // 8 → 12 const V x6 = x3 * x3; // 9 → 13 const V x7 = x4 * x3; // const V x8 = x4 * x4; const V x9 = x5 * x4; const V x10 = x5 * x5; const V x11 = x5 * x6; // 13 → 17 V y = C::P(0) * x11 + C::P(1) * x10 + C::P(2) * x9 + C::P(3) * x8 + C::P(4) * x7 + C::P(5) * x6 + C::P(6) * x5 + C::P(7) * x4 + C::P(8) * x3; #else V y = C::P(0); Vc::Common::unrolled_loop([&](int i) { y = y * x + C::P(i); }); y *= x * x2; #endif switch (Base) { case BaseE: // ln(2) is split in two parts to increase precision (i.e. ln2_small + ln2_large = ln(2)) y += exponent * C::ln2_small(); y -= x2 * C::_1_2(); // [0, 0.25[ x += y; x += exponent * C::ln2_large(); break; case Base10: y += exponent * C::ln2_small(); y -= x2 * C::_1_2(); // [0, 0.25[ x += y; x += exponent * C::ln2_large(); x *= C::log10_e(); break; case Base2: { const V x_ = x; x *= C::log2_e(); y *= C::log2_e(); y -= x_ * x * C::_1_2(); // [0, 0.25[ x += y; x += exponent; break; } } } template static Vc_ALWAYS_INLINE void log_series(Vector &Vc_RESTRICT x, typename Vector::AsArg exponent) { typedef Vector V; typedef Detail::Const C; const V x2 = x * x; V y = C::P(0); V y2 = C::Q(0) + x; Vc::Common::unrolled_loop([&](int i) { y = y * x + C::P(i); y2 = y2 * x + C::Q(i); }); y2 = x / y2; y = y * x + C::P(5); y = x2 * y * y2; // TODO: refactor the following with the float implementation: switch (Base) { case BaseE: // ln(2) is split in two parts to increase precision (i.e. ln2_small + ln2_large = ln(2)) y += exponent * C::ln2_small(); y -= x2 * C::_1_2(); // [0, 0.25[ x += y; x += exponent * C::ln2_large(); break; case Base10: y += exponent * C::ln2_small(); y -= x2 * C::_1_2(); // [0, 0.25[ x += y; x += exponent * C::ln2_large(); x *= C::log10_e(); break; case Base2: { const V x_ = x; x *= C::log2_e(); y *= C::log2_e(); y -= x_ * x * C::_1_2(); // [0, 0.25[ x += y; x += exponent; break; } } } template > static inline Vector calc(V _x) { typedef typename V::Mask M; typedef Detail::Const C; V x(_x); const M invalidMask = x < V::Zero(); const M infinityMask = x == V::Zero(); const M denormal = x <= C::min(); x(denormal) *= V(Vc::Detail::doubleConstant<1, 0, 54>()); // 2²⁵ V exponent = Detail::exponent(x.data()); // = ⎣log₂(x)⎦ exponent(denormal) -= 54; x.setZero(C::exponentMask()); // keep only the fractional part ⇒ x ∈ [1, 2[ x = Detail::operator|(x, C::_1_2()); // and set the exponent to 2⁻¹ ⇒ x ∈ [½, 1[ // split calculation in two cases: // A: x ∈ [½, √½[ // B: x ∈ [√½, 1[ // √½ defines the point where Δe(x) := log₂(x) - ⎣log₂(x)⎦ = ½, i.e. // log₂(√½) - ⎣log₂(√½)⎦ = ½ * -1 - ⎣½ * -1⎦ = -½ + 1 = ½ const M smallX = x < C::_1_sqrt2(); x(smallX) += x; // => x ∈ [√½, 1[ ∪ [1.5, 1 + √½[ x -= V::One(); // => x ∈ [√½ - 1, 0[ ∪ [0.5, √½[ exponent(!smallX) += V::One(); log_series(x, exponent); // A: (ˣ⁄₂ᵉ - 1, e) B: (ˣ⁄₂ᵉ⁺¹ - 1, e + 1) x.setQnan(invalidMask); // x < 0 → NaN x(infinityMask) = C::neginf(); // x = 0 → -∞ return x; } }; } // namespace Detail template Vc_INTRINSIC Vc_CONST Vector log(const Vector &x) { return Detail::LogImpl::calc(x); } template Vc_INTRINSIC Vc_CONST Vector log10(const Vector &x) { return Detail::LogImpl::calc(x); } template Vc_INTRINSIC Vc_CONST Vector log2(const Vector &x) { return Detail::LogImpl::calc(x); } #endif // Vc_COMMON_MATH_H_INTERNAL Vc-1.3.3/common/macros.h000066400000000000000000000330531320703111200150050ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2010-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_MACROS_H_ #define VC_COMMON_MACROS_H_ #include #ifdef Vc_MSVC #define Vc_ALIGNED_TYPEDEF(n_, type_, new_type_) \ typedef __declspec(align(n_)) type_ new_type_ #elif __GNUC__ #define Vc_ALIGNED_TYPEDEF(n_, type_, new_type_) \ typedef type_ new_type_[[gnu::aligned(n_)]] #else // the following is actually ill-formed according to C++1[14] #define Vc_ALIGNED_TYPEDEF(n_, type_, new_type_) \ using new_type_ alignas(sizeof(n_)) = type_ #endif // On Windows (WIN32) we might see macros called min and max. Just undefine them and hope // noone (re)defines them (NOMINMAX should help). #ifdef WIN32 #define NOMINMAX 1 #if defined min #undef min #endif #if defined max #undef max #endif #endif // WIN32 #if defined Vc_GCC && Vc_GCC >= 0x60000 // GCC 6 drops all attributes on types passed as template arguments. This is important // if a may_alias gets lost and therefore needs to be readded in the implementation of // the class template. #define Vc_TEMPLATES_DROP_ATTRIBUTES 1 #endif #if Vc_IS_VERSION_2 || (defined Vc_GCC && Vc_GCC >= 0x60000) // GCC 6 optimizes the RowMemory::fromRawData hack away (common/memorybase.h). Therefore // the 2D Memory class is implemented recursively using 1D Memory members. Since this is // an ABI break this is only enabled for GCC 6. With Vc 2.x all implementations should do // this. #define Vc_RECURSIVE_MEMORY 1 #endif #if defined Vc_CLANG || defined Vc_APPLECLANG # define Vc_UNREACHABLE __builtin_unreachable # define Vc_NEVER_INLINE [[gnu::noinline]] # define Vc_INTRINSIC_L inline # define Vc_INTRINSIC_R __attribute__((always_inline)) # define Vc_INTRINSIC Vc_INTRINSIC_L Vc_INTRINSIC_R # define Vc_FLATTEN # define Vc_CONST __attribute__((const)) # define Vc_CONST_L # define Vc_CONST_R Vc_CONST # define Vc_PURE __attribute__((pure)) # define Vc_PURE_L # define Vc_PURE_R Vc_PURE # define Vc_MAY_ALIAS __attribute__((may_alias)) # define Vc_ALWAYS_INLINE_L inline # define Vc_ALWAYS_INLINE_R __attribute__((always_inline)) # define Vc_ALWAYS_INLINE Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE_R # define Vc_IS_UNLIKELY(x) __builtin_expect(x, 0) # define Vc_IS_LIKELY(x) __builtin_expect(x, 1) # define Vc_RESTRICT __restrict__ # define Vc_DEPRECATED(msg) # define Vc_DEPRECATED_ALIAS(msg) # define Vc_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__)) #elif defined(__GNUC__) # define Vc_UNREACHABLE __builtin_unreachable # if defined Vc_GCC && !defined __OPTIMIZE__ # define Vc_MAY_ALIAS # else # define Vc_MAY_ALIAS __attribute__((__may_alias__)) # endif # define Vc_INTRINSIC_R __attribute__((__always_inline__, __artificial__)) # define Vc_INTRINSIC_L inline # define Vc_INTRINSIC Vc_INTRINSIC_L Vc_INTRINSIC_R # define Vc_FLATTEN __attribute__((__flatten__)) # define Vc_ALWAYS_INLINE_L inline # define Vc_ALWAYS_INLINE_R __attribute__((__always_inline__)) # define Vc_ALWAYS_INLINE Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE_R # ifdef Vc_ICC // ICC miscompiles if there are functions marked as pure or const # define Vc_PURE # define Vc_CONST # define Vc_NEVER_INLINE # else # define Vc_NEVER_INLINE [[gnu::noinline]] # define Vc_PURE __attribute__((__pure__)) # define Vc_CONST __attribute__((__const__)) # endif # define Vc_CONST_L # define Vc_CONST_R Vc_CONST # define Vc_PURE_L # define Vc_PURE_R Vc_PURE # define Vc_IS_UNLIKELY(x) __builtin_expect(x, 0) # define Vc_IS_LIKELY(x) __builtin_expect(x, 1) # define Vc_RESTRICT __restrict__ # ifdef Vc_ICC # define Vc_DEPRECATED(msg) # define Vc_DEPRECATED_ALIAS(msg) # else # define Vc_DEPRECATED(msg) __attribute__((__deprecated__(msg))) # define Vc_DEPRECATED_ALIAS(msg) __attribute__((__deprecated__(msg))) # endif # define Vc_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__)) #else # define Vc_NEVER_INLINE # define Vc_FLATTEN # ifdef Vc_PURE # undef Vc_PURE # endif # define Vc_MAY_ALIAS # ifdef Vc_MSVC # define Vc_ALWAYS_INLINE inline __forceinline # define Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE # define Vc_ALWAYS_INLINE_R # define Vc_CONST __declspec(noalias) # define Vc_CONST_L Vc_CONST # define Vc_CONST_R # define Vc_PURE /*Vc_CONST*/ # define Vc_PURE_L Vc_PURE # define Vc_PURE_R # define Vc_INTRINSIC inline __forceinline # define Vc_INTRINSIC_L Vc_INTRINSIC # define Vc_INTRINSIC_R namespace Vc_VERSIONED_NAMESPACE { namespace detail { static Vc_INTRINSIC void unreachable() { __assume(0); } } // namespace detail } # define Vc_UNREACHABLE Vc::detail::unreachable # else # define Vc_ALWAYS_INLINE # define Vc_ALWAYS_INLINE_L # define Vc_ALWAYS_INLINE_R # define Vc_CONST # define Vc_CONST_L # define Vc_CONST_R # define Vc_PURE # define Vc_PURE_L # define Vc_PURE_R # define Vc_INTRINSIC # define Vc_INTRINSIC_L # define Vc_INTRINSIC_R # define Vc_UNREACHABLE std::abort # endif # define Vc_IS_UNLIKELY(x) x # define Vc_IS_LIKELY(x) x # define Vc_RESTRICT __restrict # define Vc_DEPRECATED(msg) __declspec(deprecated(msg)) # define Vc_DEPRECATED_ALIAS(msg) # define Vc_WARN_UNUSED_RESULT #endif #ifdef Vc_CXX14 #undef Vc_DEPRECATED #define Vc_DEPRECATED(msg_) [[deprecated(msg_)]] #endif #define Vc_NOTHING_EXPECTING_SEMICOLON static_assert(true, "") #define Vc_FREE_STORE_OPERATORS_ALIGNED(align_) \ /**\name new/delete overloads for correct alignment */ \ /**@{*/ \ /*!\brief Allocates correctly aligned memory */ \ Vc_ALWAYS_INLINE void *operator new(size_t size) \ { \ return Vc::Common::aligned_malloc(size); \ } \ /*!\brief Returns \p p. */ \ Vc_ALWAYS_INLINE void *operator new(size_t, void *p) { return p; } \ /*!\brief Allocates correctly aligned memory */ \ Vc_ALWAYS_INLINE void *operator new[](size_t size) \ { \ return Vc::Common::aligned_malloc(size); \ } \ /*!\brief Returns \p p. */ \ Vc_ALWAYS_INLINE void *operator new[](size_t, void *p) { return p; } \ /*!\brief Frees aligned memory. */ \ Vc_ALWAYS_INLINE void operator delete(void *ptr, size_t) { Vc::Common::free(ptr); } \ /*!\brief Does nothing. */ \ Vc_ALWAYS_INLINE void operator delete(void *, void *) {} \ /*!\brief Frees aligned memory. */ \ Vc_ALWAYS_INLINE void operator delete[](void *ptr, size_t) \ { \ Vc::Common::free(ptr); \ } \ /*!\brief Does nothing. */ \ Vc_ALWAYS_INLINE void operator delete[](void *, void *) {} \ /**@}*/ \ Vc_NOTHING_EXPECTING_SEMICOLON #ifdef Vc_ASSERT #define Vc_EXTERNAL_ASSERT 1 #else #ifdef NDEBUG #define Vc_ASSERT(x) #else #include #define Vc_ASSERT(x) assert(x); #endif #endif #if defined Vc_CLANG || defined Vc_APPLECLANG #define Vc_HAS_BUILTIN(x) __has_builtin(x) #else #define Vc_HAS_BUILTIN(x) 0 #endif #define Vc_CAT_HELPER_(a, b, c, d) a##b##c##d #define Vc_CAT(a, b, c, d) Vc_CAT_HELPER_(a, b, c, d) #define Vc_CAT_IMPL(a, b) a##b #define Vc_CAT2(a, b) Vc_CAT_IMPL(a, b) #define Vc_APPLY_IMPL_1_(macro, a, b, c, d, e) macro(a) #define Vc_APPLY_IMPL_2_(macro, a, b, c, d, e) macro(a, b) #define Vc_APPLY_IMPL_3_(macro, a, b, c, d, e) macro(a, b, c) #define Vc_APPLY_IMPL_4_(macro, a, b, c, d, e) macro(a, b, c, d) #define Vc_APPLY_IMPL_5_(macro, a, b, c, d, e) macro(a, b, c, d, e) #define Vc_LIST_FLOAT_VECTOR_TYPES(size, macro, a, b, c, d) \ size(macro, double_v, a, b, c, d) \ size(macro, float_v, a, b, c, d) #define Vc_LIST_INT_VECTOR_TYPES(size, macro, a, b, c, d) \ size(macro, int_v, a, b, c, d) \ size(macro, uint_v, a, b, c, d) \ size(macro, short_v, a, b, c, d) \ size(macro, ushort_v, a, b, c, d) #define Vc_LIST_VECTOR_TYPES(size, macro, a, b, c, d) \ Vc_LIST_FLOAT_VECTOR_TYPES(size, macro, a, b, c, d) \ Vc_LIST_INT_VECTOR_TYPES(size, macro, a, b, c, d) #define Vc_LIST_COMPARES(size, macro, a, b, c, d) \ size(macro, ==, a, b, c, d) \ size(macro, !=, a, b, c, d) \ size(macro, <=, a, b, c, d) \ size(macro, >=, a, b, c, d) \ size(macro, < , a, b, c, d) \ size(macro, > , a, b, c, d) #define Vc_LIST_LOGICAL(size, macro, a, b, c, d) \ size(macro, &&, a, b, c, d) \ size(macro, ||, a, b, c, d) #define Vc_LIST_BINARY(size, macro, a, b, c, d) \ size(macro, |, a, b, c, d) \ size(macro, &, a, b, c, d) \ size(macro, ^, a, b, c, d) #define Vc_LIST_SHIFTS(size, macro, a, b, c, d) \ size(macro, <<, a, b, c, d) \ size(macro, >>, a, b, c, d) #define Vc_LIST_ARITHMETICS(size, macro, a, b, c, d) \ size(macro, +, a, b, c, d) \ size(macro, -, a, b, c, d) \ size(macro, *, a, b, c, d) \ size(macro, /, a, b, c, d) \ size(macro, %, a, b, c, d) #define Vc_APPLY_0(_list, macro) _list(Vc_APPLY_IMPL_1_, macro, 0, 0, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON #define Vc_APPLY_1(_list, macro, a) _list(Vc_APPLY_IMPL_2_, macro, a, 0, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON #define Vc_APPLY_2(_list, macro, a, b) _list(Vc_APPLY_IMPL_3_, macro, a, b, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON #define Vc_APPLY_3(_list, macro, a, b, c) _list(Vc_APPLY_IMPL_4_, macro, a, b, c, 0) Vc_NOTHING_EXPECTING_SEMICOLON #define Vc_APPLY_4(_list, macro, a, b, c, d) _list(Vc_APPLY_IMPL_5_, macro, a, b, c, d) Vc_NOTHING_EXPECTING_SEMICOLON #define Vc_ALL_COMPARES(macro) Vc_APPLY_0(Vc_LIST_COMPARES, macro) #define Vc_ALL_LOGICAL(macro) Vc_APPLY_0(Vc_LIST_LOGICAL, macro) #define Vc_ALL_BINARY(macro) Vc_APPLY_0(Vc_LIST_BINARY, macro) #define Vc_ALL_SHIFTS(macro) Vc_APPLY_0(Vc_LIST_SHIFTS, macro) #define Vc_ALL_ARITHMETICS(macro) Vc_APPLY_0(Vc_LIST_ARITHMETICS, macro) #define Vc_ALL_FLOAT_VECTOR_TYPES(macro) Vc_APPLY_0(Vc_LIST_FLOAT_VECTOR_TYPES, macro) #define Vc_ALL_VECTOR_TYPES(macro) Vc_APPLY_0(Vc_LIST_VECTOR_TYPES, macro) #define Vc_EXACT_TYPE(_test, _reference, _type) \ typename std::enable_if::value, _type>::type #define Vc_make_unique(name) Vc_CAT(Vc_,name,_,__LINE__) #if defined(Vc_ICC) || defined(Vc_CLANG) || defined Vc_APPLECLANG #define Vc_OFFSETOF(Type, member) (reinterpret_cast(&reinterpret_cast(0)->member) - reinterpret_cast(0)) #else #define Vc_OFFSETOF(Type, member) offsetof(Type, member) #endif #if defined(Vc_NO_NOEXCEPT) #define Vc_NOEXCEPT throw() #else #define Vc_NOEXCEPT noexcept #endif #ifdef Vc_NO_ALWAYS_INLINE #undef Vc_ALWAYS_INLINE #undef Vc_ALWAYS_INLINE_L #undef Vc_ALWAYS_INLINE_R #define Vc_ALWAYS_INLINE inline #define Vc_ALWAYS_INLINE_L inline #define Vc_ALWAYS_INLINE_R #undef Vc_INTRINSIC #undef Vc_INTRINSIC_L #undef Vc_INTRINSIC_R #define Vc_INTRINSIC inline #define Vc_INTRINSIC_L inline #define Vc_INTRINSIC_R #endif #endif // VC_COMMON_MACROS_H_ Vc-1.3.3/common/makeContainer.h000066400000000000000000000152531320703111200163030ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2013-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_MAKECONTAINER_H_ #define VC_COMMON_MAKECONTAINER_H_ #include #include #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace { template struct make_container_helper { static constexpr Container help(std::initializer_list list) { return { list }; } }; template class Container> struct make_container_helper, Alloc>, typename Vector::EntryType> { typedef Vector V; typedef typename V::EntryType T; typedef Container C; static inline C help(std::initializer_list list) { const std::size_t size = (list.size() + (V::Size - 1)) / V::Size; C v(size); auto containerIt = v.begin(); auto init = std::begin(list); const auto initEnd = std::end(list); for (std::size_t i = 0; i < size - 1; ++i) { *containerIt++ = V(init, Vc::Unaligned); init += V::Size; } Vc_ASSERT(all_of(*containerIt == V::Zero())); int j = 0; while (init != initEnd) { (*containerIt)[j++] = *init++; } return v; } }; template class Container> struct make_container_helper, N>, typename Vector::EntryType> { typedef Vector V; typedef typename V::EntryType T; static constexpr std::size_t size = (N + (V::Size - 1)) / V::Size; typedef Container< V, #if defined Vc_CLANG && Vc_CLANG < 0x30700 // TODO: when did Vc_APPLECLANG fix it? // clang before 3.7.0 has a bug when returning std::array<__m256x, 1>. So // increase it to std::array<__m256x, 2> and fill it with zeros. Better // than returning garbage. (size == 1 && std::is_same::value) ? 2 : #endif size> C; static inline C help(std::initializer_list list) { Vc_ASSERT(N == list.size()) Vc_ASSERT(size == (list.size() + (V::Size - 1)) / V::Size) C v; auto containerIt = v.begin(); auto init = std::begin(list); const auto initEnd = std::end(list); for (std::size_t i = 0; i < size - 1; ++i) { *containerIt++ = V(init, Vc::Unaligned); init += V::Size; } Vc_ASSERT(all_of(*containerIt == V::Zero())); int j = 0; while (init != initEnd) { (*containerIt)[j++] = *init++; } return v; } }; } // anonymous namespace /** * \ingroup Utilities * \headerfile Utils * * Construct a container of Vc vectors from a std::initializer_list of scalar entries. * * \tparam Container The container type to construct. * \tparam T The scalar type to use for the initializer_list. * * \param list An initializer list of arbitrary size. The type of the entries is important! * If you pass a list of integers you will get a container filled with Vc::int_v objects. * If, instead, you want to have a container of Vc::float_v objects, be sure the include a * period (.) and the 'f' postfix in the literals. Alternatively, you can pass the * type as second template argument to makeContainer. * * \return Returns a container of the requested class filled with the minimum number of SIMD * vectors to hold the values in the initializer list. * If the number of values in \p list does not match the number of values in the * returned container object, the remaining values in the returned object will be * zero-initialized. * * Example: * \code * auto data = Vc::makeContainer>({ 1.f, 2.f, 3.f, 4.f, 5.f }); * // data.size() == 5 if float_v::Size == 1 (i.e. Vc_IMPL=Scalar) * // data.size() == 2 if float_v::Size == 4 (i.e. Vc_IMPL=SSE) * // data.size() == 1 if float_v::Size == 8 (i.e. Vc_IMPL=AVX) * \endcode */ template constexpr auto makeContainer(std::initializer_list list) -> decltype(make_container_helper::help(list)) { return make_container_helper::help(list); } template constexpr auto make_container(std::initializer_list list) -> decltype(makeContainer(list)) { return makeContainer(list); } } // namespace Vc #endif // VC_COMMON_MAKECONTAINER_H_ Vc-1.3.3/common/make_unique.h000066400000000000000000000042071320703111200160230ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2013-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_MAKE_UNIQUE_H_ #define VC_COMMON_MAKE_UNIQUE_H_ #include #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace Common { template struct Deleter { Vc_ALWAYS_INLINE void operator()(T *ptr) { ptr->~T(); Vc::free(ptr); } }; template inline std::unique_ptr> make_unique(Args&&... args) { return std::unique_ptr>(new(Vc::malloc(1)) T(std::forward(args)...)); } } // namespace Common } // namespace Vc #endif // VC_COMMON_MAKE_UNIQUE_H_ Vc-1.3.3/common/malloc.h000066400000000000000000000065041320703111200147710ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2013-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_MALLOC_H_ #define VC_COMMON_MALLOC_H_ #ifndef Vc_VECTOR_DECLARED_ #error "Incorrect inclusion order. This header must be included from Vc/vector.h only." #endif #if defined _WIN32 || defined _WIN64 #include #else #include #endif #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace Common { template static constexpr size_t nextMultipleOf(size_t value) { return (value % X) > 0 ? value + X - (value % X) : value; } template Vc_INTRINSIC void *aligned_malloc(std::size_t n) { #ifdef __MIC__ return _mm_malloc(nextMultipleOf(n), alignment); #elif defined(_WIN32) # ifdef __GNUC__ return __mingw_aligned_malloc(nextMultipleOf(n), alignment); # else return _aligned_malloc(nextMultipleOf(n), alignment); # endif #else void *ptr = nullptr; if (0 == posix_memalign(&ptr, alignment < sizeof(void *) ? sizeof(void *) : alignment, nextMultipleOf(n))) { return ptr; } return ptr; #endif } template Vc_ALWAYS_INLINE void *malloc(size_t n) { switch (A) { case Vc::AlignOnVector: return aligned_malloc(n); case Vc::AlignOnCacheline: // TODO: hardcoding 64 is not such a great idea return aligned_malloc<64>(n); case Vc::AlignOnPage: // TODO: hardcoding 4096 is not such a great idea return aligned_malloc<4096>(n); } return nullptr; } Vc_ALWAYS_INLINE void free(void *p) { #ifdef __MIC__ _mm_free(p); #elif defined(_WIN32) # ifdef __GNUC__ return __mingw_aligned_free(p); # else return _aligned_free(p); # endif #else std::free(p); #endif } } // namespace Common } // namespace Vc #endif // VC_COMMON_MALLOC_H_ Vc-1.3.3/common/mask.h000066400000000000000000000335651320703111200144640ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_MASK_H_ #define VC_COMMON_MASK_H_ #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { /** * \class Mask mask.h * \ingroup Masks * * The main SIMD mask class. */ template > class Mask { public: /** * Returns the number of boolean components (\VSize{T}) in a mask of this type. * * The size of the mask. I.e. the number of boolean entries in the mask. Do not * make any assumptions about the size of masks. * * In addition, you can easily use if clauses that compare sizes. The compiler can * statically evaluate and fully optimize dead code away (very much like \#ifdef, but * with syntax checking). * * \returns The number of components (i.e. \VSize{T}) objects of this mask type store * and manipulate. */ static constexpr size_t size() { return VectorTraits::size(); } ///\copydoc size ///\deprecated Use Vc::Mask::size instead. static constexpr size_t Size = VectorTraits::size(); /** * Specifies the alignment requirement for aligned load and store calls for objects of * this mask type. */ static constexpr size_t MemoryAlignment = VectorTraits::maskMemoryAlignment(); /// The ABI tag type of the current template instantiation. using abi = Abi; /** * The \c EntryType of masks is always \c bool, independent of \c T. */ using EntryType = bool; /// \copydoc EntryType using value_type = EntryType; /// The reference wrapper type used for accessing individual mask components. using EntryReference = typename VectorTraits::EntryReference; /// \copydoc EntryReference using value_reference = EntryReference; /** * The \c VectorEntryType, in contrast to \c EntryType, reveals information about the SIMD * implementation. * This type is useful for the \c sizeof operator in generic functions. */ using VectorEntryType = typename VectorTraits::VectorEntryType; /**\internal * The \c VectorType reveals the implementation-specific internal type used for the SIMD type. */ using VectorType = typename VectorTraits::VectorType; /**\internal * \copydoc VectorType */ using vector_type = VectorType; /* * The associated Vector type. */ //using Vector = Vector; /// \name Generators ///@{ /** * Creates a new mask object initialized to zero/\c false. * * \returns A mask object with zero-initialized components. */ Vc_INTRINSIC static Mask Zero(); /** * Creates a mask object initialized to one/\c true. * * \returns A mask object with components initialized to \c true. */ Vc_INTRINSIC static Mask One(); /// Generate a mask object from booleans returned from the function \p gen. template static Vc_INTRINSIC Mask generate(G &&gen); ///@} /// \name Compile-Time Constant Initialization ///@{ /** * Construct a zero-initialized vector object. * * This constructor follows the behavior of the underlying \c bool type in that the * expression `bool()` zero-initializes the object (to \c false). On the other hand * the variable \c x in `bool x;` is uninitialized. * Since, for class types, both expressions call the default constructor `Mask x` * must zero-initialize \c x as well. */ Vc_INTRINSIC Mask() = default; /// Zero-initialize the new mask object (\c false). /// \see Vc::Zero, Zero() Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero); /// Initialize the new mask object to one (\c true). /// \see Vc::One, One() Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne); ///@} /// \name Conversion/Broadcast Constructors ///@{ /** * Broadcast constructor. * * Set all components of the new mask object to \p b. * * \param b Determines the initial state of the mask. */ Vc_INTRINSIC explicit Mask(bool b); /** * Implicit conversion from a compatible (equal \VSize{T} on every platform) mask * object. * * \param otherMask The mask to be converted. */ template Vc_INTRINSIC Mask(U &&otherMask, Common::enable_if_mask_converts_implicitly = nullarg); #if Vc_IS_VERSION_1 /** * Explicit conversion (static_cast) from a mask object that potentially has a * different \VSize{T}. * * \param otherMask The mask to be converted. * * \internal This is implemented via simd_cast in scalar/simd_cast_caller.h */ template Vc_DEPRECATED( "use simd_cast instead of explicit type casting to convert between mask types") Vc_INTRINSIC_L explicit Mask(U &&otherMask, Common::enable_if_mask_converts_explicitly = nullarg) Vc_INTRINSIC_R; ///@} #endif /** * \name Loads & Stores */ ///@{ /** * Load constructor from an array of \c bool. * * This constructor implements an explicit conversion from an array of booleans to a * mask object. It corresponds to a Vector load constructor. * * \param mem A pointer to the start of the array of booleans. * \see Mask(const bool *, Flags), load(const bool *) */ Vc_ALWAYS_INLINE explicit Mask(const bool *mem); /** * Overload of the above with a load/store flag argument. * * \param mem A pointer to the start of the array of booleans. * \param flags Choose a combination of flags such as Vc::Aligned, Vc::Streaming, * Vc::Unaligned, Vc::PrefetchDefault, ... * \see load(const bool *, Flags) */ template Vc_ALWAYS_INLINE explicit Mask(const bool *mem, Flags flags); /** * Load the components of the mask from an array of \c bool. * * \param mem A pointer to the start of the array of booleans. * \see load(const bool *, Flags), Mask(const bool *) */ Vc_ALWAYS_INLINE void load(const bool *mem); /** * Overload of the above with a load/store flag argument. * * \param mem A pointer to the start of the array of booleans. * \param flags Choose a combination of flags such as Vc::Aligned, Vc::Streaming, * Vc::Unaligned, Vc::PrefetchDefault, ... * \see Mask(const bool *, Flags) */ template Vc_ALWAYS_INLINE void load(const bool *mem, Flags flags); /** * Store the values of the mask to an array of \c bool. * * \param mem A pointer to the start of the array of booleans. * \see store(bool *, Flags) */ Vc_ALWAYS_INLINE void store(bool *mem) const; /** * Overload of the above with a load/store flag argument. * * \param mem A pointer to the start of the array of booleans. * \param flags Choose a combination of flags such as Vc::Aligned, Vc::Streaming, * Vc::Unaligned, Vc::PrefetchDefault, ... */ template Vc_ALWAYS_INLINE void store(bool *mem, Flags flags) const; ///@} /// \name Comparison Operators ///@{ /** * Returns whether the two masks are equal in all components. * * \param mask The other mask to compare against. * \returns A scalar boolean value that says whether all components of the two masks * are equal. * * \note If you expected a behavior similar to the compare operator of Vc::Vector, * consider that the bitwise operators already implement such functionality. There is * little use, typically, in having `a == b` return the same as `a ^ b`. In general, * it is more useful to query `all_of(a ^ b)` which is the same as this equality * operator. */ Vc_ALWAYS_INLINE bool operator==(const Mask &mask) const; /** * Returns whether the two masks are different in at least one component. * * \param mask The other mask to compare against. * \returns A scalar boolean value that says whether at least one component of the two masks is different. * * \note `(a == b) == !(a != b)` holds * \see Mask::operator==(const Mask &) */ Vc_ALWAYS_INLINE bool operator!=(const Mask &mask) const; ///@} /** * \name Logical and Binary Operators * * \brief Component-wise logical/binary operations on mask objects. * * The effect of logical and binary \c AND and \c OR is equivalent for mask types (as * it is for \c bool). */ ///@{ /// Returns the component-wise application of a logical \c AND to \p mask. Vc_ALWAYS_INLINE Mask operator&&(const Mask &mask) const; /// Returns the component-wise application of a binary \c AND to \p mask. Vc_ALWAYS_INLINE Mask operator&(const Mask &mask) const; /// Returns the component-wise application of a logical \c OR to \p mask. Vc_ALWAYS_INLINE Mask operator||(const Mask &mask) const; /// Returns the component-wise application of a binary \c OR to \p mask. Vc_ALWAYS_INLINE Mask operator|(const Mask &mask) const; /// Returns the component-wise application of a binary \c XOR to \p mask. Vc_ALWAYS_INLINE Mask operator^(const Mask &mask) const; /// Returns a mask with inverted components. Vc_ALWAYS_INLINE Mask operator!() const; /// Modifies the mask using an \c AND operation with \p mask. Vc_ALWAYS_INLINE Mask &operator&=(const Mask &mask); /// Modifies the mask using an \c OR operation with \p mask. Vc_ALWAYS_INLINE Mask &operator|=(const Mask &mask); /// Modifies the mask using an \c XOR operation with \p mask. Vc_ALWAYS_INLINE Mask &operator^=(const Mask &mask); ///@} /** * \name Reductions * * \see any_of, all_of, none_of, some_of */ ///@{ /// Returns a logical \c AND of all components. Vc_ALWAYS_INLINE bool isFull() const; /// Returns a logical \c OR of all components. Vc_ALWAYS_INLINE bool isNotEmpty() const; /// Returns \c true if components are \c false, \c false otherwise. Vc_ALWAYS_INLINE bool isEmpty() const; /// Returns `!isFull() && !isEmpty()`. Vc_ALWAYS_INLINE bool isMix() const; ///@} /**\internal * \name Internal Data Access */ ///@{ Vc_ALWAYS_INLINE bool data() const; Vc_ALWAYS_INLINE bool dataI() const; Vc_ALWAYS_INLINE bool dataD() const; ///@} /// \name Scalar Subscript Operators ///@{ /** * Lvalue-reference-like access to mask entries. * * \param index Determines the boolean to be accessed. * \return a temporary proxy object referencing the \p index th entry of the mask. * * \warning This operator does not return an lvalue reference (to \c bool), but rather * a temporary (rvalue) object that mimics an lvalue reference (as much as is possible * with C++11/14). */ Vc_ALWAYS_INLINE EntryReference operator[](size_t index); /** * Read-only access to mask entries. * * \param index Determines the boolean to be accessed. * \return The \p index th entry of the mask as a \c bool (rvalue). * * \warning This operator does not return an lvalue reference (to `const bool`), but * rather a temporary (rvalue) \c bool. */ Vc_ALWAYS_INLINE EntryType operator[](size_t index) const; ///@} /// Returns how many components of the mask are \c true. Vc_ALWAYS_INLINE int count() const; /** * Returns the index of the first one in the mask. * * \returns the index of the first component that is \c true. * * \warning The return value is undefined if the mask is empty. * * Thus, unless `none_of(mask)`, `mask[mask.firstOne()] == true` holds and `mask[i] == * false` for all `i < mask.firstOne()`. */ Vc_ALWAYS_INLINE int firstOne() const; /** * Convert the boolean components of the mask into bits of an integer. * * \return An \c int where each bit corresponds to the boolean value in the mask. * * For example, the mask `[true, false, false, true]` results in a `9` (in binary: `1001`). */ Vc_ALWAYS_INLINE int toInt() const; /// Returns a mask with components shifted by \p amount places. Vc_INTRINSIC Vc_PURE Mask shifted(int amount) const; Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(Mask)); private: VectorType d; }; } // namespace Vc #endif // VC_COMMON_MASK_H_ // vim: foldmethod=marker Vc-1.3.3/common/maskbool.h000066400000000000000000000102421320703111200153230ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2013-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_MASKENTRY_H_ #define VC_COMMON_MASKENTRY_H_ #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace Common { namespace { template struct MaskBoolStorage; // the following for typedefs must use std::intN_t and NOT! Vc::intN_t. The latter // segfaults ICC 15.0.3. template<> struct MaskBoolStorage<1> { typedef std::int8_t type; }; template<> struct MaskBoolStorage<2> { typedef std::int16_t type; }; template<> struct MaskBoolStorage<4> { typedef std::int32_t type; }; template<> struct MaskBoolStorage<8> { typedef std::int64_t type; }; } // anonymous namespace template class MaskBool { typedef typename MaskBoolStorage::type storage_type Vc_MAY_ALIAS; storage_type data; public: constexpr MaskBool(bool x) noexcept : data(x ? -1 : 0) {} Vc_ALWAYS_INLINE MaskBool &operator=(bool x) noexcept { data = x ? -1 : 0; return *this; } template ::value && std::is_fundamental::value)>> Vc_ALWAYS_INLINE MaskBool &operator=(T x) noexcept { data = reinterpret_cast(x); return *this; } Vc_ALWAYS_INLINE MaskBool(const MaskBool &) noexcept = default; Vc_ALWAYS_INLINE MaskBool &operator=(const MaskBool &) noexcept = default; template ::value || (std::is_fundamental::value && sizeof(storage_type) == sizeof(T)))>> constexpr operator T() const noexcept { return std::is_same::value ? T((data & 1) != 0) : reinterpret_cast &>(data); } } Vc_MAY_ALIAS; template ::value &&std::is_convertible::value, int>::type = 0> constexpr bool operator==(A &&a, B &&b) { return static_cast(a) == static_cast(b); } template ::value &&std::is_convertible::value, int>::type = 0> constexpr bool operator!=(A &&a, B &&b) { return static_cast(a) != static_cast(b); } static_assert(true == MaskBool<4>(true), "true == MaskBool<4>(true)"); static_assert(true != MaskBool<4>(false), "true != MaskBool<4>(false)"); } // namespace Common } // namespace Vc #endif // VC_COMMON_MASKENTRY_H_ Vc-1.3.3/common/math.h000066400000000000000000000123511320703111200144500ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2013-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_MATH_H_ #define VC_COMMON_MATH_H_ #define Vc_COMMON_MATH_H_INTERNAL 1 #include "trigonometric.h" #include "const.h" #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { #ifdef Vc_IMPL_SSE // for SSE, AVX, and AVX2 #include "logarithm.h" #include "exponential.h" #ifdef Vc_IMPL_AVX inline AVX::double_v exp(AVX::double_v _x) { AVX::Vector x = _x; typedef AVX::Vector V; typedef V::Mask M; typedef AVX::Const C; const M overflow = x > Vc::Detail::doubleConstant< 1, 0x0006232bdd7abcd2ull, 9>(); // max log const M underflow = x < Vc::Detail::doubleConstant<-1, 0x0006232bdd7abcd2ull, 9>(); // min log V px = floor(C::log2_e() * x + 0.5); __m128i tmp = _mm256_cvttpd_epi32(px.data()); const SimdArray n = SSE::int_v{tmp}; x -= px * C::ln2_large(); //Vc::Detail::doubleConstant<1, 0x00062e4000000000ull, -1>(); // ln2 x -= px * C::ln2_small(); //Vc::Detail::doubleConstant<1, 0x0007f7d1cf79abcaull, -20>(); // ln2 const double P[] = { Vc::Detail::doubleConstant<1, 0x000089cdd5e44be8ull, -13>(), Vc::Detail::doubleConstant<1, 0x000f06d10cca2c7eull, -6>(), Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 0>() }; const double Q[] = { Vc::Detail::doubleConstant<1, 0x00092eb6bc365fa0ull, -19>(), Vc::Detail::doubleConstant<1, 0x0004ae39b508b6c0ull, -9>(), Vc::Detail::doubleConstant<1, 0x000d17099887e074ull, -3>(), Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 1>() }; const V x2 = x * x; px = x * ((P[0] * x2 + P[1]) * x2 + P[2]); x = px / ((((Q[0] * x2 + Q[1]) * x2 + Q[2]) * x2 + Q[3]) - px); x = V::One() + 2.0 * x; x = ldexp(x, n); // == x * 2ⁿ x(overflow) = std::numeric_limits::infinity(); x.setZero(underflow); return x; } #endif // Vc_IMPL_AVX inline SSE::double_v exp(SSE::double_v::AsArg _x) { SSE::Vector x = _x; typedef SSE::Vector V; typedef V::Mask M; typedef SSE::Const C; const M overflow = x > Vc::Detail::doubleConstant< 1, 0x0006232bdd7abcd2ull, 9>(); // max log const M underflow = x < Vc::Detail::doubleConstant<-1, 0x0006232bdd7abcd2ull, 9>(); // min log V px = floor(C::log2_e() * x + 0.5); SimdArray n; _mm_storel_epi64(reinterpret_cast<__m128i *>(&n), _mm_cvttpd_epi32(px.data())); x -= px * C::ln2_large(); //Vc::Detail::doubleConstant<1, 0x00062e4000000000ull, -1>(); // ln2 x -= px * C::ln2_small(); //Vc::Detail::doubleConstant<1, 0x0007f7d1cf79abcaull, -20>(); // ln2 const double P[] = { Vc::Detail::doubleConstant<1, 0x000089cdd5e44be8ull, -13>(), Vc::Detail::doubleConstant<1, 0x000f06d10cca2c7eull, -6>(), Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 0>() }; const double Q[] = { Vc::Detail::doubleConstant<1, 0x00092eb6bc365fa0ull, -19>(), Vc::Detail::doubleConstant<1, 0x0004ae39b508b6c0ull, -9>(), Vc::Detail::doubleConstant<1, 0x000d17099887e074ull, -3>(), Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 1>() }; const V x2 = x * x; px = x * ((P[0] * x2 + P[1]) * x2 + P[2]); x = px / ((((Q[0] * x2 + Q[1]) * x2 + Q[2]) * x2 + Q[3]) - px); x = V::One() + 2.0 * x; x = ldexp(x, n); // == x * 2ⁿ x(overflow) = std::numeric_limits::infinity(); x.setZero(underflow); return x; } #endif } // namespace Vc #undef Vc_COMMON_MATH_H_INTERNAL #endif // VC_COMMON_MATH_H_ Vc-1.3.3/common/memory.h000066400000000000000000000616511320703111200150360ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2009-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_MEMORY_H_ #define VC_COMMON_MEMORY_H_ #include "memorybase.h" #include #include #include #include #include #include "memoryfwd.h" #include "malloc.h" #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { /** * Allocates memory on the Heap with alignment and padding suitable for vectorized access. * * Memory that was allocated with this function must be released with Vc::free! Other methods might * work but are not portable. * * \param n Specifies the number of objects the allocated memory must be able to store. * \tparam T The type of the allocated memory. Note, that the constructor is not called. * \tparam A Determines the alignment of the memory. See \ref Vc::MallocAlignment. * * \return Pointer to memory of the requested type, or 0 on error. The allocated memory is padded at * the end to be a multiple of the requested alignment \p A. Thus if you request memory for 21 * int objects, aligned via Vc::AlignOnCacheline, you can safely read a full cacheline until the * end of the array, without generating an out-of-bounds access. For a cacheline size of 64 Bytes * and an int size of 4 Bytes you would thus get an array of 128 Bytes to work with. * * \warning * \li The standard malloc function specifies the number of Bytes to allocate whereas this * function specifies the number of values, thus differing in a factor of sizeof(T). * \li This function is mainly meant for use with builtin types. If you use a custom * type with a sizeof that is not a multiple of 2 the results might not be what you expect. * \li The constructor of T is not called. You can make up for this: * \code * SomeType *array = new(Vc::malloc(N)) SomeType[N]; * \endcode * * \see Vc::free * * \ingroup Utilities * \headerfile memory.h */ template Vc_ALWAYS_INLINE T *malloc(size_t n) { return static_cast(Common::malloc(n * sizeof(T))); } /** * Frees memory that was allocated with Vc::malloc. * * \param p The pointer to the memory to be freed. * * \tparam T The type of the allocated memory. * * \warning The destructor of T is not called. If needed, you can call the destructor before calling * free: * \code * for (int i = 0; i < N; ++i) { * p[i].~T(); * } * Vc::free(p); * \endcode * * \ingroup Utilities * \headerfile memory.h * * \see Vc::malloc */ template Vc_ALWAYS_INLINE void free(T *p) { Common::free(p); } namespace Common { template struct _MemorySizeCalculation { enum AlignmentCalculations { Alignment = V::Size, AlignmentMask = Alignment - 1, MaskedSize = Size & AlignmentMask, Padding = Alignment - MaskedSize, PaddedSize = MaskedSize == 0 ? Size : Size + Padding }; }; /** * \ingroup Utilities * \headerfile memory.h * * A helper class for fixed-size two-dimensional arrays. * * \param V The vector type you want to operate on. (e.g. float_v or uint_v) * \param Size1 Number of rows * \param Size2 Number of columns */ template #ifdef Vc_RECURSIVE_MEMORY class Memory : public MemoryBase, 2, Memory> #else class Memory : public AlignedBase, public MemoryBase, 2, Memory> #endif { public: typedef typename V::EntryType EntryType; private: #ifdef Vc_RECURSIVE_MEMORY using RowMemory = Memory; #else using RowMemory = Memory; #endif typedef MemoryBase, 2, RowMemory> Base; friend class MemoryBase, 2, RowMemory>; friend class MemoryDimensionBase, 2, RowMemory>; enum : size_t { Alignment = V::MemoryAlignment, PaddedSize2 = _MemorySizeCalculation::PaddedSize }; alignas(static_cast(Alignment)) // GCC complains about 'is not an // integer constant' unless the // static_cast is present #ifdef Vc_RECURSIVE_MEMORY RowMemory m_mem[Size1]; #else EntryType m_mem[Size1][PaddedSize2]; #endif public: using Base::vector; enum Constants { RowCount = Size1, VectorsCount = PaddedSize2 / V::Size }; #ifdef Vc_RECURSIVE_MEMORY Memory() = default; #else Memory() { if (InitPadding) { if (Size1 > 32) for (size_t i = 0; i < Size1; ++i) { V::Zero().store(&m_mem[i][PaddedSize2 - V::Size], Vc::Streaming); } } } #endif /** * \return the number of rows in the array. * * \note This function can be eliminated by an optimizing compiler. */ static constexpr size_t rowsCount() { return RowCount; } /** * \return the number of scalar entries in the whole array. * * \warning Do not use this function for scalar iteration over the array since there will be * padding between rows if \c Size2 is not divisible by \c V::Size. * * \note This function can be optimized into a compile-time constant. */ static constexpr size_t entriesCount() { return Size1 * Size2; } /** * \return the number of vectors in the whole array. * * \note This function can be optimized into a compile-time constant. */ static constexpr size_t vectorsCount() { return VectorsCount * Size1; } /** * Copies the data from a different object. * * \param rhs The object to copy the data from. * * \return reference to the modified Memory object. * * \note Both objects must have the exact same vectorsCount(). */ template Vc_ALWAYS_INLINE Memory &operator=(const MemoryBase &rhs) { assert(vectorsCount() == rhs.vectorsCount()); Detail::copyVectors(*this, rhs); return *this; } Vc_ALWAYS_INLINE Memory &operator=(const Memory &rhs) { Detail::copyVectors(*this, rhs); return *this; } /** * Initialize all data with the given vector. * * \param v This vector will be used to initialize the memory. * * \return reference to the modified Memory object. */ inline Memory &operator=(const V &v) { for (size_t i = 0; i < vectorsCount(); ++i) { vector(i) = v; } return *this; } }; /** * A helper class to simplify usage of correctly aligned and padded memory, allowing both vector and * scalar access. * * Example: * \code Vc::Memory array; // scalar access: for (size_t i = 0; i < array.entriesCount(); ++i) { int x = array[i]; // read array[i] = x; // write } // more explicit alternative: for (size_t i = 0; i < array.entriesCount(); ++i) { int x = array.scalar(i); // read array.scalar(i) = x; // write } // vector access: for (size_t i = 0; i < array.vectorsCount(); ++i) { int_v x = array.vector(i); // read array.vector(i) = x; // write } * \endcode * This code allocates a small array and implements three equivalent loops (that do nothing useful). * The loops show how scalar and vector read/write access is best implemented. * * Since the size of 11 is not a multiple of int_v::Size (unless you use the * scalar Vc implementation) the last write access of the vector loop would normally be out of * bounds. But the Memory class automatically pads the memory such that the whole array can be * accessed with correctly aligned memory addresses. * * \param V The vector type you want to operate on. (e.g. float_v or uint_v) * \param Size The number of entries of the scalar base type the memory should hold. This * is thus the same number as you would use for a normal C array (e.g. float mem[11] becomes * Memory mem). * * \see Memory * * \ingroup Utilities * \headerfile memory.h */ template class Memory : #ifndef Vc_RECURSIVE_MEMORY public AlignedBase, #endif public MemoryBase, 1, void> { public: typedef typename V::EntryType EntryType; private: typedef MemoryBase, 1, void> Base; friend class MemoryBase, 1, void>; friend class MemoryDimensionBase, 1, void>; enum : size_t { Alignment = V::MemoryAlignment, // in Bytes MaskedSize = Size & (V::Size - 1), // the fraction of Size that exceeds // an integral multiple of V::Size Padding = V::Size - MaskedSize, PaddedSize = MaskedSize == 0 ? Size : Size + Padding }; alignas(static_cast(Alignment)) // GCC complains about 'is not an // integer constant' unless the // static_cast is present EntryType m_mem[PaddedSize]; public: using Base::vector; enum Constants { EntriesCount = Size, VectorsCount = PaddedSize / V::Size }; Memory() { if (InitPadding) { Base::lastVector() = V::Zero(); } } Memory(std::initializer_list init) { Vc_ASSERT(init.size() <= Size); Base::lastVector() = V::Zero(); std::copy(init.begin(), init.end(), &m_mem[0]); } /** * Wrap existing data with the Memory convenience class. * * This function returns a \em reference to a Memory object that you must * capture to avoid a copy of the whole data: * \code * Memory &m = Memory::fromRawData(someAlignedPointerToFloat) * \endcode * * \param ptr An aligned pointer to memory of type \p V::EntryType (e.g. \c float for * Vc::float_v). * \return A Memory object placed at the given location in memory. * * \warning The pointer \p ptr passed to this function must be aligned according to the * alignment restrictions of \p V. * \warning The size of the accessible memory must match \p Size. This includes the * required padding at the end to allow the last entries to be accessed via vectors. If * you know what you are doing you might violate this constraint. * \warning It is your responsibility to ensure that the memory is released correctly * (not too early/not leaked). This function simply adds convenience functions to \em * access the memory. */ static Vc_ALWAYS_INLINE Vc_CONST Memory &fromRawData(EntryType *ptr) { // DANGER! This placement new has to use the right address. If the compiler decides // RowMemory requires padding before the actual data then the address has to be adjusted // accordingly char *addr = reinterpret_cast(ptr); typedef Memory MM; addr -= Vc_OFFSETOF(MM, m_mem); return *new(addr) MM; } /** * \return the number of scalar entries in the whole array. * * \note This function can be optimized into a compile-time constant. */ static constexpr size_t entriesCount() { return EntriesCount; } /** * \return the number of vectors in the whole array. * * \note This function can be optimized into a compile-time constant. */ static constexpr size_t vectorsCount() { return VectorsCount; } inline Memory(const Memory &rhs) { Detail::copyVectors(*this, rhs); } template inline Memory(const Memory &rhs) { assert(vectorsCount() == rhs.vectorsCount()); Detail::copyVectors(*this, rhs); } inline Memory &operator=(const Memory &rhs) { Detail::copyVectors(*this, rhs); return *this; } template inline Memory &operator=(const Memory &rhs) { assert(vectorsCount() == rhs.vectorsCount()); Detail::copyVectors(*this, rhs); return *this; } Vc_ALWAYS_INLINE Memory &operator=(const EntryType *rhs) { std::memcpy(m_mem, rhs, entriesCount() * sizeof(EntryType)); return *this; } inline Memory &operator=(const V &v) { for (size_t i = 0; i < vectorsCount(); ++i) { vector(i) = v; } return *this; } }; /** * A helper class that is very similar to Memory but with dynamically allocated memory and * thus dynamic size. * * Example: * \code size_t size = 11; Vc::Memory array(size); // scalar access: for (size_t i = 0; i < array.entriesCount(); ++i) { array[i] = i; } // vector access: for (size_t i = 0; i < array.vectorsCount(); ++i) { array.vector(i) = int_v::IndexesFromZero() + i * int_v::Size; } * \endcode * This code allocates a small array with 11 scalar entries * and implements two equivalent loops that initialize the memory. * The scalar loop writes each individual int. The vectorized loop writes int_v::Size values to * memory per iteration. Since the size of 11 is not a multiple of int_v::Size (unless you use the * scalar Vc implementation) the last write access of the vector loop would normally be out of * bounds. But the Memory class automatically pads the memory such that the whole array can be * accessed with correctly aligned memory addresses. * (Note: the scalar loop can be auto-vectorized, except for the last three assignments.) * * \note The internal data pointer is not declared with the \c __restrict__ keyword. Therefore * modifying memory of V::EntryType will require the compiler to assume aliasing. If you want to use * the \c __restrict__ keyword you need to use a standard pointer to memory and do the vector * address calculation and loads and stores manually. * * \param V The vector type you want to operate on. (e.g. float_v or uint_v) * * \see Memory * * \ingroup Utilities * \headerfile memory.h */ template class Memory : public MemoryBase, 1, void> { public: typedef typename V::EntryType EntryType; private: typedef MemoryBase, 1, void> Base; friend class MemoryBase, 1, void>; friend class MemoryDimensionBase, 1, void>; enum InternalConstants { Alignment = V::Size, AlignmentMask = Alignment - 1 }; size_t m_entriesCount; size_t m_vectorsCount; EntryType *m_mem; size_t calcPaddedEntriesCount(size_t x) { size_t masked = x & AlignmentMask; return (masked == 0 ? x : x + (Alignment - masked)); } public: using Base::vector; /** * Allocate enough memory to access \p size values of type \p V::EntryType. * * The allocated memory is aligned and padded correctly for fully vectorized access. * * \param size Determines how many scalar values will fit into the allocated memory. */ Vc_ALWAYS_INLINE Memory(size_t size) : m_entriesCount(size), m_vectorsCount(calcPaddedEntriesCount(m_entriesCount)), m_mem(Vc::malloc(m_vectorsCount)) { m_vectorsCount /= V::Size; Base::lastVector() = V::Zero(); } /** * Copy the memory into a new memory area. * * The allocated memory is aligned and padded correctly for fully vectorized access. * * \param rhs The Memory object to copy from. */ template Vc_ALWAYS_INLINE Memory(const MemoryBase &rhs) : m_entriesCount(rhs.entriesCount()), m_vectorsCount(rhs.vectorsCount()), m_mem(Vc::malloc(m_vectorsCount * V::Size)) { Detail::copyVectors(*this, rhs); } /** * Overload of the above function. * * (Because C++ would otherwise not use the templated cctor and use a default-constructed cctor instead.) * * \param rhs The Memory object to copy from. */ Vc_ALWAYS_INLINE Memory(const Memory &rhs) : m_entriesCount(rhs.entriesCount()), m_vectorsCount(rhs.vectorsCount()), m_mem(Vc::malloc(m_vectorsCount * V::Size)) { Detail::copyVectors(*this, rhs); } /** * Frees the memory which was allocated in the constructor. */ Vc_ALWAYS_INLINE ~Memory() { Vc::free(m_mem); } /** * Swap the contents and size information of two Memory objects. * * \param rhs The other Memory object to swap. */ inline void swap(Memory &rhs) { std::swap(m_mem, rhs.m_mem); std::swap(m_entriesCount, rhs.m_entriesCount); std::swap(m_vectorsCount, rhs.m_vectorsCount); } /** * \return the number of scalar entries in the whole array. */ Vc_ALWAYS_INLINE Vc_PURE size_t entriesCount() const { return m_entriesCount; } /** * \return the number of vectors in the whole array. */ Vc_ALWAYS_INLINE Vc_PURE size_t vectorsCount() const { return m_vectorsCount; } /** * Overwrite all entries with the values stored in \p rhs. * * \param rhs The object to copy the data from. * * \return reference to the modified Memory object. * * \note this function requires the vectorsCount() of both Memory objects to be equal. */ template Vc_ALWAYS_INLINE Memory &operator=(const MemoryBase &rhs) { assert(vectorsCount() == rhs.vectorsCount()); Detail::copyVectors(*this, rhs); return *this; } Vc_ALWAYS_INLINE Memory &operator=(const Memory &rhs) { assert(vectorsCount() == rhs.vectorsCount()); Detail::copyVectors(*this, rhs); return *this; } /** * Overwrite all entries with the values stored in the memory at \p rhs. * * \param rhs The array to copy the data from. * * \return reference to the modified Memory object. * * \note this function requires that there are entriesCount() many values accessible from \p rhs. */ Vc_ALWAYS_INLINE Memory &operator=(const EntryType *rhs) { std::memcpy(m_mem, rhs, entriesCount() * sizeof(EntryType)); return *this; } }; /** * Prefetch the cacheline containing \p addr for a single read access. * * This prefetch completely bypasses the cache, not evicting any other data. * * \param addr The cacheline containing \p addr will be prefetched. * * \ingroup Utilities * \headerfile memory.h */ Vc_ALWAYS_INLINE void prefetchForOneRead(const void *addr) { Vc::Detail::prefetchForOneRead(addr, VectorAbi::Best()); } /** * Prefetch the cacheline containing \p addr for modification. * * This prefetch evicts data from the cache. So use it only for data you really will use. When the * target system supports it the cacheline will be marked as modified while prefetching, saving work * later on. * * \param addr The cacheline containing \p addr will be prefetched. * * \ingroup Utilities * \headerfile memory.h */ Vc_ALWAYS_INLINE void prefetchForModify(const void *addr) { Vc::Detail::prefetchForModify(addr, VectorAbi::Best()); } /** * Prefetch the cacheline containing \p addr to L1 cache. * * This prefetch evicts data from the cache. So use it only for data you really will use. * * \param addr The cacheline containing \p addr will be prefetched. * * \ingroup Utilities * \headerfile memory.h */ Vc_ALWAYS_INLINE void prefetchClose(const void *addr) { Vc::Detail::prefetchClose(addr, VectorAbi::Best()); } /** * Prefetch the cacheline containing \p addr to L2 cache. * * This prefetch evicts data from the cache. So use it only for data you really will use. * * \param addr The cacheline containing \p addr will be prefetched. * * \ingroup Utilities * \headerfile memory.h */ Vc_ALWAYS_INLINE void prefetchMid(const void *addr) { Vc::Detail::prefetchMid(addr, VectorAbi::Best()); } /** * Prefetch the cacheline containing \p addr to L3 cache. * * This prefetch evicts data from the cache. So use it only for data you really will use. * * \param addr The cacheline containing \p addr will be prefetched. * * \ingroup Utilities * \headerfile memory.h */ Vc_ALWAYS_INLINE void prefetchFar(const void *addr) { Vc::Detail::prefetchFar(addr, VectorAbi::Best()); } } // namespace Common using Common::Memory; using Common::prefetchForOneRead; using Common::prefetchForModify; using Common::prefetchClose; using Common::prefetchMid; using Common::prefetchFar; } // namespace Vc namespace std { template Vc_ALWAYS_INLINE void swap(Vc::Memory &a, Vc::Memory &b) { a.swap(b); } } // namespace std #endif // VC_COMMON_MEMORY_H_ Vc-1.3.3/common/memorybase.h000066400000000000000000001010661320703111200156640ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2009-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_MEMORYBASE_H_ #define VC_COMMON_MEMORYBASE_H_ #include #include #include #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace Common { #define Vc_MEM_OPERATOR_EQ(op) \ template \ Vc_ALWAYS_INLINE enable_if_mutable operator op##=(const T &x) { \ const V v = value() op x; \ v.store(&m_data[0], Flags()); \ return *this; \ } /*dox{{{*/ /** * Helper class for the Memory::vector(size_t) class of functions. * * You will never need to directly make use of this class. It is an implementation detail of the * Memory API. * * \headerfile memorybase.h *//*}}}*/ template class MemoryVector/*{{{*/ { typedef typename std::remove_cv<_V>::type V; template using enable_if_mutable = typename std::enable_if::value && !std::is_const<_V>::value, R>::type; typedef typename V::EntryType EntryType; typedef typename V::Mask Mask; EntryType m_data[V::Size]; public: // It is important that neither initialization nor cleanup is done as MemoryVector aliases // other memory Vc_ALWAYS_INLINE MemoryVector() {} // disable copies because this type is supposed to alias the data in a Memory object, // nothing else MemoryVector(const MemoryVector &) = delete; MemoryVector(MemoryVector &&) = delete; // Do not disable MemoryVector &operator=(const MemoryVector &) = delete; because it is // covered nicely by the operator= below. //! \internal Vc_ALWAYS_INLINE Vc_PURE V value() const { return V(&m_data[0], Flags()); } /** * Cast to \p V operator. * * This function allows to assign this object to any object of type \p V. */ Vc_ALWAYS_INLINE Vc_PURE operator V() const { return value(); } template Vc_ALWAYS_INLINE enable_if_mutable operator=(const T &x) { V v; v = x; v.store(&m_data[0], Flags()); return *this; } Vc_ALL_BINARY(Vc_MEM_OPERATOR_EQ); Vc_ALL_ARITHMETICS(Vc_MEM_OPERATOR_EQ); }; template class MemoryVectorIterator { typedef typename std::remove_cv<_V>::type V; template using enable_if_mutable = typename std::enable_if::value && !std::is_const<_V>::value, R>::type; using iterator_traits = std::iterator_traits *>; MemoryVector<_V, Flags> *d; public: typedef typename iterator_traits::difference_type difference_type; typedef typename iterator_traits::value_type value_type; typedef typename iterator_traits::pointer pointer; typedef typename iterator_traits::reference reference; typedef typename iterator_traits::iterator_category iterator_category; constexpr MemoryVectorIterator(MemoryVector<_V, Flags> *dd) : d(dd) {} constexpr MemoryVectorIterator(const MemoryVectorIterator &) = default; constexpr MemoryVectorIterator(MemoryVectorIterator &&) = default; Vc_ALWAYS_INLINE MemoryVectorIterator &operator=(const MemoryVectorIterator &) = default; Vc_ALWAYS_INLINE void *orderBy() const { return d; } Vc_ALWAYS_INLINE difference_type operator-(const MemoryVectorIterator &rhs) const { return d - rhs.d; } Vc_ALWAYS_INLINE reference operator[](size_t i) const { return d[i]; } Vc_ALWAYS_INLINE reference operator*() const { return *d; } Vc_ALWAYS_INLINE pointer operator->() const { return d; } Vc_ALWAYS_INLINE MemoryVectorIterator &operator++() { ++d; return *this; } Vc_ALWAYS_INLINE MemoryVectorIterator operator++(int) { MemoryVectorIterator r(*this); ++d; return r; } Vc_ALWAYS_INLINE MemoryVectorIterator &operator--() { --d; return *this; } Vc_ALWAYS_INLINE MemoryVectorIterator operator--(int) { MemoryVectorIterator r(*this); --d; return r; } Vc_ALWAYS_INLINE MemoryVectorIterator &operator+=(size_t n) { d += n; return *this; } Vc_ALWAYS_INLINE MemoryVectorIterator &operator-=(size_t n) { d -= n; return *this; } Vc_ALWAYS_INLINE MemoryVectorIterator operator+(size_t n) const { return MemoryVectorIterator(d + n); } Vc_ALWAYS_INLINE MemoryVectorIterator operator-(size_t n) const { return MemoryVectorIterator(d - n); } }; template Vc_ALWAYS_INLINE bool operator==(const MemoryVectorIterator &l, const MemoryVectorIterator &r) { return l.orderBy() == r.orderBy(); } template Vc_ALWAYS_INLINE bool operator!=(const MemoryVectorIterator &l, const MemoryVectorIterator &r) { return l.orderBy() != r.orderBy(); } template Vc_ALWAYS_INLINE bool operator>=(const MemoryVectorIterator &l, const MemoryVectorIterator &r) { return l.orderBy() >= r.orderBy(); } template Vc_ALWAYS_INLINE bool operator<=(const MemoryVectorIterator &l, const MemoryVectorIterator &r) { return l.orderBy() <= r.orderBy(); } template Vc_ALWAYS_INLINE bool operator> (const MemoryVectorIterator &l, const MemoryVectorIterator &r) { return l.orderBy() > r.orderBy(); } template Vc_ALWAYS_INLINE bool operator< (const MemoryVectorIterator &l, const MemoryVectorIterator &r) { return l.orderBy() < r.orderBy(); } /*}}}*/ #undef Vc_MEM_OPERATOR_EQ #define Vc_VPH_OPERATOR(op) \ template \ decltype(std::declval() op std::declval()) operator op( \ const MemoryVector &x, const MemoryVector &y) \ { \ return x.value() op y.value(); \ } Vc_ALL_ARITHMETICS(Vc_VPH_OPERATOR); Vc_ALL_BINARY (Vc_VPH_OPERATOR); Vc_ALL_COMPARES (Vc_VPH_OPERATOR); #undef Vc_VPH_OPERATOR template> class MemoryRange/*{{{*/ { Parent *m_parent; size_t m_first; size_t m_last; public: MemoryRange(Parent *p, size_t firstIndex, size_t lastIndex) : m_parent(p), m_first(firstIndex), m_last(lastIndex) {} MemoryVectorIterator begin() const { return &m_parent->vector(m_first , Flags()); } MemoryVectorIterator end() const { return &m_parent->vector(m_last + 1, Flags()); } };/*}}}*/ template class MemoryDimensionBase; template class MemoryDimensionBase // {{{1 { private: Parent *p() { return static_cast(this); } const Parent *p() const { return static_cast(this); } public: /** * The type of the scalar entries in the array. */ typedef typename V::EntryType EntryType; /** * Returns a pointer to the start of the allocated memory. */ Vc_ALWAYS_INLINE Vc_PURE EntryType *entries() { return &p()->m_mem[0]; } /// Const overload of the above function. Vc_ALWAYS_INLINE Vc_PURE const EntryType *entries() const { return &p()->m_mem[0]; } /** * Returns the \p i-th scalar value in the memory. */ Vc_ALWAYS_INLINE Vc_PURE EntryType &scalar(size_t i) { return entries()[i]; } /// Const overload of the above function. Vc_ALWAYS_INLINE Vc_PURE const EntryType scalar(size_t i) const { return entries()[i]; } #ifdef DOXYGEN /** * Cast operator to the scalar type. This allows to use the object very much like a standard * C array. */ Vc_ALWAYS_INLINE Vc_PURE operator EntryType*() { return entries(); } /// Const overload of the above function. Vc_ALWAYS_INLINE Vc_PURE operator const EntryType*() const { return entries(); } #else // The above conversion operator allows implicit conversion to bool. To prohibit this // conversion we use SFINAE to allow only conversion to EntryType* and void*. template ::type, EntryType *>::value || std::is_same::type, void *>::value, int>::type = 0> Vc_ALWAYS_INLINE Vc_PURE operator T() { return entries(); } template ::value || std::is_same::value, int>::type = 0> Vc_ALWAYS_INLINE Vc_PURE operator T() const { return entries(); } #endif /** * */ template Vc_ALWAYS_INLINE MemoryRange range(size_t firstIndex, size_t lastIndex, Flags) { return MemoryRange(p(), firstIndex, lastIndex); } Vc_ALWAYS_INLINE MemoryRange range(size_t firstIndex, size_t lastIndex) { return MemoryRange(p(), firstIndex, lastIndex); } template Vc_ALWAYS_INLINE MemoryRange range(size_t firstIndex, size_t lastIndex, Flags) const { return MemoryRange(p(), firstIndex, lastIndex); } Vc_ALWAYS_INLINE MemoryRange range(size_t firstIndex, size_t lastIndex) const { return MemoryRange(p(), firstIndex, lastIndex); } /** * Returns the \p i-th scalar value in the memory. */ Vc_ALWAYS_INLINE EntryType &operator[](size_t i) { return entries()[i]; } /// Const overload of the above function. Vc_ALWAYS_INLINE const EntryType &operator[](size_t i) const { return entries()[i]; } /** * Uses a vector gather to combine the entries at the indexes in \p i into the returned * vector object. * * \param i An integer vector. It determines the entries to be gathered. * \returns A vector object. Modification of this object will not modify the values in * memory. * * \warning The API of this function might change in future versions of Vc to additionally * support scatters. */ template Vc_ALWAYS_INLINE Vc_PURE V operator[](Vector i) const { return V(entries(), i); } }; template class MemoryDimensionBase // {{{1 { private: Parent *p() { return static_cast(this); } const Parent *p() const { return static_cast(this); } public: /** * The type of the scalar entries in the array. */ typedef typename V::EntryType EntryType; static constexpr size_t rowCount() { return Parent::RowCount; } /** * Returns a pointer to the start of the allocated memory. */ Vc_ALWAYS_INLINE Vc_PURE EntryType *entries(size_t x = 0) { return &p()->m_mem[x][0]; } /// Const overload of the above function. Vc_ALWAYS_INLINE Vc_PURE const EntryType *entries(size_t x = 0) const { return &p()->m_mem[x][0]; } /** * Returns the \p i,j-th scalar value in the memory. */ Vc_ALWAYS_INLINE Vc_PURE EntryType &scalar(size_t i, size_t j) { return entries(i)[j]; } /// Const overload of the above function. Vc_ALWAYS_INLINE Vc_PURE const EntryType scalar(size_t i, size_t j) const { return entries(i)[j]; } /** * Returns the \p i-th row in the memory. */ Vc_ALWAYS_INLINE Vc_PURE RowMemory &operator[](size_t i) { #ifdef Vc_RECURSIVE_MEMORY return p()->m_mem[i]; #else return RowMemory::fromRawData(entries(i)); #endif } /// Const overload of the above function. Vc_ALWAYS_INLINE Vc_PURE const RowMemory &operator[](size_t i) const { #ifdef Vc_RECURSIVE_MEMORY return p()->m_mem[i]; #else return RowMemory::fromRawData(const_cast(entries(i))); #endif } /** * \return the number of rows in the array. * * \note This function can be eliminated by an optimizing compiler. */ Vc_ALWAYS_INLINE Vc_PURE size_t rowsCount() const { return p()->rowsCount(); } }; //dox{{{1 /** * \headerfile memorybase.h * * Common interface to all Memory classes, independent of allocation on the stack or heap. * * \param V The vector type you want to operate on. (e.g. float_v or uint_v) * \param Parent This type is the complete type of the class that derives from MemoryBase. * \param Dimension The number of dimensions the implementation provides. * \param RowMemory Class to be used to work on a single row. */ template class MemoryBase : public MemoryDimensionBase //{{{1 { static_assert((V::size() * sizeof(typename V::EntryType)) % V::MemoryAlignment == 0, "Vc::Memory can only be used for data-parallel types storing a number " "of values that's a multiple of the memory alignment."); private: Parent *p() { return static_cast(this); } const Parent *p() const { return static_cast(this); } public: /** * The type of the scalar entries in the array. */ typedef typename V::EntryType EntryType; /** * \return the number of scalar entries in the array. This function is optimized away * if a constant size array is used. */ Vc_ALWAYS_INLINE Vc_PURE size_t entriesCount() const { return p()->entriesCount(); } /** * \return the number of vector entries that span the array. This function is optimized away * if a constant size array is used. */ Vc_ALWAYS_INLINE Vc_PURE size_t vectorsCount() const { return p()->vectorsCount(); } using MemoryDimensionBase::entries; using MemoryDimensionBase::scalar; /** * Return a (vectorized) iterator to the start of this memory object. */ template Vc_ALWAYS_INLINE MemoryVectorIterator< V, Flags> begin(Flags flags = Flags()) { return &firstVector(flags); } //! const overload of the above template Vc_ALWAYS_INLINE MemoryVectorIterator begin(Flags flags = Flags()) const { return &firstVector(flags); } /** * Return a (vectorized) iterator to the end of this memory object. */ template Vc_ALWAYS_INLINE MemoryVectorIterator< V, Flags> end(Flags flags = Flags()) { return &lastVector(flags) + 1; } //! const overload of the above template Vc_ALWAYS_INLINE MemoryVectorIterator end(Flags flags = Flags()) const { return &lastVector(flags) + 1; } /** * \param i Selects the offset, where the vector should be read. * * \return a smart object to wrap the \p i-th vector in the memory. * * The return value can be used as any other vector object. I.e. you can substitute * something like * \code * float_v a = ..., b = ...; * a += b; * \endcode * with * \code * mem.vector(i) += b; * \endcode * * This function ensures that only \em aligned loads and stores are used. Thus it only allows to * access memory at fixed strides. If access to known offsets from the aligned vectors is * needed the vector(size_t, int) function can be used. */ template Vc_ALWAYS_INLINE Vc_PURE typename std::enable_if::value, MemoryVector>::type &vector(size_t i, Flags = Flags()) { return *new(&entries()[i * V::Size]) MemoryVector; } /** \brief Const overload of the above function * * \param i Selects the offset, where the vector should be read. * * \return a smart object to wrap the \p i-th vector in the memory. */ template Vc_ALWAYS_INLINE Vc_PURE typename std::enable_if::value, MemoryVector>::type &vector(size_t i, Flags = Flags()) const { return *new(const_cast(&entries()[i * V::Size])) MemoryVector; } /** * \return a smart object to wrap the vector starting from the \p i-th scalar entry in the memory. * * Example: * \code * Memory mem; * mem.setZero(); * for (int i = 0; i < mem.entriesCount(); i += float_v::Size) { * mem.vectorAt(i) += b; * } * \endcode * * \param i Specifies the scalar entry from where the vector will be loaded/stored. I.e. the * values scalar(i), scalar(i + 1), ..., scalar(i + V::Size - 1) will be read/overwritten. * * \param flags You must take care to determine whether an unaligned load/store is * required. Per default an unaligned load/store is used. If \p i is a multiple of \c V::Size * you may want to pass Vc::Aligned here. */ template Vc_ALWAYS_INLINE Vc_PURE MemoryVector &vectorAt(size_t i, Flags flags = Flags()) { return *new(&entries()[i]) MemoryVector; } /** \brief Const overload of the above function * * \return a smart object to wrap the vector starting from the \p i-th scalar entry in the memory. * * \param i Specifies the scalar entry from where the vector will be loaded/stored. I.e. the * values scalar(i), scalar(i + 1), ..., scalar(i + V::Size - 1) will be read/overwritten. * * \param flags You must take care to determine whether an unaligned load/store is * required. Per default an unaligned load/store is used. If \p i is a multiple of \c V::Size * you may want to pass Vc::Aligned here. */ template Vc_ALWAYS_INLINE Vc_PURE MemoryVector &vectorAt(size_t i, Flags flags = Flags()) const { return *new(const_cast(&entries()[i])) MemoryVector; } /** * \return a smart object to wrap the \p i-th vector + \p shift in the memory. * * This function ensures that only \em unaligned loads and stores are used. * It allows to access memory at any location aligned to the entry type. * * \param i Selects the memory location of the i-th vector. Thus if \p V::Size == 4 and * \p i is set to 3 the base address for the load/store will be the 12th entry * (same as \p &mem[12]). * \param shift Shifts the base address determined by parameter \p i by \p shift many * entries. Thus \p vector(3, 1) for \p V::Size == 4 will load/store the * 13th - 16th entries (same as \p &mem[13]). * * \note Any shift value is allowed as long as you make sure it stays within bounds of the * allocated memory. Shift values that are a multiple of \p V::Size will \em not result in * aligned loads. You have to use the above vector(size_t) function for aligned loads * instead. * * \note Thus a simple way to access vectors randomly is to set \p i to 0 and use \p shift as the * parameter to select the memory address: * \code * // don't use: * mem.vector(i / V::Size, i % V::Size) += 1; * // instead use: * mem.vector(0, i) += 1; * \endcode */ template Vc_ALWAYS_INLINE Vc_PURE typename std::enable_if< std::is_convertible::value, MemoryVector() | Unaligned)>>::type & vector(size_t i, ShiftT shift, Flags = Flags()) { return *new (&entries()[i * V::Size + shift]) MemoryVector() | Unaligned)>; } /// Const overload of the above function. template Vc_ALWAYS_INLINE Vc_PURE typename std::enable_if< std::is_convertible::value, MemoryVector() | Unaligned)>>::type & vector(size_t i, ShiftT shift, Flags = Flags()) const { return *new (const_cast(&entries()[i * V::Size + shift])) MemoryVector() | Unaligned)>; } /** * \return the first vector in the allocated memory. * * This function is simply a shorthand for vector(0). */ template Vc_ALWAYS_INLINE Vc_PURE MemoryVector &firstVector(Flags = Flags()) { return *new(entries()) MemoryVector; } /// Const overload of the above function. template Vc_ALWAYS_INLINE Vc_PURE MemoryVector &firstVector(Flags = Flags()) const { return *new(const_cast(entries())) MemoryVector; } /** * \return the last vector in the allocated memory. * * This function is simply a shorthand for vector(vectorsCount() - 1). */ template Vc_ALWAYS_INLINE Vc_PURE MemoryVector &lastVector(Flags = Flags()) { return *new(&entries()[vectorsCount() * V::Size - V::Size]) MemoryVector; } /// Const overload of the above function. template Vc_ALWAYS_INLINE Vc_PURE MemoryVector &lastVector(Flags = Flags()) const { return *new(const_cast(&entries()[vectorsCount() * V::Size - V::Size])) MemoryVector; } Vc_ALWAYS_INLINE Vc_PURE V gather(const unsigned char *indexes) const { return V(entries(), indexes); } Vc_ALWAYS_INLINE Vc_PURE V gather(const unsigned short *indexes) const { return V(entries(), indexes); } Vc_ALWAYS_INLINE Vc_PURE V gather(const unsigned int *indexes) const { return V(entries(), indexes); } Vc_ALWAYS_INLINE Vc_PURE V gather(const unsigned long *indexes) const { return V(entries(), indexes); } /** * Zero the whole memory area. */ Vc_ALWAYS_INLINE void setZero() { V zero(Vc::Zero); for (size_t i = 0; i < vectorsCount(); ++i) { vector(i) = zero; } } /** * Assign a value to all vectors in the array. */ template Vc_ALWAYS_INLINE Parent &operator=(U &&x) { for (size_t i = 0; i < vectorsCount(); ++i) { vector(i) = std::forward(x); } } /** * (Inefficient) shorthand to add up two arrays. */ template inline Parent &operator+=(const MemoryBase &rhs) { assert(vectorsCount() == rhs.vectorsCount()); for (size_t i = 0; i < vectorsCount(); ++i) { vector(i) += rhs.vector(i); } return static_cast(*this); } /** * (Inefficient) shorthand to subtract two arrays. */ template inline Parent &operator-=(const MemoryBase &rhs) { assert(vectorsCount() == rhs.vectorsCount()); for (size_t i = 0; i < vectorsCount(); ++i) { vector(i) -= rhs.vector(i); } return static_cast(*this); } /** * (Inefficient) shorthand to multiply two arrays. */ template inline Parent &operator*=(const MemoryBase &rhs) { assert(vectorsCount() == rhs.vectorsCount()); for (size_t i = 0; i < vectorsCount(); ++i) { vector(i) *= rhs.vector(i); } return static_cast(*this); } /** * (Inefficient) shorthand to divide two arrays. */ template inline Parent &operator/=(const MemoryBase &rhs) { assert(vectorsCount() == rhs.vectorsCount()); for (size_t i = 0; i < vectorsCount(); ++i) { vector(i) /= rhs.vector(i); } return static_cast(*this); } /** * (Inefficient) shorthand to add a value to an array. */ inline Parent &operator+=(EntryType rhs) { V v(rhs); for (size_t i = 0; i < vectorsCount(); ++i) { vector(i) += v; } return static_cast(*this); } /** * (Inefficient) shorthand to subtract a value from an array. */ inline Parent &operator-=(EntryType rhs) { V v(rhs); for (size_t i = 0; i < vectorsCount(); ++i) { vector(i) -= v; } return static_cast(*this); } /** * (Inefficient) shorthand to multiply a value to an array. */ inline Parent &operator*=(EntryType rhs) { V v(rhs); for (size_t i = 0; i < vectorsCount(); ++i) { vector(i) *= v; } return static_cast(*this); } /** * (Inefficient) shorthand to divide an array with a value. */ inline Parent &operator/=(EntryType rhs) { V v(rhs); for (size_t i = 0; i < vectorsCount(); ++i) { vector(i) /= v; } return static_cast(*this); } /** * (Inefficient) shorthand compare equality of two arrays. */ template inline bool operator==(const MemoryBase &rhs) const { assert(vectorsCount() == rhs.vectorsCount()); for (size_t i = 0; i < vectorsCount(); ++i) { if (!(V(vector(i)) == V(rhs.vector(i))).isFull()) { return false; } } return true; } /** * (Inefficient) shorthand compare two arrays. */ template inline bool operator!=(const MemoryBase &rhs) const { assert(vectorsCount() == rhs.vectorsCount()); for (size_t i = 0; i < vectorsCount(); ++i) { if (!(V(vector(i)) == V(rhs.vector(i))).isEmpty()) { return false; } } return true; } /** * (Inefficient) shorthand compare two arrays. */ template inline bool operator<(const MemoryBase &rhs) const { assert(vectorsCount() == rhs.vectorsCount()); for (size_t i = 0; i < vectorsCount(); ++i) { if (!(V(vector(i)) < V(rhs.vector(i))).isFull()) { return false; } } return true; } /** * (Inefficient) shorthand compare two arrays. */ template inline bool operator<=(const MemoryBase &rhs) const { assert(vectorsCount() == rhs.vectorsCount()); for (size_t i = 0; i < vectorsCount(); ++i) { if (!(V(vector(i)) <= V(rhs.vector(i))).isFull()) { return false; } } return true; } /** * (Inefficient) shorthand compare two arrays. */ template inline bool operator>(const MemoryBase &rhs) const { assert(vectorsCount() == rhs.vectorsCount()); for (size_t i = 0; i < vectorsCount(); ++i) { if (!(V(vector(i)) > V(rhs.vector(i))).isFull()) { return false; } } return true; } /** * (Inefficient) shorthand compare two arrays. */ template inline bool operator>=(const MemoryBase &rhs) const { assert(vectorsCount() == rhs.vectorsCount()); for (size_t i = 0; i < vectorsCount(); ++i) { if (!(V(vector(i)) >= V(rhs.vector(i))).isFull()) { return false; } } return true; } }; namespace Detail { template inline void copyVectors(MemoryBase &dst, const MemoryBase &src) { const size_t vectorsCount = dst.vectorsCount(); size_t i = 3; for (; i < vectorsCount; i += 4) { const V tmp3 = src.vector(i - 3); const V tmp2 = src.vector(i - 2); const V tmp1 = src.vector(i - 1); const V tmp0 = src.vector(i - 0); dst.vector(i - 3) = tmp3; dst.vector(i - 2) = tmp2; dst.vector(i - 1) = tmp1; dst.vector(i - 0) = tmp0; } for (i -= 3; i < vectorsCount; ++i) { dst.vector(i) = src.vector(i); } } } // namespace Detail } // namespace Common } // namespace Vc #endif // VC_COMMON_MEMORYBASE_H_ // vim: foldmethod=marker Vc-1.3.3/common/memoryfwd.h000066400000000000000000000037051320703111200155330ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2011-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_MEMORYFWD_H_ #define VC_COMMON_MEMORYFWD_H_ namespace Vc_VERSIONED_NAMESPACE { namespace Common { template class Memory; template class MemoryBase; } // namespace Common using Common::Memory; } // namespace Vc #endif // VC_COMMON_MEMORYFWD_H_ Vc-1.3.3/common/operators.h000066400000000000000000000366151320703111200155460ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2012-2016 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef COMMON_OPERATORS_H_ #define COMMON_OPERATORS_H_ #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace Detail { template enable_if::value, U> is_convertible_to_any_vector(Vector); template T is_convertible_to_any_vector(Vector); template void is_convertible_to_any_vector(...); template ::value, bool = std::is_integral::value> struct FundamentalReturnType; template struct FundamentalReturnType { using type = typename std::conditional< std::is_arithmetic::value, typename std::conditional<(sizeof(T) < sizeof(U)), U, T>::type, // U is not arithmetic, e.g. an enum or a type with e.g. operator int() T>::type; }; template struct FundamentalReturnType { using type = typename std::conditional< std::is_arithmetic::value, U, // U is not arithmetic, e.g. an enum or a type with e.g. operator int() T>::type; }; template struct FundamentalReturnType { using type = T; }; template struct my_make_signed : public std::make_signed { }; template <> struct my_make_signed { using type = bool; }; template struct higher_conversion_rank { template using fix_sign = typename std::conditional<(std::is_unsigned::value || std::is_unsigned::value), typename std::make_unsigned::type, A>::type; using T = typename my_make_signed::type; using U = typename my_make_signed::type; template using c = typename std::conditional::value || std::is_same::value, Test, Otherwise>::type; using type = fix_sign>>>>>; }; template struct FundamentalReturnType { template using c = typename std::conditional::type; using type = c<(sizeof(T) > sizeof(U)), T, c<(sizeof(T) < sizeof(U)), U, typename higher_conversion_rank::type>>; }; static_assert(std::is_same::type>::value, ""); template struct ReturnTypeImpl { // no type => SFINAE }; template struct ReturnTypeImpl, Vector, false, Deduced, false> { using type = Vc::Vector::type, Abi>; }; template struct ReturnTypeImpl, int, true, T, true> { using type = Vc::Vector; }; template struct ReturnTypeImpl, unsigned int, true, T, true> { using type = Vc::Vector::type, Abi>; }; template struct ReturnTypeImpl, U, true, T, Integral> { using type = Vc::Vector::type, Abi>; }; template struct ReturnTypeImpl, U, false, void, Integral> { // no type => SFINAE }; template struct ReturnTypeImpl, U, false, V, Integral> { using type = Vc::Vector::type, Abi>; }; template using ReturnType = ReturnTypeImpl< V, T, std::is_arithmetic::value || std::is_convertible::value, decltype(is_convertible_to_any_vector( std::declval())), std::is_integral::value>; template struct is_a_type : public std::true_type { }; #ifdef Vc_ENABLE_FLOAT_BIT_OPERATORS #define Vc_TEST_FOR_BUILTIN_OPERATOR(op_) true #else #define Vc_TEST_FOR_BUILTIN_OPERATOR(op_) \ Detail::is_a_type, U>::type::EntryType>() \ op_ std::declval, \ U>::type::EntryType>())>::value #endif } // namespace Detail #define Vc_GENERIC_OPERATOR(op_) \ template \ Vc_ALWAYS_INLINE enable_if< \ Vc_TEST_FOR_BUILTIN_OPERATOR(op_) && \ std::is_convertible, typename Detail::ReturnType< \ Vector, U>::type>::value && \ std::is_convertible< \ U, typename Detail::ReturnType, U>::type>::value, \ typename Detail::ReturnType, U>::type> \ operator op_(Vector x, const U &y) \ { \ using V = typename Detail::ReturnType, U>::type; \ return Detail::operator op_(V(x), V(y)); \ } \ template \ Vc_ALWAYS_INLINE enable_if< \ Vc_TEST_FOR_BUILTIN_OPERATOR(op_) && \ !Traits::is_simd_vector_internal::value && \ std::is_convertible, typename Detail::ReturnType< \ Vector, U>::type>::value && \ std::is_convertible< \ U, typename Detail::ReturnType, U>::type>::value, \ typename Detail::ReturnType, U>::type> \ operator op_(const U &x, Vector y) \ { \ using V = typename Detail::ReturnType, U>::type; \ return Detail::operator op_(V(x), V(y)); \ } \ template \ Vc_ALWAYS_INLINE enable_if< \ Vc_TEST_FOR_BUILTIN_OPERATOR(op_) && \ std::is_convertible, typename Detail::ReturnType< \ Vector, U>::type>::value && \ std::is_convertible< \ U, typename Detail::ReturnType, U>::type>::value, \ Vector &> \ operator op_##=(Vector &x, const U &y) \ { \ using V = typename Detail::ReturnType, U>::type; \ x = Detail::operator op_(V(x), V(y)); \ return x; \ } #define Vc_LOGICAL_OPERATOR(op_) \ template \ Vc_ALWAYS_INLINE typename Vector::Mask operator op_(Vector x, \ Vector y) \ { \ return !!x op_ !!y; \ } \ template \ Vc_ALWAYS_INLINE enable_if< \ std::is_convertible, Vector>::value && \ std::is_convertible, Vector>::value, \ typename Detail::ReturnType, Vector>::type::Mask> \ operator op_(Vector x, Vector y) \ { \ return !!x op_ !!y; \ } \ template \ Vc_ALWAYS_INLINE \ enable_if())>::value, \ typename Vector::Mask> \ operator op_(Vector x, const U &y) \ { \ using M = typename Vector::Mask; \ return !!x op_ M(!!y); \ } \ template \ Vc_ALWAYS_INLINE \ enable_if())>::value, \ typename Vector::Mask> \ operator op_(const U &x, Vector y) \ { \ using M = typename Vector::Mask; \ return M(!!x) op_ !!y; \ } #define Vc_COMPARE_OPERATOR(op_) \ template \ Vc_ALWAYS_INLINE enable_if< \ std::is_convertible, typename Detail::ReturnType< \ Vector, U>::type>::value && \ std::is_convertible< \ U, typename Detail::ReturnType, U>::type>::value, \ typename Detail::ReturnType, U>::type::Mask> \ operator op_(Vector x, const U &y) \ { \ using V = typename Detail::ReturnType, U>::type; \ return Detail::operator op_(V(x), V(y)); \ } \ template \ Vc_ALWAYS_INLINE enable_if< \ !Traits::is_simd_vector_internal::value && \ std::is_convertible, typename Detail::ReturnType< \ Vector, U>::type>::value && \ std::is_convertible< \ U, typename Detail::ReturnType, U>::type>::value, \ typename Detail::ReturnType, U>::type::Mask> \ operator op_(const U &x, Vector y) \ { \ using V = typename Detail::ReturnType, U>::type; \ return Detail::operator op_(V(x), V(y)); \ } Vc_ALL_LOGICAL (Vc_LOGICAL_OPERATOR); Vc_ALL_BINARY (Vc_GENERIC_OPERATOR); Vc_ALL_ARITHMETICS(Vc_GENERIC_OPERATOR); Vc_ALL_COMPARES (Vc_COMPARE_OPERATOR); #undef Vc_LOGICAL_OPERATOR #undef Vc_GENERIC_OPERATOR #undef Vc_COMPARE_OPERATOR #undef Vc_INVALID_OPERATOR } // namespace Vc #endif // COMMON_OPERATORS_H_ Vc-1.3.3/common/permutation.h000066400000000000000000000035001320703111200160620ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_PERMUTATION_H_ #define VC_COMMON_PERMUTATION_H_ #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace Permutation { struct ReversedTag {}; constexpr ReversedTag Reversed{}; } // namespace Permutation } #endif // VC_COMMON_PERMUTATION_H_ // vim: foldmethod=marker Vc-1.3.3/common/scatterimplementation.h000066400000000000000000000213301320703111200201270ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2014-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_SCATTERIMPLEMENTATION_H_ #define VC_COMMON_SCATTERIMPLEMENTATION_H_ #include "gatherimplementation.h" #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace Common { template Vc_ALWAYS_INLINE void executeScatter(SetIndexZeroT, V &v, MT *mem, IT indexes, typename V::MaskArgument mask) { indexes.setZeroInverted(static_cast(mask)); // Huh? const V tmp(mem, indexes); where(mask) | v = tmp; } template Vc_ALWAYS_INLINE void executeScatter(SimpleLoopT, V &v, MT *mem, const IT &indexes, typename V::MaskArgument mask) { if (Vc_IS_UNLIKELY(mask.isEmpty())) { return; } Common::unrolled_loop([&](std::size_t i) { if (mask[i]) mem[indexes[i]] = v[i]; }); } template Vc_ALWAYS_INLINE void executeScatter(BitScanLoopT, V &v, MT *mem, const IT &indexes, typename V::MaskArgument mask) { size_t bits = mask.toInt(); while (Vc_IS_LIKELY(bits > 0)) { size_t i, j; asm("bsf %[bits],%[i]\n\t" "bsr %[bits],%[j]\n\t" "btr %[i],%[bits]\n\t" "btr %[j],%[bits]\n\t" : [i] "=r"(i), [j] "=r"(j), [bits] "+r"(bits)); mem[indexes[i]] = v[i]; mem[indexes[j]] = v[j]; } /* Alternative from Vc::SSE (0.7) int bits = mask.toInt(); while (bits) { const int i = _bit_scan_forward(bits); bits ^= (1 << i); // btr? mem[indexes[i]] = v[i]; } */ } template Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT, V &v, MT *mem, const IT &indexes, typename V::MaskArgument mask, enable_if = nullarg) { unsigned int bits = mask.toInt(); unsigned int low, high = 0; switch (Vc::Detail::popcnt16(bits)) { case 16: v.scatter(mem, indexes); break; case 15: low = _bit_scan_forward(bits); bits ^= 1 << low; mem[indexes[low]] = v[low]; case 14: high = _bit_scan_reverse(bits); mem[indexes[high]] = v[high]; high = (1 << high); case 13: low = _bit_scan_forward(bits); bits ^= high | (1 << low); mem[indexes[low]] = v[low]; case 12: high = _bit_scan_reverse(bits); mem[indexes[high]] = v[high]; high = (1 << high); case 11: low = _bit_scan_forward(bits); bits ^= high | (1 << low); mem[indexes[low]] = v[low]; case 10: high = _bit_scan_reverse(bits); mem[indexes[high]] = v[high]; high = (1 << high); case 9: low = _bit_scan_forward(bits); bits ^= high | (1 << low); mem[indexes[low]] = v[low]; case 8: high = _bit_scan_reverse(bits); mem[indexes[high]] = v[high]; high = (1 << high); case 7: low = _bit_scan_forward(bits); bits ^= high | (1 << low); mem[indexes[low]] = v[low]; case 6: high = _bit_scan_reverse(bits); mem[indexes[high]] = v[high]; high = (1 << high); case 5: low = _bit_scan_forward(bits); bits ^= high | (1 << low); mem[indexes[low]] = v[low]; case 4: high = _bit_scan_reverse(bits); mem[indexes[high]] = v[high]; high = (1 << high); case 3: low = _bit_scan_forward(bits); bits ^= high | (1 << low); mem[indexes[low]] = v[low]; case 2: high = _bit_scan_reverse(bits); mem[indexes[high]] = v[high]; case 1: low = _bit_scan_forward(bits); mem[indexes[low]] = v[low]; case 0: break; } } template Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT, V &v, MT *mem, const IT &indexes, typename V::MaskArgument mask, enable_if = nullarg) { unsigned int bits = mask.toInt(); unsigned int low, high = 0; switch (Vc::Detail::popcnt8(bits)) { case 8: v.scatter(mem, indexes); break; case 7: low = _bit_scan_forward(bits); bits ^= 1 << low; mem[indexes[low]] = v[low]; case 6: high = _bit_scan_reverse(bits); mem[indexes[high]] = v[high]; high = (1 << high); case 5: low = _bit_scan_forward(bits); bits ^= high | (1 << low); mem[indexes[low]] = v[low]; case 4: high = _bit_scan_reverse(bits); mem[indexes[high]] = v[high]; high = (1 << high); case 3: low = _bit_scan_forward(bits); bits ^= high | (1 << low); mem[indexes[low]] = v[low]; case 2: high = _bit_scan_reverse(bits); mem[indexes[high]] = v[high]; case 1: low = _bit_scan_forward(bits); mem[indexes[low]] = v[low]; case 0: break; } } template Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT, V &v, MT *mem, const IT &indexes, typename V::MaskArgument mask, enable_if = nullarg) { unsigned int bits = mask.toInt(); unsigned int low, high = 0; switch (Vc::Detail::popcnt4(bits)) { case 4: v.scatter(mem, indexes); break; case 3: low = _bit_scan_forward(bits); bits ^= 1 << low; mem[indexes[low]] = v[low]; case 2: high = _bit_scan_reverse(bits); mem[indexes[high]] = v[high]; case 1: low = _bit_scan_forward(bits); mem[indexes[low]] = v[low]; case 0: break; } } template Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT, V &v, MT *mem, const IT &indexes, typename V::MaskArgument mask, enable_if = nullarg) { unsigned int bits = mask.toInt(); unsigned int low; switch (Vc::Detail::popcnt4(bits)) { case 2: v.scatter(mem, indexes); break; case 1: low = _bit_scan_forward(bits); mem[indexes[low]] = v[low]; case 0: break; } } } // namespace Common } // namespace Vc #endif // VC_COMMON_SCATTERIMPLEMENTATION_H_ Vc-1.3.3/common/scatterinterface.h000066400000000000000000000346311320703111200170520ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2014-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ /////////////////////////////////////////////////////////////////////////////////////////// // scatters // A scatter takes the following arguments: // 1. A pointer to memory of any type that EntryType can convert to. // 2. An indexes “vector”. The requirement is that the type implements the subscript operator, // stores «Size» valid index values, and each offset to the pointer above yields a valid // memory location for reading. // 3. Optionally the third argument may be a mask. The mask disables several memory stores and // thus removes the requirements in (2.) for the disabled entries. private: /**\internal * This function implements a scatter given a pointer to memory \p mem and some * container object storing the scatter \p indexes. * * \param mem This pointer must be aligned correctly for the type \p MT. This is the * natural behavior of C++, so this is typically the case. * \param indexes This object contains at least \VSize{T} indexes that denote the * offset in \p mem where the components for the current vector should be copied to. * The offset is not in Bytes, but in multiples of `sizeof(MT)`. */ // enable_if::value && has_subscript_operator::value> template inline void scatterImplementation(MT *mem, IT &&indexes) const; /**\internal * This overload of the above function adds a \p mask argument to disable memory * accesses at the \p indexes offsets where \p mask is \c false. */ template inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const; public: #define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \ static_assert( \ std::is_convertible::value, \ "The memory pointer needs to point to a type that the EntryType of this " \ "SIMD vector type can be converted to."); \ static_assert( \ Vc::Traits::has_subscript_operator::value, \ "The indexes argument must be a type that implements the subscript operator."); \ static_assert( \ !Traits::is_simd_vector::value || \ Traits::simd_vector_size::value >= Size, \ "If you use a SIMD vector for the indexes parameter, the index vector must " \ "have at least as many entries as this SIMD vector."); \ static_assert( \ !std::is_array::value || \ (std::rank::value == 1 && \ (std::extent::value == 0 || std::extent::value >= Size)), \ "If you use a simple array for the indexes parameter, the array must have " \ "at least as many entries as this SIMD vector.") /** * \name Scatter functions * * Stores a vector to the objects at `mem[indexes[0]]`, `mem[indexes[1]]`, * `mem[indexes[2]]`, ... * * \param mem A pointer to memory which contains objects of type \p MT at the offsets * given by \p indexes. * \param indexes * \param mask */ ///@{ /// Scatter function template ::value>> Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const { Vc_ASSERT_SCATTER_PARAMETER_TYPES_; scatterImplementation(mem, std::forward(indexes)); } /// Masked scatter function template ::value>> Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const { Vc_ASSERT_SCATTER_PARAMETER_TYPES_; scatterImplementation(mem, std::forward(indexes), mask); } ///@} /// \name Deprecated Members ///@{ /** * \deprecated Use Vc::array or Vc::vector subscripting instead. * * \param array A pointer into memory (without alignment restrictions). * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to * be read. Thus the offsets in \p indexes are relative to the \p array and not to * the size of the gathered type (i.e. array[i].*member1 is accessed instead of * (&(array->*member1))[i]) * \param indexes Determines the offsets into \p array where the values are gathered from/scattered * to. The type of indexes can either be an integer vector or a type that supports * operator[] access. */ template Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " "instead.") inline void scatter(S1 *array, EntryType S1::*member1, IT indexes) const { scatter(Common::SubscriptOperation, true>( array, indexes)[member1] .scatterArguments()); } /** * \deprecated Use Vc::array or Vc::vector subscripting instead. * * \param array A pointer into memory (without alignment restrictions). * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to * be read. Thus the offsets in \p indexes are relative to the \p array and not to * the size of the gathered type (i.e. array[i].*member1 is accessed instead of * (&(array->*member1))[i]) * \param indexes Determines the offsets into \p array where the values are gathered from/scattered * to. The type of indexes can either be an integer vector or a type that supports * operator[] access. * \param mask If a mask is given only the active entries will be gathered/scattered. */ template Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " "instead.") inline void scatter(S1 *array, EntryType S1::*member1, IT indexes, MaskArgument mask) const { scatter(Common::SubscriptOperation, true>( array, indexes)[member1] .scatterArguments(), mask); } /** * \deprecated Use Vc::array or Vc::vector subscripting instead. * * \param array A pointer into memory (without alignment restrictions). * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to * be read. Thus the offsets in \p indexes are relative to the \p array and not to * the size of the gathered type (i.e. array[i].*member1 is accessed instead of * (&(array->*member1))[i]) * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that * struct (i.e. array[i].*member1.*member2 is read). * \param indexes Determines the offsets into \p array where the values are gathered from/scattered * to. The type of indexes can either be an integer vector or a type that supports * operator[] access. */ template Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " "instead.") inline void scatter(S1 *array, S2 S1::*member1, EntryType S2::*member2, IT indexes) const { scatter(Common::SubscriptOperation, true>( array, indexes)[member1][member2] .scatterArguments()); } /** * \deprecated Use Vc::array or Vc::vector subscripting instead. * * \param array A pointer into memory (without alignment restrictions). * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to * be read. Thus the offsets in \p indexes are relative to the \p array and not to * the size of the gathered type (i.e. array[i].*member1 is accessed instead of * (&(array->*member1))[i]) * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that * struct (i.e. array[i].*member1.*member2 is read). * \param indexes Determines the offsets into \p array where the values are gathered from/scattered * to. The type of indexes can either be an integer vector or a type that supports * operator[] access. * \param mask If a mask is given only the active entries will be gathered/scattered. */ template Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " "instead.") inline void scatter(S1 *array, S2 S1::*member1, EntryType S2::*member2, IT indexes, MaskArgument mask) const { scatter(Common::SubscriptOperation, true>( array, indexes)[member1][member2] .scatterArguments(), mask); } /** * \deprecated Use Vc::array or Vc::vector subscripting instead. * * \param array A pointer into memory (without alignment restrictions). * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to * be read. Thus the offsets in \p indexes are relative to the \p array and not to * the size of the gathered type (i.e. array[i].*member1 is accessed instead of * (&(array->*member1))[i]) * \param outerIndexes * \param innerIndexes */ template Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " "instead.") inline void scatter(S1 *array, EntryType *S1::*ptrMember1, IT1 outerIndexes, IT2 innerIndexes) const { scatter(Common::SubscriptOperation, true>( array, outerIndexes)[ptrMember1][innerIndexes] .scatterArguments()); } /** * \deprecated Use Vc::array or Vc::vector subscripting instead. * * \param array A pointer into memory (without alignment restrictions). * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to * be read. Thus the offsets in \p indexes are relative to the \p array and not to * the size of the gathered type (i.e. array[i].*member1 is accessed instead of * (&(array->*member1))[i]) * \param outerIndexes * \param innerIndexes * \param mask If a mask is given only the active entries will be gathered/scattered. */ template Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " "instead.") inline void scatter(S1 *array, EntryType *S1::*ptrMember1, IT1 outerIndexes, IT2 innerIndexes, MaskArgument mask) const { scatter(Common::SubscriptOperation, true>( array, outerIndexes)[ptrMember1][innerIndexes] .scatterArguments(), mask); } ///@} /**\internal * \name Scatter function to use from Vc::Common::subscript_operator * * \param args * \param mask */ ///@{ template Vc_INTRINSIC void scatter(const Common::ScatterArguments &args) const { scatter(args.address, args.indexes); } template Vc_INTRINSIC void scatter(const Common::ScatterArguments &args, MaskArgument mask) const { scatter(args.address, args.indexes, mask); } ///@} #undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_ Vc-1.3.3/common/set.h000066400000000000000000000104741320703111200143160ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2013-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_SET_H_ #define VC_COMMON_SET_H_ #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace { static Vc_INTRINSIC Vc_CONST __m128i set(unsigned short x0, unsigned short x1, unsigned short x2, unsigned short x3, unsigned short x4, unsigned short x5, unsigned short x6, unsigned short x7) { #if defined(Vc_GNU_ASM) #if 0 // defined(__x86_64__) // it appears that the 32bit variant is always faster __m128i r; unsigned long long tmp0 = x3; tmp0 = (tmp0 << 16) | x2; unsigned long long tmp1 = x1; tmp1 = (tmp1 << 16) | x0; asm("vmovq %1,%0" : "=x"(r) : "r"((tmp0 << 32) | tmp1)); unsigned long long tmp2 = x7; tmp2 = (tmp2 << 16) | x6; unsigned long long tmp3 = x5; tmp3 = (tmp3 << 16) | x4; asm("vpinsrq $1,%1,%0,%0" : "+x"(r) : "r"((tmp2 << 32) | tmp3)); return r; #elif defined(Vc_USE_VEX_CODING) __m128i r0, r1; unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0; unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2; unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4; unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6; asm("vmovd %1,%0" : "=x"(r0) : "r"(tmp0)); asm("vpinsrd $1,%1,%0,%0" : "+x"(r0) : "r"(tmp1)); asm("vmovd %1,%0" : "=x"(r1) : "r"(tmp2)); asm("vpinsrd $1,%1,%0,%0" : "+x"(r1) : "r"(tmp3)); asm("vpunpcklqdq %1,%0,%0" : "+x"(r0) : "x"(r1)); return r0; #else __m128i r0, r1; unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0; unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2; unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4; unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6; asm("movd %1,%0" : "=x"(r0) : "r"(tmp0)); asm("pinsrd $1,%1,%0" : "+x"(r0) : "r"(tmp1)); asm("movd %1,%0" : "=x"(r1) : "r"(tmp2)); asm("pinsrd $1,%1,%0" : "+x"(r1) : "r"(tmp3)); asm("punpcklqdq %1,%0" : "+x"(r0) : "x"(r1)); return r0; #endif #else unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0; unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2; unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4; unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6; return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3); #endif } static Vc_INTRINSIC Vc_CONST __m128i set(short x0, short x1, short x2, short x3, short x4, short x5, short x6, short x7) { return set(static_cast(x0), static_cast(x1), static_cast(x2), static_cast(x3), static_cast(x4), static_cast(x5), static_cast(x6), static_cast(x7)); } } // anonymous namespace } // namespace Vc #endif // VC_COMMON_SET_H_ Vc-1.3.3/common/simd_cast.h000066400000000000000000000054161320703111200154710ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2014-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_SIMD_CAST_H_ #define VC_COMMON_SIMD_CAST_H_ #include #include "macros.h" // declare a bogus simd_cast function template in the global namespace to enable ADL for // simd_cast template void simd_cast(); namespace Vc_VERSIONED_NAMESPACE { /** * Casts the argument \p x from type \p From to type \p To. * * This function implements the trivial case where \p To and \p From are the same type. * * \param x The object of type \p From to be converted to type \p To. * \returns An object of type \p To with all vector components converted according to * standard conversion behavior as mandated by the C++ standard for the * underlying arithmetic types. */ template Vc_INTRINSIC Vc_CONST To simd_cast(From &&x, enable_if>::value> = nullarg) { return std::forward(x); } /** * A cast from nothing results in default-initialization of \p To. * * This function can be useful in generic code where a parameter pack expands to nothing. * * \returns A zero-initialized object of type \p To. */ template Vc_INTRINSIC Vc_CONST To simd_cast() { return To(); } } // namespace Vc #endif // VC_COMMON_SIMD_CAST_H_ Vc-1.3.3/common/simd_cast_caller.tcc000066400000000000000000000072671320703111200173430ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2014-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_SIMD_CAST_CALLER_TCC_ #define VC_COMMON_SIMD_CAST_CALLER_TCC_ #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { template template Vc_INTRINSIC SimdMaskArray::SimdMaskArray( const SimdMaskArray &x, enable_if) : data(simd_cast(internal_data(x))) { } template template Vc_INTRINSIC SimdMaskArray::SimdMaskArray( const SimdMaskArray &x, enable_if<(N > V::Size && N <= 2 * V::Size)>) : data(simd_cast(internal_data(internal_data0(x)), internal_data(internal_data1(x)))) { } template template Vc_INTRINSIC SimdMaskArray::SimdMaskArray( const SimdMaskArray &x, enable_if<(N > 2 * V::Size && N <= 4 * V::Size)>) : data(simd_cast(internal_data(internal_data0(internal_data0(x))), internal_data(internal_data1(internal_data0(x))), internal_data(internal_data0(internal_data1(x))), internal_data(internal_data1(internal_data1(x))))) { } // conversion from any Segment object (could be SimdMaskArray or Mask) template template Vc_INTRINSIC SimdMaskArray::SimdMaskArray( Common::Segment &&x, enable_if::value == Size * Pieces>) : data(simd_cast(x.data)) { } // conversion from Mask template template Vc_INTRINSIC SimdMaskArray::SimdMaskArray( M k, enable_if<(Traits::is_simd_mask::value && !Traits::isSimdMaskArray::value && Traits::simd_vector_size::value == Size)>) : data(simd_cast(k)) { } } #endif // VC_COMMON_SIMD_CAST_CALLER_TCC_ // vim: foldmethod=marker Vc-1.3.3/common/simdarray.h000066400000000000000000003555271320703111200155310ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2013-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_SIMDARRAY_H_ #define VC_COMMON_SIMDARRAY_H_ //#define Vc_DEBUG_SIMD_CAST 1 //#define Vc_DEBUG_SORTED 1 #if defined Vc_DEBUG_SIMD_CAST || defined Vc_DEBUG_SORTED #include #endif #include #include "writemaskedvector.h" #include "simdarrayhelper.h" #include "simdmaskarray.h" #include "utility.h" #include "interleave.h" #include "indexsequence.h" #include "transpose.h" #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { // internal namespace (product & sum helper) {{{1 namespace internal { template T Vc_INTRINSIC Vc_PURE product_helper_(const T &l, const T &r) { return l * r; } template T Vc_INTRINSIC Vc_PURE sum_helper_(const T &l, const T &r) { return l + r; } } // namespace internal // min & max declarations {{{1 template inline SimdArray min(const SimdArray &x, const SimdArray &y); template inline SimdArray max(const SimdArray &x, const SimdArray &y); // SimdArray class {{{1 /// \addtogroup SimdArray /// @{ // atomic SimdArray {{{1 #define Vc_CURRENT_CLASS_NAME SimdArray /**\internal * Specialization of `SimdArray` for the case where `N == * VectorSize`. * * This is specialized for implementation purposes: Since the general implementation uses * two SimdArray data members it recurses over different SimdArray instantiations. The * recursion is ended by this specialization, which has a single \p VectorType_ data * member to which all functions are forwarded more or less directly. */ template class SimdArray { static_assert(std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value, "SimdArray may only be used with T = { double, float, int32_t, uint32_t, " "int16_t, uint16_t }"); public: using VectorType = VectorType_; using vector_type = VectorType; using storage_type = vector_type; using vectorentry_type = typename vector_type::VectorEntryType; using value_type = T; using mask_type = SimdMaskArray; using index_type = SimdArray; static constexpr std::size_t size() { return N; } using Mask = mask_type; using MaskType = Mask; using MaskArgument = const MaskType &; using VectorEntryType = vectorentry_type; using EntryType = value_type; using IndexType = index_type; using AsArg = const SimdArray &; using reference = Detail::ElementReference; static constexpr std::size_t Size = size(); static constexpr std::size_t MemoryAlignment = storage_type::MemoryAlignment; // zero init #ifndef Vc_MSVC // bogus error C2580 Vc_INTRINSIC SimdArray() = default; #endif // default copy ctor/operator Vc_INTRINSIC SimdArray(const SimdArray &) = default; Vc_INTRINSIC SimdArray(SimdArray &&) = default; Vc_INTRINSIC SimdArray &operator=(const SimdArray &) = default; // broadcast Vc_INTRINSIC SimdArray(const value_type &a) : data(a) {} Vc_INTRINSIC SimdArray(value_type &a) : data(a) {} Vc_INTRINSIC SimdArray(value_type &&a) : data(a) {} template < typename U, typename = enable_if::value && !std::is_same::value>> Vc_INTRINSIC SimdArray(U a) : SimdArray(static_cast(a)) { } // implicit casts template Vc_INTRINSIC SimdArray(const SimdArray &x, enable_if = nullarg) : data(simd_cast(internal_data(x))) { } template Vc_INTRINSIC SimdArray(const SimdArray &x, enable_if<(N > V::Size && N <= 2 * V::Size)> = nullarg) : data(simd_cast(internal_data(internal_data0(x)), internal_data(internal_data1(x)))) { } template Vc_INTRINSIC SimdArray(const SimdArray &x, enable_if<(N > 2 * V::Size && N <= 4 * V::Size)> = nullarg) : data(simd_cast(internal_data(internal_data0(internal_data0(x))), internal_data(internal_data1(internal_data0(x))), internal_data(internal_data0(internal_data1(x))), internal_data(internal_data1(internal_data1(x))))) { } template Vc_INTRINSIC SimdArray(Common::Segment &&x) : data(simd_cast(x.data)) { } Vc_INTRINSIC SimdArray(const std::initializer_list &init) : data(init.begin(), Vc::Unaligned) { #if defined Vc_CXX14 && 0 // doesn't compile yet static_assert(init.size() == size(), "The initializer_list argument to " "SimdArray must contain exactly N " "values."); #else Vc_ASSERT(init.size() == size()); #endif } // implicit conversion from underlying vector_type template < typename V, typename = enable_if::value && !Traits::isSimdArray::value>> explicit Vc_INTRINSIC SimdArray(const V &x) : data(simd_cast(x)) { } // implicit conversion to Vector for if Vector::size() == N and // T implicitly convertible to U template < typename U, typename A, typename = enable_if::value && Vector::Size == N>> Vc_INTRINSIC operator Vector() const { return simd_cast>(data); } #include "gatherinterface.h" #include "scatterinterface.h" // forward all remaining ctors template ::value && !Traits::is_gather_signature::value && !Traits::is_initializer_list::value>> explicit Vc_INTRINSIC SimdArray(Args &&... args) : data(std::forward(args)...) { } template explicit Vc_INTRINSIC SimdArray( Common::AddOffset) : data(Vc::IndexesFromZero) { data += value_type(Offset); } Vc_INTRINSIC void setZero() { data.setZero(); } Vc_INTRINSIC void setZero(mask_type k) { data.setZero(internal_data(k)); } Vc_INTRINSIC void setZeroInverted() { data.setZeroInverted(); } Vc_INTRINSIC void setZeroInverted(mask_type k) { data.setZeroInverted(internal_data(k)); } Vc_INTRINSIC void setQnan() { data.setQnan(); } Vc_INTRINSIC void setQnan(mask_type m) { data.setQnan(internal_data(m)); } // internal: execute specified Operation template static Vc_INTRINSIC SimdArray fromOperation(Op op, Args &&... args) { SimdArray r; Common::unpackArgumentsAuto(op, r.data, std::forward(args)...); return r; } template static Vc_INTRINSIC void callOperation(Op op, Args &&... args) { Common::unpackArgumentsAuto(op, nullptr, std::forward(args)...); } static Vc_INTRINSIC SimdArray Zero() { return SimdArray(Vc::Zero); } static Vc_INTRINSIC SimdArray One() { return SimdArray(Vc::One); } static Vc_INTRINSIC SimdArray IndexesFromZero() { return SimdArray(Vc::IndexesFromZero); } static Vc_INTRINSIC SimdArray Random() { return fromOperation(Common::Operations::random()); } template Vc_INTRINSIC void load(Args &&... args) { data.load(std::forward(args)...); } template Vc_INTRINSIC void store(Args &&... args) const { data.store(std::forward(args)...); } Vc_INTRINSIC mask_type operator!() const { return {!data}; } Vc_INTRINSIC SimdArray operator-() const { return {-data}; } /// Returns a copy of itself Vc_INTRINSIC SimdArray operator+() const { return *this; } Vc_INTRINSIC SimdArray operator~() const { return {~data}; } template ::value && std::is_integral::value>> Vc_INTRINSIC Vc_CONST SimdArray operator<<(U x) const { return {data << x}; } template ::value && std::is_integral::value>> Vc_INTRINSIC SimdArray &operator<<=(U x) { data <<= x; return *this; } template ::value && std::is_integral::value>> Vc_INTRINSIC Vc_CONST SimdArray operator>>(U x) const { return {data >> x}; } template ::value && std::is_integral::value>> Vc_INTRINSIC SimdArray &operator>>=(U x) { data >>= x; return *this; } #define Vc_BINARY_OPERATOR_(op) \ Vc_INTRINSIC Vc_CONST SimdArray operator op(const SimdArray &rhs) const \ { \ return {data op rhs.data}; \ } \ Vc_INTRINSIC SimdArray &operator op##=(const SimdArray &rhs) \ { \ data op## = rhs.data; \ return *this; \ } Vc_ALL_ARITHMETICS(Vc_BINARY_OPERATOR_); Vc_ALL_BINARY(Vc_BINARY_OPERATOR_); Vc_ALL_SHIFTS(Vc_BINARY_OPERATOR_); #undef Vc_BINARY_OPERATOR_ #define Vc_COMPARES(op) \ Vc_INTRINSIC mask_type operator op(const SimdArray &rhs) const \ { \ return {data op rhs.data}; \ } Vc_ALL_COMPARES(Vc_COMPARES); #undef Vc_COMPARES /// \copydoc Vector::isNegative Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC MaskType isNegative() const { return {isnegative(data)}; } private: friend reference; Vc_INTRINSIC static value_type get(const SimdArray &o, int i) noexcept { return o.data[i]; } template Vc_INTRINSIC static void set(SimdArray &o, int i, U &&v) noexcept( noexcept(std::declval() = v)) { o.data[i] = v; } public: /** * \note the returned object models the concept of a reference and * as such it can exist longer than the data it is referencing. * \note to avoid lifetime issues, we strongly advice not to store * any reference objects. */ Vc_INTRINSIC reference operator[](size_t i) noexcept { static_assert(noexcept(reference{std::declval(), int()}), ""); return {*this, int(i)}; } Vc_INTRINSIC value_type operator[](size_t i) const noexcept { return get(*this, int(i)); } Vc_INTRINSIC Common::WriteMaskedVector operator()(const mask_type &k) { return {*this, k}; } Vc_INTRINSIC void assign(const SimdArray &v, const mask_type &k) { data.assign(v.data, internal_data(k)); } // reductions //////////////////////////////////////////////////////// #define Vc_REDUCTION_FUNCTION_(name_) \ Vc_INTRINSIC Vc_PURE value_type name_() const { return data.name_(); } \ Vc_INTRINSIC Vc_PURE value_type name_(mask_type mask) const \ { \ return data.name_(internal_data(mask)); \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_REDUCTION_FUNCTION_(min); Vc_REDUCTION_FUNCTION_(max); Vc_REDUCTION_FUNCTION_(product); Vc_REDUCTION_FUNCTION_(sum); #undef Vc_REDUCTION_FUNCTION_ Vc_INTRINSIC Vc_PURE SimdArray partialSum() const { return data.partialSum(); } template Vc_INTRINSIC SimdArray apply(F &&f) const { return {data.apply(std::forward(f))}; } template Vc_INTRINSIC SimdArray apply(F &&f, const mask_type &k) const { return {data.apply(std::forward(f), k)}; } Vc_INTRINSIC SimdArray shifted(int amount) const { return {data.shifted(amount)}; } template Vc_INTRINSIC SimdArray shifted(int amount, const SimdArray &shiftIn) const { return {data.shifted(amount, simd_cast(shiftIn))}; } Vc_INTRINSIC SimdArray rotated(int amount) const { return {data.rotated(amount)}; } /// \copydoc Vector::exponent Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC SimdArray exponent() const { return {exponent(data)}; } Vc_INTRINSIC SimdArray interleaveLow(SimdArray x) const { return {data.interleaveLow(x.data)}; } Vc_INTRINSIC SimdArray interleaveHigh(SimdArray x) const { return {data.interleaveHigh(x.data)}; } Vc_INTRINSIC SimdArray reversed() const { return {data.reversed()}; } Vc_INTRINSIC SimdArray sorted() const { return {data.sorted()}; } template static Vc_INTRINSIC SimdArray generate(const G &gen) { return {VectorType::generate(gen)}; } Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC SimdArray copySign(const SimdArray &reference) const { return {Vc::copysign(data, reference.data)}; } friend VectorType &internal_data<>(SimdArray &x); friend const VectorType &internal_data<>(const SimdArray &x); /// \internal Vc_INTRINSIC SimdArray(VectorType &&x) : data(std::move(x)) {} Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(storage_type)); private: // The alignas attribute attached to the class declaration above is ignored by ICC // 17.0.0 (at least). So just move the alignas attribute down here where it works for // all compilers. alignas(static_cast( Common::BoundedAlignment::value * sizeof(VectorType_) / VectorType_::size()>::value)) storage_type data; }; template constexpr std::size_t SimdArray::Size; template constexpr std::size_t SimdArray::MemoryAlignment; template #ifndef Vc_MSVC Vc_INTRINSIC #endif VectorType &internal_data(SimdArray &x) { return x.data; } template #ifndef Vc_MSVC Vc_INTRINSIC #endif const VectorType &internal_data(const SimdArray &x) { return x.data; } // unpackIfSegment {{{2 template T unpackIfSegment(T &&x) { return std::forward(x); } template auto unpackIfSegment(Common::Segment &&x) -> decltype(x.asSimdArray()) { return x.asSimdArray(); } // gatherImplementation {{{2 template template inline void SimdArray::gatherImplementation(const MT *mem, const IT &indexes) { data.gather(mem, unpackIfSegment(indexes)); } template template inline void SimdArray::gatherImplementation(const MT *mem, const IT &indexes, MaskArgument mask) { data.gather(mem, unpackIfSegment(indexes), mask); } // scatterImplementation {{{2 template template inline void SimdArray::scatterImplementation(MT *mem, IT &&indexes) const { data.scatter(mem, unpackIfSegment(std::forward(indexes))); } template template inline void SimdArray::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const { data.scatter(mem, unpackIfSegment(std::forward(indexes)), mask); } // generic SimdArray {{{1 /** * Data-parallel arithmetic type with user-defined number of elements. * * \tparam T The type of the vector's elements. The supported types currently are limited * to the types supported by Vc::Vector. * * \tparam N The number of elements to store and process concurrently. You can choose an * arbitrary number, though not every number is a good idea. * Generally, a power of two value or the sum of two power of two values might * work efficiently, though this depends a lot on the target system. * * \tparam V Don't change the default value unless you really know what you are doing. * This type is set to the underlying native Vc::Vector type used in the * implementation of the type. * Having it as part of the type name guards against some cases of ODR * violations (i.e. linking incompatible translation units / libraries). * * \tparam Wt Don't ever change the default value. * This parameter is an unfortunate implementation detail shining through. * * \warning Choosing \p N too large (what “too large” means depends on the target) will * result in excessive compilation times and high (or too high) register * pressure, thus potentially negating the improvement from concurrent execution. * As a rule of thumb, keep \p N less or equal to `2 * float_v::size()`. * * \warning A special portability concern arises from a current limitation in the MIC * implementation (Intel Knights Corner), where SimdArray types with \p T = \p * (u)short require an \p N either less than short_v::size() or a multiple of * short_v::size(). * * \headerfile simdarray.h */ template class SimdArray { static_assert(std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value, "SimdArray may only be used with T = { double, float, int32_t, uint32_t, int16_t, uint16_t }"); static_assert( // either the EntryType and VectorEntryType of the main V are equal std::is_same::value || // or N is a multiple of V::size() (N % V::size() == 0), "SimdArray<(un)signed short, N> on MIC only works correctly for N = k * " "MIC::(u)short_v::size(), i.e. k * 16."); using my_traits = SimdArrayTraits; static constexpr std::size_t N0 = my_traits::N0; static constexpr std::size_t N1 = my_traits::N1; using Split = Common::Split; template using CArray = U[K]; public: using storage_type0 = typename my_traits::storage_type0; using storage_type1 = typename my_traits::storage_type1; static_assert(storage_type0::size() == N0, ""); /**\internal * This type reveals the implementation-specific type used for the data member. */ using vector_type = V; using vectorentry_type = typename storage_type0::vectorentry_type; typedef vectorentry_type alias_type Vc_MAY_ALIAS; /// The type of the elements (i.e.\ \p T) using value_type = T; /// The type of the mask used for masked operations and returned from comparisons. using mask_type = SimdMaskArray; /// The type of the vector used for indexes in gather and scatter operations. using index_type = SimdArray; /** * Returns \p N, the number of scalar components in an object of this type. * * The size of the SimdArray, i.e. the number of scalar elements in the vector. In * contrast to Vector::size() you have control over this value via the \p N template * parameter of the SimdArray class template. * * \returns The number of scalar values stored and manipulated concurrently by objects * of this type. */ static constexpr std::size_t size() { return N; } /// \copydoc mask_type using Mask = mask_type; /// \copydoc mask_type using MaskType = Mask; using MaskArgument = const MaskType &; using VectorEntryType = vectorentry_type; /// \copydoc value_type using EntryType = value_type; /// \copydoc index_type using IndexType = index_type; using AsArg = const SimdArray &; using reference = Detail::ElementReference; ///\copydoc Vector::MemoryAlignment static constexpr std::size_t MemoryAlignment = storage_type0::MemoryAlignment > storage_type1::MemoryAlignment ? storage_type0::MemoryAlignment : storage_type1::MemoryAlignment; /// \name Generators ///@{ ///\copybrief Vector::Zero static Vc_INTRINSIC SimdArray Zero() { return SimdArray(Vc::Zero); } ///\copybrief Vector::One static Vc_INTRINSIC SimdArray One() { return SimdArray(Vc::One); } ///\copybrief Vector::IndexesFromZero static Vc_INTRINSIC SimdArray IndexesFromZero() { return SimdArray(Vc::IndexesFromZero); } ///\copydoc Vector::Random static Vc_INTRINSIC SimdArray Random() { return fromOperation(Common::Operations::random()); } ///\copybrief Vector::generate template static Vc_INTRINSIC SimdArray generate(const G &gen) // {{{2 { auto tmp = storage_type0::generate(gen); // GCC bug: the order of evaluation in // an initializer list is well-defined // (front to back), but GCC 4.8 doesn't // implement this correctly. Therefore // we enforce correct order. return {std::move(tmp), storage_type1::generate([&](std::size_t i) { return gen(i + N0); })}; } ///@} /// \name Compile-Time Constant Initialization ///@{ ///\copydoc Vector::Vector() #ifndef Vc_MSVC // bogus error C2580 SimdArray() = default; #endif ///@} /// \name Conversion/Broadcast Constructors ///@{ ///\copydoc Vector::Vector(EntryType) Vc_INTRINSIC SimdArray(value_type a) : data0(a), data1(a) {} template < typename U, typename = enable_if::value && !std::is_same::value>> SimdArray(U a) : SimdArray(static_cast(a)) { } ///@} // default copy ctor/operator SimdArray(const SimdArray &) = default; SimdArray(SimdArray &&) = default; SimdArray &operator=(const SimdArray &) = default; // load ctor template ::value>> explicit Vc_INTRINSIC SimdArray(const U *mem, Flags f = Flags()) : data0(mem, f), data1(mem + storage_type0::size(), f) { } // MSVC does overload resolution differently and takes the const U *mem overload (I hope) #ifndef Vc_MSVC /**\internal * Load from a C-array. This is basically the same function as the load constructor * above, except that the forwarding reference overload would steal the deal and the * constructor above doesn't get called. This overload is required to enable loads * from C-arrays. */ template ::value>> explicit Vc_INTRINSIC SimdArray(CArray &mem, Flags f = Flags()) : data0(&mem[0], f), data1(&mem[storage_type0::size()], f) { } /**\internal * Const overload of the above. */ template ::value>> explicit Vc_INTRINSIC SimdArray(const CArray &mem, Flags f = Flags()) : data0(&mem[0], f), data1(&mem[storage_type0::size()], f) { } #endif // initializer list Vc_INTRINSIC SimdArray(const std::initializer_list &init) : data0(init.begin(), Vc::Unaligned) , data1(init.begin() + storage_type0::size(), Vc::Unaligned) { #if defined Vc_CXX14 && 0 // doesn't compile yet static_assert(init.size() == size(), "The initializer_list argument to " "SimdArray must contain exactly N " "values."); #else Vc_ASSERT(init.size() == size()); #endif } #include "gatherinterface.h" #include "scatterinterface.h" // forward all remaining ctors template ::value && !Traits::is_initializer_list::value && !Traits::is_gather_signature::value && !Traits::is_load_arguments::value>> explicit Vc_INTRINSIC SimdArray(Args &&... args) : data0(Split::lo(args)...) // no forward here - it could move and thus // break the next line , data1(Split::hi(std::forward(args))...) { } // explicit casts template Vc_INTRINSIC explicit SimdArray( W &&x, enable_if<(Traits::is_simd_vector::value && Traits::simd_vector_size::value == N && !(std::is_convertible, T>::value && Traits::isSimdArray::value))> = nullarg) : data0(Split::lo(x)), data1(Split::hi(x)) { } // implicit casts template Vc_INTRINSIC SimdArray( W &&x, enable_if<(Traits::isSimdArray::value && Traits::simd_vector_size::value == N && std::is_convertible, T>::value)> = nullarg) : data0(Split::lo(x)), data1(Split::hi(x)) { } // implicit conversion to Vector for if Vector::size() == N and // T implicitly convertible to U template < typename U, typename A, typename = enable_if::value && Vector::Size == N>> operator Vector() const { return simd_cast>(data0, data1); } //////////////////// other functions /////////////// Vc_INTRINSIC void setZero() { data0.setZero(); data1.setZero(); } Vc_INTRINSIC void setZero(const mask_type &k) { data0.setZero(Split::lo(k)); data1.setZero(Split::hi(k)); } Vc_INTRINSIC void setZeroInverted() { data0.setZeroInverted(); data1.setZeroInverted(); } Vc_INTRINSIC void setZeroInverted(const mask_type &k) { data0.setZeroInverted(Split::lo(k)); data1.setZeroInverted(Split::hi(k)); } Vc_INTRINSIC void setQnan() { data0.setQnan(); data1.setQnan(); } Vc_INTRINSIC void setQnan(const mask_type &m) { data0.setQnan(Split::lo(m)); data1.setQnan(Split::hi(m)); } ///\internal execute specified Operation template static Vc_INTRINSIC SimdArray fromOperation(Op op, Args &&... args) { SimdArray r = { storage_type0::fromOperation(op, Split::lo(args)...), // no forward here - it // could move and thus // break the next line storage_type1::fromOperation(op, Split::hi(std::forward(args))...)}; return r; } ///\internal template static Vc_INTRINSIC void callOperation(Op op, Args &&... args) { storage_type0::callOperation(op, Split::lo(args)...); storage_type1::callOperation(op, Split::hi(std::forward(args))...); } template Vc_INTRINSIC void load(const U *mem, Args &&... args) { data0.load(mem, Split::lo(args)...); // no forward here - it could move and thus // break the next line data1.load(mem + storage_type0::size(), Split::hi(std::forward(args))...); } template Vc_INTRINSIC void store(U *mem, Args &&... args) const { data0.store(mem, Split::lo(args)...); // no forward here - it could move and thus // break the next line data1.store(mem + storage_type0::size(), Split::hi(std::forward(args))...); } Vc_INTRINSIC mask_type operator!() const { return {!data0, !data1}; } Vc_INTRINSIC SimdArray operator-() const { return {-data0, -data1}; } /// Returns a copy of itself Vc_INTRINSIC SimdArray operator+() const { return *this; } Vc_INTRINSIC SimdArray operator~() const { return {~data0, ~data1}; } // left/right shift operators {{{2 template ::value && std::is_integral::value>> Vc_INTRINSIC Vc_CONST SimdArray operator<<(U x) const { return {data0 << x, data1 << x}; } template ::value && std::is_integral::value>> Vc_INTRINSIC SimdArray &operator<<=(U x) { data0 <<= x; data1 <<= x; return *this; } template ::value && std::is_integral::value>> Vc_INTRINSIC Vc_CONST SimdArray operator>>(U x) const { return {data0 >> x, data1 >> x}; } template ::value && std::is_integral::value>> Vc_INTRINSIC SimdArray &operator>>=(U x) { data0 >>= x; data1 >>= x; return *this; } // binary operators {{{2 #define Vc_BINARY_OPERATOR_(op) \ Vc_INTRINSIC Vc_CONST SimdArray operator op(const SimdArray &rhs) const \ { \ return {data0 op rhs.data0, data1 op rhs.data1}; \ } \ Vc_INTRINSIC SimdArray &operator op##=(const SimdArray &rhs) \ { \ data0 op## = rhs.data0; \ data1 op## = rhs.data1; \ return *this; \ } Vc_ALL_ARITHMETICS(Vc_BINARY_OPERATOR_); Vc_ALL_BINARY(Vc_BINARY_OPERATOR_); Vc_ALL_SHIFTS(Vc_BINARY_OPERATOR_); #undef Vc_BINARY_OPERATOR_ #define Vc_COMPARES(op) \ Vc_INTRINSIC mask_type operator op(const SimdArray &rhs) const \ { \ return {data0 op rhs.data0, data1 op rhs.data1}; \ } Vc_ALL_COMPARES(Vc_COMPARES); #undef Vc_COMPARES // operator[] {{{2 /// \name Scalar Subscript Operators ///@{ private: friend reference; Vc_INTRINSIC static value_type get(const SimdArray &o, int i) noexcept { return reinterpret_cast(&o)[i]; } template Vc_INTRINSIC static void set(SimdArray &o, int i, U &&v) noexcept( noexcept(std::declval() = v)) { reinterpret_cast(&o)[i] = v; } public: ///\copydoc Vector::operator[](size_t) /** * \note the returned object models the concept of a reference and * as such it can exist longer than the data it is referencing. * \note to avoid lifetime issues, we strongly advice not to store * any reference objects. */ Vc_INTRINSIC reference operator[](size_t i) noexcept { static_assert(noexcept(reference{std::declval(), int()}), ""); return {*this, int(i)}; } ///\copydoc Vector::operator[](size_t) const Vc_INTRINSIC value_type operator[](size_t index) const noexcept { return get(*this, int(index)); } ///@} // operator(){{{2 ///\copydoc Vector::operator()(MaskType) Vc_INTRINSIC Common::WriteMaskedVector operator()( const mask_type &mask) { return {*this, mask}; } ///\internal Vc_INTRINSIC void assign(const SimdArray &v, const mask_type &k) //{{{2 { data0.assign(v.data0, internal_data0(k)); data1.assign(v.data1, internal_data1(k)); } // reductions {{{2 #define Vc_REDUCTION_FUNCTION_(name_, binary_fun_, scalar_fun_) \ private: \ template \ Vc_INTRINSIC enable_if::value && \ storage_type0::Size == storage_type1::Size, \ value_type> name_##_impl() const \ { \ return binary_fun_(data0, data1).name_(); \ } \ \ template \ Vc_INTRINSIC enable_if::value && \ storage_type0::Size != storage_type1::Size, \ value_type> name_##_impl() const \ { \ return scalar_fun_(data0.name_(), data1.name_()); \ } \ \ public: \ /**\copybrief Vector::##name_ */ \ Vc_INTRINSIC value_type name_() const { return name_##_impl(); } \ /**\copybrief Vector::##name_ */ \ Vc_INTRINSIC value_type name_(const mask_type &mask) const \ { \ if (Vc_IS_UNLIKELY(Split::lo(mask).isEmpty())) { \ return data1.name_(Split::hi(mask)); \ } else if (Vc_IS_UNLIKELY(Split::hi(mask).isEmpty())) { \ return data0.name_(Split::lo(mask)); \ } else { \ return scalar_fun_(data0.name_(Split::lo(mask)), \ data1.name_(Split::hi(mask))); \ } \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_REDUCTION_FUNCTION_(min, Vc::min, std::min); Vc_REDUCTION_FUNCTION_(max, Vc::max, std::max); Vc_REDUCTION_FUNCTION_(product, internal::product_helper_, internal::product_helper_); Vc_REDUCTION_FUNCTION_(sum, internal::sum_helper_, internal::sum_helper_); #undef Vc_REDUCTION_FUNCTION_ ///\copybrief Vector::partialSum Vc_INTRINSIC Vc_PURE SimdArray partialSum() const //{{{2 { auto ps0 = data0.partialSum(); auto tmp = data1; tmp[0] += ps0[data0.size() - 1]; return {std::move(ps0), tmp.partialSum()}; } // apply {{{2 ///\copybrief Vector::apply(F &&) const template inline SimdArray apply(F &&f) const { return {data0.apply(f), data1.apply(f)}; } ///\copybrief Vector::apply(F &&, MaskType) const template inline SimdArray apply(F &&f, const mask_type &k) const { return {data0.apply(f, Split::lo(k)), data1.apply(f, Split::hi(k))}; } // shifted {{{2 ///\copybrief Vector::shifted(int) const inline SimdArray shifted(int amount) const { constexpr int SSize = Size; constexpr int SSize0 = storage_type0::Size; constexpr int SSize1 = storage_type1::Size; if (amount == 0) { return *this; } if (amount < 0) { if (amount > -SSize0) { return {data0.shifted(amount), data1.shifted(amount, data0)}; } if (amount == -SSize0) { return {storage_type0::Zero(), simd_cast(data0)}; } if (amount < -SSize0) { return {storage_type0::Zero(), simd_cast(data0.shifted( amount + SSize0))}; } return Zero(); } else { if (amount >= SSize) { return Zero(); } else if (amount >= SSize0) { return { simd_cast(data1).shifted(amount - SSize0), storage_type1::Zero()}; } else if (amount >= SSize1) { return {data0.shifted(amount, data1), storage_type1::Zero()}; } else { return {data0.shifted(amount, data1), data1.shifted(amount)}; } } } template inline enable_if< !(std::is_same::value && // not bisectable N == NN), SimdArray> shifted(int amount, const SimdArray &shiftIn) const { constexpr int SSize = Size; if (amount < 0) { return SimdArray::generate([&](int i) -> value_type { i += amount; if (i >= 0) { return operator[](i); } else if (i >= -SSize) { return shiftIn[i + SSize]; } return 0; }); } return SimdArray::generate([&](int i) -> value_type { i += amount; if (i < SSize) { return operator[](i); } else if (i < 2 * SSize) { return shiftIn[i - SSize]; } return 0; }); } private: // workaround for MSVC not understanding the simpler and shorter expression of the boolean // expression directly in the enable_if below template struct bisectable_shift : public std::integral_constant::value && // bisectable N == NN> { }; public: template inline SimdArray shifted(enable_if::value, int> amount, const SimdArray &shiftIn) const { constexpr int SSize = Size; if (amount < 0) { if (amount > -static_cast(storage_type0::Size)) { return {data0.shifted(amount, internal_data1(shiftIn)), data1.shifted(amount, data0)}; } if (amount == -static_cast(storage_type0::Size)) { return {storage_type0(internal_data1(shiftIn)), storage_type1(data0)}; } if (amount > -SSize) { return { internal_data1(shiftIn) .shifted(amount + static_cast(storage_type0::Size), internal_data0(shiftIn)), data0.shifted(amount + static_cast(storage_type0::Size), internal_data1(shiftIn))}; } if (amount == -SSize) { return shiftIn; } if (amount > -2 * SSize) { return shiftIn.shifted(amount + SSize); } } if (amount == 0) { return *this; } if (amount < static_cast(storage_type0::Size)) { return {data0.shifted(amount, data1), data1.shifted(amount, internal_data0(shiftIn))}; } if (amount == static_cast(storage_type0::Size)) { return {storage_type0(data1), storage_type1(internal_data0(shiftIn))}; } if (amount < SSize) { return {data1.shifted(amount - static_cast(storage_type0::Size), internal_data0(shiftIn)), internal_data0(shiftIn) .shifted(amount - static_cast(storage_type0::Size), internal_data1(shiftIn))}; } if (amount == SSize) { return shiftIn; } if (amount < 2 * SSize) { return shiftIn.shifted(amount - SSize); } return Zero(); } // rotated {{{2 ///\copybrief Vector::rotated Vc_INTRINSIC SimdArray rotated(int amount) const { amount %= int(size()); if (amount == 0) { return *this; } else if (amount < 0) { amount += size(); } #ifdef Vc_MSVC // MSVC fails to find a SimdArray::shifted function with 2 arguments. So use store // -> // load to implement the function instead. alignas(MemoryAlignment) T tmp[N + data0.size()]; data0.store(&tmp[0], Vc::Aligned); data1.store(&tmp[data0.size()], Vc::Aligned); data0.store(&tmp[N], Vc::Unaligned); SimdArray r; r.data0.load(&tmp[amount], Vc::Unaligned); r.data1.load(&tmp[(amount + data0.size()) % size()], Vc::Unaligned); return r; #else auto &&d0cvtd = simd_cast(data0); auto &&d1cvtd = simd_cast(data1); constexpr int size0 = storage_type0::size(); constexpr int size1 = storage_type1::size(); if (amount == size0 && std::is_same::value) { return {std::move(d1cvtd), std::move(d0cvtd)}; } else if (amount < size1) { return {data0.shifted(amount, d1cvtd), data1.shifted(amount, d0cvtd)}; } else if (amount == size1) { return {data0.shifted(amount, d1cvtd), std::move(d0cvtd)}; } else if (int(size()) - amount < size1) { return {data0.shifted(amount - int(size()), d1cvtd.shifted(size1 - size0)), data1.shifted(amount - int(size()), data0.shifted(size0 - size1))}; } else if (int(size()) - amount == size1) { return {data0.shifted(-size1, d1cvtd.shifted(size1 - size0)), simd_cast(data0.shifted(size0 - size1))}; } else if (amount <= size0) { return {data0.shifted(size1, d1cvtd).shifted(amount - size1, data0), simd_cast(data0.shifted(amount - size1))}; } else { return {data0.shifted(size1, d1cvtd).shifted(amount - size1, data0), simd_cast(data0.shifted(amount - size1, d1cvtd))}; } return *this; #endif } // interleaveLow/-High {{{2 ///\internal \copydoc Vector::interleaveLow Vc_INTRINSIC SimdArray interleaveLow(const SimdArray &x) const { // return data0[0], x.data0[0], data0[1], x.data0[1], ... return {data0.interleaveLow(x.data0), simd_cast(data0.interleaveHigh(x.data0))}; } ///\internal \copydoc Vector::interleaveHigh Vc_INTRINSIC SimdArray interleaveHigh(const SimdArray &x) const { return interleaveHighImpl( x, std::integral_constant()); } private: ///\internal Vc_INTRINSIC SimdArray interleaveHighImpl(const SimdArray &x, std::true_type) const { return {data1.interleaveLow(x.data1), data1.interleaveHigh(x.data1)}; } ///\internal inline SimdArray interleaveHighImpl(const SimdArray &x, std::false_type) const { return {data0.interleaveHigh(x.data0) .shifted(storage_type1::Size, simd_cast(data1.interleaveLow(x.data1))), data1.interleaveHigh(x.data1)}; } public: ///\copybrief Vector::reversed inline SimdArray reversed() const //{{{2 { if (std::is_same::value) { return {simd_cast(data1).reversed(), simd_cast(data0).reversed()}; } else { #ifdef Vc_MSVC // MSVC fails to find a SimdArray::shifted function with 2 arguments. So use // store // -> load to implement the function instead. alignas(MemoryAlignment) T tmp[N]; data1.reversed().store(&tmp[0], Vc::Aligned); data0.reversed().store(&tmp[data1.size()], Vc::Unaligned); return SimdArray{&tmp[0], Vc::Aligned}; #else return {data0.shifted(storage_type1::Size, data1).reversed(), simd_cast(data0.reversed().shifted( storage_type0::Size - storage_type1::Size))}; #endif } } ///\copydoc Vector::sorted inline SimdArray sorted() const //{{{2 { return sortedImpl( std::integral_constant()); } ///\internal Vc_INTRINSIC SimdArray sortedImpl(std::true_type) const { #ifdef Vc_DEBUG_SORTED std::cerr << "-- " << data0 << data1 << '\n'; #endif const auto a = data0.sorted(); const auto b = data1.sorted().reversed(); const auto lo = Vc::min(a, b); const auto hi = Vc::max(a, b); return {lo.sorted(), hi.sorted()}; } ///\internal Vc_INTRINSIC SimdArray sortedImpl(std::false_type) const { using SortableArray = SimdArray::value>; auto sortable = simd_cast(*this); for (std::size_t i = Size; i < SortableArray::Size; ++i) { using limits = std::numeric_limits; if (limits::has_infinity) { sortable[i] = limits::infinity(); } else { sortable[i] = std::numeric_limits::max(); } } return simd_cast(sortable.sorted()); /* The following implementation appears to be less efficient. But this may need further * work. const auto a = data0.sorted(); const auto b = data1.sorted(); #ifdef Vc_DEBUG_SORTED std::cerr << "== " << a << b << '\n'; #endif auto aIt = Vc::begin(a); auto bIt = Vc::begin(b); const auto aEnd = Vc::end(a); const auto bEnd = Vc::end(b); return SimdArray::generate([&](std::size_t) { if (aIt == aEnd) { return *(bIt++); } if (bIt == bEnd) { return *(aIt++); } if (*aIt < *bIt) { return *(aIt++); } else { return *(bIt++); } }); */ } /// \name Deprecated Members ///@{ ///\copydoc size ///\deprecated Use size() instead. static constexpr std::size_t Size = size(); /// \copydoc Vector::exponent Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC SimdArray exponent() const { return {exponent(data0), exponent(data1)}; } /// \copydoc Vector::isNegative Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC MaskType isNegative() const { return {isnegative(data0), isnegative(data1)}; } ///\copydoc Vector::copySign Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC SimdArray copySign(const SimdArray &reference) const { return {Vc::copysign(data0, reference.data0), Vc::copysign(data1, reference.data1)}; } ///@} // internal_data0/1 {{{2 friend storage_type0 &internal_data0<>(SimdArray &x); friend storage_type1 &internal_data1<>(SimdArray &x); friend const storage_type0 &internal_data0<>(const SimdArray &x); friend const storage_type1 &internal_data1<>(const SimdArray &x); /// \internal Vc_INTRINSIC SimdArray(storage_type0 &&x, storage_type1 &&y) //{{{2 : data0(std::move(x)), data1(std::move(y)) { } Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(storage_type0)); private: //{{{2 // The alignas attribute attached to the class declaration above is ignored by ICC // 17.0.0 (at least). So just move the alignas attribute down here where it works for // all compilers. alignas(static_cast( Common::BoundedAlignment::value * sizeof(V) / V::size()>::value)) storage_type0 data0; storage_type1 data1; }; #undef Vc_CURRENT_CLASS_NAME template constexpr std::size_t SimdArray::Size; template constexpr std::size_t SimdArray::MemoryAlignment; // gatherImplementation {{{2 template template inline void SimdArray::gatherImplementation(const MT *mem, const IT &indexes) { data0.gather(mem, Split::lo(Common::Operations::gather(), indexes)); data1.gather(mem, Split::hi(Common::Operations::gather(), indexes)); } template template inline void SimdArray::gatherImplementation(const MT *mem, const IT &indexes, MaskArgument mask) { data0.gather(mem, Split::lo(Common::Operations::gather(), indexes), Split::lo(mask)); data1.gather(mem, Split::hi(Common::Operations::gather(), indexes), Split::hi(mask)); } // scatterImplementation {{{2 template template inline void SimdArray::scatterImplementation(MT *mem, IT &&indexes) const { data0.scatter(mem, Split::lo(Common::Operations::gather(), indexes)); // don't forward indexes - it could move and // thus break the next line data1.scatter(mem, Split::hi(Common::Operations::gather(), std::forward(indexes))); } template template inline void SimdArray::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const { data0.scatter(mem, Split::lo(Common::Operations::gather(), indexes), Split::lo(mask)); // don't forward indexes - it could move and // thus break the next line data1.scatter(mem, Split::hi(Common::Operations::gather(), std::forward(indexes)), Split::hi(mask)); } // internal_data0/1 (SimdArray) {{{1 ///\internal Returns the first data member of a generic SimdArray template #ifndef Vc_MSVC Vc_INTRINSIC #endif typename SimdArrayTraits::storage_type0 &internal_data0( SimdArray &x) { return x.data0; } ///\internal Returns the second data member of a generic SimdArray template #ifndef Vc_MSVC Vc_INTRINSIC #endif typename SimdArrayTraits::storage_type1 &internal_data1( SimdArray &x) { return x.data1; } ///\internal Returns the first data member of a generic SimdArray (const overload) template #ifndef Vc_MSVC Vc_INTRINSIC #endif const typename SimdArrayTraits::storage_type0 &internal_data0( const SimdArray &x) { return x.data0; } ///\internal Returns the second data member of a generic SimdArray (const overload) template #ifndef Vc_MSVC Vc_INTRINSIC #endif const typename SimdArrayTraits::storage_type1 &internal_data1( const SimdArray &x) { return x.data1; } // MSVC workaround for SimdArray(storage_type0, storage_type1) ctor{{{1 // MSVC sometimes stores x to data1. By first broadcasting 0 and then assigning y // in the body the bug is supressed. #if defined Vc_MSVC && defined Vc_IMPL_SSE template <> Vc_INTRINSIC SimdArray, 2>::SimdArray( SimdArray &&x, SimdArray &&y) : data0(x), data1(0) { data1 = y; } #endif // binary operators {{{1 namespace result_vector_type_internal { template using type = typename std::remove_cv::type>::type; template using is_integer_larger_than_int = std::integral_constant< bool, std::is_integral::value &&(sizeof(T) > sizeof(int) || std::is_same::value || std::is_same::value)>; template < typename L, typename R, std::size_t N = Traits::isSimdArray::value ? Traits::simd_vector_size::value : Traits::simd_vector_size::value, bool = (Traits::isSimdArray::value || Traits::isSimdArray::value) // one of the operands must be a SimdArray && !std::is_same, type>::value // if the operands are of the same type // use the member function && ((std::is_arithmetic>::value && !is_integer_larger_than_int>::value) || (std::is_arithmetic>::value && !is_integer_larger_than_int>::value) // one of the operands is a scalar // type || ( // or one of the operands is Vector with Vector::size() == // SimdArray::size() Traits::simd_vector_size::value == Traits::simd_vector_size::value && ((Traits::is_simd_vector::value && !Traits::isSimdArray::value) || (Traits::is_simd_vector::value && !Traits::isSimdArray::value))))> struct evaluate; template struct evaluate { private: using LScalar = Traits::entry_type_of; using RScalar = Traits::entry_type_of; template using conditional = typename std::conditional::type; public: // In principle we want the exact same rules for SimdArray ⨉ SimdArray as the standard // defines for T ⨉ U. BUT: short ⨉ short returns int (because all integral types smaller than // int are promoted to int before any operation). This would imply that SIMD types with integral // types smaller than int are more or less useless - and you could use SimdArray from the // start. Therefore we special-case those operations where the scalar type of both operands is // integral and smaller than int. // In addition to that there is no generic support for 64-bit int SIMD types. Therefore // promotion to a 64-bit integral type (including `long` because it can potentially have 64 // bits) also is not done. But if one of the operands is a scalar type that is larger than int // then the operator is disabled altogether. We do not want an implicit demotion. using type = SimdArray< conditional<(std::is_integral::value &&std::is_integral::value && sizeof(LScalar) < sizeof(int) && sizeof(RScalar) < sizeof(int)), conditional<(sizeof(LScalar) == sizeof(RScalar)), conditional::value, LScalar, RScalar>, conditional<(sizeof(LScalar) > sizeof(RScalar)), LScalar, RScalar>>, decltype(std::declval() + std::declval())>, N>; }; } // namespace result_vector_type_internal template using result_vector_type = typename result_vector_type_internal::evaluate::type; static_assert( std::is_same>, Vc::SimdArray>::value, "result_vector_type does not work"); #define Vc_BINARY_OPERATORS_(op_) \ /*!\brief Applies op_ component-wise and concurrently. */ \ template \ Vc_INTRINSIC result_vector_type operator op_(L &&lhs, R &&rhs) \ { \ using Return = result_vector_type; \ return Return(std::forward(lhs)) op_ Return(std::forward(rhs)); \ } /** * \name Arithmetic and Bitwise Operators * * Applies the operator component-wise and concurrently on \p lhs and \p rhs and returns * a new SimdArray object containing the result values. * * This operator only participates in overload resolution if: * \li At least one of the template parameters \p L or \p R is a SimdArray type. * \li Either \p L or \p R is a fundamental arithmetic type but not an integral type * larger than \c int \n * or \n * \p L or \p R is a Vc::Vector type with equal number of elements (Vector::size() == * SimdArray::size()). * * The return type of the operator is a SimdArray type using the more precise EntryType of * \p L or \p R and the same number of elements as the SimdArray argument(s). */ ///@{ Vc_ALL_ARITHMETICS(Vc_BINARY_OPERATORS_); Vc_ALL_BINARY(Vc_BINARY_OPERATORS_); ///@} #undef Vc_BINARY_OPERATORS_ #define Vc_BINARY_OPERATORS_(op_) \ /*!\brief Applies op_ component-wise and concurrently. */ \ template \ Vc_INTRINSIC typename result_vector_type::mask_type operator op_(L &&lhs, \ R &&rhs) \ { \ using Promote = result_vector_type; \ return Promote(std::forward(lhs)) op_ Promote(std::forward(rhs)); \ } /** * \name Compare Operators * * Applies the operator component-wise and concurrently on \p lhs and \p rhs and returns * a new SimdMaskArray object containing the result values. * * This operator only participates in overload resolution if (same rules as above): * \li At least one of the template parameters \p L or \p R is a SimdArray type. * \li Either \p L or \p R is a fundamental arithmetic type but not an integral type * larger than \c int \n * or \n * \p L or \p R is a Vc::Vector type with equal number of elements (Vector::size() == * SimdArray::size()). * * The return type of the operator is a SimdMaskArray type using the more precise EntryType of * \p L or \p R and the same number of elements as the SimdArray argument(s). */ ///@{ Vc_ALL_COMPARES(Vc_BINARY_OPERATORS_); ///@} #undef Vc_BINARY_OPERATORS_ // math functions {{{1 #define Vc_FORWARD_UNARY_OPERATOR(name_) \ /*!\brief Applies the std::name_ function component-wise and concurrently. */ \ template \ inline SimdArray name_(const SimdArray &x) \ { \ return SimdArray::fromOperation( \ Common::Operations::Forward_##name_(), x); \ } \ Vc_NOTHING_EXPECTING_SEMICOLON #define Vc_FORWARD_UNARY_BOOL_OPERATOR(name_) \ /*!\brief Applies the std::name_ function component-wise and concurrently. */ \ template \ inline SimdMaskArray name_(const SimdArray &x) \ { \ return SimdMaskArray::fromOperation( \ Common::Operations::Forward_##name_(), x); \ } \ Vc_NOTHING_EXPECTING_SEMICOLON #define Vc_FORWARD_BINARY_OPERATOR(name_) \ /*!\brief Applies the std::name_ function component-wise and concurrently. */ \ template \ inline SimdArray name_(const SimdArray &x, \ const SimdArray &y) \ { \ return SimdArray::fromOperation( \ Common::Operations::Forward_##name_(), x, y); \ } \ Vc_NOTHING_EXPECTING_SEMICOLON /** * \name Math functions * These functions evaluate the */ ///@{ Vc_FORWARD_UNARY_OPERATOR(abs); Vc_FORWARD_UNARY_OPERATOR(asin); Vc_FORWARD_UNARY_OPERATOR(atan); Vc_FORWARD_BINARY_OPERATOR(atan2); Vc_FORWARD_UNARY_OPERATOR(ceil); Vc_FORWARD_BINARY_OPERATOR(copysign); Vc_FORWARD_UNARY_OPERATOR(cos); Vc_FORWARD_UNARY_OPERATOR(exp); Vc_FORWARD_UNARY_OPERATOR(exponent); Vc_FORWARD_UNARY_OPERATOR(floor); /// Applies the std::fma function component-wise and concurrently. template inline SimdArray fma(const SimdArray &a, const SimdArray &b, const SimdArray &c) { return SimdArray::fromOperation(Common::Operations::Forward_fma(), a, b, c); } Vc_FORWARD_UNARY_BOOL_OPERATOR(isfinite); Vc_FORWARD_UNARY_BOOL_OPERATOR(isinf); Vc_FORWARD_UNARY_BOOL_OPERATOR(isnan); #if defined Vc_MSVC && defined Vc_IMPL_SSE inline SimdMaskArray, 2> isnan( const SimdArray, 2> &x) { using V = SSE::Vector; const SimdArray &x0 = internal_data0(x); const SimdArray &x1 = internal_data1(x); SimdMaskArray r0; SimdMaskArray r1; internal_data(internal_data0(r0)) = isnan(internal_data(internal_data0(x0))); internal_data(internal_data1(r0)) = isnan(internal_data(internal_data1(x0))); internal_data(internal_data0(r1)) = isnan(internal_data(internal_data0(x1))); internal_data(internal_data1(r1)) = isnan(internal_data(internal_data1(x1))); return {std::move(r0), std::move(r1)}; } #endif Vc_FORWARD_UNARY_BOOL_OPERATOR(isnegative); /// Applies the std::frexp function component-wise and concurrently. template inline SimdArray frexp(const SimdArray &x, SimdArray *e) { return SimdArray::fromOperation(Common::Operations::Forward_frexp(), x, e); } /// Applies the std::ldexp function component-wise and concurrently. template inline SimdArray ldexp(const SimdArray &x, const SimdArray &e) { return SimdArray::fromOperation(Common::Operations::Forward_ldexp(), x, e); } Vc_FORWARD_UNARY_OPERATOR(log); Vc_FORWARD_UNARY_OPERATOR(log10); Vc_FORWARD_UNARY_OPERATOR(log2); Vc_FORWARD_UNARY_OPERATOR(reciprocal); Vc_FORWARD_UNARY_OPERATOR(round); Vc_FORWARD_UNARY_OPERATOR(rsqrt); Vc_FORWARD_UNARY_OPERATOR(sin); /// Determines sine and cosine concurrently and component-wise on \p x. template void sincos(const SimdArray &x, SimdArray *sin, SimdArray *cos) { SimdArray::callOperation(Common::Operations::Forward_sincos(), x, sin, cos); } Vc_FORWARD_UNARY_OPERATOR(sqrt); Vc_FORWARD_UNARY_OPERATOR(trunc); Vc_FORWARD_BINARY_OPERATOR(min); Vc_FORWARD_BINARY_OPERATOR(max); ///@} #undef Vc_FORWARD_UNARY_OPERATOR #undef Vc_FORWARD_UNARY_BOOL_OPERATOR #undef Vc_FORWARD_BINARY_OPERATOR // simd_cast {{{1 #ifdef Vc_MSVC #define Vc_DUMMY_ARG0 , int = 0 #define Vc_DUMMY_ARG1 , long = 0 #define Vc_DUMMY_ARG2 , short = 0 #define Vc_DUMMY_ARG3 , char = '0' #define Vc_DUMMY_ARG4 , unsigned = 0u #define Vc_DUMMY_ARG5 , unsigned short = 0u #else #define Vc_DUMMY_ARG0 #define Vc_DUMMY_ARG1 #define Vc_DUMMY_ARG2 #define Vc_DUMMY_ARG3 #define Vc_DUMMY_ARG4 #define Vc_DUMMY_ARG5 #endif // Vc_MSVC // simd_cast_impl_smaller_input {{{2 // The following function can be implemented without the sizeof...(From) overload. // However, ICC has a bug (Premier Issue #6000116338) which leads to an ICE. Splitting the // function in two works around the issue. template Vc_INTRINSIC Vc_CONST enable_if simd_cast_impl_smaller_input(const From &... xs, const T &last) { Return r = simd_cast(xs...); for (size_t i = 0; i < N; ++i) { r[i + N * sizeof...(From)] = static_cast(last[i]); } return r; } template Vc_INTRINSIC Vc_CONST Return simd_cast_impl_smaller_input(const T &last) { Return r = Return(); for (size_t i = 0; i < N; ++i) { r[i] = static_cast(last[i]); } return r; } template Vc_INTRINSIC Vc_CONST enable_if simd_cast_impl_larger_input( const From &... xs, const T &last) { Return r = simd_cast(xs...); for (size_t i = N * sizeof...(From); i < Return::Size; ++i) { r[i] = static_cast(last[i - N * sizeof...(From)]); } return r; } template Vc_INTRINSIC Vc_CONST Return simd_cast_impl_larger_input(const T &last) { Return r = Return(); for (size_t i = 0; i < Return::size(); ++i) { r[i] = static_cast(last[i]); } return r; } // simd_cast_without_last (declaration) {{{2 template Vc_INTRINSIC_L Vc_CONST_L Return simd_cast_without_last(const From &... xs, const T &) Vc_INTRINSIC_R Vc_CONST_R; // are_all_types_equal {{{2 template struct are_all_types_equal; template struct are_all_types_equal : public std::integral_constant { }; template struct are_all_types_equal : public std::integral_constant< bool, std::is_same::value && are_all_types_equal::value> { }; // simd_cast_interleaved_argument_order (declarations) {{{2 /*! \internal The need for simd_cast_interleaved_argument_order stems from a shortcoming in pack expansion of variadic templates in C++. For a simd_cast with SimdArray arguments that are bisectable (i.e. \c storage_type0 and \c storage_type1 are equal) the generic implementation needs to forward to a simd_cast of the \c internal_data0 and \c internal_data1 of the arguments. But the required order of arguments is `internal_data0(arg0), internal_data1(arg0), internal_data0(arg1), ...`. This is impossible to achieve with pack expansion. It is only possible to write `internal_data0(args)..., internal_data1(args)...` and thus have the argument order mixed up. The simd_cast_interleaved_argument_order “simply” calls simd_cast with the arguments correctly reordered (i.e. interleaved). The implementation of simd_cast_interleaved_argument_order is done generically, so that it supports any number of arguments. The central idea of the implementation is an `extract` function which returns one value of an argument pack determined via an index passed as template argument. This index is generated via an index_sequence. The `extract` function uses two argument packs (of equal size) to easily return values from the front and middle of the argument pack (for doing the deinterleave). */ template Vc_INTRINSIC Vc_CONST Return simd_cast_interleaved_argument_order(const Ts &... a, const Ts &... b); // simd_cast_with_offset (declarations and one impl) {{{2 // offset == 0 {{{3 template Vc_INTRINSIC Vc_CONST enable_if<(are_all_types_equal::value && offset == 0), Return> simd_cast_with_offset(const From &x, const Froms &... xs); // offset > 0 && offset divisible by Return::Size {{{3 template Vc_INTRINSIC Vc_CONST enable_if<(From::Size > offset && offset > 0 && offset % Return::Size == 0), Return> simd_cast_with_offset(const From &x); // offset > 0 && offset NOT divisible && Return is non-atomic simd(mask)array {{{3 template Vc_INTRINSIC Vc_CONST enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 && ((Traits::isSimdArray::value && !Traits::isAtomicSimdArray::value) || (Traits::isSimdMaskArray::value && !Traits::isAtomicSimdMaskArray::value))), Return> simd_cast_with_offset(const From &x); // offset > 0 && offset NOT divisible && Return is atomic simd(mask)array {{{3 template Vc_INTRINSIC Vc_CONST enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 && ((Traits::isSimdArray::value && Traits::isAtomicSimdArray::value) || (Traits::isSimdMaskArray::value && Traits::isAtomicSimdMaskArray::value))), Return> simd_cast_with_offset(const From &x); // offset > first argument (drops first arg) {{{3 template Vc_INTRINSIC Vc_CONST enable_if< (are_all_types_equal::value && From::Size <= offset), Return> simd_cast_with_offset(const From &, const Froms &... xs) { return simd_cast_with_offset(xs...); } // offset > first and only argument (returns Zero) {{{3 template Vc_INTRINSIC Vc_CONST enable_if<(From::Size <= offset), Return> simd_cast_with_offset( const From &) { return Return::Zero(); } // first_type_of {{{2 template struct first_type_of_impl { using type = T; }; template using first_type_of = typename first_type_of_impl::type; // simd_cast_drop_arguments (declarations) {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast_drop_arguments(From x); template Vc_INTRINSIC Vc_CONST enable_if<(are_all_types_equal::value && sizeof...(Froms) * first_type_of::Size < Return::Size), Return> simd_cast_drop_arguments(Froms... xs, first_type_of x); // The following function can be implemented without the sizeof...(From) overload. // However, ICC has a bug (Premier Issue #6000116338) which leads to an ICE. Splitting the // function in two works around the issue. template Vc_INTRINSIC Vc_CONST enable_if< (are_all_types_equal::value && (1 + sizeof...(Froms)) * From::Size >= Return::Size && sizeof...(Froms) != 0), Return> simd_cast_drop_arguments(Froms... xs, From x, From); template Vc_INTRINSIC Vc_CONST enable_if<(are_all_types_equal::value && From::Size >= Return::Size), Return> simd_cast_drop_arguments(From x, From); namespace { #ifdef Vc_DEBUG_SIMD_CAST void debugDoNothing(const std::initializer_list &) {} template inline void vc_debug_(const char *prefix, const char *suffix, const T0 &arg0, const Ts &... args) { std::cerr << prefix << arg0; debugDoNothing({&(std::cerr << ", " << args)...}); std::cerr << suffix; } #else template Vc_INTRINSIC void vc_debug_(const char *, const char *, const T0 &, const Ts &...) { } #endif } // unnamed namespace // is_less trait{{{2 template struct is_less : public std::integral_constant { }; // is_power_of_2 trait{{{2 template struct is_power_of_2 : public std::integral_constant { }; // simd_cast(xs...) to SimdArray/-mask {{{2 #define Vc_SIMDARRAY_CASTS(SimdArrayType_, NativeType_) \ template \ Vc_INTRINSIC Vc_CONST enable_if< \ (Traits::isAtomic##SimdArrayType_::value && \ is_less::Size * sizeof...(Froms), Return::Size>::value && \ are_all_types_equal, Froms...>::value), \ Return> \ simd_cast(NativeType_ x, Froms... xs) \ { \ vc_debug_("simd_cast{1}(", ")\n", x, xs...); \ return {simd_cast(x, xs...)}; \ } \ template \ Vc_INTRINSIC Vc_CONST enable_if< \ (Traits::isAtomic##SimdArrayType_::value && \ !is_less::Size * sizeof...(Froms), Return::Size>::value && \ are_all_types_equal, Froms...>::value), \ Return> \ simd_cast(NativeType_ x, Froms... xs) \ { \ vc_debug_("simd_cast{2}(", ")\n", x, xs...); \ return {simd_cast_without_last, Froms...>(x, xs...)}; \ } \ template \ Vc_INTRINSIC Vc_CONST \ enable_if<(Traits::is##SimdArrayType_::value && \ !Traits::isAtomic##SimdArrayType_::value && \ is_less(), \ NativeType_::Size *(1 + sizeof...(Froms))>::value && \ are_all_types_equal, Froms...>::value), \ Return> \ simd_cast(NativeType_ x, Froms... xs) \ { \ vc_debug_("simd_cast{3}(", ")\n", x, xs...); \ using R0 = typename Return::storage_type0; \ using R1 = typename Return::storage_type1; \ return {simd_cast_drop_arguments(x, xs...), \ simd_cast_with_offset(x, xs...)}; \ } \ template \ Vc_INTRINSIC Vc_CONST \ enable_if<(Traits::is##SimdArrayType_::value && \ !Traits::isAtomic##SimdArrayType_::value && \ !is_less(), \ NativeType_::Size *(1 + sizeof...(Froms))>::value && \ are_all_types_equal, Froms...>::value), \ Return> \ simd_cast(NativeType_ x, Froms... xs) \ { \ vc_debug_("simd_cast{4}(", ")\n", x, xs...); \ using R0 = typename Return::storage_type0; \ using R1 = typename Return::storage_type1; \ return {simd_cast(x, xs...), R1::Zero()}; \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_SIMDARRAY_CASTS(SimdArray, Vc::Vector); Vc_SIMDARRAY_CASTS(SimdMaskArray, Vc::Mask); #undef Vc_SIMDARRAY_CASTS // simd_cast(V) {{{2 #define Vc_SIMDARRAY_CASTS(SimdArrayType_, NativeType_) \ /* SIMD Vector/Mask to atomic SimdArray/simdmaskarray */ \ template \ Vc_INTRINSIC Vc_CONST \ enable_if::value, Return> \ simd_cast(NativeType_ x Vc_DUMMY_ARG0) \ { \ vc_debug_("simd_cast{offset, atomic}(", ")\n", offset, x); \ return {simd_cast(x)}; \ } \ /* both halves of Return array are extracted from argument */ \ template \ Vc_INTRINSIC Vc_CONST \ enable_if<(Traits::is##SimdArrayType_::value && \ !Traits::isAtomic##SimdArrayType_::value && \ Return::Size * offset + Common::left_size() < \ NativeType_::Size), \ Return> \ simd_cast(NativeType_ x Vc_DUMMY_ARG1) \ { \ vc_debug_("simd_cast{offset, split Return}(", ")\n", offset, x); \ using R0 = typename Return::storage_type0; \ constexpr int entries_offset = offset * Return::Size; \ constexpr int entries_offset_right = entries_offset + R0::Size; \ return { \ simd_cast_with_offset(x), \ simd_cast_with_offset( \ x)}; \ } \ /* SIMD Vector/Mask to non-atomic SimdArray/simdmaskarray */ \ /* right half of Return array is zero */ \ template \ Vc_INTRINSIC Vc_CONST \ enable_if<(Traits::is##SimdArrayType_::value && \ !Traits::isAtomic##SimdArrayType_::value && \ Return::Size * offset + Common::left_size() >= \ NativeType_::Size), \ Return> \ simd_cast(NativeType_ x Vc_DUMMY_ARG2) \ { \ vc_debug_("simd_cast{offset, R1::Zero}(", ")\n", offset, x); \ using R0 = typename Return::storage_type0; \ using R1 = typename Return::storage_type1; \ constexpr int entries_offset = offset * Return::Size; \ return {simd_cast_with_offset(x), R1::Zero()}; \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_SIMDARRAY_CASTS(SimdArray, Vc::Vector); Vc_SIMDARRAY_CASTS(SimdMaskArray, Vc::Mask); #undef Vc_SIMDARRAY_CASTS // simd_cast(xs...) from SimdArray/-mask {{{2 #define Vc_SIMDARRAY_CASTS(SimdArrayType_) \ /* indivisible SimdArrayType_ */ \ template \ Vc_INTRINSIC Vc_CONST \ enable_if<(are_all_types_equal, From...>::value && \ (sizeof...(From) == 0 || N * sizeof...(From) < Return::Size) && \ !std::is_same>::value), \ Return> \ simd_cast(const SimdArrayType_ &x0, const From &... xs) \ { \ vc_debug_("simd_cast{indivisible}(", ")\n", x0, xs...); \ return simd_cast(internal_data(x0), internal_data(xs)...); \ } \ /* indivisible SimdArrayType_ && can drop arguments from the end */ \ template \ Vc_INTRINSIC Vc_CONST \ enable_if<(are_all_types_equal, From...>::value && \ (sizeof...(From) > 0 && (N * sizeof...(From) >= Return::Size)) && \ !std::is_same>::value), \ Return> \ simd_cast(const SimdArrayType_ &x0, const From &... xs) \ { \ vc_debug_("simd_cast{indivisible2}(", ")\n", x0, xs...); \ return simd_cast_without_last::storage_type, \ typename From::storage_type...>( \ internal_data(x0), internal_data(xs)...); \ } \ /* bisectable SimdArrayType_ (N = 2^n) && never too large */ \ template \ Vc_INTRINSIC Vc_CONST enable_if< \ (N != M && are_all_types_equal, From...>::value && \ !std::is_same>::value && \ is_less::value && is_power_of_2::value), \ Return> \ simd_cast(const SimdArrayType_ &x0, const From &... xs) \ { \ vc_debug_("simd_cast{bisectable}(", ")\n", x0, xs...); \ return simd_cast_interleaved_argument_order< \ Return, typename SimdArrayType_::storage_type0, \ typename From::storage_type0...>(internal_data0(x0), internal_data0(xs)..., \ internal_data1(x0), internal_data1(xs)...); \ } \ /* bisectable SimdArrayType_ (N = 2^n) && input so large that at least the last \ * input can be dropped */ \ template \ Vc_INTRINSIC Vc_CONST enable_if< \ (N != M && are_all_types_equal, From...>::value && \ !is_less::value && is_power_of_2::value), \ Return> \ simd_cast(const SimdArrayType_ &x0, const From &... xs) \ { \ vc_debug_("simd_cast{bisectable2}(", ")\n", x0, xs...); \ return simd_cast_without_last, From...>( \ x0, xs...); \ } \ /* remaining SimdArrayType_ input never larger (N != 2^n) */ \ template \ Vc_INTRINSIC Vc_CONST enable_if< \ (N != M && are_all_types_equal, From...>::value && \ N * (1 + sizeof...(From)) <= Return::Size && !is_power_of_2::value), \ Return> \ simd_cast(const SimdArrayType_ &x0, const From &... xs) \ { \ vc_debug_("simd_cast{remaining}(", ")\n", x0, xs...); \ return simd_cast_impl_smaller_input, \ From...>(x0, xs...); \ } \ /* remaining SimdArrayType_ input larger (N != 2^n) */ \ template \ Vc_INTRINSIC Vc_CONST enable_if< \ (N != M && are_all_types_equal, From...>::value && \ N * (1 + sizeof...(From)) > Return::Size && !is_power_of_2::value), \ Return> \ simd_cast(const SimdArrayType_ &x0, const From &... xs) \ { \ vc_debug_("simd_cast{remaining2}(", ")\n", x0, xs...); \ return simd_cast_impl_larger_input, \ From...>(x0, xs...); \ } \ /* a single bisectable SimdArrayType_ (N = 2^n) too large */ \ template \ Vc_INTRINSIC Vc_CONST \ enable_if<(N != M && N >= 2 * Return::Size && is_power_of_2::value), Return> \ simd_cast(const SimdArrayType_ &x) \ { \ vc_debug_("simd_cast{single bisectable}(", ")\n", x); \ return simd_cast(internal_data0(x)); \ } \ template \ Vc_INTRINSIC Vc_CONST enable_if<(N != M && N > Return::Size && \ N < 2 * Return::Size && is_power_of_2::value), \ Return> \ simd_cast(const SimdArrayType_ &x) \ { \ vc_debug_("simd_cast{single bisectable2}(", ")\n", x); \ return simd_cast(internal_data0(x), internal_data1(x)); \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_SIMDARRAY_CASTS(SimdArray); Vc_SIMDARRAY_CASTS(SimdMaskArray); #undef Vc_SIMDARRAY_CASTS // simd_cast(SimdArray/-mask) {{{2 #define Vc_SIMDARRAY_CASTS(SimdArrayType_) \ /* offset == 0 is like without offset */ \ template \ Vc_INTRINSIC Vc_CONST enable_if<(offset == 0), Return> simd_cast( \ const SimdArrayType_ &x Vc_DUMMY_ARG0) \ { \ vc_debug_("simd_cast{offset == 0}(", ")\n", offset, x); \ return simd_cast(x); \ } \ /* forward to V */ \ template \ Vc_INTRINSIC Vc_CONST enable_if<(offset != 0), Return> simd_cast( \ const SimdArrayType_ &x Vc_DUMMY_ARG1) \ { \ vc_debug_("simd_cast{offset, forward}(", ")\n", offset, x); \ return simd_cast(internal_data(x)); \ } \ /* convert from right member of SimdArray */ \ template \ Vc_INTRINSIC Vc_CONST \ enable_if<(N != M && offset * Return::Size >= Common::left_size() && \ offset != 0 && Common::left_size() % Return::Size == 0), \ Return> \ simd_cast(const SimdArrayType_ &x Vc_DUMMY_ARG2) \ { \ vc_debug_("simd_cast{offset, right}(", ")\n", offset, x); \ return simd_cast() / Return::Size>( \ internal_data1(x)); \ } \ /* same as above except for odd cases where offset * Return::Size doesn't fit the \ * left side of the SimdArray */ \ template \ Vc_INTRINSIC Vc_CONST \ enable_if<(N != M && offset * Return::Size >= Common::left_size() && \ offset != 0 && Common::left_size() % Return::Size != 0), \ Return> \ simd_cast(const SimdArrayType_ &x Vc_DUMMY_ARG3) \ { \ vc_debug_("simd_cast{offset, right, nofit}(", ")\n", offset, x); \ return simd_cast_with_offset()>( \ internal_data1(x)); \ } \ /* convert from left member of SimdArray */ \ template \ Vc_INTRINSIC Vc_CONST enable_if< \ (N != M && /*offset * Return::Size < Common::left_size() &&*/ \ offset != 0 && (offset + 1) * Return::Size <= Common::left_size()), \ Return> \ simd_cast(const SimdArrayType_ &x Vc_DUMMY_ARG4) \ { \ vc_debug_("simd_cast{offset, left}(", ")\n", offset, x); \ return simd_cast(internal_data0(x)); \ } \ /* fallback to copying scalars */ \ template \ Vc_INTRINSIC Vc_CONST \ enable_if<(N != M && (offset * Return::Size < Common::left_size()) && \ offset != 0 && (offset + 1) * Return::Size > Common::left_size()), \ Return> \ simd_cast(const SimdArrayType_ &x Vc_DUMMY_ARG5) \ { \ vc_debug_("simd_cast{offset, copy scalars}(", ")\n", offset, x); \ using R = typename Return::EntryType; \ Return r = Return::Zero(); \ for (std::size_t i = offset * Return::Size; \ i < std::min(N, (offset + 1) * Return::Size); ++i) { \ r[i - offset * Return::Size] = static_cast(x[i]); \ } \ return r; \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_SIMDARRAY_CASTS(SimdArray); Vc_SIMDARRAY_CASTS(SimdMaskArray); #undef Vc_SIMDARRAY_CASTS // simd_cast_drop_arguments (definitions) {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast_drop_arguments(From x) { return simd_cast(x); } template Vc_INTRINSIC Vc_CONST enable_if<(are_all_types_equal::value && sizeof...(Froms) * first_type_of::Size < Return::Size), Return> simd_cast_drop_arguments(Froms... xs, first_type_of x) { return simd_cast(xs..., x); } // The following function can be implemented without the sizeof...(From) overload. // However, ICC has a bug (Premier Issue #6000116338) which leads to an ICE. Splitting the // function in two works around the issue. template Vc_INTRINSIC Vc_CONST enable_if< (are_all_types_equal::value && (1 + sizeof...(Froms)) * From::Size >= Return::Size && sizeof...(Froms) != 0), Return> simd_cast_drop_arguments(Froms... xs, From x, From) { return simd_cast_drop_arguments(xs..., x); } template Vc_INTRINSIC Vc_CONST enable_if<(are_all_types_equal::value && From::Size >= Return::Size), Return> simd_cast_drop_arguments(From x, From) { return simd_cast_drop_arguments(x); } // simd_cast_with_offset (definitions) {{{2 template Vc_INTRINSIC Vc_CONST enable_if<(From::Size > offset && offset > 0 && offset % Return::Size == 0), Return> simd_cast_with_offset(const From &x) { return simd_cast(x); } template Vc_INTRINSIC Vc_CONST enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 && ((Traits::isSimdArray::value && !Traits::isAtomicSimdArray::value) || (Traits::isSimdMaskArray::value && !Traits::isAtomicSimdMaskArray::value))), Return> simd_cast_with_offset(const From &x) { using R0 = typename Return::storage_type0; using R1 = typename Return::storage_type1; return {simd_cast_with_offset(x), simd_cast_with_offset(x)}; } template Vc_INTRINSIC Vc_CONST enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 && ((Traits::isSimdArray::value && Traits::isAtomicSimdArray::value) || (Traits::isSimdMaskArray::value && Traits::isAtomicSimdMaskArray::value))), Return> simd_cast_with_offset(const From &x) { return simd_cast(x.shifted(offset % Return::Size)); } template Vc_INTRINSIC Vc_CONST enable_if<(are_all_types_equal::value && offset == 0), Return> simd_cast_with_offset(const From &x, const Froms &... xs) { return simd_cast(x, xs...); } // simd_cast_without_last (definition) {{{2 template Vc_INTRINSIC Vc_CONST Return simd_cast_without_last(const From &... xs, const T &) { return simd_cast(xs...); } // simd_cast_interleaved_argument_order (definitions) {{{2 #ifdef Vc_MSVC // MSVC doesn't see that the Ts pack below can be empty and thus complains when extract_interleaved // is called with only 2 arguments. These overloads here are *INCORRECT standard C++*, but they make // MSVC do the right thing. template Vc_INTRINSIC Vc_CONST enable_if<(I == 0), T0> extract_interleaved(const T0 &a0, const T0 &) { return a0; } template Vc_INTRINSIC Vc_CONST enable_if<(I == 1), T0> extract_interleaved(const T0 &, const T0 &b0) { return b0; } #endif // Vc_MSVC /// \internal returns the first argument template Vc_INTRINSIC Vc_CONST enable_if<(I == 0), T0> extract_interleaved(const T0 &a0, const Ts &..., const T0 &, const Ts &...) { return a0; } /// \internal returns the center argument template Vc_INTRINSIC Vc_CONST enable_if<(I == 1), T0> extract_interleaved(const T0 &, const Ts &..., const T0 &b0, const Ts &...) { return b0; } /// \internal drops the first and center arguments and recurses template Vc_INTRINSIC Vc_CONST enable_if<(I > 1), T0> extract_interleaved(const T0 &, const Ts &... a, const T0 &, const Ts &... b) { return extract_interleaved(a..., b...); } /// \internal calls simd_cast with correct argument order thanks to extract_interleaved template Vc_INTRINSIC Vc_CONST Return simd_cast_interleaved_argument_order_1(index_sequence, const Ts &... a, const Ts &... b) { return simd_cast(extract_interleaved(a..., b...)...); } /// \internal constructs the necessary index_sequence to pass it to /// simd_cast_interleaved_argument_order_1 template Vc_INTRINSIC Vc_CONST Return simd_cast_interleaved_argument_order(const Ts &... a, const Ts &... b) { using seq = make_index_sequence; return simd_cast_interleaved_argument_order_1(seq(), a..., b...); } // conditional_assign {{{1 #define Vc_CONDITIONAL_ASSIGN(name_, op_) \ template \ Vc_INTRINSIC enable_if conditional_assign( \ SimdArray &lhs, M &&mask, U &&rhs) \ { \ lhs(mask) op_ rhs; \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_CONDITIONAL_ASSIGN( Assign, =); Vc_CONDITIONAL_ASSIGN( PlusAssign, +=); Vc_CONDITIONAL_ASSIGN( MinusAssign, -=); Vc_CONDITIONAL_ASSIGN( MultiplyAssign, *=); Vc_CONDITIONAL_ASSIGN( DivideAssign, /=); Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=); Vc_CONDITIONAL_ASSIGN( XorAssign, ^=); Vc_CONDITIONAL_ASSIGN( AndAssign, &=); Vc_CONDITIONAL_ASSIGN( OrAssign, |=); Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=); Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=); #undef Vc_CONDITIONAL_ASSIGN #define Vc_CONDITIONAL_ASSIGN(name_, expr_) \ template \ Vc_INTRINSIC enable_if> \ conditional_assign(SimdArray &lhs, M &&mask) \ { \ return expr_; \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs(mask)++); Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs(mask)); Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs(mask)--); Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs(mask)); #undef Vc_CONDITIONAL_ASSIGN // transpose_impl {{{1 namespace Common { template inline void transpose_impl( TransposeTag<4, 4>, SimdArray *Vc_RESTRICT r[], const TransposeProxy, SimdArray, SimdArray, SimdArray> &proxy) { V *Vc_RESTRICT r2[4] = {&internal_data(*r[0]), &internal_data(*r[1]), &internal_data(*r[2]), &internal_data(*r[3])}; transpose_impl(TransposeTag<4, 4>(), &r2[0], TransposeProxy{internal_data(std::get<0>(proxy.in)), internal_data(std::get<1>(proxy.in)), internal_data(std::get<2>(proxy.in)), internal_data(std::get<3>(proxy.in))}); } template inline void transpose_impl( TransposeTag<2, 4>, SimdArray *Vc_RESTRICT r[], const TransposeProxy, SimdArray, SimdArray, SimdArray> &proxy) { auto &lo = *r[0]; auto &hi = *r[1]; internal_data0(internal_data0(lo)) = internal_data0(std::get<0>(proxy.in)); internal_data1(internal_data0(lo)) = internal_data0(std::get<1>(proxy.in)); internal_data0(internal_data1(lo)) = internal_data0(std::get<2>(proxy.in)); internal_data1(internal_data1(lo)) = internal_data0(std::get<3>(proxy.in)); internal_data0(internal_data0(hi)) = internal_data1(std::get<0>(proxy.in)); internal_data1(internal_data0(hi)) = internal_data1(std::get<1>(proxy.in)); internal_data0(internal_data1(hi)) = internal_data1(std::get<2>(proxy.in)); internal_data1(internal_data1(hi)) = internal_data1(std::get<3>(proxy.in)); } template inline void transpose_impl( TransposeTag<4, 4>, SimdArray *Vc_RESTRICT r[], const TransposeProxy, SimdArray, SimdArray, SimdArray> &proxy) { V *Vc_RESTRICT r2[4] = {&internal_data(*r[0]), &internal_data(*r[1]), &internal_data(*r[2]), &internal_data(*r[3])}; transpose_impl(TransposeTag<4, 4>(), &r2[0], TransposeProxy{internal_data(std::get<0>(proxy.in)), internal_data(std::get<1>(proxy.in)), internal_data(std::get<2>(proxy.in)), internal_data(std::get<3>(proxy.in))}); } template inline void transpose_impl( TransposeTag<4, 4>, SimdArray *Vc_RESTRICT r[], const TransposeProxy, SimdArray, SimdArray, SimdArray> &proxy) { SimdArray *Vc_RESTRICT r0[4 / 2] = {r[0], r[1]}; SimdArray *Vc_RESTRICT r1[4 / 2] = {r[2], r[3]}; using H = SimdArray; transpose_impl(TransposeTag<2, 4>(), &r0[0], TransposeProxy{internal_data0(std::get<0>(proxy.in)), internal_data0(std::get<1>(proxy.in)), internal_data0(std::get<2>(proxy.in)), internal_data0(std::get<3>(proxy.in))}); transpose_impl(TransposeTag<2, 4>(), &r1[0], TransposeProxy{internal_data1(std::get<0>(proxy.in)), internal_data1(std::get<1>(proxy.in)), internal_data1(std::get<2>(proxy.in)), internal_data1(std::get<3>(proxy.in))}); } /* TODO: template inline enable_if<(N > VSize), void> transpose_impl( std::array * Vc_RESTRICT, 4> & r, const TransposeProxy, SimdArray, SimdArray, SimdArray> &proxy) { typedef SimdArray SA; std::array r0 = { {&internal_data0(*r[0]), &internal_data0(*r[1]), &internal_data0(*r[2]), &internal_data0(*r[3])}}; transpose_impl( r0, TransposeProxy{ internal_data0(std::get<0>(proxy.in)), internal_data0(std::get<1>(proxy.in)), internal_data0(std::get<2>(proxy.in)), internal_data0(std::get<3>(proxy.in))}); std::array r1 = { {&internal_data1(*r[0]), &internal_data1(*r[1]), &internal_data1(*r[2]), &internal_data1(*r[3])}}; transpose_impl( r1, TransposeProxy{ internal_data1(std::get<0>(proxy.in)), internal_data1(std::get<1>(proxy.in)), internal_data1(std::get<2>(proxy.in)), internal_data1(std::get<3>(proxy.in))}); } */ } // namespace Common // Traits static assertions {{{1 static_assert(Traits::has_no_allocated_data &>::value, ""); static_assert(Traits::has_no_allocated_data>::value, ""); static_assert(Traits::has_no_allocated_data &>::value, ""); static_assert(Traits::has_no_allocated_data>::value, ""); static_assert(Traits::has_no_allocated_data &>::value, ""); static_assert(Traits::has_no_allocated_data>::value, ""); static_assert(Traits::has_no_allocated_data>::value, ""); static_assert(Traits::has_no_allocated_data &&>::value, ""); // }}}1 /// @} } // namespace Vc_VERSIONED_NAMESPACE // numeric_limits {{{1 namespace std { template struct numeric_limits> : public numeric_limits { private: using R = Vc::SimdArray; public: static Vc_ALWAYS_INLINE Vc_CONST R max() noexcept { return numeric_limits::max(); } static Vc_ALWAYS_INLINE Vc_CONST R min() noexcept { return numeric_limits::min(); } static Vc_ALWAYS_INLINE Vc_CONST R lowest() noexcept { return numeric_limits::lowest(); } static Vc_ALWAYS_INLINE Vc_CONST R epsilon() noexcept { return numeric_limits::epsilon(); } static Vc_ALWAYS_INLINE Vc_CONST R round_error() noexcept { return numeric_limits::round_error(); } static Vc_ALWAYS_INLINE Vc_CONST R infinity() noexcept { return numeric_limits::infinity(); } static Vc_ALWAYS_INLINE Vc_CONST R quiet_NaN() noexcept { return numeric_limits::quiet_NaN(); } static Vc_ALWAYS_INLINE Vc_CONST R signaling_NaN() noexcept { return numeric_limits::signaling_NaN(); } static Vc_ALWAYS_INLINE Vc_CONST R denorm_min() noexcept { return numeric_limits::denorm_min(); } }; } // namespace std //}}}1 #endif // VC_COMMON_SIMDARRAY_H_ // vim: foldmethod=marker Vc-1.3.3/common/simdarrayfwd.h000066400000000000000000000206571320703111200162230ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2014-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_SIMDARRAYFWD_H_ #define VC_COMMON_SIMDARRAYFWD_H_ #include "../scalar/types.h" #include "../sse/types.h" #include "../avx/types.h" #include "../mic/types.h" #include "utility.h" #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace Common { /// \addtogroup SimdArray /// @{ /*select_best_vector_type{{{*/ /** * \internal * Selects the best SIMD type out of a typelist to store N scalar values. */ template struct select_best_vector_type_impl; template struct select_best_vector_type_impl { using type = T; }; template struct select_best_vector_type_impl { using type = typename std::conditional< (N < T::Size), typename select_best_vector_type_impl::type, T>::type; }; template using select_best_vector_type = typename select_best_vector_type_impl, Vc::SSE::Vector, Vc::Scalar::Vector #elif defined(Vc_IMPL_AVX) Vc::AVX::Vector, Vc::SSE::Vector, Vc::Scalar::Vector #elif defined(Vc_IMPL_Scalar) Vc::Scalar::Vector #elif defined(Vc_IMPL_SSE) Vc::SSE::Vector, Vc::Scalar::Vector #elif defined(Vc_IMPL_MIC) Vc::MIC::Vector, Vc::Scalar::Vector #endif >::type; //}}} /// @} } // namespace Common // === having SimdArray in the Vc namespace leads to a ABI bug === // // SimdArray can be { double[4] }, { __m128d[2] }, or { __m256d } even though the type // is the same. // The question is, what should SimdArray focus on? // a) A type that makes interfacing between different implementations possible? // b) Or a type that makes fixed size SIMD easier and efficient? // // a) can be achieved by using a union with T[N] as one member. But this may have more serious // performance implications than only less efficient parameter passing (because compilers have a // much harder time wrt. aliasing issues). Also alignment would need to be set to the sizeof in // order to be compatible with targets with larger alignment requirements. // But, the in-memory representation of masks is not portable. Thus, at the latest with AVX-512, // there would be a problem with requiring SimdMaskArray to be an ABI compatible type. // AVX-512 uses one bit per boolean, whereas SSE/AVX use sizeof(T) Bytes per boolean. Conversion // between the two representations is not a trivial operation. Therefore choosing one or the other // representation will have a considerable impact for the targets that do not use this // representation. Since the future probably belongs to one bit per boolean representation, I would // go with that choice. // // b) requires that SimdArray != SimdArray if // SimdArray::vector_type != SimdArray::vector_type // // Therefore use SimdArray, where V follows from the above. template , size_t Wt = V::Size // this last parameter is only used for specialization of N // == VectorSize > class SimdArray; template , size_t Wt = V::Size // this last parameter is only used for specialization of N // == VectorSize > class SimdMaskArray; /** \internal * Simple traits for SimdArray to easily access internal types of non-atomic SimdArray * types. */ template struct SimdArrayTraits { static constexpr std::size_t N0 = Common::left_size(); static constexpr std::size_t N1 = Common::right_size(); using storage_type0 = SimdArray; using storage_type1 = SimdArray; }; template Vc_INTRINSIC_L typename SimdArrayTraits::storage_type0 &internal_data0( SimdArray &x) Vc_INTRINSIC_R; template Vc_INTRINSIC_L typename SimdArrayTraits::storage_type1 &internal_data1( SimdArray &x) Vc_INTRINSIC_R; template Vc_INTRINSIC_L const typename SimdArrayTraits::storage_type0 &internal_data0( const SimdArray &x) Vc_INTRINSIC_R; template Vc_INTRINSIC_L const typename SimdArrayTraits::storage_type1 &internal_data1( const SimdArray &x) Vc_INTRINSIC_R; template Vc_INTRINSIC_L V &internal_data(SimdArray &x) Vc_INTRINSIC_R; template Vc_INTRINSIC_L const V &internal_data(const SimdArray &x) Vc_INTRINSIC_R; namespace Traits { template struct is_atomic_simdarray_internal> : public std::true_type {}; template struct is_atomic_simd_mask_array_internal> : public std::true_type {}; template struct is_simdarray_internal> : public std::true_type {}; template struct is_simd_mask_array_internal> : public std::true_type {}; template struct is_integral_internal , false> : public std::is_integral {}; template struct is_floating_point_internal, false> : public std::is_floating_point {}; template struct is_signed_internal , false> : public std::is_signed {}; template struct is_unsigned_internal , false> : public std::is_unsigned {}; template struct has_no_allocated_data_impl> : public std::true_type {}; } // namespace Traits } // namespace Vc #endif // VC_COMMON_SIMDARRAYFWD_H_ // vim: foldmethod=marker Vc-1.3.3/common/simdarrayhelper.h000066400000000000000000000533141320703111200167160ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2013-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_SIMDARRAYHELPER_H_ #define VC_COMMON_SIMDARRAYHELPER_H_ #include "macros.h" namespace Vc_VERSIONED_NAMESPACE { namespace Common { /// \addtogroup SimdArray /// @{ namespace Operations/*{{{*/ { struct tag {}; #define Vc_DEFINE_OPERATION(name_) \ struct name_ : public tag { \ template \ Vc_INTRINSIC void operator()(V &v, Args &&... args) \ { \ v.name_(std::forward(args)...); \ } \ } Vc_DEFINE_OPERATION(gather); Vc_DEFINE_OPERATION(scatter); Vc_DEFINE_OPERATION(load); Vc_DEFINE_OPERATION(store); Vc_DEFINE_OPERATION(setZero); Vc_DEFINE_OPERATION(setZeroInverted); Vc_DEFINE_OPERATION(assign); #undef Vc_DEFINE_OPERATION #define Vc_DEFINE_OPERATION(name_, code_) \ struct name_ : public tag { \ template Vc_INTRINSIC void operator()(V &v) { code_; } \ } Vc_DEFINE_OPERATION(increment, ++(v)); Vc_DEFINE_OPERATION(decrement, --(v)); Vc_DEFINE_OPERATION(random, v = V::Random()); #undef Vc_DEFINE_OPERATION #define Vc_DEFINE_OPERATION_FORWARD(name_) \ struct Forward_##name_ : public tag \ { \ template ()...))> \ Vc_INTRINSIC void operator()(decltype(name_(std::declval()...)) &v, \ Args &&... args) \ { \ v = name_(std::forward(args)...); \ } \ template ()...))> \ Vc_INTRINSIC void operator()(std::nullptr_t, Args && ... args) \ { \ name_(std::forward(args)...); \ } \ } Vc_DEFINE_OPERATION_FORWARD(abs); Vc_DEFINE_OPERATION_FORWARD(asin); Vc_DEFINE_OPERATION_FORWARD(atan); Vc_DEFINE_OPERATION_FORWARD(atan2); Vc_DEFINE_OPERATION_FORWARD(cos); Vc_DEFINE_OPERATION_FORWARD(ceil); Vc_DEFINE_OPERATION_FORWARD(copysign); Vc_DEFINE_OPERATION_FORWARD(exp); Vc_DEFINE_OPERATION_FORWARD(exponent); Vc_DEFINE_OPERATION_FORWARD(fma); Vc_DEFINE_OPERATION_FORWARD(floor); Vc_DEFINE_OPERATION_FORWARD(frexp); Vc_DEFINE_OPERATION_FORWARD(isfinite); Vc_DEFINE_OPERATION_FORWARD(isinf); Vc_DEFINE_OPERATION_FORWARD(isnan); Vc_DEFINE_OPERATION_FORWARD(isnegative); Vc_DEFINE_OPERATION_FORWARD(ldexp); Vc_DEFINE_OPERATION_FORWARD(log); Vc_DEFINE_OPERATION_FORWARD(log10); Vc_DEFINE_OPERATION_FORWARD(log2); Vc_DEFINE_OPERATION_FORWARD(reciprocal); Vc_DEFINE_OPERATION_FORWARD(round); Vc_DEFINE_OPERATION_FORWARD(rsqrt); Vc_DEFINE_OPERATION_FORWARD(sin); Vc_DEFINE_OPERATION_FORWARD(sincos); Vc_DEFINE_OPERATION_FORWARD(sqrt); Vc_DEFINE_OPERATION_FORWARD(trunc); Vc_DEFINE_OPERATION_FORWARD(min); Vc_DEFINE_OPERATION_FORWARD(max); #undef Vc_DEFINE_OPERATION_FORWARD template using is_operation = std::is_base_of; } // namespace Operations }}} /** * \internal * Helper type to statically communicate segmentation of one vector register into 2^n parts * (Pieces). */ template struct Segment/*{{{*/ { static_assert(Index_ < Pieces_, "You found a bug in Vc. Please report."); using type = T_; using type_decayed = typename std::decay::type; static constexpr std::size_t Pieces = Pieces_; static constexpr std::size_t Index = Index_; using simd_array_type = SimdArray< typename std::conditional::value, typename type_decayed::EntryType, float>::type, type_decayed::Size / Pieces>; type data; static constexpr std::size_t EntryOffset = Index * type_decayed::Size / Pieces; // no non-const operator[] needed decltype(std::declval()[0]) operator[](size_t i) const { return data[i + EntryOffset]; } simd_array_type asSimdArray() const { return simd_cast(data); } };/*}}}*/ //Segment specialization {{{ template struct Segment { static_assert(Index_ < Pieces_, "You found a bug in Vc. Please report."); using type = T_ *; using type_decayed = typename std::decay::type; static constexpr size_t Pieces = Pieces_; static constexpr size_t Index = Index_; using simd_array_type = SimdArray< typename std::conditional::value, typename type_decayed::VectorEntryType, float>::type, type_decayed::Size / Pieces> *; type data; static constexpr std::size_t EntryOffset = Index * type_decayed::size() / Pieces; simd_array_type asSimdArray() const { return reinterpret_cast< #ifdef Vc_GCC // GCC might ICE if this type is declared with may_alias. If it doesn't // ICE it warns about ignoring the attribute. typename std::remove_pointer::type #else MayAlias::type> #endif *>(data) + Index; } //decltype(std::declval()[0]) operator[](size_t i) { return data[i + EntryOffset]; } //decltype(std::declval()[0]) operator[](size_t i) const { return data[i + EntryOffset]; } };/*}}}*/ /** \internal Template class that is used to attach an offset value to an existing type. It is used for IndexesFromZero construction in SimdArray. The \c data1 constructor needs to know that the IndexesFromZero constructor requires an offset so that the whole data is constructed as a correct sequence from `0` to `Size - 1`. \tparam T The original type that needs the offset attached. \tparam Offset An integral value that determines the offset in the complete SimdArray. */ template struct AddOffset { constexpr AddOffset() = default; }; // class Split {{{1 /** \internal Helper type with static functions to generically adjust arguments for the \c data0 and \c data1 members of SimdArray and SimdMaskArray. \tparam secondOffset The offset in number of elements that \c data1 has in the SimdArray / SimdMaskArray. This is essentially equal to the number of elements in \c data0. */ template class Split { static Vc_INTRINSIC AddOffset hiImpl(VectorSpecialInitializerIndexesFromZero) { return {}; } template static Vc_INTRINSIC AddOffset hiImpl(AddOffset) { return {}; } // split composite SimdArray template > static Vc_INTRINSIC auto loImpl(const SimdArray &x) -> decltype(internal_data0(x)) { return internal_data0(x); } template > static Vc_INTRINSIC auto hiImpl(const SimdArray &x) -> decltype(internal_data1(x)) { return internal_data1(x); } template > static Vc_INTRINSIC auto loImpl(SimdArray *x) -> decltype(&internal_data0(*x)) { return &internal_data0(*x); } template > static Vc_INTRINSIC auto hiImpl(SimdArray *x) -> decltype(&internal_data1(*x)) { return &internal_data1(*x); } // split atomic SimdArray template static Vc_INTRINSIC Segment loImpl(const SimdArray &x) { return {internal_data(x)}; } template static Vc_INTRINSIC Segment hiImpl(const SimdArray &x) { return {internal_data(x)}; } template static Vc_INTRINSIC Segment loImpl(SimdArray *x) { return {&internal_data(*x)}; } template static Vc_INTRINSIC Segment hiImpl(SimdArray *x) { return {&internal_data(*x)}; } // split composite SimdMaskArray template static Vc_INTRINSIC auto loImpl(const SimdMaskArray &x) -> decltype(internal_data0(x)) { return internal_data0(x); } template static Vc_INTRINSIC auto hiImpl(const SimdMaskArray &x) -> decltype(internal_data1(x)) { return internal_data1(x); } template static Vc_INTRINSIC Segment::mask_type, 2, 0> loImpl( const SimdMaskArray &x) { return {internal_data(x)}; } template static Vc_INTRINSIC Segment::mask_type, 2, 1> hiImpl( const SimdMaskArray &x) { return {internal_data(x)}; } // split Vector and Mask template static constexpr bool is_vector_or_mask(){ return (Traits::is_simd_vector::value && !Traits::isSimdArray::value) || (Traits::is_simd_mask::value && !Traits::isSimdMaskArray::value); } template static Vc_INTRINSIC Segment loImpl(V &&x, enable_if()> = nullarg) { return {std::forward(x)}; } template static Vc_INTRINSIC Segment hiImpl(V &&x, enable_if()> = nullarg) { return {std::forward(x)}; } // generically split Segments template static Vc_INTRINSIC Segment loImpl( const Segment &x) { return {x.data}; } template static Vc_INTRINSIC Segment hiImpl( const Segment &x) { return {x.data}; } /** \internal * \name Checks for existence of \c loImpl / \c hiImpl */ //@{ template ()))> static std::true_type have_lo_impl(int); template static std::false_type have_lo_impl(float); template static constexpr bool have_lo_impl() { return decltype(have_lo_impl(1))::value; } template ()))> static std::true_type have_hi_impl(int); template static std::false_type have_hi_impl(float); template static constexpr bool have_hi_impl() { return decltype(have_hi_impl(1))::value; } //@} public: /** \internal * \name with Operations tag * * These functions don't overload on the data parameter. The first parameter (the tag) clearly * identifies the intended function. */ //@{ template static Vc_INTRINSIC const U *lo(Operations::gather, const U *ptr) { return ptr; } template static Vc_INTRINSIC const U *hi(Operations::gather, const U *ptr) { return ptr + secondOffset; } template ::value>> static Vc_ALWAYS_INLINE decltype(loImpl(std::declval())) lo(Operations::gather, U &&x) { return loImpl(std::forward(x)); } template ::value>> static Vc_ALWAYS_INLINE decltype(hiImpl(std::declval())) hi(Operations::gather, U &&x) { return hiImpl(std::forward(x)); } template static Vc_INTRINSIC const U *lo(Operations::scatter, const U *ptr) { return ptr; } template static Vc_INTRINSIC const U *hi(Operations::scatter, const U *ptr) { return ptr + secondOffset; } //@} /** \internal \name without Operations tag These functions are not clearly tagged as to where they are used and therefore behave differently depending on the type of the parameter. Different behavior is implemented via overloads of \c loImpl and \c hiImpl. They are not overloads of \c lo and \c hi directly because it's hard to compete against a universal reference (i.e. an overload for `int` requires overloads for `int &`, `const int &`, and `int &&`. If one of them were missing `U &&` would win in overload resolution). */ //@{ template static Vc_ALWAYS_INLINE decltype(loImpl(std::declval())) lo(U &&x) { return loImpl(std::forward(x)); } template static Vc_ALWAYS_INLINE decltype(hiImpl(std::declval())) hi(U &&x) { return hiImpl(std::forward(x)); } template static Vc_ALWAYS_INLINE enable_if(), U> lo(U &&x) { return std::forward(x); } template static Vc_ALWAYS_INLINE enable_if(), U> hi(U &&x) { return std::forward(x); } //@} }; // actual_value {{{1 template static Vc_INTRINSIC const V &actual_value(Op, const SimdArray &x) { return internal_data(x); } template static Vc_INTRINSIC V *actual_value(Op, SimdArray *x) { return &internal_data(*x); } template static Vc_INTRINSIC typename Segment::simd_array_type actual_value( Op, Segment &&seg) { return seg.asSimdArray(); } template static Vc_INTRINSIC const typename V::Mask &actual_value(Op, const SimdMaskArray &x) { return internal_data(x); } template static Vc_INTRINSIC typename V::Mask *actual_value(Op, SimdMaskArray *x) { return &internal_data(*x); } // unpackArgumentsAuto {{{1 /**\internal * \name unpackArgumentsAuto * * Search for the right amount of SimdArray "unpacking" (via actual_value) to match the * interface of the function to be called. * * The compiler can figure this out for us thanks to SFINAE. The approach is to have a * number \c I that determines the indexes of the arguments to be transformed via * actual_value. Each bit of \c I identifies an argument. unpackArgumentsAuto starts the * recursion with `I = 0`, i.e. no actual_value transformations. If the overload calling * \c op is unavailable due to a substitution failure \c I is incremented and the function * recurses. Otherwise there are two unpackArgumentsAutoImpl functions in the overload * set. The first argument (\c int / \c float) leads to a preference of the function * calling \c op, thus ending the recursion. */ ///@{ ///\internal transforms \p arg via actual_value template Vc_INTRINSIC decltype(actual_value(std::declval(), std::declval())) conditionalUnpack(std::true_type, Op op, Arg &&arg) { return actual_value(op, std::forward(arg)); } ///\internal forwards \p arg to its return value template Vc_INTRINSIC Arg conditionalUnpack(std::false_type, Op, Arg &&arg) { return std::forward(arg); } ///\internal true-/false_type that selects whether the argument with index B should be unpacked template struct selectorType : public std::integral_constant { }; ///\internal ends the recursion, transforms arguments, and calls \p op template Vc_INTRINSIC decltype(std::declval()(std::declval(), conditionalUnpack(selectorType(), std::declval(), std::declval())...)) unpackArgumentsAutoImpl(int, index_sequence, Op op, R &&r, Args &&... args) { op(std::forward(r), conditionalUnpack(selectorType(), op, std::forward(args))...); } ///\internal the current actual_value calls don't work: recurse to I + 1 template Vc_INTRINSIC enable_if<(I <= (size_t(1) << sizeof...(Args))), void> unpackArgumentsAutoImpl( float, index_sequence is, Op op, R &&r, Args &&... args) { // if R is nullptr_t then the return type cannot enforce that actually any unwrapping // of the SimdArray types happens. Thus, you could get an endless loop of the // SimdArray function overload calling itself, if the index goes up to (1 << // sizeof...(Args)) - 1 (which means no argument transformations via actual_value). static_assert( I < (1 << sizeof...(Args)) - (std::is_same::value ? 1 : 0), "Vc or compiler bug. Please report. Failed to find a combination of " "actual_value(arg) transformations that allows calling Op."); unpackArgumentsAutoImpl(int(), is, op, std::forward(r), std::forward(args)...); } #ifdef Vc_ICC template struct IccWorkaround { using type = void; }; template struct IccWorkaround<2, Ts...> { using type = typename std::remove_pointer>::type>::type>::type; }; #endif ///\internal The interface to start the machinery. template Vc_INTRINSIC void unpackArgumentsAuto(Op op, R &&r, Args &&... args) { #ifdef Vc_ICC // ugly hacky workaround for ICC: // The compiler fails to do SFINAE right on recursion. We have to hit the right // recursionStart number from the start. const int recursionStart = Traits::isSimdArray< typename IccWorkaround::type>::value && (std::is_same::value || std::is_same::value) ? 2 : 0; #else const int recursionStart = 0; #endif unpackArgumentsAutoImpl( int(), make_index_sequence(), op, std::forward(r), std::forward(args)...); } ///@} //}}}1 ///@} } // namespace Common } // namespace Vc #endif // VC_COMMON_SIMDARRAYHELPER_H_ // vim: foldmethod=marker Vc-1.3.3/common/simdize.h000066400000000000000000002224511320703111200151670ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright © 2014-2015 Matthias Kretz Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the names of contributing organizations nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ #ifndef VC_COMMON_SIMDIZE_H_ #define VC_COMMON_SIMDIZE_H_ #include #include #include "macros.h" /*! \addtogroup Simdize Automatic type vectorization. The simdize expression transforms the type \c T to a vectorized variant. This requires the type \c T to be a class template instance. Example: First, we declare a class template for a three-dimensional point. The template parameter \c T determines the type of the members and is \c float in the scalar (classical) case. \code template struct PointTemplate { T x, y, z; // Declares tuple_size and makes the members accessible via get(point), allowing // the simdize implementation to convert between Point and PointV (see below). Vc_SIMDIZE_INTERFACE((x, y, z)); PointTemplate(T xx, T yy, T zz) : x{xx}, y{yy}, z{zz} {}; // The following function will automatically be vectorized in the PointV type. T distance_to_origin() const { return std::sqrt(x * x + y * y + z * z); } }; \endcode In the following we create a type alias for the scalar type, which simply means instantiating \c PointTemplate with \c float. The resulting type can then be transformed with \ref simdize. \code using Point = PointTemplate; // A simple struct with three floats and two functions. using PointV = Vc::simdize; // The vectorization of Point stores three float_v and thus // float_v::size() Points. \endcode The following shows a code example using the above \c Point and \c PointV types. \code PointV pv = Point{0.f, 1.f, 2.f}; // Constructs a PointV containing PointV::size() // copies of Point{0, 1, 2}. for (int i = 1; i < int(pv.size()); ++i) { assign(pv, i, {i + 0.f, i + 1.f, i + 2.f}); } const Vc::float_v l = pv.distance_to_origin(); std::cout << l << '\n'; // prints [2.23607, 3.74166, 5.38516, 7.07107, 8.77496, 10.4881, 12.2066, 13.9284] with // float_v::size() == 8 const Point most_distant = extract(pv, (l.max() == l).firstOne()); std::cout << '(' << most_distant.x << ", " << most_distant.y << ", " << most_distant.z << ")\n"; // prints (7, 8, 9) with float_v::size() == 8 \endcode */ namespace Vc_VERSIONED_NAMESPACE { /**\internal * \ingroup Simdize * This namespace contains all the required code for implementing simdize. None of this * code should be directly accessed by users, though the unit test for simdize * certainly may look into some of the details if necessary. */ namespace SimdizeDetail { /** * \addtogroup Simdize * @{ */ using std::is_same; using std::is_base_of; using std::false_type; using std::true_type; using std::iterator_traits; using std::conditional; using std::size_t; template using conditional_t = typename conditional::type; /**\internal * Typelist is a simple helper class for supporting multiple parameter packs in one class * template. */ template struct Typelist; /**\internal * The Category identifies how the type argument to simdize has to be transformed. */ enum class Category { ///\internal No transformation None, ///\internal simple Vector transformation ArithmeticVectorizable, ///\internal transform an input iterator to return vectorized entries InputIterator, ///\internal transform a forward iterator to return vectorized entries OutputIterator, ///\internal transform an output iterator to return vectorized entries ForwardIterator, ///\internal transform a bidirectional iterator to return vectorized entries BidirectionalIterator, ///\internal transform a random access iterator to return vectorized entries RandomAccessIterator, ///\internal transform a class template recursively ClassTemplate }; /**\internal * iteratorCategories(int()) returns whether iterator_traits::iterator_category is a * valid type and whether it is derived from RandomAccessIterator or ForwardIterator. */ template constexpr Category iteratorCategories(int, ItCat * = nullptr) { return is_base_of::value ? Category::RandomAccessIterator : is_base_of::value ? Category::BidirectionalIterator : is_base_of::value ? Category::ForwardIterator : is_base_of::value ? Category::OutputIterator : is_base_of::value ? Category::InputIterator : Category::None; } /**\internal * This overload is selected for pointer types => RandomAccessIterator. */ template constexpr enable_if::value, Category> iteratorCategories(float) { return Category::RandomAccessIterator; } /**\internal * This overload is selected if T does not work with iterator_traits. */ template constexpr Category iteratorCategories(...) { return Category::None; } /**\internal * Simple trait to identify whether a type T is a class template or not. */ template struct is_class_template : public false_type { }; template