pax_global_header00006660000000000000000000000064123351234600014511gustar00rootroot0000000000000052 comment=6464c9fd059d36ef7c1a9dadd29bce114e95e92e Vc-0.7.4/000077500000000000000000000000001233512346000120715ustar00rootroot00000000000000Vc-0.7.4/.clang-format000066400000000000000000000142561233512346000144540ustar00rootroot00000000000000BasedOnStyle: Google # The extra indent or outdent of access modifiers, e.g. public:. AccessModifierOffset: -4 # If true, aligns escaped newlines as far left as possible. Otherwise puts them into the right-most column. AlignEscapedNewlinesLeft: false # If true, aligns trailing comments. AlignTrailingComments: true # Allow putting all parameters of a function declaration onto the next line even if BinPackParameters is false. AllowAllParametersOfDeclarationOnNextLine: false # If true, if (a) return; can be put on a single line. AllowShortIfStatementsOnASingleLine: false # If true, while (true) continue; can be put on a single line. AllowShortLoopsOnASingleLine: false # If true, always break before multiline string literals. AlwaysBreakBeforeMultilineStrings: false # If true, always break after the template<...> of a template declaration. AlwaysBreakTemplateDeclarations: false # If false, a function call’s or function definition’s parameters will either all be on the same line or will have one line each. BinPackParameters: false # If true, binary operators will be placed after line breaks. BreakBeforeBinaryOperators: false # The brace breaking style to use. # Possible values: # BS_Attach (in configuration: Attach) Always attach braces to surrounding context. # BS_Linux (in configuration: Linux) Like Attach, but break before braces on function, namespace and class definitions. # BS_Stroustrup (in configuration: Stroustrup) Like Attach, but break before function definitions. # BS_Allman (in configuration: Allman) Always break before braces. BreakBeforeBraces: Linux # Always break constructor initializers before commas and align the commas with the colon. BreakConstructorInitializersBeforeComma: true # The column limit. # A column limit of 0 means that there is no column limit. In this case, clang-format will respect the input’s line breaking decisions within statements. ColumnLimit: 100 # If the constructor initializers don’t fit on a line, put each initializer on its own line. #ConstructorInitializerAllOnOneLineOrOnePerLine (bool) # The number of characters to use for indentation of constructor initializer lists. #ConstructorInitializerIndentWidth (unsigned) # If true, format braced lists as best suited for C++11 braced lists. # Important differences: - No spaces inside the braced list. - No line break before the closing brace. - Indentation with the continuation indent, not with the block indent. # Fundamentally, C++11 braced lists are formatted exactly like function calls would be formatted in their place. If the braced list follows a name (e.g. a type or variable name), clang-format formats as if the {} were the parentheses of a function call with that name. If there is no name, a zero-length name is assumed. Cpp11BracedListStyle: true # If true, analyze the formatted file for the most common binding. #DerivePointerBinding (bool) # If true, clang-format detects whether function calls and definitions are formatted with one parameter per line. # Each call can be bin-packed, one-per-line or inconclusive. If it is inconclusive, e.g. completely on one line, but a decision needs to be made, clang-format analyzes whether there are other bin-packed cases in the input file and act accordingly. # NOTE: This is an experimental flag, that might go away or be renamed. Do not use this in config files, etc. Use at your own risk. #ExperimentalAutoDetectBinPacking (bool) # Indent case labels one level from the switch statement. # When false, use the same indentation level as for the switch statement. Switch statement body is always indented one level more than case labels. IndentCaseLabels: false # If true, indent when breaking function declarations which are not also definitions after the type. #IndentFunctionDeclarationAfterType (bool) # The number of characters to use for indentation. IndentWidth: 4 # The maximum number of consecutive empty lines to keep. MaxEmptyLinesToKeep: 1 # The indentation used for namespaces. # Possible values: # NI_None (in configuration: None) Don’t indent in namespaces. # NI_Inner (in configuration: Inner) Indent only in inner namespaces (nested in other namespaces). # NI_All (in configuration: All) Indent in all namespaces. NamespaceIndentation: None # Add a space in front of an Objective-C protocol list, i.e. use Foo instead of Foo. #ObjCSpaceBeforeProtocolList (bool) # The penalty for each line break introduced inside a comment. #PenaltyBreakComment (unsigned) # The penalty for breaking before the first <<. #PenaltyBreakFirstLessLess (unsigned) # The penalty for each line break introduced inside a string literal. #PenaltyBreakString (unsigned) # The penalty for each character outside of the column limit. #PenaltyExcessCharacter (unsigned) # Penalty for putting the return type of a function onto its own line. #PenaltyReturnTypeOnItsOwnLine (unsigned) # Set whether & and * bind to the type as opposed to the variable. #PointerBindsToType: false # If true, spaces will be inserted between ‘for’/’if’/’while’/... and ‘(‘. #SpaceAfterControlStatementKeyword: true # If false, spaces will be removed before ‘=’, ‘+=’, etc. #SpaceBeforeAssignmentOperators: true # If false, spaces may be inserted into ‘()’. #SpaceInEmptyParentheses: false # The number of spaces to before trailing line comments. #SpacesBeforeTrailingComments (unsigned) # If false, spaces may be inserted into C style casts. #SpacesInCStyleCastParentheses (bool) # If true, spaces will be inserted after every ‘(‘ and before every ‘)’. SpacesInParentheses: false # Format compatible with this standard, e.g. use A > instead of A> for LS_Cpp03. # Possible values: # LS_Cpp03 (in configuration: Cpp03) Use C++03-compatible syntax. # LS_Cpp11 (in configuration: Cpp11) Use features of C++11 (e.g. A> instead of A >). # LS_Auto (in configuration: Auto) Automatic detection based on the input. Standard: Cpp11 # If true, IndentWidth consecutive spaces will be replaced with tab characters. UseTab: false # vim: ft=yaml Vc-0.7.4/.gitignore000066400000000000000000000000621233512346000140570ustar00rootroot00000000000000doc/html doc/latex doc/man vc-benchmarks *.swp *~ Vc-0.7.4/CMakeLists.txt000066400000000000000000000262061233512346000146370ustar00rootroot00000000000000cmake_minimum_required(VERSION 2.8.3) if(CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR) message(FATAL_ERROR "You don't want to configure in the source directory!") endif() project(Vc) set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") set(ROOT_RELEASE FALSE CACHE BOOL "Set up for creating a Vc copy inside ROOT/AliRoot.") mark_as_advanced(ROOT_RELEASE) set(disabled_targets) include (VcMacros) include (AddTargetProperty) include (OptimizeForArchitecture) vc_determine_compiler() if(ROOT_RELEASE) if(EXISTS "${CMAKE_INSTALL_PREFIX}/Module.mk") file(READ "${CMAKE_INSTALL_PREFIX}/Module.mk" ROOT_MODULE_MK) if(NOT "${ROOT_MODULE_MK}" MATCHES "\nMODNAME *:= *vc *\n") message(FATAL_ERROR "CMAKE_INSTALL_PREFIX is incorrect. It must point to the Vc subdirectory inside ROOT/AliRoot") endif() set(_extra_namespace "ROOT") endif() if(EXISTS "${CMAKE_INSTALL_PREFIX}/Vc.cmake") file(READ "${CMAKE_INSTALL_PREFIX}/Vc.cmake" ALIROOT_VC_CMAKE) if(NOT "${ALIROOT_VC_CMAKE}" MATCHES "\nmacro\\(ALICE_UseVc\\)\n") message(FATAL_ERROR "CMAKE_INSTALL_PREFIX is incorrect. It must point to the Vc subdirectory inside ROOT/AliRoot") endif() set(_extra_namespace "AliRoot") endif() else() if(Vc_COMPILER_IS_GCC) if(Vc_GCC_VERSION STREQUAL "4.6.0") UserWarning("GCC 4.6.0 is broken. The following tests are therefore disabled: gather_avx, gather_sse, gather_VC_USE_SET_GATHERS_avx, gather_VC_USE_SET_GATHERS_sse, gather_sse_LOOP, scatter_avx, and scatter_sse") list(APPEND disabled_targets gather_avx gather_sse gather_VC_USE_SET_GATHERS_avx gather_VC_USE_SET_GATHERS_sse scatter_avx scatter_sse c++11_gather_avx c++11_gather_sse c++11_gather_VC_USE_SET_GATHERS_avx c++11_gather_VC_USE_SET_GATHERS_sse c++11_scatter_avx c++11_scatter_sse ) elseif(Vc_GCC_VERSION STREQUAL "4.5.0" OR Vc_GCC_VERSION STREQUAL "4.5.1") UserWarning("GCC 4.5.[12] are known to generate an internal compiler error on the memory unit test. (http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46723) The test will therefore not be compiled and executed.") list(APPEND disabled_targets memory_scalar memory_sse memory_avx c++11_memory_scalar c++11_memory_sse c++11_memory_avx ) elseif(Vc_GCC_VERSION STREQUAL "4.5.2") UserWarning("GCC 4.5.2 generates an internal compiler error on the memory_scalar unit test. The test will not be compiled and executed.") list(APPEND disabled_targets memory_scalar c++11_memory_scalar ) endif() elseif(Vc_COMPILER_IS_CLANG) if(Vc_CLANG_VERSION VERSION_EQUAL "3.0") UserWarning("Clang 3.0 generates an internal compiler error on the finitediff example. The example will not be compiled.") list(APPEND disabled_targets example_finitediff ) endif() elseif(Vc_COMPILER_IS_MSVC) if(MSVC_VERSION LESS 1700) # MSVC before 2012 has a broken std::vector::resize implementation. STL + Vc code will probably not compile. # UserWarning in VcMacros.cmake list(APPEND disabled_targets stlcontainer_sse stlcontainer_avx c++11_stlcontainer_sse c++11_stlcontainer_avx ) endif() # Disable warning "C++ exception specification ignored except to indicate a function is not __declspec(nothrow)" # MSVC emits the warning for the _UnitTest_Compare desctructor which needs the throw declaration so that it doesn't std::terminate AddCompilerFlag("/wd4290") endif() endif() if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebug RelWithDebInfo MinSizeRel." FORCE) endif(NOT CMAKE_BUILD_TYPE) vc_set_preferred_compiler_flags(WARNING_FLAGS BUILDTYPE_FLAGS) add_definitions("${Vc_DEFINITIONS}") if(Vc_COMPILER_IS_GCC AND Vc_GCC_VERSION VERSION_LESS 4.3.0) add_definitions(-DVC_DONT_WARN_OLD_GCC) # this warning is only interesting for external users of Vc endif() if(Vc_COMPILER_IS_INTEL) # per default icc is not IEEE compliant, but we need that for verification set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fp-model source") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fp-model source") endif() if(CMAKE_BUILD_TYPE STREQUAL "" AND NOT CMAKE_CXX_FLAGS MATCHES "-O[123]") message(STATUS "WARNING! It seems you are compiling without optimization. Please set CMAKE_BUILD_TYPE.") endif(CMAKE_BUILD_TYPE STREQUAL "" AND NOT CMAKE_CXX_FLAGS MATCHES "-O[123]") include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/include) if(NOT ROOT_RELEASE) add_custom_target(other VERBATIM) add_custom_target(Scalar COMMENT "build Scalar code" VERBATIM) add_custom_target(SSE COMMENT "build SSE code" VERBATIM) add_custom_target(AVX COMMENT "build AVX code" VERBATIM) set(libvc_compile_flags "-DVC_COMPILE_LIB") AddCompilerFlag("-fPIC" CXX_FLAGS libvc_compile_flags) vc_compile_for_all_implementations(_objs src/trigonometric.cpp FLAGS ${libvc_compile_flags} ONLY SSE2 SSE3 SSSE3 SSE4_1 AVX SSE+XOP+FMA4 AVX+XOP+FMA4 AVX+XOP+FMA AVX+FMA) set(_srcs src/const.cpp src/cpuid.cpp src/support.cpp ${_objs}) vc_compile_for_all_implementations(_objs src/avx_sorthelper.cpp FLAGS ${libvc_compile_flags} ONLY AVX) set(_srcs ${_srcs} ${_objs}) add_library(Vc STATIC ${_srcs}) add_target_property(Vc COMPILE_FLAGS ${libvc_compile_flags}) add_target_property(Vc LABELS "other") add_dependencies(other Vc) install(TARGETS Vc DESTINATION lib${LIB_SUFFIX}) install(DIRECTORY include/Vc/ DESTINATION include/Vc) install(DIRECTORY scalar sse avx common DESTINATION include/Vc FILES_MATCHING REGEX "/*.(h|tcc|def)$") else() # libVc should be compiled in the ROOT/AliRoot tree, so we need to install the sources # # Sadly there are messed up systems where putting include/Vc in the include paths will # break the standard library (e.g. MacOS X Lion with case insensitive filesystem). # Thus, we modify the includes such that include/Vc never needs to be in the path. file(GLOB_RECURSE _srcs RELATIVE "${CMAKE_SOURCE_DIR}" src/*.cpp examples/*.cpp examples/*.h tests/*.cpp tests/*.h) foreach(_src ${_srcs}) message(STATUS "Processing ${CMAKE_SOURCE_DIR}/${_src} -> ${CMAKE_BINARY_DIR}/${_src}") get_filename_component(_path "${CMAKE_BINARY_DIR}/${_src}" PATH) file(MAKE_DIRECTORY "${_path}") execute_process( COMMAND sed -e "s,#include \\(.\\)\\(common\\|avx\\|sse\\|scalar\\)/,#include \\1Vc/\\2/," -e "s,::Vc::,::${_extra_namespace}::Vc::,g" -e "s,/\\*OUTER_NAMESPACE_BEGIN\\*/,namespace ${_extra_namespace} {," -e "s,/\\*OUTER_NAMESPACE_END\\*/,} // namespace ${_extra_namespace}," -e "s,/\\*NAMESPACE_ALIAS\\*/,namespace Vc = ${_extra_namespace}::Vc;," INPUT_FILE ${CMAKE_SOURCE_DIR}/${_src} OUTPUT_FILE ${CMAKE_BINARY_DIR}/${_src} ) endforeach() set(includes) macro(copy_and_set_outer_namespace dst) foreach(_name ${ARGN}) set(_dst "${dst}${_name}") set(_src "${CMAKE_SOURCE_DIR}/${_name}") get_filename_component(_dir "${_dst}" PATH) add_custom_command(OUTPUT "${_dst}" COMMAND mkdir -p "${_dir}" COMMAND cp "${_src}" "${_dst}" COMMAND sed -e "s,::Vc::,::${_extra_namespace}::Vc::,g" -e "s,/\\*OUTER_NAMESPACE_BEGIN\\*/,namespace ${_extra_namespace} {," -e "s,/\\*OUTER_NAMESPACE_END\\*/,} // namespace ${_extra_namespace}," -e "s,/\\*NAMESPACE_ALIAS\\*/,namespace Vc = ${_extra_namespace}::Vc;," -i "${_dst}" MAIN_DEPENDENCY "${_src}" COMMENT "Rewrite ${_dst}" WORKING_DIRECTORY "${CMAKE_BINARY_DIR}" VERBATIM) list(APPEND includes "${_dst}") endforeach() endmacro() file(GLOB_RECURSE _srcs RELATIVE "${CMAKE_SOURCE_DIR}" include/*.h include/*.tcc include/*.def) file(GLOB _src2 RELATIVE "${CMAKE_SOURCE_DIR}" include/Vc/*) list(APPEND _srcs ${_src2}) list(REMOVE_DUPLICATES _srcs) copy_and_set_outer_namespace("" "${_srcs}") foreach(_dir in scalar sse avx common) file(GLOB_RECURSE _srcs RELATIVE "${CMAKE_SOURCE_DIR}" ${_dir}/*.h ${_dir}/*.tcc ${_dir}/*.def) copy_and_set_outer_namespace("include/Vc/" "${_srcs}") endforeach() add_custom_target(rewrite ALL DEPENDS ${includes}) endif() # read version parts from version.h to be put into VcConfig.cmake file(STRINGS ${CMAKE_CURRENT_SOURCE_DIR}/include/Vc/version.h _version_lines REGEX "^#define VC_VERSION_STRING ") string(REGEX MATCH "([0-9]+)\\.([0-9]+)\\.([0-9]+)" _version_matches "${_version_lines}") set(Vc_VERSION_MAJOR ${CMAKE_MATCH_1}) set(Vc_VERSION_MINOR ${CMAKE_MATCH_2}) set(Vc_VERSION_PATCH ${CMAKE_MATCH_3}) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/VcConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/cmake/VcConfig.cmake @ONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/VcConfigVersion.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/cmake/VcConfigVersion.cmake @ONLY) set(cmake_install_files cmake/UserWarning.cmake cmake/VcMacros.cmake cmake/AddCompilerFlag.cmake cmake/CheckCCompilerFlag.cmake cmake/CheckCXXCompilerFlag.cmake ) if(ROOT_RELEASE) execute_process( COMMAND sed "s, \"auto\" CACHE, \"none\" CACHE," INPUT_FILE ${CMAKE_SOURCE_DIR}/cmake/OptimizeForArchitecture.cmake OUTPUT_FILE ${CMAKE_BINARY_DIR}/cmake/OptimizeForArchitecture.cmake ) install(FILES ${cmake_install_files} cmake/AddTargetProperty.cmake ${CMAKE_BINARY_DIR}/cmake/OptimizeForArchitecture.cmake DESTINATION cmake ) install(DIRECTORY ${CMAKE_BINARY_DIR}/examples/ DESTINATION examples) install(DIRECTORY ${CMAKE_BINARY_DIR}/tests/ DESTINATION tests) install(FILES tests/CMakeLists.txt tests/download.cmake DESTINATION tests) install(DIRECTORY ${CMAKE_BINARY_DIR}/src/ DESTINATION src) install(DIRECTORY ${CMAKE_BINARY_DIR}/include/Vc/ DESTINATION include/Vc) install(DIRECTORY examples/ DESTINATION examples FILES_MATCHING PATTERN CMakeLists.txt) else() install(FILES ${cmake_install_files} ${CMAKE_CURRENT_BINARY_DIR}/cmake/VcConfig.cmake ${CMAKE_CURRENT_BINARY_DIR}/cmake/VcConfigVersion.cmake cmake/OptimizeForArchitecture.cmake cmake/FindVc.cmake DESTINATION lib/cmake/Vc ) endif() if(NOT ROOT_RELEASE) include (CTest) configure_file(${CMAKE_SOURCE_DIR}/CTestCustom.cmake ${CMAKE_BINARY_DIR}/CTestCustom.cmake COPYONLY) if(BUILD_TESTING) add_custom_target(build_tests VERBATIM) add_subdirectory(tests) endif(BUILD_TESTING) set(BUILD_EXAMPLES FALSE CACHE BOOL "Build examples.") if(BUILD_EXAMPLES) add_subdirectory(examples) endif(BUILD_EXAMPLES) endif() # Hide VC_IMPL as it is only meant for users of Vc mark_as_advanced(VC_IMPL) Vc-0.7.4/CTestConfig.cmake000066400000000000000000000006041233512346000152430ustar00rootroot00000000000000set(CTEST_PROJECT_NAME "Vc") set(CTEST_NIGHTLY_START_TIME "00:00:00 CEST") set(CTEST_DROP_METHOD "http") set(CTEST_DROP_SITE "code.compeng.uni-frankfurt.de") set(CTEST_DROP_LOCATION "/dashboard/submit.php?project=Vc-0.7") set(CTEST_DROP_SITE_CDASH TRUE) set(CTEST_UPDATE_TYPE "git") find_program(GITCOMMAND git) set(CTEST_UPDATE_COMMAND "${GITCOMMAND}") mark_as_advanced(GITCOMMAND) Vc-0.7.4/CTestCustom.cmake000066400000000000000000000037561233512346000153230ustar00rootroot00000000000000set(CTEST_CUSTOM_WARNING_EXCEPTION ${CTEST_CUSTOM_WARNING_EXCEPTION} " C4723: " # MSVC 2012 can't suppress this warning " C4756: " # MSVC 2012 can't suppress this warning "used uninitialized in this function" "Skipping compilation of tests gatherStruct and gather2dim because of clang bug" # Not a helpful warning for the dashboard "GCC < 4.3 does not have full support for SSE2 intrinsics." # Ignore self-made warning, though what I really want is a message when the warning is absent "call to .*Vc::Warnings::_operator_bracket_warning.* declared with attribute warning" "warning is a GCC extension" "^-- " # Ignore output from cmake "GCC 4.6.0 is broken. The following tests are therefore disabled" # This warning is meant for users not the dashboard "Your GCC is older than 4.4.6. This is known to cause problems/bugs" # This warning is meant for users not the dashboard "GCC 4.4.x shows false positives for -Wparentheses, thus we rather disable" # This warning is meant for users not the dashboard "AVX disabled per default because of old/broken compiler" # This warning is meant for users not the dashboard "GCC 4.7.0 miscompiles at -O3, adding -fno-predictive-commoning to the" # This warning is meant for users not the dashboard "warning: the mangled name of .*typename Vc::{anonymous}::Decltype.* will change in a future version of GCC" "^\\*\\*\\* WARNING non-zero return value in ctest from: make" # Ignore output from ctest "ipo: warning #11010:" # Ignore warning about incompatible libraries with ICC -m32 on 64-bit system "include/qt4" # -Wuninitialized in QWeakPointer(X *ptr) "implicit_type_conversion.cpp.*(double|float).*argument 1 to.*TestImplicitCast" # ignore GCC 4.1 warning "Vc::Warnings::_operator_bracket_warning" # GCC 4.3 is supposed to throw this warning like crazy ) set(CTEST_CUSTOM_ERROR_EXCEPTION ${CTEST_CUSTOM_ERROR_EXCEPTION} "^make\\[[1-9]\\]: " "^collect2: ld returned . exit status" "^make: \\*\\*\\* \\[all\\] Error ") Vc-0.7.4/INSTALL000066400000000000000000000006311233512346000131220ustar00rootroot00000000000000Build Requirements ================== cmake >= 2.8.3 C++11 Compiler: * GCC >= 4.6 * clang >= 3.2 * ICC >= 13 * Visual Studio >= 2012 Building and Installing Vc ========================== * Create a build directory: $ mkdir build $ cd build * Call cmake with the relevant options: $ cmake -DCMAKE_INSTALL_PREFIX=/opt/Vc -DBUILD_TESTING=OFF * Build and install: $ make -j16 $ make install Vc-0.7.4/LGPL000066400000000000000000000167251233512346000125650ustar00rootroot00000000000000 GNU LESSER GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. This version of the GNU Lesser General Public License incorporates the terms and conditions of version 3 of the GNU General Public License, supplemented by the additional permissions listed below. 0. Additional Definitions. As used herein, "this License" refers to version 3 of the GNU Lesser General Public License, and the "GNU GPL" refers to version 3 of the GNU General Public License. "The Library" refers to a covered work governed by this License, other than an Application or a Combined Work as defined below. An "Application" is any work that makes use of an interface provided by the Library, but which is not otherwise based on the Library. Defining a subclass of a class defined by the Library is deemed a mode of using an interface provided by the Library. A "Combined Work" is a work produced by combining or linking an Application with the Library. The particular version of the Library with which the Combined Work was made is also called the "Linked Version". The "Minimal Corresponding Source" for a Combined Work means the Corresponding Source for the Combined Work, excluding any source code for portions of the Combined Work that, considered in isolation, are based on the Application, and not on the Linked Version. The "Corresponding Application Code" for a Combined Work means the object code and/or source code for the Application, including any data and utility programs needed for reproducing the Combined Work from the Application, but excluding the System Libraries of the Combined Work. 1. Exception to Section 3 of the GNU GPL. You may convey a covered work under sections 3 and 4 of this License without being bound by section 3 of the GNU GPL. 2. Conveying Modified Versions. If you modify a copy of the Library, and, in your modifications, a facility refers to a function or data to be supplied by an Application that uses the facility (other than as an argument passed when the facility is invoked), then you may convey a copy of the modified version: a) under this License, provided that you make a good faith effort to ensure that, in the event an Application does not supply the function or data, the facility still operates, and performs whatever part of its purpose remains meaningful, or b) under the GNU GPL, with none of the additional permissions of this License applicable to that copy. 3. Object Code Incorporating Material from Library Header Files. The object code form of an Application may incorporate material from a header file that is part of the Library. You may convey such object code under terms of your choice, provided that, if the incorporated material is not limited to numerical parameters, data structure layouts and accessors, or small macros, inline functions and templates (ten or fewer lines in length), you do both of the following: a) Give prominent notice with each copy of the object code that the Library is used in it and that the Library and its use are covered by this License. b) Accompany the object code with a copy of the GNU GPL and this license document. 4. Combined Works. You may convey a Combined Work under terms of your choice that, taken together, effectively do not restrict modification of the portions of the Library contained in the Combined Work and reverse engineering for debugging such modifications, if you also do each of the following: a) Give prominent notice with each copy of the Combined Work that the Library is used in it and that the Library and its use are covered by this License. b) Accompany the Combined Work with a copy of the GNU GPL and this license document. c) For a Combined Work that displays copyright notices during execution, include the copyright notice for the Library among these notices, as well as a reference directing the user to the copies of the GNU GPL and this license document. d) Do one of the following: 0) Convey the Minimal Corresponding Source under the terms of this License, and the Corresponding Application Code in a form suitable for, and under terms that permit, the user to recombine or relink the Application with a modified version of the Linked Version to produce a modified Combined Work, in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source. 1) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (a) uses at run time a copy of the Library already present on the user's computer system, and (b) will operate properly with a modified version of the Library that is interface-compatible with the Linked Version. e) Provide Installation Information, but only if you would otherwise be required to provide such information under section 6 of the GNU GPL, and only to the extent that such information is necessary to install and execute a modified version of the Combined Work produced by recombining or relinking the Application with a modified version of the Linked Version. (If you use option 4d0, the Installation Information must accompany the Minimal Corresponding Source and Corresponding Application Code. If you use option 4d1, you must provide the Installation Information in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source.) 5. Combined Libraries. You may place library facilities that are a work based on the Library side by side in a single library together with other library facilities that are not Applications and are not covered by this License, and convey such a combined library under terms of your choice, if you do both of the following: a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities, conveyed under the terms of this License. b) Give prominent notice with the combined library that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. 6. Revised Versions of the GNU Lesser General Public License. The Free Software Foundation may publish revised and/or new versions of the GNU Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library as you received it specifies that a certain numbered version of the GNU Lesser General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that published version or of any later version published by the Free Software Foundation. If the Library as you received it does not specify a version number of the GNU Lesser General Public License, you may choose any version of the GNU Lesser General Public License ever published by the Free Software Foundation. If the Library as you received it specifies that a proxy can decide whether future versions of the GNU Lesser General Public License shall apply, that proxy's public statement of acceptance of any version is permanent authorization for you to choose that version for the Library. Vc-0.7.4/Test_all_compilers.sh000077500000000000000000000032701233512346000162560ustar00rootroot00000000000000#!/bin/sh export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games" export LANG="en_US.UTF-8" export LANGUAGE="en_US.UTF-8" export LC_CTYPE="en_US.UTF-8" export LC_NUMERIC="en_US.UTF-8" export LC_TIME="en_US.UTF-8" export LC_MESSAGES="en_US.UTF-8" unset CFLAGS CXXFLAGS cd "`dirname "$0"`" runTest() { CFLAGS="$1" CXXFLAGS="$1" ./Test_vc.sh Experimental } supports32Bit() { test `uname -m` = "x86_64" || return 1 CXX=${CXX:-c++} cat > /tmp/m32test.cpp < int main() { std::cout << "Hello World!\n"; return 0; } END $CXX -m32 -o /tmp/m32test /tmp/m32test.cpp >/dev/null 2>&1 || return 1 rm /tmp/m32test* return 0 } cxxlist="`find /usr/bin/ /usr/local/bin/ -name 'g++-*'`" if test -z "$cxxlist"; then cxxlist="`find /usr/bin/ /usr/local/bin/ -name 'g++'`" fi if test -z "$cxxlist"; then # default compiler runTest & supports32Bit && runTest -m32 & wait else for CXX in $cxxlist; do CC=`echo "$CXX"|sed 's/g++/gcc/'` if test -x "$CC" -a -x "$CXX"; then ( export CC export CXX runTest & supports32Bit && runTest -m32 & wait ) fi done fi for VcEnv in `find /opt/ -mindepth 2 -maxdepth 2 -name Vc.env`; do ( . "$VcEnv" case "$VcEnv" in *-snapshot/Vc.env) ( cd $HOME/src/gcc-build && ./update.sh "`dirname "$VcEnv"`" ) ;; esac runTest & supports32Bit && runTest -m32 & wait ) done export CC=icc export CXX=icpc icclist="`find /opt/ -name 'iccvars.sh'`" case x86_64 in x86_64) arch=intel64 ;; i[345678]86) arch=ia32 ;; esac test -n "$icclist" && for IccEnv in $icclist; do ( . $IccEnv $arch runTest & supports32Bit && runTest -m32 & wait ) done Vc-0.7.4/Test_vc.sh000077500000000000000000000011261233512346000140370ustar00rootroot00000000000000#!/bin/bash case "$1" in Experimental|Nightly|Continuous) export dashboard_model=$1 case "$2" in None|Debug|Release|RelWithDebug|RelWithDebInfo|MinSizeRel) export build_type=$2 ;; esac ;; *) echo "Usage: $0 []" echo echo "Possible arguments for model are Nightly, Continuous, or Experimental." echo "Build type may be one of: None Debug Release RelWithDebug RelWithDebInfo MinSizeRel." echo exit 1 ;; esac ctest -S "`dirname $0`/test.cmake" 2>&1 | grep -v 'Error in read script:' Vc-0.7.4/avx/000077500000000000000000000000001233512346000126675ustar00rootroot00000000000000Vc-0.7.4/avx/README000066400000000000000000000035551233512346000135570ustar00rootroot00000000000000########################################### ################# AVX ################# ########################################### 1. Floating Point =========================================== Uses full 256bit vectors for all operations. 128bit vectors are never used. 2. Integer =========================================== Integer support in AVX is minimal. The 256bit integer vectors are just intended as a supporting type of float operations. Any arithmetic, logical, or comparison operations must be implemented using 128bit operations. int_v/uint_v could be implemented either as 128 or 256 types. I.e. either int_v::Size == 4 or 8. 2.1. 256bit int vectors =========================================== 2.1.1. Implementation Details: This requires the SSE operations to not zero the high bits of the registers. Since the YMM registers are aliased on the XMM registers you need to use SSE ops that are not using the VEX prefix (IIUC). Or you have to use two XMM registers most of the time. Perfect would be the use of union M256I { __m256i ymm; __m128i xmm[2]; }; But as far as I know GCC, this will result in lots of unnecessary loads and stores. (It seems this is due to GCC expecting aliasing, thus making sure the modified values are always up-to-date in memory - like if it were declared volatile.) 2.1.2. Upsides: int_v::Size == float_v::Size 2.1.3. Downsides: Register pressure is increased. 2.2. 128bit int vectors =========================================== 2.2.1. Implementation Details: 2.2.2. Upsides: 2.2.3. Downsides: - Use of int_v for float_v operations involving __m256i arguments require an extra type. This will be hard to generalize 2.3. Mixed approach =========================================== int_v/uint_v are implemented as 256bit while short_v/ushort_v are implemented as 128bit. Thus int_v::Size == short_v::Size (which is the case on LRBni, too). Vc-0.7.4/avx/casts.h000066400000000000000000000375261233512346000141720ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef AVX_CASTS_H #define AVX_CASTS_H #include "intrinsics.h" #include "types.h" #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace AVX { template static Vc_INTRINSIC_L T avx_cast(param128 v) Vc_INTRINSIC_R; template static Vc_INTRINSIC_L T avx_cast(param128i v) Vc_INTRINSIC_R; template static Vc_INTRINSIC_L T avx_cast(param128d v) Vc_INTRINSIC_R; template static Vc_INTRINSIC_L T avx_cast(param256 v) Vc_INTRINSIC_R; template static Vc_INTRINSIC_L T avx_cast(param256i v) Vc_INTRINSIC_R; template static Vc_INTRINSIC_L T avx_cast(param256d v) Vc_INTRINSIC_R; #ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS template static Vc_INTRINSIC T avx_cast(__m128 v) { return avx_cast(param128 (v)); } template static Vc_INTRINSIC T avx_cast(__m128i v) { return avx_cast(param128i(v)); } template static Vc_INTRINSIC T avx_cast(__m128d v) { return avx_cast(param128d(v)); } template static Vc_INTRINSIC T avx_cast(__m256 v) { return avx_cast(param256 (v)); } template static Vc_INTRINSIC T avx_cast(__m256i v) { return avx_cast(param256i(v)); } template static Vc_INTRINSIC T avx_cast(__m256d v) { return avx_cast(param256d(v)); } #endif // 128 -> 128 template<> Vc_INTRINSIC m128 avx_cast(param128 v) { return v; } template<> Vc_INTRINSIC m128 avx_cast(param128i v) { return _mm_castsi128_ps(v); } template<> Vc_INTRINSIC m128 avx_cast(param128d v) { return _mm_castpd_ps(v); } template<> Vc_INTRINSIC m128i avx_cast(param128 v) { return _mm_castps_si128(v); } template<> Vc_INTRINSIC m128i avx_cast(param128i v) { return v; } template<> Vc_INTRINSIC m128i avx_cast(param128d v) { return _mm_castpd_si128(v); } template<> Vc_INTRINSIC m128d avx_cast(param128 v) { return _mm_castps_pd(v); } template<> Vc_INTRINSIC m128d avx_cast(param128i v) { return _mm_castsi128_pd(v); } template<> Vc_INTRINSIC m128d avx_cast(param128d v) { return v; } // 128 -> 256 // FIXME: the following casts leave the upper 128bits undefined. With GCC and ICC I've never // seen the cast not do what I want though: after a VEX-coded SSE instruction the register's // upper 128bits are zero. Thus using the same register as AVX register will have the upper // 128bits zeroed. MSVC, though, implements _mm256_castxx128_xx256 with a 128bit move to memory // + 256bit load. Thus the upper 128bits are really undefined. But there is no intrinsic to do // what I want (i.e. alias the register, disallowing the move to memory in-between). I'm stuck, // do we really want to rely on specific compiler behavior here? template<> Vc_INTRINSIC m256 avx_cast(param128 v) { return _mm256_castps128_ps256(v); } template<> Vc_INTRINSIC m256 avx_cast(param128i v) { return _mm256_castps128_ps256(_mm_castsi128_ps(v)); } template<> Vc_INTRINSIC m256 avx_cast(param128d v) { return _mm256_castps128_ps256(_mm_castpd_ps(v)); } template<> Vc_INTRINSIC m256i avx_cast(param128 v) { return _mm256_castsi128_si256(_mm_castps_si128(v)); } template<> Vc_INTRINSIC m256i avx_cast(param128i v) { return _mm256_castsi128_si256(v); } template<> Vc_INTRINSIC m256i avx_cast(param128d v) { return _mm256_castsi128_si256(_mm_castpd_si128(v)); } template<> Vc_INTRINSIC m256d avx_cast(param128 v) { return _mm256_castpd128_pd256(_mm_castps_pd(v)); } template<> Vc_INTRINSIC m256d avx_cast(param128i v) { return _mm256_castpd128_pd256(_mm_castsi128_pd(v)); } template<> Vc_INTRINSIC m256d avx_cast(param128d v) { return _mm256_castpd128_pd256(v); } #ifdef VC_MSVC static Vc_INTRINSIC Vc_CONST m256 zeroExtend(param128 v) { return _mm256_permute2f128_ps (_mm256_castps128_ps256(v), _mm256_castps128_ps256(v), 0x80); } static Vc_INTRINSIC Vc_CONST m256i zeroExtend(param128i v) { return _mm256_permute2f128_si256(_mm256_castsi128_si256(v), _mm256_castsi128_si256(v), 0x80); } static Vc_INTRINSIC Vc_CONST m256d zeroExtend(param128d v) { return _mm256_permute2f128_pd (_mm256_castpd128_pd256(v), _mm256_castpd128_pd256(v), 0x80); } #else static Vc_INTRINSIC Vc_CONST m256 zeroExtend(param128 v) { return _mm256_castps128_ps256(v); } static Vc_INTRINSIC Vc_CONST m256i zeroExtend(param128i v) { return _mm256_castsi128_si256(v); } static Vc_INTRINSIC Vc_CONST m256d zeroExtend(param128d v) { return _mm256_castpd128_pd256(v); } #ifdef VC_ICC static Vc_INTRINSIC Vc_CONST m256 zeroExtend(__m128 v) { return _mm256_castps128_ps256(v); } static Vc_INTRINSIC Vc_CONST m256i zeroExtend(__m128i v) { return _mm256_castsi128_si256(v); } static Vc_INTRINSIC Vc_CONST m256d zeroExtend(__m128d v) { return _mm256_castpd128_pd256(v); } #endif #endif // 256 -> 128 template<> Vc_INTRINSIC m128 avx_cast(param256 v) { return _mm256_castps256_ps128(v); } template<> Vc_INTRINSIC m128 avx_cast(param256i v) { return _mm256_castps256_ps128(_mm256_castsi256_ps(v)); } template<> Vc_INTRINSIC m128 avx_cast(param256d v) { return _mm256_castps256_ps128(_mm256_castpd_ps(v)); } template<> Vc_INTRINSIC m128i avx_cast(param256 v) { return _mm256_castsi256_si128(_mm256_castps_si256(v)); } template<> Vc_INTRINSIC m128i avx_cast(param256i v) { return _mm256_castsi256_si128(v); } template<> Vc_INTRINSIC m128i avx_cast(param256d v) { return _mm256_castsi256_si128(_mm256_castpd_si256(v)); } template<> Vc_INTRINSIC m128d avx_cast(param256 v) { return _mm256_castpd256_pd128(_mm256_castps_pd(v)); } template<> Vc_INTRINSIC m128d avx_cast(param256i v) { return _mm256_castpd256_pd128(_mm256_castsi256_pd(v)); } template<> Vc_INTRINSIC m128d avx_cast(param256d v) { return _mm256_castpd256_pd128(v); } // 256 -> 256 template<> Vc_INTRINSIC m256 avx_cast(param256 v) { return v; } template<> Vc_INTRINSIC m256 avx_cast(param256i v) { return _mm256_castsi256_ps(v); } template<> Vc_INTRINSIC m256 avx_cast(param256d v) { return _mm256_castpd_ps(v); } template<> Vc_INTRINSIC m256i avx_cast(param256 v) { return _mm256_castps_si256(v); } template<> Vc_INTRINSIC m256i avx_cast(param256i v) { return v; } template<> Vc_INTRINSIC m256i avx_cast(param256d v) { return _mm256_castpd_si256(v); } template<> Vc_INTRINSIC m256d avx_cast(param256 v) { return _mm256_castps_pd(v); } template<> Vc_INTRINSIC m256d avx_cast(param256i v) { return _mm256_castsi256_pd(v); } template<> Vc_INTRINSIC m256d avx_cast(param256d v) { return v; } // simplify splitting 256-bit registers in 128-bit registers Vc_INTRINSIC Vc_CONST m128 lo128(param256 v) { return avx_cast(v); } Vc_INTRINSIC Vc_CONST m128d lo128(param256d v) { return avx_cast(v); } Vc_INTRINSIC Vc_CONST m128i lo128(param256i v) { return avx_cast(v); } Vc_INTRINSIC Vc_CONST m128 hi128(param256 v) { return _mm256_extractf128_ps(v, 1); } Vc_INTRINSIC Vc_CONST m128d hi128(param256d v) { return _mm256_extractf128_pd(v, 1); } Vc_INTRINSIC Vc_CONST m128i hi128(param256i v) { return _mm256_extractf128_si256(v, 1); } // simplify combining 128-bit registers in 256-bit registers Vc_INTRINSIC Vc_CONST m256 concat(param128 a, param128 b) { return _mm256_insertf128_ps (avx_cast(a), b, 1); } Vc_INTRINSIC Vc_CONST m256d concat(param128d a, param128d b) { return _mm256_insertf128_pd (avx_cast(a), b, 1); } Vc_INTRINSIC Vc_CONST m256i concat(param128i a, param128i b) { return _mm256_insertf128_si256(avx_cast(a), b, 1); } #ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS Vc_INTRINSIC Vc_CONST m256 concat(__m128 a, param128 b) { return _mm256_insertf128_ps (avx_cast(a), b, 1); } Vc_INTRINSIC Vc_CONST m256d concat(__m128d a, param128d b) { return _mm256_insertf128_pd (avx_cast(a), b, 1); } Vc_INTRINSIC Vc_CONST m256i concat(__m128i a, param128i b) { return _mm256_insertf128_si256(avx_cast(a), b, 1); } Vc_INTRINSIC Vc_CONST m256 concat(param128 a, __m128 b) { return _mm256_insertf128_ps (avx_cast(a), b, 1); } Vc_INTRINSIC Vc_CONST m256d concat(param128d a, __m128d b) { return _mm256_insertf128_pd (avx_cast(a), b, 1); } Vc_INTRINSIC Vc_CONST m256i concat(param128i a, __m128i b) { return _mm256_insertf128_si256(avx_cast(a), b, 1); } Vc_INTRINSIC Vc_CONST m256 concat(__m128 a, __m128 b) { return _mm256_insertf128_ps (avx_cast(a), b, 1); } Vc_INTRINSIC Vc_CONST m256d concat(__m128d a, __m128d b) { return _mm256_insertf128_pd (avx_cast(a), b, 1); } Vc_INTRINSIC Vc_CONST m256i concat(__m128i a, __m128i b) { return _mm256_insertf128_si256(avx_cast(a), b, 1); } #endif template struct StaticCastHelper {}; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256 v) { return _mm256_cvttps_epi32(v); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256d v) { return avx_cast(_mm256_cvttpd_epi32(v)); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param128i v) { return concat(_mm_srai_epi32(_mm_unpacklo_epi16(v, v), 16), _mm_srai_epi32(_mm_unpackhi_epi16(v, v), 16)); } }; template<> struct StaticCastHelper { static inline Vc_CONST m256i cast(param256 v) { return _mm256_castps_si256(_mm256_blendv_ps( _mm256_castsi256_ps(_mm256_cvttps_epi32(v)), _mm256_castsi256_ps(_mm256_add_epi32(m256i(_mm256_cvttps_epi32(_mm256_sub_ps(v, _mm256_set2power31_ps()))), _mm256_set2power31_epu32())), _mm256_cmpge_ps(v, _mm256_set2power31_ps()) )); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256d v) { return avx_cast(_mm256_cvttpd_epi32(v)); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param128i v) { return concat(_mm_srli_epi32(_mm_unpacklo_epi16(v, v), 16), _mm_srli_epi32(_mm_unpackhi_epi16(v, v), 16)); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256 v) { return v; } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256d v) { return avx_cast(_mm256_cvtpd_ps(v)); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256i v) { return _mm256_cvtepi32_ps(v); } }; template<> struct StaticCastHelper { static inline Vc_CONST m256 cast(param256i v) { return _mm256_blendv_ps( _mm256_cvtepi32_ps(v), _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_sub_epi32(v, _mm256_set2power31_epu32())), _mm256_set2power31_ps()), _mm256_castsi256_ps(_mm256_cmplt_epi32(v, _mm256_setzero_si256())) ); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper::cast(v)); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper::cast(v)); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256 v) { return _mm256_cvtps_pd(avx_cast(v)); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256d v) { return v; } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256i v) { return _mm256_cvtepi32_pd(avx_cast(v)); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256i v) { return _mm256_cvtepi32_pd(avx_cast(v)); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256i v) { return _mm_packs_epi32(lo128(v), hi128(v)); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper::cast(StaticCastHelper::cast(v)); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256i v) { return _mm_packus_epi32(lo128(v), hi128(v)); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper::cast(StaticCastHelper::cast(v)); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper::cast(StaticCastHelper::cast(v)); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper::cast(StaticCastHelper::cast(v)); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper::cast(v)); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper::cast(v)); } }; } // namespace AVX } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // AVX_CASTS_H Vc-0.7.4/avx/const.h000066400000000000000000000144131233512346000141710ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_AVX_CONST_H #define VC_AVX_CONST_H #include #include "const_data.h" #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace AVX { template class Vector; template struct IndexesFromZeroData; template<> struct IndexesFromZeroData { static Vc_ALWAYS_INLINE Vc_CONST const int *address() { return reinterpret_cast(&_IndexesFromZero32[0]); } }; template<> struct IndexesFromZeroData { static Vc_ALWAYS_INLINE Vc_CONST const unsigned int *address() { return &_IndexesFromZero32[0]; } }; template<> struct IndexesFromZeroData { static Vc_ALWAYS_INLINE Vc_CONST const short *address() { return reinterpret_cast(&_IndexesFromZero16[0]); } }; template<> struct IndexesFromZeroData { static Vc_ALWAYS_INLINE Vc_CONST const unsigned short *address() { return &_IndexesFromZero16[0]; } }; template<> struct IndexesFromZeroData { static Vc_ALWAYS_INLINE Vc_CONST const signed char *address() { return reinterpret_cast(&_IndexesFromZero8[0]); } }; template<> struct IndexesFromZeroData { static Vc_ALWAYS_INLINE Vc_CONST const char *address() { return reinterpret_cast(&_IndexesFromZero8[0]); } }; template<> struct IndexesFromZeroData { static Vc_ALWAYS_INLINE Vc_CONST const unsigned char *address() { return &_IndexesFromZero8[0]; } }; template struct Const { typedef Vector<_T> V; typedef typename V::EntryType T; typedef typename V::Mask M; static Vc_ALWAYS_INLINE Vc_CONST V _pi_4() { return V(c_trig::data[0]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_hi() { return V(c_trig::data[1]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem1() { return V(c_trig::data[2]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem2() { return V(c_trig::data[3]); } static Vc_ALWAYS_INLINE Vc_CONST V _1_16() { return V(c_trig::data[4]); } static Vc_ALWAYS_INLINE Vc_CONST V _16() { return V(c_trig::data[5]); } static Vc_ALWAYS_INLINE Vc_CONST V cosCoeff(int i) { return V(c_trig::data[( 8 + i)]); } static Vc_ALWAYS_INLINE Vc_CONST V sinCoeff(int i) { return V(c_trig::data[(14 + i)]); } static Vc_ALWAYS_INLINE Vc_CONST V atanP(int i) { return V(c_trig::data[(24 + i)]); } static Vc_ALWAYS_INLINE Vc_CONST V atanQ(int i) { return V(c_trig::data[(29 + i)]); } static Vc_ALWAYS_INLINE Vc_CONST V atanThrsHi() { return V(c_trig::data[34]); } static Vc_ALWAYS_INLINE Vc_CONST V atanThrsLo() { return V(c_trig::data[35]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_2_rem() { return V(c_trig::data[36]); } static Vc_ALWAYS_INLINE Vc_CONST V lossThreshold() { return V(c_trig::data[20]); } static Vc_ALWAYS_INLINE Vc_CONST V _4_pi() { return V(c_trig::data[21]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_2() { return V(c_trig::data[22]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi() { return V(c_trig::data[23]); } static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff0(int i) { return V(c_trig::data[(40 + i)]); } static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff1(int i) { return V(c_trig::data[(45 + i)]); } static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff2(int i) { return V(c_trig::data[(49 + i)]); } static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff3(int i) { return V(c_trig::data[(55 + i)]); } static Vc_ALWAYS_INLINE Vc_CONST V smallAsinInput() { return V(c_trig::data[37]); } static Vc_ALWAYS_INLINE Vc_CONST V largeAsinInput() { return V(c_trig::data[38]); } static Vc_ALWAYS_INLINE Vc_CONST M exponentMask() { return M(V(c_log::d(1)).data()); } static Vc_ALWAYS_INLINE Vc_CONST V _1_2() { return V(c_log::d(18)); } static Vc_ALWAYS_INLINE Vc_CONST V _1_sqrt2() { return V(c_log::d(15)); } static Vc_ALWAYS_INLINE Vc_CONST V P(int i) { return V(c_log::d(2 + i)); } static Vc_ALWAYS_INLINE Vc_CONST V Q(int i) { return V(c_log::d(8 + i)); } static Vc_ALWAYS_INLINE Vc_CONST V min() { return V(c_log::d(14)); } static Vc_ALWAYS_INLINE Vc_CONST V ln2_small() { return V(c_log::d(17)); } static Vc_ALWAYS_INLINE Vc_CONST V ln2_large() { return V(c_log::d(16)); } static Vc_ALWAYS_INLINE Vc_CONST V neginf() { return V(c_log::d(13)); } static Vc_ALWAYS_INLINE Vc_CONST V log10_e() { return V(c_log::d(19)); } static Vc_ALWAYS_INLINE Vc_CONST V log2_e() { return V(c_log::d(20)); } static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask() Vc_ALWAYS_INLINE_R Vc_CONST_R; }; template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask() { return _mm256_broadcast_ss(reinterpret_cast(&c_general::highMaskFloat)); } template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask() { return _mm256_broadcast_sd(reinterpret_cast(&c_general::highMaskDouble)); } template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask() { return _mm256_broadcast_ss(reinterpret_cast(&c_general::highMaskFloat)); } } // namespace AVX } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // VC_AVX_CONST_H Vc-0.7.4/avx/const_data.h000066400000000000000000000043511233512346000151620ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright (C) 2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . }}}*/ #ifndef VC_AVX_CONST_DATA_H #define VC_AVX_CONST_DATA_H #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace AVX { ALIGN(64) extern const unsigned int _IndexesFromZero32[8]; ALIGN(16) extern const unsigned short _IndexesFromZero16[8]; ALIGN(16) extern const unsigned char _IndexesFromZero8[16]; struct STRUCT_ALIGN1(64) c_general { static const float oneFloat; static const unsigned int absMaskFloat[2]; static const unsigned int signMaskFloat[2]; static const unsigned int highMaskFloat; static const unsigned short minShort[2]; static const unsigned short one16[2]; static const float _2power31; static const double oneDouble; static const unsigned long long frexpMask; static const unsigned long long highMaskDouble; } STRUCT_ALIGN2(64); template struct c_trig { ALIGN(64) static const T data[]; }; template struct c_log { typedef float floatAlias Vc_MAY_ALIAS; static Vc_ALWAYS_INLINE float d(int i) { return *reinterpret_cast(&data[i]); } ALIGN(64) static const unsigned int data[]; }; template<> struct c_log { enum VectorSize { Size = 16 / sizeof(double) }; typedef double doubleAlias Vc_MAY_ALIAS; static Vc_ALWAYS_INLINE double d(int i) { return *reinterpret_cast(&data[i]); } ALIGN(64) static const unsigned long long data[]; }; } // namespace AVX } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // VC_AVX_CONST_DATA_H Vc-0.7.4/avx/debug.h000066400000000000000000000055461233512346000141400ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2011-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_AVX_DEBUG_H #define VC_AVX_DEBUG_H #ifndef NDEBUG #include "vectorbase.h" #include #include #endif /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace AVX { #ifdef NDEBUG class DebugStream { public: DebugStream(const char *, const char *, int) {} template inline DebugStream &operator<<(const T &) { return *this; } }; #else class DebugStream { private: template static void printVector(V _x) { enum { Size = sizeof(V) / sizeof(T) }; union { V v; T m[Size]; } x = { _x }; std::cerr << '[' << std::setprecision(24) << x.m[0]; for (int i = 1; i < Size; ++i) { std::cerr << ", " << std::setprecision(24) << x.m[i]; } std::cerr << ']'; } public: DebugStream(const char *func, const char *file, int line) { std::cerr << "\033[1;40;33mDEBUG: " << file << ':' << line << ' ' << func << ' '; } template DebugStream &operator<<(const T &x) { std::cerr << x; return *this; } DebugStream &operator<<(__m128 x) { printVector(x); return *this; } DebugStream &operator<<(__m256 x) { printVector(x); return *this; } DebugStream &operator<<(__m128d x) { printVector(x); return *this; } DebugStream &operator<<(__m256d x) { printVector(x); return *this; } DebugStream &operator<<(__m128i x) { printVector(x); return *this; } DebugStream &operator<<(__m256i x) { printVector(x); return *this; } ~DebugStream() { std::cerr << "\033[0m" << std::endl; } }; #endif #define VC_DEBUG ::Vc::AVX::DebugStream(__PRETTY_FUNCTION__, __FILE__, __LINE__) } // namespace AVX } // namespace Vc /*OUTER_NAMESPACE_END*/ #endif // VC_AVX_DEBUG_H Vc-0.7.4/avx/deinterleave.tcc000066400000000000000000000273671233512346000160500ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2010-2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace AVX { inline void deinterleave(double_v &VC_RESTRICT a, double_v &VC_RESTRICT b, double_v &VC_RESTRICT c) { // estimated latency (AVX): 4.5 cycles const m256d tmp0 = Mem::shuffle128(a.data(), b.data()); const m256d tmp1 = Mem::shuffle128(a.data(), c.data()); const m256d tmp2 = Mem::shuffle128(b.data(), c.data()); a.data() = Mem::shuffle(tmp0, tmp1); b.data() = Mem::shuffle(tmp0, tmp2); c.data() = Mem::shuffle(tmp1, tmp2); } inline void deinterleave(float_v &VC_RESTRICT a, float_v &VC_RESTRICT b, float_v &VC_RESTRICT c) { // abc abc abc // a = [a0 b0 c0 a1 b1 c1 a2 b2] 332 = 211+121 // b = [c2 a3 b3 c3 a4 b4 c4 a5] 323 = 112+211 // c = [b5 c5 a6 b6 c6 a7 b7 c7] 233 = 121+112 const m256 ac0 = Mem::shuffle128(a.data(), c.data()); // a0 b0 c0 a1 b5 c5 a6 b6 const m256 ac1 = Mem::shuffle128(a.data(), c.data()); // b1 c1 a2 b2 c6 a7 b7 c7 m256 tmp0 = Mem::blend( ac0, b.data()); tmp0 = Mem::blend(tmp0, ac1); // a0 a3 a2 a1 a4 a7 a6 a5 m256 tmp1 = Mem::blend( ac0, b.data()); tmp1 = Mem::blend(tmp1, ac1); // b1 b0 b3 b2 b5 b4 b7 b6 m256 tmp2 = Mem::blend( ac0, b.data()); tmp2 = Mem::blend(tmp2, ac1); // c2 c1 c0 c3 c6 c5 c4 c7 a.data() = Mem::permute(tmp0); b.data() = Mem::permute(tmp1); c.data() = Mem::permute(tmp2); } inline void deinterleave(int_v &VC_RESTRICT a, int_v &VC_RESTRICT b, int_v &VC_RESTRICT c) { deinterleave(reinterpret_cast(a), reinterpret_cast(b), reinterpret_cast(c)); } inline void deinterleave(uint_v &VC_RESTRICT a, uint_v &VC_RESTRICT b, uint_v &VC_RESTRICT c) { deinterleave(reinterpret_cast(a), reinterpret_cast(b), reinterpret_cast(c)); } inline void deinterleave(Vector &VC_RESTRICT a, Vector &VC_RESTRICT b, Vector &VC_RESTRICT c) { // abc abc abc // a = [a0 b0 c0 a1 b1 c1 a2 b2] 332 = 211+121 // b = [c2 a3 b3 c3 a4 b4 c4 a5] 323 = 112+211 // c = [b5 c5 a6 b6 c6 a7 b7 c7] 233 = 121+112 m128i ac0 = _mm_unpacklo_epi64(a.data(), c.data()); // a0 b0 c0 a1 b5 c5 a6 b6 m128i ac1 = _mm_unpackhi_epi64(a.data(), c.data()); // b1 c1 a2 b2 c6 a7 b7 c7 m128i tmp0 = Mem::blend( ac0, b.data()); tmp0 = Mem::blend(tmp0, ac1); // a0 a3 a2 a1 a4 a7 a6 a5 m128i tmp1 = Mem::blend( ac0, b.data()); tmp1 = Mem::blend(tmp1, ac1); // b1 b0 b3 b2 b5 b4 b7 b6 m128i tmp2 = Mem::blend( ac0, b.data()); tmp2 = Mem::blend(tmp2, ac1); // c2 c1 c0 c3 c6 c5 c4 c7 a.data() = Mem::permuteHi(Mem::permuteLo(tmp0)); b.data() = Mem::permuteHi(Mem::permuteLo(tmp1)); c.data() = Mem::permuteHi(Mem::permuteLo(tmp2)); } inline void deinterleave(Vector &VC_RESTRICT a, Vector &VC_RESTRICT b, Vector &VC_RESTRICT c) { deinterleave(reinterpret_cast &>(a), reinterpret_cast &>(b), reinterpret_cast &>(c)); } inline void deinterleave(Vector &a, Vector &b) { // a7 a6 a5 a4 a3 a2 a1 a0 // b7 b6 b5 b4 b3 b2 b1 b0 const m256 tmp0 = Reg::permute128(a.data(), b.data()); // b3 b2 b1 b0 a3 a2 a1 a0 const m256 tmp1 = Reg::permute128(a.data(), b.data()); // b7 b6 b5 b4 a7 a6 a5 a4 const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); // b5 b1 b4 b0 a5 a1 a4 a0 const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); // b7 b3 b6 b2 a7 a3 a6 a2 a.data() = _mm256_unpacklo_ps(tmp2, tmp3); // b6 b4 b2 b0 a6 a4 a2 a0 b.data() = _mm256_unpackhi_ps(tmp2, tmp3); // b7 b5 b3 b1 a7 a5 a3 a1 } inline void deinterleave(Vector &a, Vector &b) { m128i tmp0 = _mm_unpacklo_epi16(a.data(), b.data()); // a0 a4 b0 b4 a1 a5 b1 b5 m128i tmp1 = _mm_unpackhi_epi16(a.data(), b.data()); // a2 a6 b2 b6 a3 a7 b3 b7 m128i tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); // a0 a2 a4 a6 b0 b2 b4 b6 m128i tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); // a1 a3 a5 a7 b1 b3 b5 b7 a.data() = _mm_unpacklo_epi16(tmp2, tmp3); b.data() = _mm_unpackhi_epi16(tmp2, tmp3); } inline void deinterleave(Vector &a, Vector &b) { m128i tmp0 = _mm_unpacklo_epi16(a.data(), b.data()); // a0 a4 b0 b4 a1 a5 b1 b5 m128i tmp1 = _mm_unpackhi_epi16(a.data(), b.data()); // a2 a6 b2 b6 a3 a7 b3 b7 m128i tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); // a0 a2 a4 a6 b0 b2 b4 b6 m128i tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); // a1 a3 a5 a7 b1 b3 b5 b7 a.data() = _mm_unpacklo_epi16(tmp2, tmp3); b.data() = _mm_unpackhi_epi16(tmp2, tmp3); } } // namespace AVX namespace Internal { template inline void HelperImpl::deinterleave( float_v &a, float_v &b, const float *m, A align) { a.load(m, align); b.load(m + float_v::Size, align); Vc::AVX::deinterleave(a, b); } template inline void HelperImpl::deinterleave( float_v &a, float_v &b, const short *m, A align) { using Vc::AVX::m256i; const m256i tmp = Vc::AVX::VectorHelper::load(m, align); a.data() = _mm256_cvtepi32_ps(Vc::AVX::concat( _mm_srai_epi32(_mm_slli_epi32(AVX::lo128(tmp), 16), 16), _mm_srai_epi32(_mm_slli_epi32(AVX::hi128(tmp), 16), 16))); b.data() = _mm256_cvtepi32_ps(Vc::AVX::concat( _mm_srai_epi32(AVX::lo128(tmp), 16), _mm_srai_epi32(AVX::hi128(tmp), 16))); } template inline void HelperImpl::deinterleave( float_v &a, float_v &b, const unsigned short *m, A align) { using Vc::AVX::m256i; const m256i tmp = Vc::AVX::VectorHelper::load(m, align); a.data() = _mm256_cvtepi32_ps(Vc::AVX::concat( _mm_blend_epi16(AVX::lo128(tmp), _mm_setzero_si128(), 0xaa), _mm_blend_epi16(AVX::hi128(tmp), _mm_setzero_si128(), 0xaa))); b.data() = _mm256_cvtepi32_ps(Vc::AVX::concat( _mm_srli_epi32(AVX::lo128(tmp), 16), _mm_srli_epi32(AVX::hi128(tmp), 16))); } template inline void HelperImpl::deinterleave( sfloat_v &_a, sfloat_v &_b, const MemT *m, A align) { float_v &a = reinterpret_cast(_a); float_v &b = reinterpret_cast(_b); HelperImpl::deinterleave(a, b, m, align); } template inline void HelperImpl::deinterleave( double_v &a, double_v &b, const double *m, A align) { a.load(m, align); b.load(m + double_v::Size, align); m256d tmp0 = Mem::shuffle128(a.data(), b.data()); // b1 b0 a1 a0 m256d tmp1 = Mem::shuffle128(a.data(), b.data()); // b3 b2 a3 a2 a.data() = _mm256_unpacklo_pd(tmp0, tmp1); // b2 b0 a2 a0 b.data() = _mm256_unpackhi_pd(tmp0, tmp1); // b3 b1 a3 a1 } template inline void HelperImpl::deinterleave( int_v &a, int_v &b, const int *m, A align) { using Vc::AVX::m256; a.load(m, align); b.load(m + int_v::Size, align); const m256 tmp0 = AVX::avx_cast(Mem::shuffle128(a.data(), b.data())); const m256 tmp1 = AVX::avx_cast(Mem::shuffle128(a.data(), b.data())); const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); // b5 b1 b4 b0 a5 a1 a4 a0 const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); // b7 b3 b6 b2 a7 a3 a6 a2 a.data() = AVX::avx_cast(_mm256_unpacklo_ps(tmp2, tmp3)); // b6 b4 b2 b0 a6 a4 a2 a0 b.data() = AVX::avx_cast(_mm256_unpackhi_ps(tmp2, tmp3)); // b7 b5 b3 b1 a7 a5 a3 a1 } template inline void HelperImpl::deinterleave( int_v &a, int_v &b, const short *m, A align) { using Vc::AVX::m256i; const m256i tmp = Vc::AVX::VectorHelper::load(m, align); a.data() = Vc::AVX::concat( _mm_srai_epi32(_mm_slli_epi32(AVX::lo128(tmp), 16), 16), _mm_srai_epi32(_mm_slli_epi32(AVX::hi128(tmp), 16), 16)); b.data() = Vc::AVX::concat( _mm_srai_epi32(AVX::lo128(tmp), 16), _mm_srai_epi32(AVX::hi128(tmp), 16)); } template inline void HelperImpl::deinterleave( uint_v &a, uint_v &b, const unsigned int *m, A align) { using Vc::AVX::m256; a.load(m, align); b.load(m + uint_v::Size, align); const m256 tmp0 = AVX::avx_cast(Mem::shuffle128(a.data(), b.data())); const m256 tmp1 = AVX::avx_cast(Mem::shuffle128(a.data(), b.data())); const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); // b5 b1 b4 b0 a5 a1 a4 a0 const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); // b7 b3 b6 b2 a7 a3 a6 a2 a.data() = AVX::avx_cast(_mm256_unpacklo_ps(tmp2, tmp3)); // b6 b4 b2 b0 a6 a4 a2 a0 b.data() = AVX::avx_cast(_mm256_unpackhi_ps(tmp2, tmp3)); // b7 b5 b3 b1 a7 a5 a3 a1 } template inline void HelperImpl::deinterleave( uint_v &a, uint_v &b, const unsigned short *m, A align) { using Vc::AVX::m256i; const m256i tmp = Vc::AVX::VectorHelper::load(m, align); a.data() = Vc::AVX::concat( _mm_srli_epi32(_mm_slli_epi32(AVX::lo128(tmp), 16), 16), _mm_srli_epi32(_mm_slli_epi32(AVX::hi128(tmp), 16), 16)); b.data() = Vc::AVX::concat( _mm_srli_epi32(AVX::lo128(tmp), 16), _mm_srli_epi32(AVX::hi128(tmp), 16)); } template inline void HelperImpl::deinterleave( short_v &a, short_v &b, const short *m, A align) { a.load(m, align); b.load(m + short_v::Size, align); Vc::AVX::deinterleave(a, b); } template inline void HelperImpl::deinterleave( ushort_v &a, ushort_v &b, const unsigned short *m, A align) { a.load(m, align); b.load(m + ushort_v::Size, align); Vc::AVX::deinterleave(a, b); } // only support M == V::EntryType -> no specialization template inline Vc_FLATTEN void HelperImpl::deinterleave(V &VC_RESTRICT a, V &VC_RESTRICT b, V &VC_RESTRICT c, const M *VC_RESTRICT memory, A align) { a.load(&memory[0 * V::Size], align); b.load(&memory[1 * V::Size], align); c.load(&memory[2 * V::Size], align); Vc::AVX::deinterleave(a, b, c); } } // namespace Internal } // namespace Vc /*OUTER_NAMESPACE_END*/ Vc-0.7.4/avx/forceToRegisters.tcc000066400000000000000000000220271233512346000166560ustar00rootroot00000000000000#ifdef __GNUC__ template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x1) { __asm__ __volatile__(""::"x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x1) { __asm__ __volatile__("":"+x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x2, const Vector &x1) { __asm__ __volatile__(""::"x"(x2.data()), "x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x2, Vector &x1) { __asm__ __volatile__("":"+x"(x2.data()), "+x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x3, const Vector &x2, const Vector &x1) { __asm__ __volatile__(""::"x"(x3.data()), "x"(x2.data()), "x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x3, Vector &x2, Vector &x1) { __asm__ __volatile__("":"+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { __asm__ __volatile__(""::"x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x4, Vector &x3, Vector &x2, Vector &x1) { __asm__ __volatile__("":"+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x5, const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { __asm__ __volatile__(""::"x"(x5.data()), "x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x5, Vector &x4, Vector &x3, Vector &x2, Vector &x1) { __asm__ __volatile__("":"+x"(x5.data()), "+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x6, const Vector &x5, const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { __asm__ __volatile__(""::"x"(x6.data()), "x"(x5.data()), "x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x6, Vector &x5, Vector &x4, Vector &x3, Vector &x2, Vector &x1) { __asm__ __volatile__("":"+x"(x6.data()), "+x"(x5.data()), "+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x7, const Vector &x6, const Vector &x5, const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { __asm__ __volatile__(""::"x"(x7.data()), "x"(x6.data()), "x"(x5.data()), "x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x7, Vector &x6, Vector &x5, Vector &x4, Vector &x3, Vector &x2, Vector &x1) { __asm__ __volatile__("":"+x"(x7.data()), "+x"(x6.data()), "+x"(x5.data()), "+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x8, const Vector &x7, const Vector &x6, const Vector &x5, const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { __asm__ __volatile__(""::"x"(x8.data()), "x"(x7.data()), "x"(x6.data()), "x"(x5.data()), "x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x8, Vector &x7, Vector &x6, Vector &x5, Vector &x4, Vector &x3, Vector &x2, Vector &x1) { __asm__ __volatile__("":"+x"(x8.data()), "+x"(x7.data()), "+x"(x6.data()), "+x"(x5.data()), "+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); } #elif defined(VC_MSVC) #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x1*/) { } #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x1*/) { } #pragma optimize("g", on) #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x2*/, const Vector &/*x1*/) { } #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x2*/, Vector &/*x1*/) { } #pragma optimize("g", on) #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { } #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { } #pragma optimize("g", on) #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { } #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { } #pragma optimize("g", on) #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x5*/, const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { } #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x5*/, Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { } #pragma optimize("g", on) #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x6*/, const Vector &/*x5*/, const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { } #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x6*/, Vector &/*x5*/, Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { } #pragma optimize("g", on) #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x7*/, const Vector &/*x6*/, const Vector &/*x5*/, const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { } #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x7*/, Vector &/*x6*/, Vector &/*x5*/, Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { } #pragma optimize("g", on) #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x8*/, const Vector &/*x7*/, const Vector &/*x6*/, const Vector &/*x5*/, const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { } #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x8*/, Vector &/*x7*/, Vector &/*x6*/, Vector &/*x5*/, Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { } #pragma optimize("g", on) #else #error "forceToRegisters unsupported on this compiler" #endif Vc-0.7.4/avx/helperimpl.h000066400000000000000000000106661233512346000152120ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2011-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_AVX_HELPERIMPL_H #define VC_AVX_HELPERIMPL_H #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace Internal { template<> struct HelperImpl { typedef AVX::Vector float_v; typedef AVX::Vector sfloat_v; typedef AVX::Vector double_v; typedef AVX::Vector int_v; typedef AVX::Vector uint_v; typedef AVX::Vector short_v; typedef AVX::Vector ushort_v; template static void deinterleave(float_v &, float_v &, const float *, A); template static void deinterleave(float_v &, float_v &, const short *, A); template static void deinterleave(float_v &, float_v &, const unsigned short *, A); template static void deinterleave(sfloat_v &, sfloat_v &, const MemT *, A); template static void deinterleave(double_v &, double_v &, const double *, A); template static void deinterleave(int_v &, int_v &, const int *, A); template static void deinterleave(int_v &, int_v &, const short *, A); template static void deinterleave(uint_v &, uint_v &, const unsigned int *, A); template static void deinterleave(uint_v &, uint_v &, const unsigned short *, A); template static void deinterleave(short_v &, short_v &, const short *, A); template static void deinterleave(ushort_v &, ushort_v &, const unsigned short *, A); template static Vc_ALWAYS_INLINE_L void deinterleave(V &VC_RESTRICT a, V &VC_RESTRICT b, V &VC_RESTRICT c, const M *VC_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; template static Vc_ALWAYS_INLINE_L void deinterleave(V &VC_RESTRICT a, V &VC_RESTRICT b, V &VC_RESTRICT c, V &VC_RESTRICT d, const M *VC_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; template static Vc_ALWAYS_INLINE_L void deinterleave(V &VC_RESTRICT a, V &VC_RESTRICT b, V &VC_RESTRICT c, V &VC_RESTRICT d, V &VC_RESTRICT e, const M *VC_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; template static Vc_ALWAYS_INLINE_L void deinterleave(V &VC_RESTRICT a, V &VC_RESTRICT b, V &VC_RESTRICT c, V &VC_RESTRICT d, V &VC_RESTRICT e, V &VC_RESTRICT f, const M *VC_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; template static Vc_ALWAYS_INLINE_L void deinterleave(V &VC_RESTRICT a, V &VC_RESTRICT b, V &VC_RESTRICT c, V &VC_RESTRICT d, V &VC_RESTRICT e, V &VC_RESTRICT f, V &VC_RESTRICT g, V &VC_RESTRICT h, const M *VC_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void prefetchForOneRead(const void *addr) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void prefetchForModify(const void *addr) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void prefetchClose(const void *addr) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void prefetchMid(const void *addr) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void prefetchFar(const void *addr) Vc_ALWAYS_INLINE_R; template static Vc_ALWAYS_INLINE_L void *malloc(size_t n) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void free(void *p) Vc_ALWAYS_INLINE_R; }; } // namespace Internal } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "deinterleave.tcc" #include "prefetches.tcc" #include "helperimpl.tcc" #include "undomacros.h" #endif // VC_AVX_HELPERIMPL_H Vc-0.7.4/avx/helperimpl.tcc000066400000000000000000000034761233512346000155350ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2011-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_AVX_HELPERIMPL_TCC #define VC_AVX_HELPERIMPL_TCC /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace Internal { template static _VC_CONSTEXPR size_t nextMultipleOf(size_t value) { return (value % X) > 0 ? value + X - (value % X) : value; } template Vc_ALWAYS_INLINE void *HelperImpl::malloc(size_t n) { switch (A) { case Vc::AlignOnVector: return _mm_malloc(nextMultipleOf(n), Vc::AVX::VectorAlignment); case Vc::AlignOnCacheline: // TODO: hardcoding 64 is not such a great idea return _mm_malloc(nextMultipleOf<64>(n), 64); case Vc::AlignOnPage: // TODO: hardcoding 4096 is not such a great idea return _mm_malloc(nextMultipleOf<4096>(n), 4096); default: #ifndef NDEBUG abort(); #endif return _mm_malloc(n, 8); } } Vc_ALWAYS_INLINE void HelperImpl::free(void *p) { _mm_free(p); } } // namespace Internal } // namespace Vc /*OUTER_NAMESPACE_END*/ #endif // VC_AVX_HELPERIMPL_TCC Vc-0.7.4/avx/interleavedmemory.tcc000066400000000000000000001515431233512346000171260ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright (C) 2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . }}}*/ #ifndef VC_AVX_INTERLEAVEDMEMORY_TCC #define VC_AVX_INTERLEAVEDMEMORY_TCC #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace Common { namespace { template struct InterleaveImpl; template struct InterleaveImpl { static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1) { const m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v1.data()); const m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v1.data()); #ifdef __x86_64__ const long long tmp00 = _mm_cvtsi128_si64(tmp0); const long long tmp01 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp0, tmp0)); const long long tmp10 = _mm_cvtsi128_si64(tmp1); const long long tmp11 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp1, tmp1)); *reinterpret_cast(&data[i[0]]) = tmp00; *reinterpret_cast(&data[i[1]]) = tmp00 >> 32; *reinterpret_cast(&data[i[2]]) = tmp01; *reinterpret_cast(&data[i[3]]) = tmp01 >> 32; *reinterpret_cast(&data[i[4]]) = tmp10; *reinterpret_cast(&data[i[5]]) = tmp10 >> 32; *reinterpret_cast(&data[i[6]]) = tmp11; *reinterpret_cast(&data[i[7]]) = tmp11 >> 32; #else *reinterpret_cast(&data[i[0]]) = _mm_cvtsi128_si32(tmp0); *reinterpret_cast(&data[i[1]]) = _mm_extract_epi32(tmp0, 1); *reinterpret_cast(&data[i[2]]) = _mm_extract_epi32(tmp0, 2); *reinterpret_cast(&data[i[3]]) = _mm_extract_epi32(tmp0, 3); *reinterpret_cast(&data[i[4]]) = _mm_cvtsi128_si32(tmp1); *reinterpret_cast(&data[i[5]]) = _mm_extract_epi32(tmp1, 1); *reinterpret_cast(&data[i[6]]) = _mm_extract_epi32(tmp1, 2); *reinterpret_cast(&data[i[7]]) = _mm_extract_epi32(tmp1, 3); #endif }/*}}}*/ static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) { #ifdef VC_USE_MASKMOV_SCATTER const m128i maskLo = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); const m128i maskHi = _mm_set_epi16(0, -1, -1, -1, 0, 0, 0, 0); typename V::EntryType *const dataHi = data - 4; const m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data()); const m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data()); const m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v1.data()); const m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v1.data()); const m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2); const m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2); const m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3); const m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3); _mm_maskmoveu_si128(tmp4, maskLo, reinterpret_cast(&data[i[0]])); _mm_maskmoveu_si128(tmp4, maskHi, reinterpret_cast(&dataHi[i[1]])); _mm_maskmoveu_si128(tmp5, maskLo, reinterpret_cast(&data[i[2]])); _mm_maskmoveu_si128(tmp5, maskHi, reinterpret_cast(&dataHi[i[3]])); _mm_maskmoveu_si128(tmp6, maskLo, reinterpret_cast(&data[i[4]])); _mm_maskmoveu_si128(tmp6, maskHi, reinterpret_cast(&dataHi[i[5]])); _mm_maskmoveu_si128(tmp7, maskLo, reinterpret_cast(&data[i[6]])); _mm_maskmoveu_si128(tmp7, maskHi, reinterpret_cast(&dataHi[i[7]])); #else interleave(data, i, v0, v1); v2.scatter(data + 2, i); #endif }/*}}}*/ static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) { const m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data()); const m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data()); const m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v3.data()); const m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v3.data()); const m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2); const m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2); const m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3); const m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3); _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[0]]), tmp4); _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[2]]), tmp5); _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[4]]), tmp6); _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[6]]), tmp7); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), _mm_castsi128_ps(tmp4)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), _mm_castsi128_ps(tmp5)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), _mm_castsi128_ps(tmp6)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), _mm_castsi128_ps(tmp7)); }/*}}}*/ }; template struct InterleaveImpl { static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1) { using namespace Vc::AVX; // [0a 1a 0b 1b 0e 1e 0f 1f]: const m256 tmp0 = _mm256_unpacklo_ps(AVX::avx_cast(v0.data()), AVX::avx_cast(v1.data())); // [0c 1c 0d 1d 0g 1g 0h 1h]: const m256 tmp1 = _mm256_unpackhi_ps(AVX::avx_cast(v0.data()), AVX::avx_cast(v1.data())); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), lo128(tmp0)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), lo128(tmp0)); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), lo128(tmp1)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), lo128(tmp1)); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[4]]), hi128(tmp0)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), hi128(tmp0)); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[6]]), hi128(tmp1)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), hi128(tmp1)); }/*}}}*/ static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) { using namespace Vc::AVX; #ifdef VC_USE_MASKMOV_SCATTER // [0a 2a 0b 2b 0e 2e 0f 2f]: const m256 tmp0 = _mm256_unpacklo_ps(AVX::avx_cast(v0.data()), AVX::avx_cast(v2.data())); // [0c 2c 0d 2d 0g 2g 0h 2h]: const m256 tmp1 = _mm256_unpackhi_ps(AVX::avx_cast(v0.data()), AVX::avx_cast(v2.data())); // [1a __ 1b __ 1e __ 1f __]: const m256 tmp2 = _mm256_unpacklo_ps(AVX::avx_cast(v1.data()), AVX::avx_cast(v1.data())); // [1c __ 1d __ 1g __ 1h __]: const m256 tmp3 = _mm256_unpackhi_ps(AVX::avx_cast(v1.data()), AVX::avx_cast(v1.data())); const m256 tmp4 = _mm256_unpacklo_ps(tmp0, tmp2); const m256 tmp5 = _mm256_unpackhi_ps(tmp0, tmp2); const m256 tmp6 = _mm256_unpacklo_ps(tmp1, tmp3); const m256 tmp7 = _mm256_unpackhi_ps(tmp1, tmp3); const m128i mask = _mm_set_epi32(0, -1, -1, -1); _mm_maskstore_ps(reinterpret_cast(&data[i[0]]), mask, lo128(tmp4)); _mm_maskstore_ps(reinterpret_cast(&data[i[1]]), mask, lo128(tmp5)); _mm_maskstore_ps(reinterpret_cast(&data[i[2]]), mask, lo128(tmp6)); _mm_maskstore_ps(reinterpret_cast(&data[i[3]]), mask, lo128(tmp7)); _mm_maskstore_ps(reinterpret_cast(&data[i[4]]), mask, hi128(tmp4)); _mm_maskstore_ps(reinterpret_cast(&data[i[5]]), mask, hi128(tmp5)); _mm_maskstore_ps(reinterpret_cast(&data[i[6]]), mask, hi128(tmp6)); _mm_maskstore_ps(reinterpret_cast(&data[i[7]]), mask, hi128(tmp7)); #else interleave(data, i, v0, v1); v2.scatter(data + 2, i); #endif }/*}}}*/ static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) { using namespace Vc::AVX; const m256 tmp0 = _mm256_unpacklo_ps(AVX::avx_cast(v0.data()), AVX::avx_cast(v2.data())); const m256 tmp1 = _mm256_unpackhi_ps(AVX::avx_cast(v0.data()), AVX::avx_cast(v2.data())); const m256 tmp2 = _mm256_unpacklo_ps(AVX::avx_cast(v1.data()), AVX::avx_cast(v3.data())); const m256 tmp3 = _mm256_unpackhi_ps(AVX::avx_cast(v1.data()), AVX::avx_cast(v3.data())); const m256 tmp4 = _mm256_unpacklo_ps(tmp0, tmp2); const m256 tmp5 = _mm256_unpackhi_ps(tmp0, tmp2); const m256 tmp6 = _mm256_unpacklo_ps(tmp1, tmp3); const m256 tmp7 = _mm256_unpackhi_ps(tmp1, tmp3); _mm_storeu_ps(reinterpret_cast(&data[i[0]]), lo128(tmp4)); _mm_storeu_ps(reinterpret_cast(&data[i[1]]), lo128(tmp5)); _mm_storeu_ps(reinterpret_cast(&data[i[2]]), lo128(tmp6)); _mm_storeu_ps(reinterpret_cast(&data[i[3]]), lo128(tmp7)); _mm_storeu_ps(reinterpret_cast(&data[i[4]]), hi128(tmp4)); _mm_storeu_ps(reinterpret_cast(&data[i[5]]), hi128(tmp5)); _mm_storeu_ps(reinterpret_cast(&data[i[6]]), hi128(tmp6)); _mm_storeu_ps(reinterpret_cast(&data[i[7]]), hi128(tmp7)); }/*}}}*/ }; template struct InterleaveImpl { static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1) { using namespace Vc::AVX; const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data()); const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data()); _mm_storeu_pd(&data[i[0]], lo128(tmp0)); _mm_storeu_pd(&data[i[1]], lo128(tmp1)); _mm_storeu_pd(&data[i[2]], hi128(tmp0)); _mm_storeu_pd(&data[i[3]], hi128(tmp1)); }/*}}}*/ static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) { using namespace Vc::AVX; #ifdef VC_USE_MASKMOV_SCATTER const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data()); const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data()); const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v2.data()); const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v2.data()); #if defined(VC_MSVC) && (VC_MSVC < 170000000 || !defined(_WIN64)) // MSVC needs to be at Version 2012 before _mm256_set_epi64x works const m256i mask = AVX::concat(_mm_setallone_si128(), _mm_set_epi32(0, 0, -1, -1)); #else const m256i mask = _mm256_set_epi64x(0, -1, -1, -1); #endif _mm256_maskstore_pd(&data[i[0]], mask, Mem::shuffle128(tmp0, tmp2)); _mm256_maskstore_pd(&data[i[1]], mask, Mem::shuffle128(tmp1, tmp3)); _mm256_maskstore_pd(&data[i[2]], mask, Mem::shuffle128(tmp0, tmp2)); _mm256_maskstore_pd(&data[i[3]], mask, Mem::shuffle128(tmp1, tmp3)); #else interleave(data, i, v0, v1); v2.scatter(data + 2, i); #endif }/*}}}*/ static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) { using namespace Vc::AVX; // 0a 1a 0c 1c: const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data()); // 0b 1b 0b 1b: const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data()); // 2a 3a 2c 3c: const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v3.data()); // 2b 3b 2b 3b: const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v3.data()); _mm256_storeu_pd(&data[i[0]], Mem::shuffle128(tmp0, tmp2)); _mm256_storeu_pd(&data[i[1]], Mem::shuffle128(tmp1, tmp3)); _mm256_storeu_pd(&data[i[2]], Mem::shuffle128(tmp0, tmp2)); _mm256_storeu_pd(&data[i[3]], Mem::shuffle128(tmp1, tmp3)); }/*}}}*/ }; } // anonymous namespace template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ const typename V::AsArg v1) { InterleaveImpl::interleave(m_data, m_indexes, v0, v1); }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ const typename V::AsArg v1, const typename V::AsArg v2) { InterleaveImpl::interleave(m_data, m_indexes, v0, v1, v2); }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) { InterleaveImpl::interleave(m_data, m_indexes, v0, v1, v2, v3); }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4) { InterleaveImpl::interleave(m_data, m_indexes, v0, v1, v2, v3); v4.scatter(m_data + 4, m_indexes); }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5) { InterleaveImpl::interleave(m_data , m_indexes, v0, v1, v2, v3); InterleaveImpl::interleave(m_data + 4, m_indexes, v4, v5); }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6) { InterleaveImpl::interleave(m_data + 0, m_indexes, v0, v1, v2, v3); InterleaveImpl::interleave(m_data + 4, m_indexes, v4, v5, v6); }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6, const typename V::AsArg v7) { InterleaveImpl::interleave(m_data + 0, m_indexes, v0, v1, v2, v3); InterleaveImpl::interleave(m_data + 4, m_indexes, v4, v5, v6, v7); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1) const/*{{{*/ { const m128 il0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[0]])); // a0 b0 const m128 il2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[2]])); // a2 b2 const m128 il4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[4]])); // a4 b4 const m128 il6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[6]])); // a6 b6 const m128 il01 = _mm_loadh_pi( il0, reinterpret_cast<__m64 const *>(&m_data[m_indexes[1]])); // a0 b0 a1 b1 const m128 il23 = _mm_loadh_pi( il2, reinterpret_cast<__m64 const *>(&m_data[m_indexes[3]])); // a2 b2 a3 b3 const m128 il45 = _mm_loadh_pi( il4, reinterpret_cast<__m64 const *>(&m_data[m_indexes[5]])); // a4 b4 a5 b5 const m128 il67 = _mm_loadh_pi( il6, reinterpret_cast<__m64 const *>(&m_data[m_indexes[7]])); // a6 b6 a7 b7 const m256 tmp2 = AVX::concat(il01, il45); const m256 tmp3 = AVX::concat(il23, il67); const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3); const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3); v0.data() = _mm256_unpacklo_ps(tmp0, tmp1); v1.data() = _mm256_unpackhi_ps(tmp0, tmp1); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2) const/*{{{*/ { const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0]]); // a0 b0 c0 d0 const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1]]); // a1 b1 c1 d1 const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2]]); // a2 b2 c2 d2 const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3]]); // a3 b3 c3 d3 const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4]]); // a4 b4 c4 d4 const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5]]); // a5 b5 c5 d5 const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6]]); // a6 b6 c6 d6 const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7]]); // a7 b7 c7 d7 const m256 il04 = AVX::concat(il0, il4); const m256 il15 = AVX::concat(il1, il5); const m256 il26 = AVX::concat(il2, il6); const m256 il37 = AVX::concat(il3, il7); const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); v0.data() = _mm256_unpacklo_ps(ab0246, ab1357); v1.data() = _mm256_unpackhi_ps(ab0246, ab1357); v2.data() = _mm256_unpacklo_ps(cd0246, cd1357); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3) const/*{{{*/ { const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0]]); // a0 b0 c0 d0 const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1]]); // a1 b1 c1 d1 const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2]]); // a2 b2 c2 d2 const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3]]); // a3 b3 c3 d3 const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4]]); // a4 b4 c4 d4 const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5]]); // a5 b5 c5 d5 const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6]]); // a6 b6 c6 d6 const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7]]); // a7 b7 c7 d7 const m256 il04 = AVX::concat(il0, il4); const m256 il15 = AVX::concat(il1, il5); const m256 il26 = AVX::concat(il2, il6); const m256 il37 = AVX::concat(il3, il7); const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); v0.data() = _mm256_unpacklo_ps(ab0246, ab1357); v1.data() = _mm256_unpackhi_ps(ab0246, ab1357); v2.data() = _mm256_unpacklo_ps(cd0246, cd1357); v3.data() = _mm256_unpackhi_ps(cd0246, cd1357); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4) const/*{{{*/ { v4.gather(m_data, m_indexes + I(4)); deinterleave(v0, v1, v2, v3); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4, float_v &v5) const/*{{{*/ { deinterleave(v0, v1, v2, v3); const m128 il0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[0] + 4])); // a0 b0 const m128 il2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[2] + 4])); // a2 b2 const m128 il4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[4] + 4])); // a4 b4 const m128 il6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[6] + 4])); // a6 b6 const m128 il01 = _mm_loadh_pi( il0, reinterpret_cast<__m64 const *>(&m_data[m_indexes[1] + 4])); // a0 b0 a1 b1 const m128 il23 = _mm_loadh_pi( il2, reinterpret_cast<__m64 const *>(&m_data[m_indexes[3] + 4])); // a2 b2 a3 b3 const m128 il45 = _mm_loadh_pi( il4, reinterpret_cast<__m64 const *>(&m_data[m_indexes[5] + 4])); // a4 b4 a5 b5 const m128 il67 = _mm_loadh_pi( il6, reinterpret_cast<__m64 const *>(&m_data[m_indexes[7] + 4])); // a6 b6 a7 b7 const m256 tmp2 = AVX::concat(il01, il45); const m256 tmp3 = AVX::concat(il23, il67); const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3); const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3); v4.data() = _mm256_unpacklo_ps(tmp0, tmp1); v5.data() = _mm256_unpackhi_ps(tmp0, tmp1); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4, float_v &v5, float_v &v6) const/*{{{*/ { deinterleave(v0, v1, v2, v3); const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0] + 4]); // a0 b0 c0 d0 const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1] + 4]); // a1 b1 c1 d1 const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2] + 4]); // a2 b2 c2 d2 const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3] + 4]); // a3 b3 c3 d3 const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4] + 4]); // a4 b4 c4 d4 const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5] + 4]); // a5 b5 c5 d5 const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6] + 4]); // a6 b6 c6 d6 const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7] + 4]); // a7 b7 c7 d7 const m256 il04 = AVX::concat(il0, il4); const m256 il15 = AVX::concat(il1, il5); const m256 il26 = AVX::concat(il2, il6); const m256 il37 = AVX::concat(il3, il7); const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); v4.data() = _mm256_unpacklo_ps(ab0246, ab1357); v5.data() = _mm256_unpackhi_ps(ab0246, ab1357); v6.data() = _mm256_unpacklo_ps(cd0246, cd1357); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4, float_v &v5, float_v &v6, float_v &v7) const/*{{{*/ { deinterleave(v0, v1, v2, v3); const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0] + 4]); // a0 b0 c0 d0 const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1] + 4]); // a1 b1 c1 d1 const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2] + 4]); // a2 b2 c2 d2 const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3] + 4]); // a3 b3 c3 d3 const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4] + 4]); // a4 b4 c4 d4 const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5] + 4]); // a5 b5 c5 d5 const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6] + 4]); // a6 b6 c6 d6 const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7] + 4]); // a7 b7 c7 d7 const m256 il04 = AVX::concat(il0, il4); const m256 il15 = AVX::concat(il1, il5); const m256 il26 = AVX::concat(il2, il6); const m256 il37 = AVX::concat(il3, il7); const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); v4.data() = _mm256_unpacklo_ps(ab0246, ab1357); v5.data() = _mm256_unpackhi_ps(ab0246, ab1357); v6.data() = _mm256_unpacklo_ps(cd0246, cd1357); v7.data() = _mm256_unpackhi_ps(cd0246, cd1357); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1) const/*{{{*/ { const m128 il0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[0]])); // a0 b0 const m128 il2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[2]])); // a2 b2 const m128 il4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[4]])); // a4 b4 const m128 il6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[6]])); // a6 b6 const m128 il01 = _mm_loadh_pi( il0, reinterpret_cast<__m64 const *>(&m_data[m_indexes[1]])); // a0 b0 a1 b1 const m128 il23 = _mm_loadh_pi( il2, reinterpret_cast<__m64 const *>(&m_data[m_indexes[3]])); // a2 b2 a3 b3 const m128 il45 = _mm_loadh_pi( il4, reinterpret_cast<__m64 const *>(&m_data[m_indexes[5]])); // a4 b4 a5 b5 const m128 il67 = _mm_loadh_pi( il6, reinterpret_cast<__m64 const *>(&m_data[m_indexes[7]])); // a6 b6 a7 b7 const m256 tmp2 = AVX::concat(il01, il45); const m256 tmp3 = AVX::concat(il23, il67); const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3); const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3); v0.data() = _mm256_unpacklo_ps(tmp0, tmp1); v1.data() = _mm256_unpackhi_ps(tmp0, tmp1); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2) const/*{{{*/ { const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0]]); // a0 b0 c0 d0 const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1]]); // a1 b1 c1 d1 const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2]]); // a2 b2 c2 d2 const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3]]); // a3 b3 c3 d3 const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4]]); // a4 b4 c4 d4 const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5]]); // a5 b5 c5 d5 const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6]]); // a6 b6 c6 d6 const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7]]); // a7 b7 c7 d7 const m256 il04 = AVX::concat(il0, il4); const m256 il15 = AVX::concat(il1, il5); const m256 il26 = AVX::concat(il2, il6); const m256 il37 = AVX::concat(il3, il7); const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); v0.data() = _mm256_unpacklo_ps(ab0246, ab1357); v1.data() = _mm256_unpackhi_ps(ab0246, ab1357); v2.data() = _mm256_unpacklo_ps(cd0246, cd1357); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3) const/*{{{*/ { const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0]]); // a0 b0 c0 d0 const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1]]); // a1 b1 c1 d1 const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2]]); // a2 b2 c2 d2 const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3]]); // a3 b3 c3 d3 const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4]]); // a4 b4 c4 d4 const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5]]); // a5 b5 c5 d5 const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6]]); // a6 b6 c6 d6 const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7]]); // a7 b7 c7 d7 const m256 il04 = AVX::concat(il0, il4); const m256 il15 = AVX::concat(il1, il5); const m256 il26 = AVX::concat(il2, il6); const m256 il37 = AVX::concat(il3, il7); const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); v0.data() = _mm256_unpacklo_ps(ab0246, ab1357); v1.data() = _mm256_unpackhi_ps(ab0246, ab1357); v2.data() = _mm256_unpacklo_ps(cd0246, cd1357); v3.data() = _mm256_unpackhi_ps(cd0246, cd1357); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4) const/*{{{*/ { v4.gather(m_data, m_indexes + I(4)); deinterleave(v0, v1, v2, v3); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4, sfloat_v &v5) const/*{{{*/ { deinterleave(v0, v1, v2, v3); const m128 il0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[0] + 4])); // a0 b0 const m128 il2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[2] + 4])); // a2 b2 const m128 il4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[4] + 4])); // a4 b4 const m128 il6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[6] + 4])); // a6 b6 const m128 il01 = _mm_loadh_pi( il0, reinterpret_cast<__m64 const *>(&m_data[m_indexes[1] + 4])); // a0 b0 a1 b1 const m128 il23 = _mm_loadh_pi( il2, reinterpret_cast<__m64 const *>(&m_data[m_indexes[3] + 4])); // a2 b2 a3 b3 const m128 il45 = _mm_loadh_pi( il4, reinterpret_cast<__m64 const *>(&m_data[m_indexes[5] + 4])); // a4 b4 a5 b5 const m128 il67 = _mm_loadh_pi( il6, reinterpret_cast<__m64 const *>(&m_data[m_indexes[7] + 4])); // a6 b6 a7 b7 const m256 tmp2 = AVX::concat(il01, il45); const m256 tmp3 = AVX::concat(il23, il67); const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3); const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3); v4.data() = _mm256_unpacklo_ps(tmp0, tmp1); v5.data() = _mm256_unpackhi_ps(tmp0, tmp1); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4, sfloat_v &v5, sfloat_v &v6) const/*{{{*/ { deinterleave(v0, v1, v2, v3); const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0] + 4]); // a0 b0 c0 d0 const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1] + 4]); // a1 b1 c1 d1 const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2] + 4]); // a2 b2 c2 d2 const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3] + 4]); // a3 b3 c3 d3 const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4] + 4]); // a4 b4 c4 d4 const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5] + 4]); // a5 b5 c5 d5 const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6] + 4]); // a6 b6 c6 d6 const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7] + 4]); // a7 b7 c7 d7 const m256 il04 = AVX::concat(il0, il4); const m256 il15 = AVX::concat(il1, il5); const m256 il26 = AVX::concat(il2, il6); const m256 il37 = AVX::concat(il3, il7); const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); v4.data() = _mm256_unpacklo_ps(ab0246, ab1357); v5.data() = _mm256_unpackhi_ps(ab0246, ab1357); v6.data() = _mm256_unpacklo_ps(cd0246, cd1357); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4, sfloat_v &v5, sfloat_v &v6, sfloat_v &v7) const/*{{{*/ { deinterleave(v0, v1, v2, v3); const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0] + 4]); // a0 b0 c0 d0 const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1] + 4]); // a1 b1 c1 d1 const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2] + 4]); // a2 b2 c2 d2 const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3] + 4]); // a3 b3 c3 d3 const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4] + 4]); // a4 b4 c4 d4 const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5] + 4]); // a5 b5 c5 d5 const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6] + 4]); // a6 b6 c6 d6 const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7] + 4]); // a7 b7 c7 d7 const m256 il04 = AVX::concat(il0, il4); const m256 il15 = AVX::concat(il1, il5); const m256 il26 = AVX::concat(il2, il6); const m256 il37 = AVX::concat(il3, il7); const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); v4.data() = _mm256_unpacklo_ps(ab0246, ab1357); v5.data() = _mm256_unpackhi_ps(ab0246, ab1357); v6.data() = _mm256_unpacklo_ps(cd0246, cd1357); v7.data() = _mm256_unpackhi_ps(cd0246, cd1357); }/*}}}*/ static Vc_ALWAYS_INLINE void _avx_deinterleave_double(const double *VC_RESTRICT data, const uint_v &indexes, double_v &v0, double_v &v1)/*{{{*/ { const m256d ab02 = AVX::concat(_mm_loadu_pd(&data[indexes[0]]), _mm_loadu_pd(&data[indexes[2]])); const m256d ab13 = AVX::concat(_mm_loadu_pd(&data[indexes[1]]), _mm_loadu_pd(&data[indexes[3]])); v0.data() = _mm256_unpacklo_pd(ab02, ab13); v1.data() = _mm256_unpackhi_pd(ab02, ab13); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1) const/*{{{*/ { _avx_deinterleave_double(m_data , m_indexes, v0, v1); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1, double_v &v2) const/*{{{*/ { _avx_deinterleave_double(m_data , m_indexes, v0, v1); v2.gather(m_data + 2, m_indexes); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1, double_v &v2, double_v &v3) const/*{{{*/ { _avx_deinterleave_double(m_data , m_indexes, v0, v1); _avx_deinterleave_double(m_data + 2, m_indexes, v2, v3); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1, double_v &v2, double_v &v3, double_v &v4) const/*{{{*/ { _avx_deinterleave_double(m_data , m_indexes, v0, v1); _avx_deinterleave_double(m_data + 2, m_indexes, v2, v3); v4.gather(m_data + 4, m_indexes); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1, double_v &v2, double_v &v3, double_v &v4, double_v &v5) const/*{{{*/ { _avx_deinterleave_double(m_data , m_indexes, v0, v1); _avx_deinterleave_double(m_data + 2, m_indexes, v2, v3); _avx_deinterleave_double(m_data + 4, m_indexes, v4, v5); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1, double_v &v2, double_v &v3, double_v &v4, double_v &v5, double_v &v6) const/*{{{*/ { _avx_deinterleave_double(m_data , m_indexes, v0, v1); _avx_deinterleave_double(m_data + 2, m_indexes, v2, v3); _avx_deinterleave_double(m_data + 4, m_indexes, v4, v5); v6.gather(m_data + 6, m_indexes); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1, double_v &v2, double_v &v3, double_v &v4, double_v &v5, double_v &v6, double_v &v7) const/*{{{*/ { _avx_deinterleave_double(m_data , m_indexes, v0, v1); _avx_deinterleave_double(m_data + 2, m_indexes, v2, v3); _avx_deinterleave_double(m_data + 4, m_indexes, v4, v5); _avx_deinterleave_double(m_data + 6, m_indexes, v6, v7); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1) const {/*{{{*/ const m128i a = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[0]])); const m128i b = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[1]])); const m128i c = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[2]])); const m128i d = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[3]])); const m128i e = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[4]])); const m128i f = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[5]])); const m128i g = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[6]])); const m128i h = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[7]])); const m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 const m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 const m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 const m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 const m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 const m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); } /*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ short_v &v2) const { const m128i a = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[0]])); const m128i b = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[1]])); const m128i c = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[2]])); const m128i d = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[3]])); const m128i e = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[4]])); const m128i f = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[5]])); const m128i g = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[6]])); const m128i h = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[7]])); const m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 const m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 const m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 const m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 const m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 const m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 const m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 const m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ short_v &v2, short_v &v3) const { const m128i a = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[0]])); const m128i b = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[1]])); const m128i c = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[2]])); const m128i d = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[3]])); const m128i e = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[4]])); const m128i f = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[5]])); const m128i g = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[6]])); const m128i h = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[7]])); const m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 const m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 const m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 const m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 const m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 const m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 const m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 const m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ short_v &v2, short_v &v3, short_v &v4) const { const m128i a = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[0]])); const m128i b = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[1]])); const m128i c = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[2]])); const m128i d = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[3]])); const m128i e = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[4]])); const m128i f = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[5]])); const m128i g = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[6]])); const m128i h = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[7]])); const m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 const m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 const m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 const m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 const m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 const m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 const m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 const m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 const m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 const m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 const m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 const m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 const m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 const m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ short_v &v2, short_v &v3, short_v &v4, short_v &v5) const { const m128i a = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[0]])); const m128i b = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[1]])); const m128i c = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[2]])); const m128i d = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[3]])); const m128i e = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[4]])); const m128i f = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[5]])); const m128i g = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[6]])); const m128i h = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[7]])); const m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 const m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 const m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 const m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 const m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 const m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 const m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 const m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 const m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 const m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 const m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 const m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 const m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 const m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); v5.data() = _mm_unpackhi_epi16(tmp8, tmp9); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ short_v &v2, short_v &v3, short_v &v4, short_v &v5, short_v &v6) const { const m128i a = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[0]])); const m128i b = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[1]])); const m128i c = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[2]])); const m128i d = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[3]])); const m128i e = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[4]])); const m128i f = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[5]])); const m128i g = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[6]])); const m128i h = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[7]])); const m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 const m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 const m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 const m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 const m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 const m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 const m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 const m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 const m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 const m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 const m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 const m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 const m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 const m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 const m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6 const m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); v5.data() = _mm_unpackhi_epi16(tmp8, tmp9); v6.data() = _mm_unpacklo_epi16(tmp14, tmp15); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ short_v &v2, short_v &v3, short_v &v4, short_v &v5, short_v &v6, short_v &v7) const { const m128i a = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[0]])); const m128i b = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[1]])); const m128i c = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[2]])); const m128i d = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[3]])); const m128i e = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[4]])); const m128i f = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[5]])); const m128i g = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[6]])); const m128i h = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[7]])); const m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 const m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 const m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 const m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 const m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 const m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 const m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 const m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 const m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 const m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 const m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 const m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 const m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 const m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 const m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6 const m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); v5.data() = _mm_unpackhi_epi16(tmp8, tmp9); v6.data() = _mm_unpacklo_epi16(tmp14, tmp15); v7.data() = _mm_unpackhi_epi16(tmp14, tmp15); }/*}}}*/ // forward types of equal size - ugly, but it works/*{{{*/ #define _forward(V, V2) \ template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1) const { \ reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1)); \ } \ template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2) const { \ reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ reinterpret_cast(v2)); \ } \ template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3) const { \ reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ reinterpret_cast(v2), reinterpret_cast(v3)); \ } \ template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, \ V &v4) const { \ reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ reinterpret_cast(v2), reinterpret_cast(v3), reinterpret_cast(v4)); \ } \ template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, \ V &v4, V &v5) const { \ reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ reinterpret_cast(v2), reinterpret_cast(v3), reinterpret_cast(v4), \ reinterpret_cast(v5)); \ } \ template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, \ V &v4, V &v5, V &v6) const { \ reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ reinterpret_cast(v2), reinterpret_cast(v3), reinterpret_cast(v4), \ reinterpret_cast(v5), reinterpret_cast(v6)); \ } \ template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, \ V &v4, V &v5, V &v6, V &v7) const { \ reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ reinterpret_cast(v2), reinterpret_cast(v3), reinterpret_cast(v4), \ reinterpret_cast(v5), reinterpret_cast(v6), reinterpret_cast(v7)); \ } _forward( int_v, float_v) _forward(uint_v, float_v) _forward(ushort_v, short_v) #undef _forward/*}}}*/ } // namespace Common } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // VC_AVX_INTERLEAVEDMEMORY_TCC // vim: foldmethod=marker Vc-0.7.4/avx/intrinsics.h000066400000000000000000001075441233512346000152400ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_AVX_INTRINSICS_H #define VC_AVX_INTRINSICS_H #include "../common/windows_fix_intrin.h" #include // see comment in sse/intrinsics.h extern "C" { // AVX #include #if (defined(VC_IMPL_XOP) || defined(VC_IMPL_FMA4)) && !defined(VC_MSVC) #include #endif } #include "../common/fix_clang_emmintrin.h" #if defined(VC_CLANG) && VC_CLANG < 0x30100 // _mm_permute_ps is broken: http://llvm.org/bugs/show_bug.cgi?id=12401 #undef _mm_permute_ps #define _mm_permute_ps(A, C) __extension__ ({ \ m128 __A = (A); \ (m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \ (C) & 0x3, ((C) & 0xc) >> 2, \ ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); }) #endif #include "const_data.h" #include "macros.h" #include #if defined(VC_CLANG) || defined(VC_MSVC) || (defined(VC_GCC) && !defined(__OPTIMIZE__)) #define VC_REQUIRES_MACRO_FOR_IMMEDIATE_ARGUMENT #endif #if defined(VC_CLANG) && VC_CLANG <= 0x30000 // _mm_alignr_epi8 doesn't specify its return type, thus breaking overload resolution #undef _mm_alignr_epi8 #define _mm_alignr_epi8(a, b, n) ((m128i)__builtin_ia32_palignr128((a), (b), (n))) #endif /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace AVX { /* super evil hacking around C++ features: * consider * void fun(int); * namespace X { void fun(int); } * namespace X { void bar() { fun(0); } } // this will be a call to X::fun(int) * * void fun(m256); * namespace X { void fun(m256); } * namespace X { void bar() { fun(0); } } // this will be ambiguous because m256 is a * non-fundamental type in the global namespace, thus * adding ::fun(m256) to the candidates * * To make my own overloads of the intrinsics distinct I have to use a type that is inside the * Vc::AVX namespace. To reduce porting effort and increase generality I want to use the same * function names as used in the global namespace. The type name may not be the same, though * because identifiers starting with two underscores are reserved by the standard. Thus using * those would mean to depend on undefined behavior. * Sadly a typedef is not enough. * Public inheritance also does not work, because at least ICC considers the __m??? types to be * some sort of fundamental types. * Thus composition is the only solution. */ #ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS template struct Alias { typedef T Base; T _d; Vc_ALWAYS_INLINE operator T &() { return _d; } Vc_ALWAYS_INLINE operator const T &() const { return _d; } Vc_ALWAYS_INLINE Alias() {} Vc_ALWAYS_INLINE Alias(T x) : _d(x) {} Vc_ALWAYS_INLINE Alias(const Alias &x) : _d(x._d) {} Vc_ALWAYS_INLINE Alias &operator=(T x) { _d = x; return *this; } Vc_ALWAYS_INLINE Alias &operator=(const Alias &x) { _d = x._d; return *this; } }; typedef Alias<__m128 > m128 ; typedef Alias<__m128d> m128d; typedef Alias<__m128i> m128i; typedef Alias<__m256 > m256 ; typedef Alias<__m256d> m256d; typedef Alias<__m256i> m256i; #else typedef __m128 m128 ; typedef __m128d m128d; typedef __m128i m128i; typedef __m256 m256 ; typedef __m256d m256d; typedef __m256i m256i; #endif #if defined(VC_UNCONDITIONAL_AVX2_INTRINSICS) && defined(VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN) typedef const m128 & param128 ; typedef const m128d & param128d; typedef const m128i & param128i; typedef const m256 & param256 ; typedef const m256d & param256d; typedef const m256i & param256i; #else typedef const m128 param128 ; typedef const m128d param128d; typedef const m128i param128i; typedef const m256 param256 ; typedef const m256d param256d; typedef const m256i param256i; #endif #ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS // Make use of cast intrinsics easier. But if param256 == const __m256 then these would lead to // ambiguities. static Vc_INTRINSIC m256i Vc_CONST _mm256_castps_si256(param256 a) { return ::_mm256_castps_si256(a); } static Vc_INTRINSIC m256d Vc_CONST _mm256_castps_pd (param256 a) { return ::_mm256_castps_pd (a); } static Vc_INTRINSIC m256i Vc_CONST _mm256_castpd_si256(param256d a) { return ::_mm256_castpd_si256(a); } static Vc_INTRINSIC m256 Vc_CONST _mm256_castpd_ps (param256d a) { return ::_mm256_castpd_ps (a); } static Vc_INTRINSIC m256 Vc_CONST _mm256_castsi256_ps(param256i a) { return ::_mm256_castsi256_ps(a); } static Vc_INTRINSIC m256d Vc_CONST _mm256_castsi256_pd(param256i a) { return ::_mm256_castsi256_pd(a); } #endif #ifdef VC_GCC // Redefine the mul/add/sub intrinsics to use GCC-specific operators instead of builtin // functions. This way the fp-contraction optimization step kicks in and creates FMAs! :) static Vc_INTRINSIC Vc_CONST m256d _mm256_mul_pd(m256d a, m256d b) { return static_cast(static_cast<__v4df>(a) * static_cast<__v4df>(b)); } static Vc_INTRINSIC Vc_CONST m256d _mm256_add_pd(m256d a, m256d b) { return static_cast(static_cast<__v4df>(a) + static_cast<__v4df>(b)); } static Vc_INTRINSIC Vc_CONST m256d _mm256_sub_pd(m256d a, m256d b) { return static_cast(static_cast<__v4df>(a) - static_cast<__v4df>(b)); } static Vc_INTRINSIC Vc_CONST m256 _mm256_mul_ps(m256 a, m256 b) { return static_cast(static_cast<__v8sf>(a) * static_cast<__v8sf>(b)); } static Vc_INTRINSIC Vc_CONST m256 _mm256_add_ps(m256 a, m256 b) { return static_cast(static_cast<__v8sf>(a) + static_cast<__v8sf>(b)); } static Vc_INTRINSIC Vc_CONST m256 _mm256_sub_ps(m256 a, m256 b) { return static_cast(static_cast<__v8sf>(a) - static_cast<__v8sf>(b)); } #endif static Vc_INTRINSIC m256 Vc_CONST _mm256_set1_ps (float a) { return ::_mm256_set1_ps (a); } static Vc_INTRINSIC m256d Vc_CONST _mm256_set1_pd (double a) { return ::_mm256_set1_pd (a); } static Vc_INTRINSIC m256i Vc_CONST _mm256_set1_epi32(int a) { return ::_mm256_set1_epi32(a); } //static Vc_INTRINSIC m256i Vc_CONST _mm256_set1_epu32(unsigned int a) { return ::_mm256_set1_epu32(a); } #if defined(VC_GNU_ASM) && !defined(NVALGRIND) static Vc_INTRINSIC m128i Vc_CONST _mm_setallone() { m128i r; __asm__("pcmpeqb %0,%0":"=x"(r)); return r; } #else static Vc_INTRINSIC m128i Vc_CONST _mm_setallone() { m128i r = _mm_setzero_si128(); return _mm_cmpeq_epi8(r, r); } #endif static Vc_INTRINSIC m128i Vc_CONST _mm_setallone_si128() { return _mm_setallone(); } static Vc_INTRINSIC m128d Vc_CONST _mm_setallone_pd() { return _mm_castsi128_pd(_mm_setallone()); } static Vc_INTRINSIC m128 Vc_CONST _mm_setallone_ps() { return _mm_castsi128_ps(_mm_setallone()); } static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi8 () { return _mm_set1_epi8(1); } static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epu8 () { return _mm_setone_epi8(); } static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi16() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast(c_general::one16))); } static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epu16() { return _mm_setone_epi16(); } static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast(&_IndexesFromZero32[1]))); } #if defined(VC_GNU_ASM) && !defined(NVALGRIND) static Vc_INTRINSIC m256 Vc_CONST _mm256_setallone() { __m256 r; __asm__("vcmpps $8,%0,%0,%0":"=x"(r)); return r; } #elif defined(VC_MSVC) // MSVC puts temporaries of this value on the stack, but sometimes at misaligned addresses, try // some other generator instead... static Vc_INTRINSIC m256 Vc_CONST _mm256_setallone() { return _mm256_castsi256_ps(_mm256_set1_epi32(-1)); } #else static Vc_INTRINSIC m256 Vc_CONST _mm256_setallone() { m256 r = _mm256_setzero_ps(); return _mm256_cmp_ps(r, r, _CMP_EQ_UQ); } #endif static Vc_INTRINSIC m256i Vc_CONST _mm256_setallone_si256() { return _mm256_castps_si256(_mm256_setallone()); } static Vc_INTRINSIC m256d Vc_CONST _mm256_setallone_pd() { return _mm256_castps_pd(_mm256_setallone()); } static Vc_INTRINSIC m256 Vc_CONST _mm256_setallone_ps() { return _mm256_setallone(); } static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epi8 () { return _mm256_set1_epi8(1); } static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epu8 () { return _mm256_setone_epi8(); } static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(c_general::one16))); } static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epu16() { return _mm256_setone_epi16(); } static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(&_IndexesFromZero32[1]))); } static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epu32() { return _mm256_setone_epi32(); } static Vc_INTRINSIC m256 Vc_CONST _mm256_setone_ps() { return _mm256_broadcast_ss(&c_general::oneFloat); } static Vc_INTRINSIC m256d Vc_CONST _mm256_setone_pd() { return _mm256_broadcast_sd(&c_general::oneDouble); } static Vc_INTRINSIC m256d Vc_CONST _mm256_setabsmask_pd() { return _mm256_broadcast_sd(reinterpret_cast(&c_general::absMaskFloat[0])); } static Vc_INTRINSIC m256 Vc_CONST _mm256_setabsmask_ps() { return _mm256_broadcast_ss(reinterpret_cast(&c_general::absMaskFloat[1])); } static Vc_INTRINSIC m256d Vc_CONST _mm256_setsignmask_pd(){ return _mm256_broadcast_sd(reinterpret_cast(&c_general::signMaskFloat[0])); } static Vc_INTRINSIC m256 Vc_CONST _mm256_setsignmask_ps(){ return _mm256_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1])); } static Vc_INTRINSIC m256 Vc_CONST _mm256_set2power31_ps() { return _mm256_broadcast_ss(&c_general::_2power31); } static Vc_INTRINSIC m256i Vc_CONST _mm256_set2power31_epu32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1]))); } //X static Vc_INTRINSIC m256i Vc_CONST _mm256_setmin_epi8 () { return _mm256_slli_epi8 (_mm256_setallone_si256(), 7); } static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi16() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast(c_general::minShort))); } static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1]))); } static Vc_INTRINSIC m256i Vc_CONST _mm256_setmin_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(c_general::minShort))); } static Vc_INTRINSIC m256i Vc_CONST _mm256_setmin_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1]))); } #ifdef VC_REQUIRES_MACRO_FOR_IMMEDIATE_ARGUMENT #define _mm_extract_epu8 (x, i) (static_cast (_mm_extract_epi8 ((x), (i)))) #define _mm_extract_epu16(x, i) (static_cast(_mm_extract_epi16((x), (i)))) #define _mm_extract_epu32(x, i) (static_cast (_mm_extract_epi32((x), (i)))) #else static Vc_INTRINSIC unsigned char Vc_CONST _mm_extract_epu8(param128i x, const int i) { return _mm_extract_epi8(x, i); } static Vc_INTRINSIC unsigned short Vc_CONST _mm_extract_epu16(param128i x, const int i) { return _mm_extract_epi16(x, i); } static Vc_INTRINSIC unsigned int Vc_CONST _mm_extract_epu32(param128i x, const int i) { return _mm_extract_epi32(x, i); } #endif /////////////////////// COMPARE OPS /////////////////////// static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpeq_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); } static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpneq_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); } static Vc_INTRINSIC m256d Vc_CONST _mm256_cmplt_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); } static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpnlt_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); } static Vc_INTRINSIC m256d Vc_CONST _mm256_cmple_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); } static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpnle_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); } static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpord_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_ORD_Q); } static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpunord_pd(param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_UNORD_Q); } static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpeq_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); } static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpneq_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); } static Vc_INTRINSIC m256 Vc_CONST _mm256_cmplt_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); } static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpnlt_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); } static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpge_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); } static Vc_INTRINSIC m256 Vc_CONST _mm256_cmple_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); } static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpnle_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); } static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpgt_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); } static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpord_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_ORD_Q); } static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpunord_ps(param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_UNORD_Q); } static Vc_INTRINSIC m128i _mm_cmplt_epu16(param128i a, param128i b) { return _mm_cmplt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); } static Vc_INTRINSIC m128i _mm_cmpgt_epu16(param128i a, param128i b) { return _mm_cmpgt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); } /////////////////////// INTEGER OPS /////////////////////// #define AVX_TO_SSE_2(name) \ static Vc_INTRINSIC m256i Vc_CONST _mm256_##name(param256i a0, param256i b0) { \ m128i a1 = _mm256_extractf128_si256(a0, 1); \ m128i b1 = _mm256_extractf128_si256(b0, 1); \ m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0)); \ m128i r1 = _mm_##name(a1, b1); \ return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); \ } #define AVX_TO_SSE_2_si128_si256(name) \ static Vc_INTRINSIC m256i Vc_CONST _mm256_##name##_si256(param256i a0, param256i b0) { \ m128i a1 = _mm256_extractf128_si256(a0, 1); \ m128i b1 = _mm256_extractf128_si256(b0, 1); \ m128i r0 = _mm_##name##_si128(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0)); \ m128i r1 = _mm_##name##_si128(a1, b1); \ return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); \ } #define AVX_TO_SSE_1(name) \ static Vc_INTRINSIC m256i Vc_CONST _mm256_##name(param256i a0) { \ m128i a1 = _mm256_extractf128_si256(a0, 1); \ m128i r0 = _mm_##name(_mm256_castsi256_si128(a0)); \ m128i r1 = _mm_##name(a1); \ return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); \ } #define AVX_TO_SSE_1i(name) \ static Vc_INTRINSIC m256i Vc_CONST _mm256_##name(param256i a0, const int i) { \ m128i a1 = _mm256_extractf128_si256(a0, 1); \ m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), i); \ m128i r1 = _mm_##name(a1, i); \ return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); \ } AVX_TO_SSE_2(cmplt_epi8) AVX_TO_SSE_2(cmplt_epi16) AVX_TO_SSE_2(cmplt_epi32) AVX_TO_SSE_2(cmpeq_epi8) AVX_TO_SSE_2(cmpeq_epi16) AVX_TO_SSE_2(cmpeq_epi32) AVX_TO_SSE_2(cmpgt_epi8) AVX_TO_SSE_2(cmpgt_epi16) AVX_TO_SSE_2(cmpgt_epi32) // This code is AVX only (without AVX2). We never asked for AVX2 intrinsics. So go away... :) #if defined _mm256_srli_si256 #undef _mm256_srli_si256 #endif #if defined _mm256_slli_si256 #undef _mm256_slli_si256 #endif #if defined _mm256_blend_epi16 #undef _mm256_blend_epi16 #endif static Vc_INTRINSIC m256i Vc_CONST _mm256_srli_si256(param256i a0, const int i) { const m128i vLo = _mm256_castsi256_si128(a0); const m128i vHi = _mm256_extractf128_si256(a0, 1); switch (i) { case 0: return a0; case 1: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 1)), _mm_srli_si128(vHi, 1), 1); case 2: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 2)), _mm_srli_si128(vHi, 2), 1); case 3: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 3)), _mm_srli_si128(vHi, 3), 1); case 4: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 4)), _mm_srli_si128(vHi, 4), 1); case 5: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 5)), _mm_srli_si128(vHi, 5), 1); case 6: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 6)), _mm_srli_si128(vHi, 6), 1); case 7: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 7)), _mm_srli_si128(vHi, 7), 1); case 8: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 8)), _mm_srli_si128(vHi, 8), 1); case 9: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 9)), _mm_srli_si128(vHi, 9), 1); case 10: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 10)), _mm_srli_si128(vHi, 10), 1); case 11: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 11)), _mm_srli_si128(vHi, 11), 1); case 12: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 12)), _mm_srli_si128(vHi, 12), 1); case 13: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 13)), _mm_srli_si128(vHi, 13), 1); case 14: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 14)), _mm_srli_si128(vHi, 14), 1); case 15: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 15)), _mm_srli_si128(vHi, 15), 1); case 16: return _mm256_permute2f128_si256(a0, a0, 0x81); case 17: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 1)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 1)), 0x80); case 18: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 2)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 2)), 0x80); case 19: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 3)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 3)), 0x80); case 20: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 4)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 4)), 0x80); case 21: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 5)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 5)), 0x80); case 22: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 6)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 6)), 0x80); case 23: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 7)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 7)), 0x80); case 24: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 8)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 8)), 0x80); case 25: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 9)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 9)), 0x80); case 26: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 10)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 10)), 0x80); case 27: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 11)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 11)), 0x80); case 28: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 12)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 12)), 0x80); case 29: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 13)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 13)), 0x80); case 30: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 14)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 14)), 0x80); case 31: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 15)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 15)), 0x80); } return _mm256_setzero_si256(); } static Vc_INTRINSIC m256i Vc_CONST _mm256_slli_si256(param256i a0, const int i) { const m128i vLo = _mm256_castsi256_si128(a0); const m128i vHi = _mm256_extractf128_si256(a0, 1); switch (i) { case 0: return a0; case 1: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 1)), _mm_alignr_epi8(vHi, vLo, 15), 1); case 2: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 2)), _mm_alignr_epi8(vHi, vLo, 14), 1); case 3: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 3)), _mm_alignr_epi8(vHi, vLo, 13), 1); case 4: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 4)), _mm_alignr_epi8(vHi, vLo, 12), 1); case 5: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 5)), _mm_alignr_epi8(vHi, vLo, 11), 1); case 6: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 6)), _mm_alignr_epi8(vHi, vLo, 10), 1); case 7: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 7)), _mm_alignr_epi8(vHi, vLo, 9), 1); case 8: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 8)), _mm_alignr_epi8(vHi, vLo, 8), 1); case 9: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 9)), _mm_alignr_epi8(vHi, vLo, 7), 1); case 10: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 10)), _mm_alignr_epi8(vHi, vLo, 6), 1); case 11: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 11)), _mm_alignr_epi8(vHi, vLo, 5), 1); case 12: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 12)), _mm_alignr_epi8(vHi, vLo, 4), 1); case 13: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 13)), _mm_alignr_epi8(vHi, vLo, 3), 1); case 14: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 14)), _mm_alignr_epi8(vHi, vLo, 2), 1); case 15: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 15)), _mm_alignr_epi8(vHi, vLo, 1), 1); case 16: return _mm256_permute2f128_si256(a0, a0, 0x8); case 17: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 1)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 1)), 0x8); case 18: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 2)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 2)), 0x8); case 19: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 3)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 3)), 0x8); case 20: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 4)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 4)), 0x8); case 21: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 5)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 5)), 0x8); case 22: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 6)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 6)), 0x8); case 23: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 7)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 7)), 0x8); case 24: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 8)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 8)), 0x8); case 25: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 9)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 9)), 0x8); case 26: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 10)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 10)), 0x8); case 27: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 11)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 11)), 0x8); case 28: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 12)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 12)), 0x8); case 29: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 13)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 13)), 0x8); case 30: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 14)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 14)), 0x8); case 31: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 15)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 15)), 0x8); } return _mm256_setzero_si256(); } static Vc_INTRINSIC m256i Vc_CONST _mm256_and_si256(param256i x, param256i y) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); } static Vc_INTRINSIC m256i Vc_CONST _mm256_andnot_si256(param256i x, param256i y) { return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); } static Vc_INTRINSIC m256i Vc_CONST _mm256_or_si256(param256i x, param256i y) { return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); } static Vc_INTRINSIC m256i Vc_CONST _mm256_xor_si256(param256i x, param256i y) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); } AVX_TO_SSE_2(packs_epi16) AVX_TO_SSE_2(packs_epi32) AVX_TO_SSE_2(packus_epi16) AVX_TO_SSE_2(unpackhi_epi8) AVX_TO_SSE_2(unpackhi_epi16) AVX_TO_SSE_2(unpackhi_epi32) AVX_TO_SSE_2(unpackhi_epi64) AVX_TO_SSE_2(unpacklo_epi8) AVX_TO_SSE_2(unpacklo_epi16) AVX_TO_SSE_2(unpacklo_epi32) AVX_TO_SSE_2(unpacklo_epi64) AVX_TO_SSE_2(add_epi8) AVX_TO_SSE_2(add_epi16) AVX_TO_SSE_2(add_epi32) AVX_TO_SSE_2(add_epi64) AVX_TO_SSE_2(adds_epi8) AVX_TO_SSE_2(adds_epi16) AVX_TO_SSE_2(adds_epu8) AVX_TO_SSE_2(adds_epu16) AVX_TO_SSE_2(sub_epi8) AVX_TO_SSE_2(sub_epi16) AVX_TO_SSE_2(sub_epi32) AVX_TO_SSE_2(sub_epi64) AVX_TO_SSE_2(subs_epi8) AVX_TO_SSE_2(subs_epi16) AVX_TO_SSE_2(subs_epu8) AVX_TO_SSE_2(subs_epu16) AVX_TO_SSE_2(madd_epi16) AVX_TO_SSE_2(mulhi_epi16) AVX_TO_SSE_2(mullo_epi16) AVX_TO_SSE_2(mul_epu32) AVX_TO_SSE_1i(slli_epi16) AVX_TO_SSE_1i(slli_epi32) AVX_TO_SSE_1i(slli_epi64) AVX_TO_SSE_1i(srai_epi16) AVX_TO_SSE_1i(srai_epi32) AVX_TO_SSE_1i(srli_epi16) AVX_TO_SSE_1i(srli_epi32) AVX_TO_SSE_1i(srli_epi64) AVX_TO_SSE_2(sll_epi16) AVX_TO_SSE_2(sll_epi32) AVX_TO_SSE_2(sll_epi64) AVX_TO_SSE_2(sra_epi16) AVX_TO_SSE_2(sra_epi32) AVX_TO_SSE_2(srl_epi16) AVX_TO_SSE_2(srl_epi32) AVX_TO_SSE_2(srl_epi64) AVX_TO_SSE_2(max_epi16) AVX_TO_SSE_2(max_epu8) AVX_TO_SSE_2(min_epi16) AVX_TO_SSE_2(min_epu8) Vc_INTRINSIC int Vc_CONST _mm256_movemask_epi8(param256i a0) { m128i a1 = _mm256_extractf128_si256(a0, 1); return (_mm_movemask_epi8(a1) << 16) | _mm_movemask_epi8(_mm256_castsi256_si128(a0)); } AVX_TO_SSE_2(mulhi_epu16) // shufflehi_epi16 // shufflelo_epi16 (param128i __A, const int __mask) // shuffle_epi32 (param128i __A, const int __mask) // maskmoveu_si128 (param128i __A, param128i __B, char *__C) AVX_TO_SSE_2(avg_epu8) AVX_TO_SSE_2(avg_epu16) AVX_TO_SSE_2(sad_epu8) // stream_si32 (int *__A, int __B) // stream_si128 (param128i *__A, param128i __B) // cvtsi32_si128 (int __A) // cvtsi64_si128 (long long __A) // cvtsi64x_si128 (long long __A) AVX_TO_SSE_2(hadd_epi16) AVX_TO_SSE_2(hadd_epi32) AVX_TO_SSE_2(hadds_epi16) AVX_TO_SSE_2(hsub_epi16) AVX_TO_SSE_2(hsub_epi32) AVX_TO_SSE_2(hsubs_epi16) AVX_TO_SSE_2(maddubs_epi16) AVX_TO_SSE_2(mulhrs_epi16) AVX_TO_SSE_2(shuffle_epi8) AVX_TO_SSE_2(sign_epi8) AVX_TO_SSE_2(sign_epi16) AVX_TO_SSE_2(sign_epi32) // alignr_epi8(param128i __X, param128i __Y, const int __N) AVX_TO_SSE_1(abs_epi8) AVX_TO_SSE_1(abs_epi16) AVX_TO_SSE_1(abs_epi32) #if !defined(VC_REQUIRES_MACRO_FOR_IMMEDIATE_ARGUMENT) m256i Vc_INTRINSIC Vc_CONST _mm256_blend_epi16(param256i a0, param256i b0, const int m) { m128i a1 = _mm256_extractf128_si256(a0, 1); m128i b1 = _mm256_extractf128_si256(b0, 1); m128i r0 = _mm_blend_epi16(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), m & 0xff); m128i r1 = _mm_blend_epi16(a1, b1, m >> 8); return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); } #else # define _mm256_blend_epi16(a0, b0, m) \ _mm256_insertf128_si256( \ _mm256_castsi128_si256( \ _mm_blend_epi16( \ _mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), m & 0xff)), \ _mm_blend_epi16(_mm256_extractf128_si256(a0, 1), _mm256_extractf128_si256(b0, 1), m >> 8);, 1) #endif Vc_INTRINSIC m256i Vc_CONST _mm256_blendv_epi8(param256i a0, param256i b0, param256i m0) { m128i a1 = _mm256_extractf128_si256(a0, 1); m128i b1 = _mm256_extractf128_si256(b0, 1); m128i m1 = _mm256_extractf128_si256(m0, 1); m128i r0 = _mm_blendv_epi8(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), _mm256_castsi256_si128(m0)); m128i r1 = _mm_blendv_epi8(a1, b1, m1); return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); } AVX_TO_SSE_2(cmpeq_epi64) AVX_TO_SSE_2(min_epi8) AVX_TO_SSE_2(max_epi8) AVX_TO_SSE_2(min_epu16) AVX_TO_SSE_2(max_epu16) AVX_TO_SSE_2(min_epi32) AVX_TO_SSE_2(max_epi32) AVX_TO_SSE_2(min_epu32) AVX_TO_SSE_2(max_epu32) AVX_TO_SSE_2(mullo_epi32) AVX_TO_SSE_2(mul_epi32) #if !defined(VC_CLANG) || VC_CLANG > 0x30100 // clang is missing _mm_minpos_epu16 from smmintrin.h // http://llvm.org/bugs/show_bug.cgi?id=12399 AVX_TO_SSE_1(minpos_epu16) #endif AVX_TO_SSE_1(cvtepi8_epi32) AVX_TO_SSE_1(cvtepi16_epi32) AVX_TO_SSE_1(cvtepi8_epi64) AVX_TO_SSE_1(cvtepi32_epi64) AVX_TO_SSE_1(cvtepi16_epi64) AVX_TO_SSE_1(cvtepi8_epi16) AVX_TO_SSE_1(cvtepu8_epi32) AVX_TO_SSE_1(cvtepu16_epi32) AVX_TO_SSE_1(cvtepu8_epi64) AVX_TO_SSE_1(cvtepu32_epi64) AVX_TO_SSE_1(cvtepu16_epi64) AVX_TO_SSE_1(cvtepu8_epi16) AVX_TO_SSE_2(packus_epi32) // mpsadbw_epu8 (param128i __X, param128i __Y, const int __M) // stream_load_si128 (param128i *__X) AVX_TO_SSE_2(cmpgt_epi64) //X static Vc_INTRINSIC m256i _mm256_cmplt_epu8 (param256i a, param256i b) { return _mm256_cmplt_epi8 ( //X _mm256_xor_si256(a, _mm256_setmin_epi8 ()), _mm256_xor_si256(b, _mm256_setmin_epi8 ())); } //X static Vc_INTRINSIC m256i _mm256_cmpgt_epu8 (param256i a, param256i b) { return _mm256_cmpgt_epi8 ( //X _mm256_xor_si256(a, _mm256_setmin_epi8 ()), _mm256_xor_si256(b, _mm256_setmin_epi8 ())); } static Vc_INTRINSIC m256i Vc_CONST _mm256_cmplt_epu32(param256i _a, param256i _b) { m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(_mm256_setmin_epi32()))); m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(_mm256_setmin_epi32()))); return _mm256_insertf128_si256(_mm256_castsi128_si256( _mm_cmplt_epi32(_mm256_castsi256_si128(a), _mm256_castsi256_si128(b))), _mm_cmplt_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)), 1); } static Vc_INTRINSIC m256i Vc_CONST _mm256_cmpgt_epu32(param256i _a, param256i _b) { m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(_mm256_setmin_epi32()))); m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(_mm256_setmin_epi32()))); return _mm256_insertf128_si256(_mm256_castsi128_si256( _mm_cmpgt_epi32(_mm256_castsi256_si128(a), _mm256_castsi256_si128(b))), _mm_cmpgt_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)), 1); } static Vc_INTRINSIC void _mm256_maskstore(float *mem, const param256 mask, const param256 v) { #ifndef VC_MM256_MASKSTORE_WRONG_MASK_TYPE _mm256_maskstore_ps(mem, _mm256_castps_si256(mask), v); #else _mm256_maskstore_ps(mem, mask, v); #endif } static Vc_INTRINSIC void _mm256_maskstore(double *mem, const param256d mask, const param256d v) { #ifndef VC_MM256_MASKSTORE_WRONG_MASK_TYPE _mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), v); #else _mm256_maskstore_pd(mem, mask, v); #endif } static Vc_INTRINSIC void _mm256_maskstore(int *mem, const param256i mask, const param256i v) { #ifndef VC_MM256_MASKSTORE_WRONG_MASK_TYPE _mm256_maskstore_ps(reinterpret_cast(mem), mask, _mm256_castsi256_ps(v)); #else _mm256_maskstore_ps(reinterpret_cast(mem), _mm256_castsi256_ps(mask), _mm256_castsi256_ps(v)); #endif } static Vc_INTRINSIC void _mm256_maskstore(unsigned int *mem, const param256i mask, const param256i v) { _mm256_maskstore(reinterpret_cast(mem), mask, v); } #if defined(VC_IMPL_FMA4) && defined(VC_CLANG) && VC_CLANG < 0x30300 // clang miscompiles _mm256_macc_ps: http://llvm.org/bugs/show_bug.cgi?id=15040 static Vc_INTRINSIC __m256 my256_macc_ps(__m256 a, __m256 b, __m256 c) { __m256 r; // avoid loading c from memory as that would trigger the bug asm("vfmaddps %[c], %[b], %[a], %[r]" : [r]"=x"(r) : [a]"x"(a), [b]"x"(b), [c]"x"(c)); return r; } #ifdef _mm256_macc_ps #undef _mm256_macc_ps #endif #define _mm256_macc_ps(a, b, c) Vc::AVX::my256_macc_ps(a, b, c) static Vc_INTRINSIC __m256d my256_macc_pd(__m256d a, __m256d b, __m256d c) { __m256d r; // avoid loading c from memory as that would trigger the bug asm("vfmaddpd %[c], %[b], %[a], %[r]" : [r]"=x"(r) : [a]"x"(a), [b]"x"(b), [c]"x"(c)); return r; } #ifdef _mm256_macc_pd #undef _mm256_macc_pd #endif #define _mm256_macc_pd(a, b, c) Vc::AVX::my256_macc_pd(a, b, c) #endif } // namespace AVX } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #include "shuffle.h" #endif // VC_AVX_INTRINSICS_H Vc-0.7.4/avx/limits.h000066400000000000000000000050611233512346000143430ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_AVX_LIMITS_H #define VC_AVX_LIMITS_H #include "intrinsics.h" #include "types.h" namespace std { #define _VC_NUM_LIM(T, _max, _min) \ template<> struct numeric_limits< ::Vc::AVX::Vector > : public numeric_limits \ { \ static Vc_INTRINSIC Vc_CONST ::Vc::AVX::Vector max() _VC_NOEXCEPT { return _max; } \ static Vc_INTRINSIC Vc_CONST ::Vc::AVX::Vector min() _VC_NOEXCEPT { return _min; } \ static Vc_INTRINSIC Vc_CONST ::Vc::AVX::Vector lowest() _VC_NOEXCEPT { return min(); } \ static Vc_INTRINSIC Vc_CONST ::Vc::AVX::Vector epsilon() _VC_NOEXCEPT { return ::Vc::AVX::Vector::Zero(); } \ static Vc_INTRINSIC Vc_CONST ::Vc::AVX::Vector round_error() _VC_NOEXCEPT { return ::Vc::AVX::Vector::Zero(); } \ static Vc_INTRINSIC Vc_CONST ::Vc::AVX::Vector infinity() _VC_NOEXCEPT { return ::Vc::AVX::Vector::Zero(); } \ static Vc_INTRINSIC Vc_CONST ::Vc::AVX::Vector quiet_NaN() _VC_NOEXCEPT { return ::Vc::AVX::Vector::Zero(); } \ static Vc_INTRINSIC Vc_CONST ::Vc::AVX::Vector signaling_NaN() _VC_NOEXCEPT { return ::Vc::AVX::Vector::Zero(); } \ static Vc_INTRINSIC Vc_CONST ::Vc::AVX::Vector denorm_min() _VC_NOEXCEPT { return ::Vc::AVX::Vector::Zero(); } \ } #ifndef VC_IMPL_AVX2 namespace { using ::Vc::AVX::_mm256_srli_epi32; } #endif _VC_NUM_LIM(unsigned short, ::Vc::AVX::_mm_setallone_si128(), _mm_setzero_si128()); _VC_NUM_LIM( short, _mm_srli_epi16(::Vc::AVX::_mm_setallone_si128(), 1), ::Vc::AVX::_mm_setmin_epi16()); _VC_NUM_LIM( unsigned int, ::Vc::AVX::_mm256_setallone_si256(), _mm256_setzero_si256()); _VC_NUM_LIM( int, _mm256_srli_epi32(::Vc::AVX::_mm256_setallone_si256(), 1), ::Vc::AVX::_mm256_setmin_epi32()); #undef _VC_NUM_LIM } // namespace std #endif // VC_AVX_LIMITS_H Vc-0.7.4/avx/macros.h000066400000000000000000000015631233512346000143310ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include "../common/macros.h" #ifndef VC_AVX_MACROS_H #define VC_AVX_MACROS_H #undef VC_AVX_UNDOMACROS_H #endif // VC_AVX_MACROS_H Vc-0.7.4/avx/mask.h000066400000000000000000000310641233512346000137770ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_AVX_MASK_H #define VC_AVX_MASK_H #include "intrinsics.h" #include "../common/bitscanintrinsics.h" #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace AVX { template class Mask { friend class Mask<4u, 32u>; // double_v friend class Mask<8u, 32u>; // float_v, (u)int_v friend class Mask<8u, 16u>; // (u)short_v friend class Mask<16u, 16u>; // (u)char_v public: FREE_STORE_OPERATORS_ALIGNED(32) // abstracts the way Masks are passed to functions, it can easily be changed to const ref here #if defined VC_MSVC && defined _WIN32 typedef const Mask &AsArg; #else typedef Mask AsArg; #endif Vc_ALWAYS_INLINE Mask() {} Vc_ALWAYS_INLINE Mask(param256 x) : k(x) {} Vc_ALWAYS_INLINE Mask(param256d x) : k(_mm256_castpd_ps(x)) {} Vc_ALWAYS_INLINE Mask(param256i x) : k(_mm256_castsi256_ps(x)) {} #ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS Vc_ALWAYS_INLINE Mask(__m256 x) : k(x) {} Vc_ALWAYS_INLINE Mask(__m256d x) : k(_mm256_castpd_ps(x)) {} Vc_ALWAYS_INLINE Mask(__m256i x) : k(_mm256_castsi256_ps(x)) {} #endif Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerZero::ZEnum) : k(_mm256_setzero_ps()) {} Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerOne::OEnum) : k(_mm256_setallone_ps()) {} Vc_ALWAYS_INLINE explicit Mask(bool b) : k(b ? _mm256_setallone_ps() : m256(_mm256_setzero_ps())) {} Vc_ALWAYS_INLINE Mask(const Mask &rhs) : k(rhs.k) {} Vc_ALWAYS_INLINE Mask(const Mask &rhs) : k(avx_cast(concat( _mm_unpacklo_epi16(rhs.dataI(), rhs.dataI()), _mm_unpackhi_epi16(rhs.dataI(), rhs.dataI())))) {} Vc_ALWAYS_INLINE_L Mask(const Mask &m) Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L Mask(const Mask &m) Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE bool operator==(const Mask &rhs) const { return 0 != _mm256_testc_ps(k, rhs.k); } Vc_ALWAYS_INLINE bool operator!=(const Mask &rhs) const { return 0 == _mm256_testc_ps(k, rhs.k); } Vc_ALWAYS_INLINE Mask operator!() const { return _mm256_andnot_ps(data(), _mm256_setallone_ps()); } Vc_ALWAYS_INLINE Mask &operator&=(const Mask &rhs) { k = _mm256_and_ps(k, rhs.k); return *this; } Vc_ALWAYS_INLINE Mask &operator|=(const Mask &rhs) { k = _mm256_or_ps (k, rhs.k); return *this; } Vc_ALWAYS_INLINE Mask &operator^=(const Mask &rhs) { k = _mm256_xor_ps(k, rhs.k); return *this; } // no need for expression template optimizations because cmp(n)eq for floats are not bitwise // compares Vc_ALWAYS_INLINE bool isFull () const { return 0 != _mm256_testc_ps(k, _mm256_setallone_ps()); } Vc_ALWAYS_INLINE bool isEmpty() const { return 0 != _mm256_testz_ps(k, k); } Vc_ALWAYS_INLINE bool isMix () const { return 0 != _mm256_testnzc_ps(k, _mm256_setallone_ps()); } #ifndef VC_NO_AUTOMATIC_BOOL_FROM_MASK Vc_ALWAYS_INLINE operator bool() const { return isFull(); } #endif Vc_ALWAYS_INLINE_L Vc_PURE_L int shiftMask() const Vc_ALWAYS_INLINE_R Vc_PURE_R; Vc_ALWAYS_INLINE_L Vc_PURE_L int toInt() const Vc_ALWAYS_INLINE_R Vc_PURE_R; Vc_ALWAYS_INLINE m256 data () const { return k; } Vc_ALWAYS_INLINE m256i dataI() const { return _mm256_castps_si256(k); } Vc_ALWAYS_INLINE m256d dataD() const { return _mm256_castps_pd(k); } Vc_ALWAYS_INLINE_L Vc_PURE_L bool operator[](int index) const Vc_ALWAYS_INLINE_R Vc_PURE_R; Vc_ALWAYS_INLINE_L Vc_PURE_L int count() const Vc_ALWAYS_INLINE_R Vc_PURE_R; Vc_ALWAYS_INLINE_L Vc_PURE_L int firstOne() const Vc_ALWAYS_INLINE_R Vc_PURE_R; private: #ifdef VC_COMPILE_BENCHMARKS public: #endif m256 k; }; template class Mask { friend class Mask<4u, 32u>; // double_v friend class Mask<8u, 32u>; // float_v, (u)int_v friend class Mask<8u, 16u>; // (u)short_v friend class Mask<16u, 16u>; // (u)char_v public: FREE_STORE_OPERATORS_ALIGNED(16) // abstracts the way Masks are passed to functions, it can easily be changed to const ref here #if defined VC_MSVC && defined _WIN32 typedef const Mask &AsArg; #else typedef Mask AsArg; #endif Vc_ALWAYS_INLINE Mask() {} Vc_ALWAYS_INLINE Mask(param128 x) : k(x) {} Vc_ALWAYS_INLINE Mask(param128d x) : k(_mm_castpd_ps(x)) {} Vc_ALWAYS_INLINE Mask(param128i x) : k(_mm_castsi128_ps(x)) {} #ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS Vc_ALWAYS_INLINE Mask(__m128 x) : k(x) {} Vc_ALWAYS_INLINE Mask(__m128d x) : k(_mm_castpd_ps(x)) {} Vc_ALWAYS_INLINE Mask(__m128i x) : k(_mm_castsi128_ps(x)) {} #endif Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerZero::ZEnum) : k(_mm_setzero_ps()) {} Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerOne::OEnum) : k(_mm_setallone_ps()) {} Vc_ALWAYS_INLINE explicit Mask(bool b) : k(b ? _mm_setallone_ps() : m128(_mm_setzero_ps())) {} Vc_ALWAYS_INLINE Mask(const Mask &rhs) : k(rhs.k) {} Vc_ALWAYS_INLINE Mask(const Mask &rhs) : k(avx_cast( _mm_packs_epi32(avx_cast(rhs.data()), _mm256_extractf128_si256(rhs.dataI(), 1)))) {} Vc_ALWAYS_INLINE Mask(const Mask *a) : k(avx_cast( _mm_packs_epi16(a[0].dataI(), a[1].dataI()))) {} Vc_ALWAYS_INLINE bool operator==(const Mask &rhs) const { return 0 != _mm_testc_si128(dataI(), rhs.dataI()); } Vc_ALWAYS_INLINE bool operator!=(const Mask &rhs) const { return 0 == _mm_testc_si128(dataI(), rhs.dataI()); } Vc_ALWAYS_INLINE Mask operator!() const { return _mm_andnot_ps(data(), _mm_setallone_ps()); } Vc_ALWAYS_INLINE Mask &operator&=(const Mask &rhs) { k = _mm_and_ps(k, rhs.k); return *this; } Vc_ALWAYS_INLINE Mask &operator|=(const Mask &rhs) { k = _mm_or_ps (k, rhs.k); return *this; } Vc_ALWAYS_INLINE Mask &operator^=(const Mask &rhs) { k = _mm_xor_ps(k, rhs.k); return *this; } // TODO: use expression templates to optimize (v1 == v2).isFull() and friends Vc_ALWAYS_INLINE bool isFull () const { return 0 != _mm_testc_si128(dataI(), _mm_setallone_si128()); } Vc_ALWAYS_INLINE bool isEmpty() const { return 0 != _mm_testz_si128(dataI(), dataI()); } Vc_ALWAYS_INLINE bool isMix () const { return 0 != _mm_testnzc_si128(dataI(), _mm_setallone_si128()); } #ifndef VC_NO_AUTOMATIC_BOOL_FROM_MASK Vc_ALWAYS_INLINE operator bool() const { return isFull(); } #endif Vc_ALWAYS_INLINE_L Vc_PURE_L int shiftMask() const Vc_ALWAYS_INLINE_R Vc_PURE_R; Vc_ALWAYS_INLINE_L Vc_PURE_L int toInt() const Vc_ALWAYS_INLINE_R Vc_PURE_R; Vc_ALWAYS_INLINE m128 data () const { return k; } Vc_ALWAYS_INLINE m128i dataI() const { return avx_cast(k); } Vc_ALWAYS_INLINE m128d dataD() const { return avx_cast(k); } Vc_ALWAYS_INLINE_L Vc_PURE_L bool operator[](int index) const Vc_ALWAYS_INLINE_R Vc_PURE_R; Vc_ALWAYS_INLINE_L Vc_PURE_L int count() const Vc_ALWAYS_INLINE_R Vc_PURE_R; Vc_ALWAYS_INLINE_L Vc_PURE_L int firstOne() const Vc_ALWAYS_INLINE_R Vc_PURE_R; private: #ifdef VC_COMPILE_BENCHMARKS public: #endif m128 k; }; struct ForeachHelper { size_t mask; bool brk; bool outerBreak; Vc_ALWAYS_INLINE ForeachHelper(size_t _mask) : mask(_mask), brk(false), outerBreak(false) {} Vc_ALWAYS_INLINE bool outer() const { return mask != 0 && !outerBreak; } Vc_ALWAYS_INLINE bool inner() { return (brk = !brk); } Vc_ALWAYS_INLINE void noBreak() { outerBreak = false; } Vc_ALWAYS_INLINE size_t next() { outerBreak = true; #ifdef VC_GNU_ASM const size_t bit = __builtin_ctzl(mask); __asm__("btr %1,%0" : "+r"(mask) : "r"(bit)); #else #ifdef VC_MSVC #pragma warning(suppress : 4267) // conversion from 'size_t' to 'unsigned long', possible loss of data #endif const size_t bit = _bit_scan_forward(mask); mask &= ~(1 << bit); #endif return bit; } }; #define Vc_foreach_bit(_it_, _mask_) \ for (Vc::AVX::ForeachHelper Vc__make_unique(foreach_bit_obj)((_mask_).toInt()); Vc__make_unique(foreach_bit_obj).outer(); ) \ for (_it_ = Vc__make_unique(foreach_bit_obj).next(); Vc__make_unique(foreach_bit_obj).inner(); Vc__make_unique(foreach_bit_obj).noBreak()) // Operators namespace Intrinsics { static Vc_ALWAYS_INLINE Vc_PURE m256 and_(param256 a, param256 b) { return _mm256_and_ps(a, b); } static Vc_ALWAYS_INLINE Vc_PURE m256 or_(param256 a, param256 b) { return _mm256_or_ps(a, b); } static Vc_ALWAYS_INLINE Vc_PURE m256 xor_(param256 a, param256 b) { return _mm256_xor_ps(a, b); } static Vc_ALWAYS_INLINE Vc_PURE m128 and_(param128 a, param128 b) { return _mm_and_ps(a, b); } static Vc_ALWAYS_INLINE Vc_PURE m128 or_(param128 a, param128 b) { return _mm_or_ps(a, b); } static Vc_ALWAYS_INLINE Vc_PURE m128 xor_(param128 a, param128 b) { return _mm_xor_ps(a, b); } } // namespace Intrinsics // binary and/or/xor cannot work with one operand larger than the other template void operator&(const Mask &l, const Mask &r); template void operator|(const Mask &l, const Mask &r); template void operator^(const Mask &l, const Mask &r); // let binary and/or/xor work for any combination of masks (as long as they have the same sizeof) template Vc_ALWAYS_INLINE Vc_PURE Mask operator&(const Mask &l, const Mask &r) { return Intrinsics::and_(l.data(), r.data()); } template Vc_ALWAYS_INLINE Vc_PURE Mask operator|(const Mask &l, const Mask &r) { return Intrinsics:: or_(l.data(), r.data()); } template Vc_ALWAYS_INLINE Vc_PURE Mask operator^(const Mask &l, const Mask &r) { return Intrinsics::xor_(l.data(), r.data()); } // disable logical and/or for incompatible masks template void operator&&(const Mask &lhs, const Mask &rhs); template void operator||(const Mask &lhs, const Mask &rhs); // logical and/or for compatible masks template Vc_ALWAYS_INLINE Vc_PURE Mask operator&&(const Mask &lhs, const Mask &rhs) { return lhs && static_cast >(rhs); } template Vc_ALWAYS_INLINE Vc_PURE Mask operator||(const Mask &lhs, const Mask &rhs) { return lhs || static_cast >(rhs); } template Vc_ALWAYS_INLINE Vc_PURE Mask operator&&(const Mask &lhs, const Mask &rhs) { return Intrinsics::and_(lhs.data(), rhs.data()); } template Vc_ALWAYS_INLINE Vc_PURE Mask operator||(const Mask &lhs, const Mask &rhs) { return Intrinsics::or_ (lhs.data(), rhs.data()); } } // namespace AVX } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "mask.tcc" #include "undomacros.h" #endif // VC_AVX_MASK_H Vc-0.7.4/avx/mask.tcc000066400000000000000000000065411233512346000143230ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2011-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace AVX { template<> Vc_ALWAYS_INLINE Mask<4, 32>::Mask(const Mask<8, 32> &m) : k(concat(_mm_unpacklo_ps(lo128(m.data()), lo128(m.data())), _mm_unpackhi_ps(lo128(m.data()), lo128(m.data())))) { } template<> Vc_ALWAYS_INLINE Mask<8, 32>::Mask(const Mask<4, 32> &m) // aabb ccdd -> abcd 0000 : k(concat(Mem::shuffle(lo128(m.data()), hi128(m.data())), _mm_setzero_ps())) { } template Vc_ALWAYS_INLINE Vc_PURE int Mask::shiftMask() const { return _mm256_movemask_epi8(dataI()); } template Vc_ALWAYS_INLINE Vc_PURE int Mask::shiftMask() const { return _mm_movemask_epi8(dataI()); } template<> Vc_ALWAYS_INLINE Vc_PURE int Mask< 4, 32>::toInt() const { return _mm256_movemask_pd(dataD()); } template<> Vc_ALWAYS_INLINE Vc_PURE int Mask< 8, 32>::toInt() const { return _mm256_movemask_ps(data ()); } template<> Vc_ALWAYS_INLINE Vc_PURE int Mask< 8, 16>::toInt() const { return _mm_movemask_epi8(_mm_packs_epi16(dataI(), _mm_setzero_si128())); } template<> Vc_ALWAYS_INLINE Vc_PURE int Mask<16, 16>::toInt() const { return _mm_movemask_epi8(dataI()); } template<> Vc_ALWAYS_INLINE Vc_PURE bool Mask< 4, 32>::operator[](int index) const { return toInt() & (1 << index); } template<> Vc_ALWAYS_INLINE Vc_PURE bool Mask< 8, 32>::operator[](int index) const { return toInt() & (1 << index); } template<> Vc_ALWAYS_INLINE Vc_PURE bool Mask< 8, 16>::operator[](int index) const { return shiftMask() & (1 << 2 * index); } template<> Vc_ALWAYS_INLINE Vc_PURE bool Mask<16, 16>::operator[](int index) const { return toInt() & (1 << index); } #ifndef VC_IMPL_POPCNT static Vc_ALWAYS_INLINE Vc_CONST unsigned int _mm_popcnt_u32(unsigned int n) { n = (n & 0x55555555U) + ((n >> 1) & 0x55555555U); n = (n & 0x33333333U) + ((n >> 2) & 0x33333333U); n = (n & 0x0f0f0f0fU) + ((n >> 4) & 0x0f0f0f0fU); //n = (n & 0x00ff00ffU) + ((n >> 8) & 0x00ff00ffU); //n = (n & 0x0000ffffU) + ((n >>16) & 0x0000ffffU); return n; } #endif template Vc_ALWAYS_INLINE Vc_PURE int Mask::count() const { return _mm_popcnt_u32(toInt()); } template Vc_ALWAYS_INLINE Vc_PURE int Mask::count() const { return _mm_popcnt_u32(toInt()); } template Vc_ALWAYS_INLINE Vc_PURE int Mask::firstOne() const { return _bit_scan_forward(toInt()); } template Vc_ALWAYS_INLINE Vc_PURE int Mask::firstOne() const { return _bit_scan_forward(toInt()); } } // namespace AVX } // namespace Vc /*OUTER_NAMESPACE_END*/ Vc-0.7.4/avx/math.h000066400000000000000000000126261233512346000140000ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_AVX_MATH_H #define VC_AVX_MATH_H #include "const.h" #include "limits.h" #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace AVX { /** * splits \p v into exponent and mantissa, the sign is kept with the mantissa * * The return value will be in the range [0.5, 1.0[ * The \p e value will be an integer defining the power-of-two exponent */ inline double_v frexp(double_v::AsArg v, int_v *e) { const m256d exponentBits = Const::exponentMask().dataD(); const m256d exponentPart = _mm256_and_pd(v.data(), exponentBits); e->data() = _mm256_sub_epi32(_mm256_srli_epi64(avx_cast(exponentPart), 52), _mm256_set1_epi32(0x3fe)); const m256d exponentMaximized = _mm256_or_pd(v.data(), exponentBits); double_v ret = _mm256_and_pd(exponentMaximized, _mm256_broadcast_sd(reinterpret_cast(&c_general::frexpMask))); double_m zeroMask = v == double_v::Zero(); ret(isnan(v) || !isfinite(v) || zeroMask) = v; e->setZero(zeroMask.data()); return ret; } inline float_v frexp(float_v::AsArg v, int_v *e) { const m256 exponentBits = Const::exponentMask().data(); const m256 exponentPart = _mm256_and_ps(v.data(), exponentBits); e->data() = _mm256_sub_epi32(_mm256_srli_epi32(avx_cast(exponentPart), 23), _mm256_set1_epi32(0x7e)); const m256 exponentMaximized = _mm256_or_ps(v.data(), exponentBits); float_v ret = _mm256_and_ps(exponentMaximized, avx_cast(_mm256_set1_epi32(0xbf7fffffu))); ret(isnan(v) || !isfinite(v) || v == float_v::Zero()) = v; e->setZero(v == float_v::Zero()); return ret; } inline sfloat_v frexp(sfloat_v::AsArg v, short_v *e) { const m256 exponentBits = Const::exponentMask().data(); const m256 exponentPart = _mm256_and_ps(v.data(), exponentBits); e->data() = _mm_sub_epi16(_mm_packs_epi32(_mm_srli_epi32(avx_cast(exponentPart), 23), _mm_srli_epi32(avx_cast(hi128(exponentPart)), 23)), _mm_set1_epi16(0x7e)); const m256 exponentMaximized = _mm256_or_ps(v.data(), exponentBits); sfloat_v ret = _mm256_and_ps(exponentMaximized, avx_cast(_mm256_set1_epi32(0xbf7fffffu))); ret(isnan(v) || !isfinite(v) || v == sfloat_v::Zero()) = v; e->setZero(v == sfloat_v::Zero()); return ret; } /* -> x * 2^e * x == NaN -> NaN * x == (-)inf -> (-)inf */ inline double_v ldexp(double_v::AsArg v, int_v::AsArg _e) { int_v e = _e; e.setZero((v == double_v::Zero()).dataI()); const m256i exponentBits = _mm256_slli_epi64(e.data(), 52); return avx_cast(_mm256_add_epi64(avx_cast(v.data()), exponentBits)); } inline float_v ldexp(float_v::AsArg v, int_v::AsArg _e) { int_v e = _e; e.setZero(static_cast(v == float_v::Zero())); return (v.reinterpretCast() + (e << 23)).reinterpretCast(); } inline sfloat_v ldexp(sfloat_v::AsArg v, short_v::AsArg _e) { short_v e = _e; e.setZero(static_cast(v == sfloat_v::Zero())); e = e << (23 - 16); const m256i exponentBits = concat(_mm_unpacklo_epi16(_mm_setzero_si128(), e.data()), _mm_unpackhi_epi16(_mm_setzero_si128(), e.data())); return (v.reinterpretCast() + int_v(exponentBits)).reinterpretCast(); } static Vc_ALWAYS_INLINE float_v trunc( float_v::AsArg v) { return _mm256_round_ps(v.data(), 0x3); } static Vc_ALWAYS_INLINE sfloat_v trunc(sfloat_v::AsArg v) { return _mm256_round_ps(v.data(), 0x3); } static Vc_ALWAYS_INLINE double_v trunc(double_v::AsArg v) { return _mm256_round_pd(v.data(), 0x3); } static Vc_ALWAYS_INLINE float_v floor(float_v::AsArg v) { return _mm256_floor_ps(v.data()); } static Vc_ALWAYS_INLINE sfloat_v floor(sfloat_v::AsArg v) { return _mm256_floor_ps(v.data()); } static Vc_ALWAYS_INLINE double_v floor(double_v::AsArg v) { return _mm256_floor_pd(v.data()); } static Vc_ALWAYS_INLINE float_v ceil(float_v::AsArg v) { return _mm256_ceil_ps(v.data()); } static Vc_ALWAYS_INLINE sfloat_v ceil(sfloat_v::AsArg v) { return _mm256_ceil_ps(v.data()); } static Vc_ALWAYS_INLINE double_v ceil(double_v::AsArg v) { return _mm256_ceil_pd(v.data()); } } // namespace AVX } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #define VC__USE_NAMESPACE AVX #include "../common/trigonometric.h" #define VC__USE_NAMESPACE AVX #include "../common/logarithm.h" #define VC__USE_NAMESPACE AVX #include "../common/exponential.h" #undef VC__USE_NAMESPACE #endif // VC_AVX_MATH_H Vc-0.7.4/avx/prefetches.tcc000066400000000000000000000035111233512346000155120ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2010, 2011-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_AVX_PREFETCHES_TCC #define VC_AVX_PREFETCHES_TCC /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace Internal { Vc_ALWAYS_INLINE void HelperImpl::prefetchForOneRead(const void *addr) { _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_NTA); } Vc_ALWAYS_INLINE void HelperImpl::prefetchClose(const void *addr) { _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T0); } Vc_ALWAYS_INLINE void HelperImpl::prefetchMid(const void *addr) { _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T1); } Vc_ALWAYS_INLINE void HelperImpl::prefetchFar(const void *addr) { _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T2); } Vc_ALWAYS_INLINE void HelperImpl::prefetchForModify(const void *addr) { #ifdef __3dNOW__ _m_prefetchw(const_cast(addr)); #else _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T0); #endif } } // namespace Internal } // namespace Vc /*OUTER_NAMESPACE_END*/ #endif // VC_AVX_PREFETCHES_TCC Vc-0.7.4/avx/shuffle.h000066400000000000000000000352721233512346000145050ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2011-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_AVX_SHUFFLE_H #define VC_AVX_SHUFFLE_H #include "../sse/shuffle.h" #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { using AVX::m128; using AVX::m128d; using AVX::m128i; using AVX::m256; using AVX::m256d; using AVX::m256i; using AVX::param128; using AVX::param128d; using AVX::param128i; using AVX::param256; using AVX::param256d; using AVX::param256i; namespace Mem { template static Vc_ALWAYS_INLINE m256 Vc_CONST permute128(param256 x) { VC_STATIC_ASSERT(L >= X0 && L <= X1, Incorrect_Range); VC_STATIC_ASSERT(H >= X0 && H <= X1, Incorrect_Range); return _mm256_permute2f128_ps(x, x, L + H * (1 << 4)); } template static Vc_ALWAYS_INLINE m256d Vc_CONST permute128(param256d x) { VC_STATIC_ASSERT(L >= X0 && L <= X1, Incorrect_Range); VC_STATIC_ASSERT(H >= X0 && H <= X1, Incorrect_Range); return _mm256_permute2f128_pd(x, x, L + H * (1 << 4)); } template static Vc_ALWAYS_INLINE m256i Vc_CONST permute128(param256i x) { VC_STATIC_ASSERT(L >= X0 && L <= X1, Incorrect_Range); VC_STATIC_ASSERT(H >= X0 && H <= X1, Incorrect_Range); return _mm256_permute2f128_si256(x, x, L + H * (1 << 4)); } template static Vc_ALWAYS_INLINE m256 Vc_CONST shuffle128(param256 x, param256 y) { VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range); VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range); return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); } template static Vc_ALWAYS_INLINE m256i Vc_CONST shuffle128(param256i x, param256i y) { VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range); VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range); return _mm256_permute2f128_si256(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); } template static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle128(param256d x, param256d y) { VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range); VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range); return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); } template static Vc_ALWAYS_INLINE m256d Vc_CONST permute(param256d x) { VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, Incorrect_Range); VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8); } template static Vc_ALWAYS_INLINE m256 Vc_CONST permute(param256 x) { VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } template static Vc_ALWAYS_INLINE m256i Vc_CONST permute(param256i x) { return _mm256_castps_si256(permute(_mm256_castsi256_ps(x))); } template static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle(param256d x, param256d y) { VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, Incorrect_Range); VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, Incorrect_Range); return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8); } template static Vc_ALWAYS_INLINE m256 Vc_CONST shuffle(param256 x, param256 y) { VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range); VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range); return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64); } template static Vc_ALWAYS_INLINE m256 Vc_CONST blend(param256 x, param256 y) { VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range); VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range); VC_STATIC_ASSERT(Dst2 == X2 || Dst2 == Y2, Incorrect_Range); VC_STATIC_ASSERT(Dst3 == X3 || Dst3 == Y3, Incorrect_Range); VC_STATIC_ASSERT(Dst4 == X4 || Dst4 == Y4, Incorrect_Range); VC_STATIC_ASSERT(Dst5 == X5 || Dst5 == Y5, Incorrect_Range); VC_STATIC_ASSERT(Dst6 == X6 || Dst6 == Y6, Incorrect_Range); VC_STATIC_ASSERT(Dst7 == X7 || Dst7 == Y7, Incorrect_Range); return _mm256_blend_ps(x, y, (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 + (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 + (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 + (Dst6 / Y6) * 64 + (Dst7 / Y7) *128 ); } template static Vc_ALWAYS_INLINE m256i Vc_CONST blend(param256i x, param256i y) { return _mm256_castps_si256(blend(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); } template struct ScaleForBlend { enum { Value = Dst >= X4 ? Dst - X4 + Y0 : Dst }; }; template static Vc_ALWAYS_INLINE m256 Vc_CONST permute(param256 x) { VC_STATIC_ASSERT(Dst0 >= X0 && Dst0 <= X7, Incorrect_Range); VC_STATIC_ASSERT(Dst1 >= X0 && Dst1 <= X7, Incorrect_Range); VC_STATIC_ASSERT(Dst2 >= X0 && Dst2 <= X7, Incorrect_Range); VC_STATIC_ASSERT(Dst3 >= X0 && Dst3 <= X7, Incorrect_Range); VC_STATIC_ASSERT(Dst4 >= X0 && Dst4 <= X7, Incorrect_Range); VC_STATIC_ASSERT(Dst5 >= X0 && Dst5 <= X7, Incorrect_Range); VC_STATIC_ASSERT(Dst6 >= X0 && Dst6 <= X7, Incorrect_Range); VC_STATIC_ASSERT(Dst7 >= X0 && Dst7 <= X7, Incorrect_Range); if (Dst0 + X4 == Dst4 && Dst1 + X4 == Dst5 && Dst2 + X4 == Dst6 && Dst3 + X4 == Dst7) { return permute(x); } const m128 loIn = _mm256_castps256_ps128(x); const m128 hiIn = _mm256_extractf128_ps(x, 1); m128 lo, hi; if (Dst0 < X4 && Dst1 < X4 && Dst2 < X4 && Dst3 < X4) { lo = _mm_permute_ps(loIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4) { lo = _mm_permute_ps(hiIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } else if (Dst0 < X4 && Dst1 < X4 && Dst2 >= X4 && Dst3 >= X4) { lo = shuffle(loIn, hiIn); } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 < X4 && Dst3 < X4) { lo = shuffle(hiIn, loIn); } else if (Dst0 == X0 && Dst1 == X4 && Dst2 == X1 && Dst3 == X5) { lo = _mm_unpacklo_ps(loIn, hiIn); } else if (Dst0 == X4 && Dst1 == X0 && Dst2 == X5 && Dst3 == X1) { lo = _mm_unpacklo_ps(hiIn, loIn); } else if (Dst0 == X2 && Dst1 == X6 && Dst2 == X3 && Dst3 == X7) { lo = _mm_unpackhi_ps(loIn, hiIn); } else if (Dst0 == X6 && Dst1 == X2 && Dst2 == X7 && Dst3 == X3) { lo = _mm_unpackhi_ps(hiIn, loIn); } else if (Dst0 % X4 == 0 && Dst1 % X4 == 1 && Dst2 % X4 == 2 && Dst3 % X4 == 3) { lo = blend::Value, ScaleForBlend::Value, ScaleForBlend::Value, ScaleForBlend::Value>(loIn, hiIn); } if (Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4) { hi = _mm_permute_ps(hiIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64); } else if (Dst4 < X4 && Dst5 < X4 && Dst6 < X4 && Dst7 < X4) { hi = _mm_permute_ps(loIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64); } else if (Dst4 < X4 && Dst5 < X4 && Dst6 >= X4 && Dst7 >= X4) { hi = shuffle(loIn, hiIn); } else if (Dst4 >= X4 && Dst5 >= X4 && Dst6 < X4 && Dst7 < X4) { hi = shuffle(hiIn, loIn); } else if (Dst4 == X0 && Dst5 == X4 && Dst6 == X1 && Dst7 == X5) { hi = _mm_unpacklo_ps(loIn, hiIn); } else if (Dst4 == X4 && Dst5 == X0 && Dst6 == X5 && Dst7 == X1) { hi = _mm_unpacklo_ps(hiIn, loIn); } else if (Dst4 == X2 && Dst5 == X6 && Dst6 == X3 && Dst7 == X7) { hi = _mm_unpackhi_ps(loIn, hiIn); } else if (Dst4 == X6 && Dst5 == X2 && Dst6 == X7 && Dst7 == X3) { hi = _mm_unpackhi_ps(hiIn, loIn); } else if (Dst4 % X4 == 0 && Dst5 % X4 == 1 && Dst6 % X4 == 2 && Dst7 % X4 == 3) { hi = blend::Value, ScaleForBlend::Value, ScaleForBlend::Value, ScaleForBlend::Value>(loIn, hiIn); } return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); } } // namespace Mem // little endian has the lo bits on the right and high bits on the left // with vectors this becomes greatly confusing: // Mem: abcd // Reg: dcba // // The shuffles and permutes above use memory ordering. The ones below use register ordering: namespace Reg { template static Vc_ALWAYS_INLINE m256 Vc_CONST permute128(param256 x, param256 y) { VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range); VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range); return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); } template static Vc_ALWAYS_INLINE m256i Vc_CONST permute128(param256i x, param256i y) { VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range); VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range); return _mm256_permute2f128_si256(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); } template static Vc_ALWAYS_INLINE m256d Vc_CONST permute128(param256d x, param256d y) { VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range); VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range); return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); } template static Vc_ALWAYS_INLINE m256d Vc_CONST permute(param256d x) { VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, Incorrect_Range); VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8); } template static Vc_ALWAYS_INLINE m256 Vc_CONST permute(param256 x) { VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } template static Vc_ALWAYS_INLINE m128d Vc_CONST permute(param128d x) { VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0, Incorrect_Range); VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= X1, Incorrect_Range); return _mm_permute_pd(x, Dst0 + Dst1 * 2); } template static Vc_ALWAYS_INLINE m128 Vc_CONST permute(param128 x) { VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); return _mm_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } template static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle(param256d x, param256d y) { VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, Incorrect_Range); VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, Incorrect_Range); return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8); } template static Vc_ALWAYS_INLINE m256 Vc_CONST shuffle(param256 x, param256 y) { VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range); VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range); return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64); } } // namespace Reg } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // VC_AVX_SHUFFLE_H Vc-0.7.4/avx/sorthelper.h000066400000000000000000000024121233512346000152260ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_AVX_SORTHELPER_H #define VC_AVX_SORTHELPER_H #include "types.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace AVX { template struct SortHelper { typedef typename VectorTypeHelper::Type VectorType; #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN typedef const VectorType & VTArg; #else typedef const VectorType VTArg; #endif static VectorType sort(VTArg); static void sort(VectorType &, VectorType &); }; } // namespace AVX } // namespace Vc /*OUTER_NAMESPACE_END*/ #endif // VC_AVX_SORTHELPER_H Vc-0.7.4/avx/types.h000066400000000000000000000112151233512346000142040ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef AVX_TYPES_H #define AVX_TYPES_H #include "intrinsics.h" #include "../common/storage.h" #include "macros.h" #define VC_DOUBLE_V_SIZE 4 #define VC_FLOAT_V_SIZE 8 #define VC_SFLOAT_V_SIZE 8 #define VC_INT_V_SIZE 8 #define VC_UINT_V_SIZE 8 #define VC_SHORT_V_SIZE 8 #define VC_USHORT_V_SIZE 8 #include "../common/types.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace AVX { template class Vector; template class Mask; template struct VectorHelper {}; template struct GatherHelper; template struct ScatterHelper; template struct IndexTypeHelper; template<> struct IndexTypeHelper< char > { typedef unsigned char Type; }; template<> struct IndexTypeHelper { typedef unsigned char Type; }; template<> struct IndexTypeHelper< short> { typedef unsigned short Type; }; template<> struct IndexTypeHelper { typedef unsigned short Type; }; template<> struct IndexTypeHelper< int > { typedef unsigned int Type; }; template<> struct IndexTypeHelper { typedef unsigned int Type; }; template<> struct IndexTypeHelper< float> { typedef unsigned int Type; }; template<> struct IndexTypeHelper< sfloat> { typedef unsigned short Type; }; template<> struct IndexTypeHelper< double> { typedef unsigned int Type; }; // _M128I based int32 would be nice template struct VectorTypeHelper; template<> struct VectorTypeHelper< char > { typedef m128i Type; }; template<> struct VectorTypeHelper { typedef m128i Type; }; template<> struct VectorTypeHelper< short> { typedef m128i Type; }; template<> struct VectorTypeHelper { typedef m128i Type; }; template<> struct VectorTypeHelper< int > { typedef m256i Type; }; template<> struct VectorTypeHelper { typedef m256i Type; }; template<> struct VectorTypeHelper< float> { typedef m256 Type; }; template<> struct VectorTypeHelper< sfloat> { typedef m256 Type; }; template<> struct VectorTypeHelper< double> { typedef m256d Type; }; template struct SseVectorType; template<> struct SseVectorType { typedef m128 Type; }; template<> struct SseVectorType { typedef m128i Type; }; template<> struct SseVectorType { typedef m128d Type; }; template<> struct SseVectorType { typedef m128 Type; }; template<> struct SseVectorType { typedef m128i Type; }; template<> struct SseVectorType { typedef m128d Type; }; template struct HasVectorDivisionHelper { enum { Value = 1 }; }; //template<> struct HasVectorDivisionHelper { enum { Value = 0 }; }; template struct VectorHelperSize; #ifdef VC_MSVC // MSVC's __declspec(align(#)) only works with numbers, no enums or sizeof allowed ;( template class _VectorAlignedBaseHack; template<> class STRUCT_ALIGN1( 8) _VectorAlignedBaseHack< 8> {} STRUCT_ALIGN2( 8); template<> class STRUCT_ALIGN1(16) _VectorAlignedBaseHack<16> {} STRUCT_ALIGN2(16); template<> class STRUCT_ALIGN1(32) _VectorAlignedBaseHack<32> {} STRUCT_ALIGN2(32); template<> class STRUCT_ALIGN1(64) _VectorAlignedBaseHack<64> {} STRUCT_ALIGN2(64); template > class VectorAlignedBaseT : public _VectorAlignedBaseHack { public: FREE_STORE_OPERATORS_ALIGNED(sizeof(V)) }; #else template > class STRUCT_ALIGN1(sizeof(V)) VectorAlignedBaseT { public: FREE_STORE_OPERATORS_ALIGNED(sizeof(V)) } STRUCT_ALIGN2(sizeof(V)); #endif } // namespace AVX } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // AVX_TYPES_H Vc-0.7.4/avx/undomacros.h000066400000000000000000000015771233512346000152240ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_AVX_UNDOMACROS_H #define VC_AVX_UNDOMACROS_H #undef VC_AVX_MACROS_H #endif // VC_AVX_UNDOMACROS_H #include "../common/undomacros.h" Vc-0.7.4/avx/vector.h000066400000000000000000000621271233512346000143520ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef AVX_VECTOR_H #define AVX_VECTOR_H #include "intrinsics.h" #include "vectorhelper.h" #include "mask.h" #include "writemaskedvector.h" #include "sorthelper.h" #include #include #include "../common/aliasingentryhelper.h" #include "../common/memoryfwd.h" #include "macros.h" #ifdef isfinite #undef isfinite #endif #ifdef isnan #undef isnan #endif /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace AVX { enum VectorAlignmentEnum { VectorAlignment = 32 }; template class Vector { public: FREE_STORE_OPERATORS_ALIGNED(32) typedef typename VectorTypeHelper::Type VectorType; typedef typename DetermineEntryType::Type EntryType; enum Constants { Size = sizeof(VectorType) / sizeof(EntryType), HasVectorDivision = HasVectorDivisionHelper::Value }; typedef Vector::Type> IndexType; typedef typename Vc::AVX::Mask Mask; typedef typename Mask::AsArg MaskArg; typedef Vc::Memory, Size> Memory; #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN typedef const Vector &AsArg; typedef const VectorType &VectorTypeArg; #else typedef Vector AsArg; typedef VectorType VectorTypeArg; #endif protected: // helper that specializes on VectorType typedef VectorHelper HV; // helper that specializes on T typedef VectorHelper HT; // cast any m256/m128 to VectorType static Vc_INTRINSIC VectorType _cast(param128 v) { return avx_cast(v); } static Vc_INTRINSIC VectorType _cast(param128i v) { return avx_cast(v); } static Vc_INTRINSIC VectorType _cast(param128d v) { return avx_cast(v); } static Vc_INTRINSIC VectorType _cast(param256 v) { return avx_cast(v); } static Vc_INTRINSIC VectorType _cast(param256i v) { return avx_cast(v); } static Vc_INTRINSIC VectorType _cast(param256d v) { return avx_cast(v); } #ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS typedef Common::VectorMemoryUnion StorageType; #else typedef Common::VectorMemoryUnion StorageType; #endif StorageType d; public: /////////////////////////////////////////////////////////////////////////////////////////// // uninitialized Vc_ALWAYS_INLINE Vector() {} /////////////////////////////////////////////////////////////////////////////////////////// // constants explicit Vc_ALWAYS_INLINE_L Vector(VectorSpecialInitializerZero::ZEnum) Vc_ALWAYS_INLINE_R; explicit Vc_ALWAYS_INLINE_L Vector(VectorSpecialInitializerOne::OEnum) Vc_ALWAYS_INLINE_R; explicit Vc_ALWAYS_INLINE_L Vector(VectorSpecialInitializerIndexesFromZero::IEnum) Vc_ALWAYS_INLINE_R; static Vc_INTRINSIC_L Vc_CONST_L Vector Zero() Vc_INTRINSIC_R Vc_CONST_R; static Vc_INTRINSIC_L Vc_CONST_L Vector One() Vc_INTRINSIC_R Vc_CONST_R; static Vc_INTRINSIC_L Vc_CONST_L Vector IndexesFromZero() Vc_INTRINSIC_R Vc_CONST_R; static Vc_ALWAYS_INLINE_L Vector Random() Vc_ALWAYS_INLINE_R; /////////////////////////////////////////////////////////////////////////////////////////// // internal: required to enable returning objects of VectorType Vc_ALWAYS_INLINE Vector(VectorTypeArg x) : d(x) {} #ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS Vc_ALWAYS_INLINE Vector(typename VectorType::Base x) : d(x) {} #endif /////////////////////////////////////////////////////////////////////////////////////////// // static_cast / copy ctor template explicit Vector(VC_ALIGNED_PARAMETER(Vector) x); // implicit cast template Vc_INTRINSIC_L Vector &operator=(const Vector &x) Vc_INTRINSIC_R; // copy assignment Vc_ALWAYS_INLINE Vector &operator=(AsArg v) { d.v() = v.d.v(); return *this; } /////////////////////////////////////////////////////////////////////////////////////////// // broadcast explicit Vc_ALWAYS_INLINE_L Vector(EntryType a) Vc_ALWAYS_INLINE_R; template Vc_INTRINSIC Vector(TT x, VC_EXACT_TYPE(TT, EntryType, void *) = 0) : d(HT::set(x)) {} Vc_ALWAYS_INLINE Vector &operator=(EntryType a) { d.v() = HT::set(a); return *this; } /////////////////////////////////////////////////////////////////////////////////////////// // load ctors explicit Vc_INTRINSIC_L Vector(const EntryType *x) Vc_INTRINSIC_R; template Vc_INTRINSIC_L Vector(const EntryType *x, Alignment align) Vc_INTRINSIC_R; template explicit Vc_INTRINSIC_L Vector(const OtherT *x) Vc_INTRINSIC_R; template Vc_INTRINSIC_L Vector(const OtherT *x, Alignment align) Vc_INTRINSIC_R; /////////////////////////////////////////////////////////////////////////////////////////// // load member functions Vc_INTRINSIC_L void load(const EntryType *mem) Vc_INTRINSIC_R; template Vc_INTRINSIC_L void load(const EntryType *mem, Alignment align) Vc_INTRINSIC_R; template Vc_INTRINSIC_L void load(const OtherT *mem) Vc_INTRINSIC_R; template Vc_INTRINSIC_L void load(const OtherT *mem, Alignment align) Vc_INTRINSIC_R; /////////////////////////////////////////////////////////////////////////////////////////// // expand/merge 1 float_v <=> 2 double_v XXX rationale? remove it for release? XXX explicit inline Vector(const Vector *a); inline void expand(Vector *x) const; /////////////////////////////////////////////////////////////////////////////////////////// // zeroing Vc_INTRINSIC_L void setZero() Vc_INTRINSIC_R; Vc_INTRINSIC_L void setZero(const Mask &k) Vc_INTRINSIC_R; Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R; Vc_INTRINSIC_L void setQnan(MaskArg k) Vc_INTRINSIC_R; /////////////////////////////////////////////////////////////////////////////////////////// // stores Vc_INTRINSIC_L void store(EntryType *mem) const Vc_INTRINSIC_R; Vc_INTRINSIC_L void store(EntryType *mem, const Mask &mask) const Vc_INTRINSIC_R; template Vc_INTRINSIC_L void store(EntryType *mem, A align) const Vc_INTRINSIC_R; template Vc_INTRINSIC_L void store(EntryType *mem, const Mask &mask, A align) const Vc_INTRINSIC_R; /////////////////////////////////////////////////////////////////////////////////////////// // swizzles Vc_INTRINSIC_L Vc_PURE_L const Vector &abcd() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector cdab() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector badc() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector aaaa() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector bbbb() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector cccc() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector dddd() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector bcad() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector bcda() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector dabc() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector acbd() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector dbca() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector dcba() const Vc_INTRINSIC_R Vc_PURE_R; /////////////////////////////////////////////////////////////////////////////////////////// // gathers template Vector(const EntryType *mem, const IndexT *indexes); template Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes); template Vector(const EntryType *mem, const IndexT *indexes, MaskArg mask); template Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask); template Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes); template Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask); template Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes); template Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask); template Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes); template Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask); template void gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes); template void gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask); #ifdef VC_USE_SET_GATHERS template void gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask); #endif template void gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes); template void gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask); template void gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes); template void gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask); template void gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes); template void gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask); /////////////////////////////////////////////////////////////////////////////////////////// // scatters template void scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const; template void scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) const; template void scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) const; template void scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const; template void scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) const; template void scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const; template void scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) const; template void scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) const; /////////////////////////////////////////////////////////////////////////////////////////// //prefix Vc_ALWAYS_INLINE Vector &operator++() { data() = VectorHelper::add(data(), VectorHelper::one()); return *this; } Vc_ALWAYS_INLINE Vector &operator--() { data() = VectorHelper::sub(data(), VectorHelper::one()); return *this; } //postfix Vc_ALWAYS_INLINE Vector operator++(int) { const Vector r = *this; data() = VectorHelper::add(data(), VectorHelper::one()); return r; } Vc_ALWAYS_INLINE Vector operator--(int) { const Vector r = *this; data() = VectorHelper::sub(data(), VectorHelper::one()); return r; } Vc_INTRINSIC Common::AliasingEntryHelper operator[](size_t index) { #if defined(VC_GCC) && VC_GCC >= 0x40300 && VC_GCC < 0x40400 ::Vc::Warnings::_operator_bracket_warning(); #endif return d.m(index); } Vc_ALWAYS_INLINE EntryType operator[](size_t index) const { return d.m(index); } Vc_ALWAYS_INLINE Vector operator~() const { return VectorHelper::andnot_(data(), VectorHelper::allone()); } Vc_ALWAYS_INLINE_L Vc_PURE_L Vector::Type> operator-() const Vc_ALWAYS_INLINE_R Vc_PURE_R; Vc_INTRINSIC Vc_PURE Vector operator+() const { return *this; } #define OP1(fun) \ Vc_ALWAYS_INLINE Vector fun() const { return Vector(VectorHelper::fun(data())); } \ Vc_ALWAYS_INLINE Vector &fun##_eq() { data() = VectorHelper::fun(data()); return *this; } OP1(sqrt) OP1(abs) #undef OP1 #define OP(symbol, fun) \ Vc_ALWAYS_INLINE Vector &operator symbol##=(const Vector &x) { data() = VectorHelper::fun(data(), x.data()); return *this; } \ Vc_ALWAYS_INLINE Vector &operator symbol##=(EntryType x) { return operator symbol##=(Vector(x)); } \ Vc_ALWAYS_INLINE Vector operator symbol(const Vector &x) const { return Vector(VectorHelper::fun(data(), x.data())); } \ template Vc_ALWAYS_INLINE VC_EXACT_TYPE(TT, EntryType, Vector) operator symbol(TT x) const { return operator symbol(Vector(x)); } OP(+, add) OP(-, sub) OP(*, mul) #undef OP inline Vector &operator/=(EntryType x); template inline Vc_PURE_L VC_EXACT_TYPE(TT, EntryType, Vector) operator/(TT x) const Vc_PURE_R; inline Vector &operator/=(const Vector &x); inline Vc_PURE_L Vector operator/ (const Vector &x) const Vc_PURE_R; // bitwise ops #define OP_VEC(op) \ Vc_ALWAYS_INLINE_L Vector &operator op##=(AsArg x) Vc_ALWAYS_INLINE_R; \ Vc_ALWAYS_INLINE_L Vc_PURE_L Vector operator op (AsArg x) const Vc_ALWAYS_INLINE_R Vc_PURE_R; #define OP_ENTRY(op) \ Vc_ALWAYS_INLINE Vector &operator op##=(EntryType x) { return operator op##=(Vector(x)); } \ template Vc_ALWAYS_INLINE Vc_PURE VC_EXACT_TYPE(TT, EntryType, Vector) operator op(TT x) const { return operator op(Vector(x)); } VC_ALL_BINARY(OP_VEC) VC_ALL_BINARY(OP_ENTRY) VC_ALL_SHIFTS(OP_VEC) #undef OP_VEC #undef OP_ENTRY Vc_ALWAYS_INLINE_L Vector &operator>>=(int x) Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L Vector &operator<<=(int x) Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L Vector operator>>(int x) const Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L Vector operator<<(int x) const Vc_ALWAYS_INLINE_R; #define OPcmp(symbol, fun) \ Vc_ALWAYS_INLINE Mask operator symbol(AsArg x) const { return VectorHelper::fun(data(), x.data()); } \ template Vc_ALWAYS_INLINE VC_EXACT_TYPE(TT, EntryType, Mask) operator symbol(TT x) const { return operator symbol(Vector(x)); } OPcmp(==, cmpeq) OPcmp(!=, cmpneq) OPcmp(>=, cmpnlt) OPcmp(>, cmpnle) OPcmp(<, cmplt) OPcmp(<=, cmple) #undef OPcmp Vc_INTRINSIC_L Vc_PURE_L Mask isNegative() const Vc_PURE_R Vc_INTRINSIC_R; Vc_ALWAYS_INLINE void fusedMultiplyAdd(const Vector &factor, const Vector &summand) { VectorHelper::fma(data(), factor.data(), summand.data()); } Vc_ALWAYS_INLINE void assign( const Vector &v, const Mask &mask ) { const VectorType k = avx_cast(mask.data()); data() = VectorHelper::blend(data(), v.data(), k); } template Vc_ALWAYS_INLINE V2 staticCast() const { return V2(*this); } template Vc_ALWAYS_INLINE V2 reinterpretCast() const { return avx_cast(data()); } Vc_ALWAYS_INLINE WriteMaskedVector operator()(const Mask &k) { return WriteMaskedVector(this, k); } /** * \return \p true This vector was completely filled. m2 might be 0 or != 0. You still have * to test this. * \p false This vector was not completely filled. m2 is all 0. */ //inline bool pack(Mask &m1, Vector &v2, Mask &m2) { //return VectorHelper::pack(data(), m1.data, v2.data(), m2.data); //} Vc_ALWAYS_INLINE VectorType &data() { return d.v(); } Vc_ALWAYS_INLINE const VectorType data() const { return d.v(); } Vc_ALWAYS_INLINE EntryType min() const { return VectorHelper::min(data()); } Vc_ALWAYS_INLINE EntryType max() const { return VectorHelper::max(data()); } Vc_ALWAYS_INLINE EntryType product() const { return VectorHelper::mul(data()); } Vc_ALWAYS_INLINE EntryType sum() const { return VectorHelper::add(data()); } Vc_ALWAYS_INLINE_L EntryType min(MaskArg m) const Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L EntryType max(MaskArg m) const Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L EntryType product(MaskArg m) const Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L EntryType sum(MaskArg m) const Vc_ALWAYS_INLINE_R; Vc_INTRINSIC_L Vector shifted(int amount) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector rotated(int amount) const Vc_INTRINSIC_R; Vc_ALWAYS_INLINE Vector sorted() const { return SortHelper::sort(data()); } template void callWithValuesSorted(F &f) { EntryType value = d.m(0); f(value); for (int i = 1; i < Size; ++i) { if (d.m(i) != value) { value = d.m(i); f(value); } } } template Vc_INTRINSIC void call(const F &f) const { for_all_vector_entries(i, f(EntryType(d.m(i))); ); } template Vc_INTRINSIC void call(F &f) const { for_all_vector_entries(i, f(EntryType(d.m(i))); ); } template Vc_INTRINSIC void call(const F &f, const Mask &mask) const { Vc_foreach_bit(size_t i, mask) { f(EntryType(d.m(i))); } } template Vc_INTRINSIC void call(F &f, const Mask &mask) const { Vc_foreach_bit(size_t i, mask) { f(EntryType(d.m(i))); } } template Vc_INTRINSIC Vector apply(const F &f) const { Vector r; for_all_vector_entries(i, r.d.m(i) = f(EntryType(d.m(i))); ); return r; } template Vc_INTRINSIC Vector apply(F &f) const { Vector r; for_all_vector_entries(i, r.d.m(i) = f(EntryType(d.m(i))); ); return r; } template Vc_INTRINSIC Vector apply(const F &f, const Mask &mask) const { Vector r(*this); Vc_foreach_bit (size_t i, mask) { r.d.m(i) = f(EntryType(r.d.m(i))); } return r; } template Vc_INTRINSIC Vector apply(F &f, const Mask &mask) const { Vector r(*this); Vc_foreach_bit (size_t i, mask) { r.d.m(i) = f(EntryType(r.d.m(i))); } return r; } template Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) { for_all_vector_entries(i, d.m(i) = f(i); ); } Vc_INTRINSIC void fill(EntryType (&f)()) { for_all_vector_entries(i, d.m(i) = f(); ); } Vc_INTRINSIC_L Vector copySign(AsArg reference) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector exponent() const Vc_INTRINSIC_R; }; typedef Vector double_v; typedef Vector float_v; typedef Vector sfloat_v; typedef Vector int_v; typedef Vector uint_v; typedef Vector short_v; typedef Vector ushort_v; typedef double_v::Mask double_m; typedef float_v::Mask float_m; typedef sfloat_v::Mask sfloat_m; typedef int_v::Mask int_m; typedef uint_v::Mask uint_m; typedef short_v::Mask short_m; typedef ushort_v::Mask ushort_m; template class SwizzledVector : public Vector {}; static Vc_ALWAYS_INLINE int_v min(const int_v &x, const int_v &y) { return _mm256_min_epi32(x.data(), y.data()); } static Vc_ALWAYS_INLINE uint_v min(const uint_v &x, const uint_v &y) { return _mm256_min_epu32(x.data(), y.data()); } static Vc_ALWAYS_INLINE short_v min(const short_v &x, const short_v &y) { return _mm_min_epi16(x.data(), y.data()); } static Vc_ALWAYS_INLINE ushort_v min(const ushort_v &x, const ushort_v &y) { return _mm_min_epu16(x.data(), y.data()); } static Vc_ALWAYS_INLINE float_v min(const float_v &x, const float_v &y) { return _mm256_min_ps(x.data(), y.data()); } static Vc_ALWAYS_INLINE sfloat_v min(const sfloat_v &x, const sfloat_v &y) { return _mm256_min_ps(x.data(), y.data()); } static Vc_ALWAYS_INLINE double_v min(const double_v &x, const double_v &y) { return _mm256_min_pd(x.data(), y.data()); } static Vc_ALWAYS_INLINE int_v max(const int_v &x, const int_v &y) { return _mm256_max_epi32(x.data(), y.data()); } static Vc_ALWAYS_INLINE uint_v max(const uint_v &x, const uint_v &y) { return _mm256_max_epu32(x.data(), y.data()); } static Vc_ALWAYS_INLINE short_v max(const short_v &x, const short_v &y) { return _mm_max_epi16(x.data(), y.data()); } static Vc_ALWAYS_INLINE ushort_v max(const ushort_v &x, const ushort_v &y) { return _mm_max_epu16(x.data(), y.data()); } static Vc_ALWAYS_INLINE float_v max(const float_v &x, const float_v &y) { return _mm256_max_ps(x.data(), y.data()); } static Vc_ALWAYS_INLINE sfloat_v max(const sfloat_v &x, const sfloat_v &y) { return _mm256_max_ps(x.data(), y.data()); } static Vc_ALWAYS_INLINE double_v max(const double_v &x, const double_v &y) { return _mm256_max_pd(x.data(), y.data()); } template static Vc_ALWAYS_INLINE Vector sqrt (const Vector &x) { return VectorHelper::sqrt(x.data()); } template static Vc_ALWAYS_INLINE Vector rsqrt(const Vector &x) { return VectorHelper::rsqrt(x.data()); } template static Vc_ALWAYS_INLINE Vector abs (const Vector &x) { return VectorHelper::abs(x.data()); } template static Vc_ALWAYS_INLINE Vector reciprocal(const Vector &x) { return VectorHelper::reciprocal(x.data()); } template static Vc_ALWAYS_INLINE Vector round(const Vector &x) { return VectorHelper::round(x.data()); } template static Vc_ALWAYS_INLINE typename Vector::Mask isfinite(const Vector &x) { return VectorHelper::isFinite(x.data()); } template static Vc_ALWAYS_INLINE typename Vector::Mask isnan(const Vector &x) { return VectorHelper::isNaN(x.data()); } #include "forceToRegisters.tcc" } // namespace AVX } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "vector.tcc" #include "math.h" #include "undomacros.h" #endif // AVX_VECTOR_H Vc-0.7.4/avx/vector.tcc000066400000000000000000002140721233512346000146720ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2011-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include "limits.h" #include "const.h" #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { ALIGN(64) extern unsigned int RandomState[16]; namespace AVX { /////////////////////////////////////////////////////////////////////////////////////////// // constants {{{1 template Vc_ALWAYS_INLINE Vector::Vector(VectorSpecialInitializerZero::ZEnum) : d(HT::zero()) {} template Vc_ALWAYS_INLINE Vector::Vector(VectorSpecialInitializerOne::OEnum) : d(HT::one()) {} template Vc_ALWAYS_INLINE Vector::Vector(VectorSpecialInitializerIndexesFromZero::IEnum) : d(HV::load(IndexesFromZeroData::address(), Aligned)) {} template Vc_INTRINSIC Vector Vc_CONST Vector::Zero() { return HT::zero(); } template Vc_INTRINSIC Vector Vc_CONST Vector::One() { return HT::one(); } template Vc_INTRINSIC Vector Vc_CONST Vector::IndexesFromZero() { return HV::load(IndexesFromZeroData::address(), Aligned); } template template Vc_ALWAYS_INLINE Vector::Vector(VC_ALIGNED_PARAMETER(Vector) x) : d(StaticCastHelper::cast(x.data())) {} template Vc_ALWAYS_INLINE Vector::Vector(EntryType x) : d(HT::set(x)) {} template<> Vc_ALWAYS_INLINE Vector::Vector(EntryType x) : d(_mm256_set1_pd(x)) {} /////////////////////////////////////////////////////////////////////////////////////////// // load ctors {{{1 template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *x) { load(x); } template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *x, A a) { load(x, a); } template template Vc_ALWAYS_INLINE Vector::Vector(const OtherT *x) { load(x); } template template Vc_ALWAYS_INLINE Vector::Vector(const OtherT *x, A a) { load(x, a); } /////////////////////////////////////////////////////////////////////////////////////////// // load member functions {{{1 template Vc_INTRINSIC void Vector::load(const EntryType *mem) { load(mem, Aligned); } template template Vc_INTRINSIC void Vector::load(const EntryType *mem, A align) { d.v() = HV::load(mem, align); } template template Vc_INTRINSIC void Vector::load(const OtherT *mem) { load(mem, Aligned); } // LoadHelper {{{2 template struct LoadHelper; // float {{{2 template struct LoadHelper { static m256 load(const double *mem, Flags f) { return concat(_mm256_cvtpd_ps(VectorHelper::load(&mem[0], f)), _mm256_cvtpd_ps(VectorHelper::load(&mem[4], f))); } }; template struct LoadHelper { static m256 load(const unsigned int *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m256 load(const int *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m256 load(const unsigned short *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m256 load(const short *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m256 load(const unsigned char *mem, Flags f) { return StaticCastHelper::cast(LoadHelper::load(mem, f)); } }; template struct LoadHelper { static m256 load(const signed char *mem, Flags f) { return StaticCastHelper::cast(LoadHelper::load(mem, f)); } }; template struct LoadHelper : public LoadHelper {}; // int {{{2 template struct LoadHelper { static m256i load(const unsigned int *mem, Flags f) { return VectorHelper::load(mem, f); } }; template struct LoadHelper { static m256i load(const unsigned short *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m256i load(const short *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m256i load(const unsigned char *mem, Flags) { // the only available streaming load loads 16 bytes - twice as much as we need => can't use // it, or we risk an out-of-bounds read and an unaligned load exception const m128i epu8 = _mm_loadl_epi64(reinterpret_cast(mem)); const m128i epu16 = _mm_cvtepu8_epi16(epu8); return StaticCastHelper::cast(epu16); } }; template struct LoadHelper { static m256i load(const signed char *mem, Flags) { // the only available streaming load loads 16 bytes - twice as much as we need => can't use // it, or we risk an out-of-bounds read and an unaligned load exception const m128i epi8 = _mm_loadl_epi64(reinterpret_cast(mem)); const m128i epi16 = _mm_cvtepi8_epi16(epi8); return StaticCastHelper::cast(epi16); } }; // unsigned int {{{2 template struct LoadHelper { static m256i load(const unsigned short *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m256i load(const unsigned char *mem, Flags) { // the only available streaming load loads 16 bytes - twice as much as we need => can't use // it, or we risk an out-of-bounds read and an unaligned load exception const m128i epu8 = _mm_loadl_epi64(reinterpret_cast(mem)); const m128i epu16 = _mm_cvtepu8_epi16(epu8); return StaticCastHelper::cast(epu16); } }; // short {{{2 template struct LoadHelper { static m128i load(const unsigned short *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m128i load(const unsigned char *mem, Flags) { // the only available streaming load loads 16 bytes - twice as much as we need => can't use // it, or we risk an out-of-bounds read and an unaligned load exception const m128i epu8 = _mm_loadl_epi64(reinterpret_cast(mem)); return _mm_cvtepu8_epi16(epu8); } }; template struct LoadHelper { static m128i load(const signed char *mem, Flags) { // the only available streaming load loads 16 bytes - twice as much as we need => can't use // it, or we risk an out-of-bounds read and an unaligned load exception const m128i epi8 = _mm_loadl_epi64(reinterpret_cast(mem)); return _mm_cvtepi8_epi16(epi8); } }; // unsigned short {{{2 template struct LoadHelper { static m128i load(const unsigned char *mem, Flags) { // the only available streaming load loads 16 bytes - twice as much as we need => can't use // it, or we risk an out-of-bounds read and an unaligned load exception const m128i epu8 = _mm_loadl_epi64(reinterpret_cast(mem)); return _mm_cvtepu8_epi16(epu8); } }; // general load, implemented via LoadHelper {{{2 template template Vc_INTRINSIC void Vector::load(const SrcT *x, Flags f) { d.v() = LoadHelper::load(x, f); } /////////////////////////////////////////////////////////////////////////////////////////// // zeroing {{{1 template Vc_INTRINSIC void Vector::setZero() { data() = HV::zero(); } template Vc_INTRINSIC void Vector::setZero(const Mask &k) { data() = HV::andnot_(avx_cast(k.data()), data()); } template<> Vc_INTRINSIC void Vector::setQnan() { data() = _mm256_setallone_pd(); } template<> Vc_INTRINSIC void Vector::setQnan(MaskArg k) { data() = _mm256_or_pd(data(), k.dataD()); } template<> Vc_INTRINSIC void Vector::setQnan() { data() = _mm256_setallone_ps(); } template<> Vc_INTRINSIC void Vector::setQnan(MaskArg k) { data() = _mm256_or_ps(data(), k.data()); } template<> Vc_INTRINSIC void Vector::setQnan() { data() = _mm256_setallone_ps(); } template<> Vc_INTRINSIC void Vector::setQnan(MaskArg k) { data() = _mm256_or_ps(data(), k.data()); } /////////////////////////////////////////////////////////////////////////////////////////// // stores {{{1 template Vc_INTRINSIC void Vector::store(EntryType *mem) const { HV::store(mem, data(), Aligned); } template Vc_INTRINSIC void Vector::store(EntryType *mem, const Mask &mask) const { HV::store(mem, data(), avx_cast(mask.data()), Aligned); } template template Vc_INTRINSIC void Vector::store(EntryType *mem, A align) const { HV::store(mem, data(), align); } template template Vc_INTRINSIC void Vector::store(EntryType *mem, const Mask &mask, A align) const { HV::store(mem, data(), avx_cast(mask.data()), align); } /////////////////////////////////////////////////////////////////////////////////////////// // expand/merge 1 float_v <=> 2 double_v XXX rationale? remove it for release? XXX {{{1 template Vc_ALWAYS_INLINE Vc_FLATTEN Vector::Vector(const Vector *a) : d(a[0]) { } template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector::Vector(const Vector *a) : d(concat(_mm256_cvtpd_ps(a[0].data()), _mm256_cvtpd_ps(a[1].data()))) { } template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector::Vector(const Vector *a) : d(_mm_packs_epi32(lo128(a->data()), hi128(a->data()))) { } template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector::Vector(const Vector *a) : d(_mm_packus_epi32(lo128(a->data()), hi128(a->data()))) { } template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::expand(Vector *x) const { x[0] = *this; } template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::expand(Vector *x) const { x[0].data() = _mm256_cvtps_pd(lo128(d.v())); x[1].data() = _mm256_cvtps_pd(hi128(d.v())); } template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::expand(Vector *x) const { x[0].data() = concat(_mm_cvtepi16_epi32(d.v()), _mm_cvtepi16_epi32(_mm_unpackhi_epi64(d.v(), d.v()))); } template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::expand(Vector *x) const { x[0].data() = concat(_mm_cvtepu16_epi32(d.v()), _mm_cvtepu16_epi32(_mm_unpackhi_epi64(d.v(), d.v()))); } /////////////////////////////////////////////////////////////////////////////////////////// // swizzles {{{1 template Vc_INTRINSIC const Vector Vc_PURE &Vector::abcd() const { return *this; } template Vc_INTRINSIC const Vector Vc_PURE Vector::cdab() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::badc() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::aaaa() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::bbbb() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::cccc() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::dddd() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::bcad() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::bcda() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::dabc() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::acbd() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::dbca() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::dcba() const { return Mem::permute(data()); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::cdab() const { return Mem::shuffle128(data(), data()); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::badc() const { return Mem::permute(data()); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::aaaa() const { const double &tmp = d.m(0); return _mm256_broadcast_sd(&tmp); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::bbbb() const { const double &tmp = d.m(1); return _mm256_broadcast_sd(&tmp); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::cccc() const { const double &tmp = d.m(2); return _mm256_broadcast_sd(&tmp); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::dddd() const { const double &tmp = d.m(3); return _mm256_broadcast_sd(&tmp); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::bcad() const { return Mem::shuffle(Mem::shuffle128(data(), data()), Mem::shuffle128(data(), data())); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::bcda() const { return Mem::shuffle(data(), Mem::shuffle128(data(), data())); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::dabc() const { return Mem::shuffle(Mem::shuffle128(data(), data()), data()); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::acbd() const { return Mem::shuffle(Mem::shuffle128(data(), data()), Mem::shuffle128(data(), data())); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::dbca() const { return Mem::shuffle(Mem::shuffle128(data(), data()), Mem::shuffle128(data(), data())); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::dcba() const { return cdab().badc(); } #define VC_SWIZZLES_16BIT_IMPL(T) \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::cdab() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::badc() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::aaaa() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::bbbb() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::cccc() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::dddd() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::bcad() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::bcda() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::dabc() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::acbd() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::dbca() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::dcba() const { return Mem::permute(data()); } VC_SWIZZLES_16BIT_IMPL(short) VC_SWIZZLES_16BIT_IMPL(unsigned short) #undef VC_SWIZZLES_16BIT_IMPL /////////////////////////////////////////////////////////////////////////////////////////// // division {{{1 template inline Vector &Vector::operator/=(EntryType x) { if (HasVectorDivision) { return operator/=(Vector(x)); } for_all_vector_entries(i, d.m(i) /= x; ); return *this; } template template inline Vc_PURE VC_EXACT_TYPE(TT, typename DetermineEntryType::Type, Vector) Vector::operator/(TT x) const { if (HasVectorDivision) { return operator/(Vector(x)); } Vector r; for_all_vector_entries(i, r.d.m(i) = d.m(i) / x; ); return r; } // per default fall back to scalar division template inline Vector &Vector::operator/=(const Vector &x) { for_all_vector_entries(i, d.m(i) /= x.d.m(i); ); return *this; } template inline Vector Vc_PURE Vector::operator/(const Vector &x) const { Vector r; for_all_vector_entries(i, r.d.m(i) = d.m(i) / x.d.m(i); ); return r; } // specialize division on type static Vc_INTRINSIC m256i Vc_CONST divInt(param256i a, param256i b) { const m256d lo1 = _mm256_cvtepi32_pd(lo128(a)); const m256d lo2 = _mm256_cvtepi32_pd(lo128(b)); const m256d hi1 = _mm256_cvtepi32_pd(hi128(a)); const m256d hi2 = _mm256_cvtepi32_pd(hi128(b)); return concat( _mm256_cvttpd_epi32(_mm256_div_pd(lo1, lo2)), _mm256_cvttpd_epi32(_mm256_div_pd(hi1, hi2)) ); } template<> inline Vector &Vector::operator/=(const Vector &x) { d.v() = divInt(d.v(), x.d.v()); return *this; } template<> inline Vector Vc_PURE Vector::operator/(const Vector &x) const { return divInt(d.v(), x.d.v()); } static inline m256i Vc_CONST divUInt(param256i a, param256i b) { m256d loa = _mm256_cvtepi32_pd(lo128(a)); m256d hia = _mm256_cvtepi32_pd(hi128(a)); m256d lob = _mm256_cvtepi32_pd(lo128(b)); m256d hib = _mm256_cvtepi32_pd(hi128(b)); // if a >= 2^31 then after conversion to double it will contain a negative number (i.e. a-2^32) // to get the right number back we have to add 2^32 where a >= 2^31 loa = _mm256_add_pd(loa, _mm256_and_pd(_mm256_cmp_pd(loa, _mm256_setzero_pd(), _CMP_LT_OS), _mm256_set1_pd(4294967296.))); hia = _mm256_add_pd(hia, _mm256_and_pd(_mm256_cmp_pd(hia, _mm256_setzero_pd(), _CMP_LT_OS), _mm256_set1_pd(4294967296.))); // we don't do the same for b because division by b >= 2^31 should be a seldom corner case and // we rather want the standard stuff fast // // there is one remaining problem: a >= 2^31 and b == 1 // in that case the return value would be 2^31 return avx_cast(_mm256_blendv_ps(avx_cast(concat( _mm256_cvttpd_epi32(_mm256_div_pd(loa, lob)), _mm256_cvttpd_epi32(_mm256_div_pd(hia, hib)) )), avx_cast(a), avx_cast(concat( _mm_cmpeq_epi32(lo128(b), _mm_setone_epi32()), _mm_cmpeq_epi32(hi128(b), _mm_setone_epi32()))))); } template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) { d.v() = divUInt(d.v(), x.d.v()); return *this; } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vector::operator/(const Vector &x) const { return divUInt(d.v(), x.d.v()); } template static inline m128i Vc_CONST divShort(param128i a, param128i b) { const m256 r = _mm256_div_ps(StaticCastHelper::cast(a), StaticCastHelper::cast(b)); return StaticCastHelper::cast(r); } template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) { d.v() = divShort(d.v(), x.d.v()); return *this; } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vector::operator/(const Vector &x) const { return divShort(d.v(), x.d.v()); } template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) { d.v() = divShort(d.v(), x.d.v()); return *this; } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vector::operator/(const Vector &x) const { return divShort(d.v(), x.d.v()); } template<> Vc_INTRINSIC float_v &float_v::operator/=(const float_v &x) { d.v() = _mm256_div_ps(d.v(), x.d.v()); return *this; } template<> Vc_INTRINSIC float_v Vc_PURE float_v::operator/(const float_v &x) const { return _mm256_div_ps(d.v(), x.d.v()); } template<> Vc_INTRINSIC sfloat_v &sfloat_v::operator/=(const sfloat_v &x) { d.v() = _mm256_div_ps(d.v(), x.d.v()); return *this; } template<> Vc_INTRINSIC sfloat_v Vc_PURE sfloat_v::operator/(const sfloat_v &x) const { return _mm256_div_ps(d.v(), x.d.v()); } template<> Vc_INTRINSIC double_v &double_v::operator/=(const double_v &x) { d.v() = _mm256_div_pd(d.v(), x.d.v()); return *this; } template<> Vc_INTRINSIC double_v Vc_PURE double_v::operator/(const double_v &x) const { return _mm256_div_pd(d.v(), x.d.v()); } /////////////////////////////////////////////////////////////////////////////////////////// // integer ops {{{1 #define OP_IMPL(T, symbol) \ template<> Vc_ALWAYS_INLINE Vector &Vector::operator symbol##=(AsArg x) \ { \ for_all_vector_entries(i, d.m(i) symbol##= x.d.m(i); ); \ return *this; \ } \ template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator symbol(AsArg x) const \ { \ Vector r; \ for_all_vector_entries(i, r.d.m(i) = d.m(i) symbol x.d.m(i); ); \ return r; \ } OP_IMPL(int, <<) OP_IMPL(int, >>) OP_IMPL(unsigned int, <<) OP_IMPL(unsigned int, >>) OP_IMPL(short, <<) OP_IMPL(short, >>) OP_IMPL(unsigned short, <<) OP_IMPL(unsigned short, >>) #undef OP_IMPL template Vc_ALWAYS_INLINE Vector &Vector::operator>>=(int shift) { d.v() = VectorHelper::shiftRight(d.v(), shift); return *static_cast *>(this); } template Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator>>(int shift) const { return VectorHelper::shiftRight(d.v(), shift); } template Vc_ALWAYS_INLINE Vector &Vector::operator<<=(int shift) { d.v() = VectorHelper::shiftLeft(d.v(), shift); return *static_cast *>(this); } template Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator<<(int shift) const { return VectorHelper::shiftLeft(d.v(), shift); } #define OP_IMPL(T, symbol, fun) \ template<> Vc_ALWAYS_INLINE Vector &Vector::operator symbol##=(AsArg x) { d.v() = HV::fun(d.v(), x.d.v()); return *this; } \ template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator symbol(AsArg x) const { return Vector(HV::fun(d.v(), x.d.v())); } OP_IMPL(int, &, and_) OP_IMPL(int, |, or_) OP_IMPL(int, ^, xor_) OP_IMPL(unsigned int, &, and_) OP_IMPL(unsigned int, |, or_) OP_IMPL(unsigned int, ^, xor_) OP_IMPL(short, &, and_) OP_IMPL(short, |, or_) OP_IMPL(short, ^, xor_) OP_IMPL(unsigned short, &, and_) OP_IMPL(unsigned short, |, or_) OP_IMPL(unsigned short, ^, xor_) OP_IMPL(float, &, and_) OP_IMPL(float, |, or_) OP_IMPL(float, ^, xor_) OP_IMPL(sfloat, &, and_) OP_IMPL(sfloat, |, or_) OP_IMPL(sfloat, ^, xor_) OP_IMPL(double, &, and_) OP_IMPL(double, |, or_) OP_IMPL(double, ^, xor_) #undef OP_IMPL // operators {{{1 #include "../common/operators.h" // isNegative {{{1 template<> Vc_INTRINSIC Vc_PURE float_m float_v::isNegative() const { return avx_cast(_mm256_srai_epi32(avx_cast(_mm256_and_ps(_mm256_setsignmask_ps(), d.v())), 31)); } template<> Vc_INTRINSIC Vc_PURE sfloat_m sfloat_v::isNegative() const { return avx_cast(_mm256_srai_epi32(avx_cast(_mm256_and_ps(_mm256_setsignmask_ps(), d.v())), 31)); } template<> Vc_INTRINSIC Vc_PURE double_m double_v::isNegative() const { return Mem::permute(avx_cast( _mm256_srai_epi32(avx_cast(_mm256_and_pd(_mm256_setsignmask_pd(), d.v())), 31) )); } // gathers {{{1 // Better implementation (hopefully) with _mm256_set_ //X template template Vector::Vector(const EntryType *mem, const Index *indexes) //X { //X for_all_vector_entries(int i, //X d.m(i) = mem[indexes[i]]; //X ); //X } template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, const IndexT *indexes) { gather(mem, indexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes) { gather(mem, indexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, const IndexT *indexes, MaskArg mask) : d(HT::zero()) { gather(mem, indexes, mask); } template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask) : d(HT::zero()) { gather(mem, indexes, mask); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { gather(array, member1, indexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) : d(HT::zero()) { gather(array, member1, indexes, mask); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { gather(array, member1, member2, indexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) : d(HT::zero()) { gather(array, member1, member2, indexes, mask); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { gather(array, ptrMember1, outerIndexes, innerIndexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) : d(HT::zero()) { gather(array, ptrMember1, outerIndexes, innerIndexes, mask); } template struct IndexSizeChecker { static void check() {} }; template struct IndexSizeChecker, Size> { static void check() { VC_STATIC_ASSERT(Vector::Size >= Size, IndexVector_must_have_greater_or_equal_number_of_entries); } }; template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_pd(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); } #ifdef VC_USE_SET_GATHERS template template Vc_ALWAYS_INLINE void Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask) { IndexSizeChecker, Size>::check(); Vector indexesTmp = indexes; indexesTmp.setZero(!mask); (*this)(mask) = Vector(mem, indexesTmp); } #endif #ifdef VC_USE_BSF_GATHERS #define VC_MASKED_GATHER \ int bits = mask.toInt(); \ while (bits) { \ const int i = _bit_scan_forward(bits); \ bits &= ~(1 << i); /* btr? */ \ d.m(i) = ith_value(i); \ } #elif defined(VC_USE_POPCNT_BSF_GATHERS) #define VC_MASKED_GATHER \ unsigned int bits = mask.toInt(); \ unsigned int low, high = 0; \ switch (_mm_popcnt_u32(bits)) { \ case 8: \ high = _bit_scan_reverse(bits); \ d.m(high) = ith_value(high); \ high = (1 << high); \ case 7: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ d.m(low) = ith_value(low); \ case 6: \ high = _bit_scan_reverse(bits); \ d.m(high) = ith_value(high); \ high = (1 << high); \ case 5: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ d.m(low) = ith_value(low); \ case 4: \ high = _bit_scan_reverse(bits); \ d.m(high) = ith_value(high); \ high = (1 << high); \ case 3: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ d.m(low) = ith_value(low); \ case 2: \ high = _bit_scan_reverse(bits); \ d.m(high) = ith_value(high); \ case 1: \ low = _bit_scan_forward(bits); \ d.m(low) = ith_value(low); \ case 0: \ break; \ } #else #define VC_MASKED_GATHER \ if (mask.isEmpty()) { \ return; \ } \ for_all_vector_entries(i, \ if (mask[i]) d.m(i) = ith_value(i); \ ); #endif template template Vc_INTRINSIC void Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) { IndexSizeChecker::check(); #define ith_value(_i_) (mem[indexes[_i_]]) VC_MASKED_GATHER #undef ith_value } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_pd(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1), array[indexes[7]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1), array[indexes[7]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1), array[indexes[7]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1), array[indexes[7]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1), array[indexes[7]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1), array[indexes[7]].*(member1)); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) { IndexSizeChecker::check(); #define ith_value(_i_) (array[indexes[_i_]].*(member1)) VC_MASKED_GATHER #undef ith_value } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_pd(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) { IndexSizeChecker::check(); #define ith_value(_i_) (array[indexes[_i_]].*(member1).*(member2)) VC_MASKED_GATHER #undef ith_value } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm256_setr_pd((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm256_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm256_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm256_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm256_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) { IndexSizeChecker::check(); IndexSizeChecker::check(); #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]] VC_MASKED_GATHER #undef ith_value } #undef VC_MASKED_GATHER #ifdef VC_USE_BSF_SCATTERS #define VC_MASKED_SCATTER \ int bits = mask.toInt(); \ while (bits) { \ const int i = _bit_scan_forward(bits); \ bits ^= (1 << i); /* btr? */ \ ith_value(i) = d.m(i); \ } #elif defined(VC_USE_POPCNT_BSF_SCATTERS) #define VC_MASKED_SCATTER \ unsigned int bits = mask.toInt(); \ unsigned int low, high = 0; \ switch (_mm_popcnt_u32(bits)) { \ case 8: \ high = _bit_scan_reverse(bits); \ ith_value(high) = d.m(high); \ high = (1 << high); \ case 7: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ ith_value(low) = d.m(low); \ case 6: \ high = _bit_scan_reverse(bits); \ ith_value(high) = d.m(high); \ high = (1 << high); \ case 5: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ ith_value(low) = d.m(low); \ case 4: \ high = _bit_scan_reverse(bits); \ ith_value(high) = d.m(high); \ high = (1 << high); \ case 3: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ ith_value(low) = d.m(low); \ case 2: \ high = _bit_scan_reverse(bits); \ ith_value(high) = d.m(high); \ case 1: \ low = _bit_scan_forward(bits); \ ith_value(low) = d.m(low); \ case 0: \ break; \ } #else #define VC_MASKED_SCATTER \ if (mask.isEmpty()) { \ return; \ } \ for_all_vector_entries(i, \ if (mask[i]) ith_value(i) = d.m(i); \ ); #endif template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const { for_all_vector_entries(i, mem[indexes[i]] = d.m(i); ); } #if defined(VC_MSVC) && VC_MSVC >= 170000000 // MSVC miscompiles the store mem[indexes[1]] = d.m(1) for T = (u)short template<> template Vc_ALWAYS_INLINE void short_v::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const { const unsigned int tmp = d.v()._d.m128i_u32[0]; mem[indexes[0]] = tmp & 0xffff; mem[indexes[1]] = tmp >> 16; mem[indexes[2]] = _mm_extract_epi16(d.v(), 2); mem[indexes[3]] = _mm_extract_epi16(d.v(), 3); mem[indexes[4]] = _mm_extract_epi16(d.v(), 4); mem[indexes[5]] = _mm_extract_epi16(d.v(), 5); mem[indexes[6]] = _mm_extract_epi16(d.v(), 6); mem[indexes[7]] = _mm_extract_epi16(d.v(), 7); } template<> template Vc_ALWAYS_INLINE void ushort_v::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const { const unsigned int tmp = d.v()._d.m128i_u32[0]; mem[indexes[0]] = tmp & 0xffff; mem[indexes[1]] = tmp >> 16; mem[indexes[2]] = _mm_extract_epi16(d.v(), 2); mem[indexes[3]] = _mm_extract_epi16(d.v(), 3); mem[indexes[4]] = _mm_extract_epi16(d.v(), 4); mem[indexes[5]] = _mm_extract_epi16(d.v(), 5); mem[indexes[6]] = _mm_extract_epi16(d.v(), 6); mem[indexes[7]] = _mm_extract_epi16(d.v(), 7); } #endif template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) const { #define ith_value(_i_) mem[indexes[_i_]] VC_MASKED_SCATTER #undef ith_value } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) const { for_all_vector_entries(i, array[indexes[i]].*(member1) = d.m(i); ); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const { #define ith_value(_i_) array[indexes[_i_]].*(member1) VC_MASKED_SCATTER #undef ith_value } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) const { for_all_vector_entries(i, array[indexes[i]].*(member1).*(member2) = d.m(i); ); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const { #define ith_value(_i_) array[indexes[_i_]].*(member1).*(member2) VC_MASKED_SCATTER #undef ith_value } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) const { for_all_vector_entries(i, (array[innerIndexes[i]].*(ptrMember1))[outerIndexes[i]] = d.m(i); ); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) const { #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]] VC_MASKED_SCATTER #undef ith_value } /////////////////////////////////////////////////////////////////////////////////////////// // operator- {{{1 template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { return _mm256_xor_pd(d.v(), _mm256_setsignmask_pd()); } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { return _mm256_xor_ps(d.v(), _mm256_setsignmask_ps()); } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { return _mm256_xor_ps(d.v(), _mm256_setsignmask_ps()); } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { return _mm256_sign_epi32(d.v(), _mm256_setallone_si256()); } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { return _mm256_sign_epi32(d.v(), _mm256_setallone_si256()); } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { return _mm_sign_epi16(d.v(), _mm_setallone_si128()); } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { return _mm_sign_epi16(d.v(), _mm_setallone_si128()); } /////////////////////////////////////////////////////////////////////////////////////////// // horizontal ops {{{1 template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::min(MaskArg m) const { Vector tmp = std::numeric_limits >::max(); tmp(m) = *this; return tmp.min(); } template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::max(MaskArg m) const { Vector tmp = std::numeric_limits >::min(); tmp(m) = *this; return tmp.max(); } template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::product(MaskArg m) const { Vector tmp(VectorSpecialInitializerOne::One); tmp(m) = *this; return tmp.product(); } template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::sum(MaskArg m) const { Vector tmp(VectorSpecialInitializerZero::Zero); tmp(m) = *this; return tmp.sum(); }//}}} // copySign {{{1 template<> Vc_INTRINSIC Vector Vector::copySign(Vector::AsArg reference) const { return _mm256_or_ps( _mm256_and_ps(reference.d.v(), _mm256_setsignmask_ps()), _mm256_and_ps(d.v(), _mm256_setabsmask_ps()) ); } template<> Vc_INTRINSIC Vector Vector::copySign(Vector::AsArg reference) const { return _mm256_or_ps( _mm256_and_ps(reference.d.v(), _mm256_setsignmask_ps()), _mm256_and_ps(d.v(), _mm256_setabsmask_ps()) ); } template<> Vc_INTRINSIC Vector Vector::copySign(Vector::AsArg reference) const { return _mm256_or_pd( _mm256_and_pd(reference.d.v(), _mm256_setsignmask_pd()), _mm256_and_pd(d.v(), _mm256_setabsmask_pd()) ); }//}}}1 // exponent {{{1 template<> Vc_INTRINSIC Vector Vector::exponent() const { VC_ASSERT((*this >= 0.f).isFull()); return Internal::exponent(d.v()); } template<> Vc_INTRINSIC Vector Vector::exponent() const { VC_ASSERT((*this >= 0.f).isFull()); return Internal::exponent(d.v()); } template<> Vc_INTRINSIC Vector Vector::exponent() const { VC_ASSERT((*this >= 0.).isFull()); return Internal::exponent(d.v()); } // }}}1 // Random {{{1 static Vc_ALWAYS_INLINE void _doRandomStep(Vector &state0, Vector &state1) { state0.load(&Vc::RandomState[0]); state1.load(&Vc::RandomState[uint_v::Size]); (state1 * 0xdeece66du + 11).store(&Vc::RandomState[uint_v::Size]); uint_v(_mm256_xor_si256((state0 * 0xdeece66du + 11).data(), _mm256_srli_epi32(state1.data(), 16))).store(&Vc::RandomState[0]); } template Vc_ALWAYS_INLINE Vector Vector::Random() { Vector state0, state1; _doRandomStep(state0, state1); return state0.reinterpretCast >(); } template<> Vc_ALWAYS_INLINE Vector Vector::Random() { Vector state0, state1; _doRandomStep(state0, state1); return HT::sub(HV::or_(_cast(_mm256_srli_epi32(state0.data(), 2)), HT::one()), HT::one()); } template<> Vc_ALWAYS_INLINE Vector Vector::Random() { Vector state0, state1; _doRandomStep(state0, state1); return HT::sub(HV::or_(_cast(_mm256_srli_epi32(state0.data(), 2)), HT::one()), HT::one()); } template<> Vc_ALWAYS_INLINE Vector Vector::Random() { const m256i state = VectorHelper::load(&Vc::RandomState[0], Vc::Aligned); for (size_t k = 0; k < 8; k += 2) { typedef unsigned long long uint64 Vc_MAY_ALIAS; const uint64 stateX = *reinterpret_cast(&Vc::RandomState[k]); *reinterpret_cast(&Vc::RandomState[k]) = (stateX * 0x5deece66dull + 11); } return (Vector(_cast(_mm256_srli_epi64(state, 12))) | One()) - One(); } // }}}1 // shifted / rotated {{{1 template struct VectorShift; template<> struct VectorShift<32, 4, m256d, double> { static Vc_INTRINSIC m256d shifted(param256d v, int amount) { switch (amount) { case 0: return v; case 1: return avx_cast(_mm256_srli_si256(avx_cast(v), 1 * sizeof(double))); case 2: return avx_cast(_mm256_srli_si256(avx_cast(v), 2 * sizeof(double))); case 3: return avx_cast(_mm256_srli_si256(avx_cast(v), 3 * sizeof(double))); case -1: return avx_cast(_mm256_slli_si256(avx_cast(v), 1 * sizeof(double))); case -2: return avx_cast(_mm256_slli_si256(avx_cast(v), 2 * sizeof(double))); case -3: return avx_cast(_mm256_slli_si256(avx_cast(v), 3 * sizeof(double))); } return _mm256_setzero_pd(); } }; template struct VectorShift<32, 8, VectorType, EntryType> { typedef typename SseVectorType::Type SmallV; static Vc_INTRINSIC VectorType shifted(VC_ALIGNED_PARAMETER(VectorType) v, int amount) { switch (amount) { case 0: return v; case 1: return avx_cast(_mm256_srli_si256(avx_cast(v), 1 * sizeof(EntryType))); case 2: return avx_cast(_mm256_srli_si256(avx_cast(v), 2 * sizeof(EntryType))); case 3: return avx_cast(_mm256_srli_si256(avx_cast(v), 3 * sizeof(EntryType))); case 4: return avx_cast(_mm256_srli_si256(avx_cast(v), 4 * sizeof(EntryType))); case 5: return avx_cast(_mm256_srli_si256(avx_cast(v), 5 * sizeof(EntryType))); case 6: return avx_cast(_mm256_srli_si256(avx_cast(v), 6 * sizeof(EntryType))); case 7: return avx_cast(_mm256_srli_si256(avx_cast(v), 7 * sizeof(EntryType))); case -1: return avx_cast(_mm256_slli_si256(avx_cast(v), 1 * sizeof(EntryType))); case -2: return avx_cast(_mm256_slli_si256(avx_cast(v), 2 * sizeof(EntryType))); case -3: return avx_cast(_mm256_slli_si256(avx_cast(v), 3 * sizeof(EntryType))); case -4: return avx_cast(_mm256_slli_si256(avx_cast(v), 4 * sizeof(EntryType))); case -5: return avx_cast(_mm256_slli_si256(avx_cast(v), 5 * sizeof(EntryType))); case -6: return avx_cast(_mm256_slli_si256(avx_cast(v), 6 * sizeof(EntryType))); case -7: return avx_cast(_mm256_slli_si256(avx_cast(v), 7 * sizeof(EntryType))); } return avx_cast(_mm256_setzero_ps()); } }; template struct VectorShift<16, 8, VectorType, EntryType> { enum { EntryTypeSizeof = sizeof(EntryType) }; static Vc_INTRINSIC VectorType shifted(VC_ALIGNED_PARAMETER(VectorType) v, int amount) { switch (amount) { case 0: return v; case 1: return avx_cast(_mm_srli_si128(avx_cast(v), 1 * EntryTypeSizeof)); case 2: return avx_cast(_mm_srli_si128(avx_cast(v), 2 * EntryTypeSizeof)); case 3: return avx_cast(_mm_srli_si128(avx_cast(v), 3 * EntryTypeSizeof)); case 4: return avx_cast(_mm_srli_si128(avx_cast(v), 4 * EntryTypeSizeof)); case 5: return avx_cast(_mm_srli_si128(avx_cast(v), 5 * EntryTypeSizeof)); case 6: return avx_cast(_mm_srli_si128(avx_cast(v), 6 * EntryTypeSizeof)); case 7: return avx_cast(_mm_srli_si128(avx_cast(v), 7 * EntryTypeSizeof)); case -1: return avx_cast(_mm_slli_si128(avx_cast(v), 1 * EntryTypeSizeof)); case -2: return avx_cast(_mm_slli_si128(avx_cast(v), 2 * EntryTypeSizeof)); case -3: return avx_cast(_mm_slli_si128(avx_cast(v), 3 * EntryTypeSizeof)); case -4: return avx_cast(_mm_slli_si128(avx_cast(v), 4 * EntryTypeSizeof)); case -5: return avx_cast(_mm_slli_si128(avx_cast(v), 5 * EntryTypeSizeof)); case -6: return avx_cast(_mm_slli_si128(avx_cast(v), 6 * EntryTypeSizeof)); case -7: return avx_cast(_mm_slli_si128(avx_cast(v), 7 * EntryTypeSizeof)); } return _mm_setzero_si128(); } }; template Vc_INTRINSIC Vector Vector::shifted(int amount) const { return VectorShift::shifted(d.v(), amount); } template struct VectorRotate; template struct VectorRotate<32, 4, VectorType, EntryType> { typedef typename SseVectorType::Type SmallV; enum { EntryTypeSizeof = sizeof(EntryType) }; static Vc_INTRINSIC VectorType rotated(VC_ALIGNED_PARAMETER(VectorType) v, int amount) { const m128i vLo = avx_cast(lo128(v)); const m128i vHi = avx_cast(hi128(v)); switch (static_cast(amount) % 4) { case 0: return v; case 1: return concat(avx_cast(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof))); case 2: return Mem::permute128(v); case 3: return concat(avx_cast(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof))); } return _mm256_setzero_pd(); } }; template struct VectorRotate<32, 8, VectorType, EntryType> { typedef typename SseVectorType::Type SmallV; enum { EntryTypeSizeof = sizeof(EntryType) }; static Vc_INTRINSIC VectorType rotated(VC_ALIGNED_PARAMETER(VectorType) v, int amount) { const m128i vLo = avx_cast(lo128(v)); const m128i vHi = avx_cast(hi128(v)); switch (static_cast(amount) % 8) { case 0: return v; case 1: return concat(avx_cast(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof))); case 2: return concat(avx_cast(_mm_alignr_epi8(vHi, vLo, 2 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vLo, vHi, 2 * EntryTypeSizeof))); case 3: return concat(avx_cast(_mm_alignr_epi8(vHi, vLo, 3 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vLo, vHi, 3 * EntryTypeSizeof))); case 4: return Mem::permute128(v); case 5: return concat(avx_cast(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof))); case 6: return concat(avx_cast(_mm_alignr_epi8(vLo, vHi, 2 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vHi, vLo, 2 * EntryTypeSizeof))); case 7: return concat(avx_cast(_mm_alignr_epi8(vLo, vHi, 3 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vHi, vLo, 3 * EntryTypeSizeof))); } return avx_cast(_mm256_setzero_ps()); } }; template struct VectorRotate<16, 8, VectorType, EntryType> { enum { EntryTypeSizeof = sizeof(EntryType) }; static Vc_INTRINSIC VectorType rotated(VC_ALIGNED_PARAMETER(VectorType) v, int amount) { switch (static_cast(amount) % 8) { case 0: return v; case 1: return avx_cast(_mm_alignr_epi8(v, v, 1 * EntryTypeSizeof)); case 2: return avx_cast(_mm_alignr_epi8(v, v, 2 * EntryTypeSizeof)); case 3: return avx_cast(_mm_alignr_epi8(v, v, 3 * EntryTypeSizeof)); case 4: return avx_cast(_mm_alignr_epi8(v, v, 4 * EntryTypeSizeof)); case 5: return avx_cast(_mm_alignr_epi8(v, v, 5 * EntryTypeSizeof)); case 6: return avx_cast(_mm_alignr_epi8(v, v, 6 * EntryTypeSizeof)); case 7: return avx_cast(_mm_alignr_epi8(v, v, 7 * EntryTypeSizeof)); } return _mm_setzero_si128(); } }; template Vc_INTRINSIC Vector Vector::rotated(int amount) const { return VectorRotate::rotated(d.v(), amount); /* const m128i v0 = avx_cast(d.v()[0]); const m128i v1 = avx_cast(d.v()[1]); switch (static_cast(amount) % Size) { case 0: return *this; case 1: return concat(avx_cast(_mm_alignr_epi8(v1, v0, 1 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v0, v1, 1 * sizeof(EntryType)))); case 2: return concat(avx_cast(_mm_alignr_epi8(v1, v0, 2 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v0, v1, 2 * sizeof(EntryType)))); case 3: return concat(avx_cast(_mm_alignr_epi8(v1, v0, 3 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v0, v1, 3 * sizeof(EntryType)))); case 4: return concat(d.v()[1], d.v()[0]); case 5: return concat(avx_cast(_mm_alignr_epi8(v0, v1, 1 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v1, v0, 1 * sizeof(EntryType)))); case 6: return concat(avx_cast(_mm_alignr_epi8(v0, v1, 2 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v1, v0, 2 * sizeof(EntryType)))); case 7: return concat(avx_cast(_mm_alignr_epi8(v0, v1, 3 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v1, v0, 3 * sizeof(EntryType)))); } */ } // }}}1 } // namespace AVX } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" // vim: foldmethod=marker Vc-0.7.4/avx/vectorhelper.h000066400000000000000000001245261233512346000155540ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef AVX_VECTORHELPER_H #define AVX_VECTORHELPER_H #include #include "types.h" #include "intrinsics.h" #include "casts.h" #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace AVX { namespace Internal { Vc_INTRINSIC Vc_CONST m256 exponent(param256 v) { m128i tmp0 = _mm_srli_epi32(avx_cast(v), 23); m128i tmp1 = _mm_srli_epi32(avx_cast(hi128(v)), 23); tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f)); tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f)); return _mm256_cvtepi32_ps(concat(tmp0, tmp1)); } Vc_INTRINSIC Vc_CONST m256d exponent(param256d v) { m128i tmp0 = _mm_srli_epi64(avx_cast(v), 52); m128i tmp1 = _mm_srli_epi64(avx_cast(hi128(v)), 52); tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x3ff)); tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x3ff)); return _mm256_cvtepi32_pd(avx_cast(Mem::shuffle(avx_cast(tmp0), avx_cast(tmp1)))); } } // namespace Internal #define OP0(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name() { return code; } #define OP1(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VTArg a) { return code; } #define OP2(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VTArg a, VTArg b) { return code; } #define OP3(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VTArg a, VTArg b, VTArg c) { return code; } template<> struct VectorHelper { typedef m256 VectorType; #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN typedef const VectorType & VTArg; #else typedef const VectorType VTArg; #endif template static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const float *x, A) Vc_ALWAYS_INLINE_R Vc_PURE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, AlignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, UnalignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, AlignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, UnalignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE Vc_CONST VectorType cdab(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(2, 3, 0, 1)); } static Vc_ALWAYS_INLINE Vc_CONST VectorType badc(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(1, 0, 3, 2)); } static Vc_ALWAYS_INLINE Vc_CONST VectorType aaaa(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(0, 0, 0, 0)); } static Vc_ALWAYS_INLINE Vc_CONST VectorType bbbb(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(1, 1, 1, 1)); } static Vc_ALWAYS_INLINE Vc_CONST VectorType cccc(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(2, 2, 2, 2)); } static Vc_ALWAYS_INLINE Vc_CONST VectorType dddd(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(3, 3, 3, 3)); } static Vc_ALWAYS_INLINE Vc_CONST VectorType dacb(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(3, 0, 2, 1)); } OP0(allone, _mm256_setallone_ps()) OP0(zero, _mm256_setzero_ps()) OP2(or_, _mm256_or_ps(a, b)) OP2(xor_, _mm256_xor_ps(a, b)) OP2(and_, _mm256_and_ps(a, b)) OP2(andnot_, _mm256_andnot_ps(a, b)) OP3(blend, _mm256_blendv_ps(a, b, c)) }; template<> struct VectorHelper { typedef m256d VectorType; #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN typedef const VectorType & VTArg; #else typedef const VectorType VTArg; #endif template static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const double *x, A) Vc_ALWAYS_INLINE_R Vc_PURE_R; static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, AlignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, UnalignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, AlignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, UnalignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; static VectorType cdab(VTArg x) { return _mm256_permute_pd(x, 5); } static VectorType badc(VTArg x) { return _mm256_permute2f128_pd(x, x, 1); } // aaaa bbbb cccc dddd specialized in vector.tcc static VectorType dacb(VTArg x) { const m128d cb = avx_cast(_mm_alignr_epi8(avx_cast(lo128(x)), avx_cast(hi128(x)), sizeof(double))); // XXX: lo and hi swapped? const m128d da = _mm_blend_pd(lo128(x), hi128(x), 0 + 2); // XXX: lo and hi swapped? return concat(cb, da); } OP0(allone, _mm256_setallone_pd()) OP0(zero, _mm256_setzero_pd()) OP2(or_, _mm256_or_pd(a, b)) OP2(xor_, _mm256_xor_pd(a, b)) OP2(and_, _mm256_and_pd(a, b)) OP2(andnot_, _mm256_andnot_pd(a, b)) OP3(blend, _mm256_blendv_pd(a, b, c)) }; template<> struct VectorHelper { typedef m256i VectorType; #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN typedef const VectorType & VTArg; #else typedef const VectorType VTArg; #endif template static VectorType load(const T *x, AlignedFlag) Vc_PURE; template static VectorType load(const T *x, UnalignedFlag) Vc_PURE; template static VectorType load(const T *x, StreamingAndAlignedFlag) Vc_PURE; template static VectorType load(const T *x, StreamingAndUnalignedFlag) Vc_PURE; template static void store(T *mem, VTArg x, AlignedFlag); template static void store(T *mem, VTArg x, UnalignedFlag); template static void store(T *mem, VTArg x, StreamingAndAlignedFlag); template static void store(T *mem, VTArg x, StreamingAndUnalignedFlag); template static void store(T *mem, VTArg x, VTArg m, AlignedFlag); template static void store(T *mem, VTArg x, VTArg m, UnalignedFlag); template static void store(T *mem, VTArg x, VTArg m, StreamingAndAlignedFlag); template static void store(T *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag); static VectorType cdab(VTArg x) { return avx_cast(_mm256_permute_ps(avx_cast(x), _MM_SHUFFLE(2, 3, 0, 1))); } static VectorType badc(VTArg x) { return avx_cast(_mm256_permute_ps(avx_cast(x), _MM_SHUFFLE(1, 0, 3, 2))); } static VectorType aaaa(VTArg x) { return avx_cast(_mm256_permute_ps(avx_cast(x), _MM_SHUFFLE(0, 0, 0, 0))); } static VectorType bbbb(VTArg x) { return avx_cast(_mm256_permute_ps(avx_cast(x), _MM_SHUFFLE(1, 1, 1, 1))); } static VectorType cccc(VTArg x) { return avx_cast(_mm256_permute_ps(avx_cast(x), _MM_SHUFFLE(2, 2, 2, 2))); } static VectorType dddd(VTArg x) { return avx_cast(_mm256_permute_ps(avx_cast(x), _MM_SHUFFLE(3, 3, 3, 3))); } static VectorType dacb(VTArg x) { return avx_cast(_mm256_permute_ps(avx_cast(x), _MM_SHUFFLE(3, 0, 2, 1))); } OP0(allone, _mm256_setallone_si256()) OP0(zero, _mm256_setzero_si256()) OP2(or_, _mm256_or_si256(a, b)) OP2(xor_, _mm256_xor_si256(a, b)) OP2(and_, _mm256_and_si256(a, b)) OP2(andnot_, _mm256_andnot_si256(a, b)) OP3(blend, _mm256_blendv_epi8(a, b, c)) }; template<> struct VectorHelper { typedef m128i VectorType; #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN typedef const VectorType & VTArg; #else typedef const VectorType VTArg; #endif template static VectorType load(const T *x, AlignedFlag) Vc_PURE; template static VectorType load(const T *x, UnalignedFlag) Vc_PURE; template static VectorType load(const T *x, StreamingAndAlignedFlag) Vc_PURE; template static VectorType load(const T *x, StreamingAndUnalignedFlag) Vc_PURE; template static void store(T *mem, VTArg x, AlignedFlag); template static void store(T *mem, VTArg x, UnalignedFlag); template static void store(T *mem, VTArg x, StreamingAndAlignedFlag); template static void store(T *mem, VTArg x, StreamingAndUnalignedFlag); template static void store(T *mem, VTArg x, VTArg m, AlignedFlag); template static void store(T *mem, VTArg x, VTArg m, UnalignedFlag); template static void store(T *mem, VTArg x, VTArg m, StreamingAndAlignedFlag); template static void store(T *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag); static VectorType cdab(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)); } static VectorType badc(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)); } static VectorType aaaa(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(0, 0, 0, 0)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(0, 0, 0, 0)); } static VectorType bbbb(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 1, 1, 1)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(1, 1, 1, 1)); } static VectorType cccc(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 2, 2, 2)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(2, 2, 2, 2)); } static VectorType dddd(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(3, 3, 3, 3)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(3, 3, 3, 3)); } static VectorType dacb(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(3, 0, 2, 1)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(3, 0, 2, 1)); } OP0(allone, _mm_setallone_si128()) OP0(zero, _mm_setzero_si128()) OP2(or_, _mm_or_si128(a, b)) OP2(xor_, _mm_xor_si128(a, b)) OP2(and_, _mm_and_si128(a, b)) OP2(andnot_, _mm_andnot_si128(a, b)) OP3(blend, _mm_blendv_epi8(a, b, c)) }; #undef OP1 #undef OP2 #undef OP3 #define OP1(op) \ static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a) { return CAT(_mm256_##op##_, SUFFIX)(a); } #define OP(op) \ static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_##op##_ , SUFFIX)(a, b); } #define OP_(op) \ static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_##op , SUFFIX)(a, b); } #define OPx(op, op2) \ static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_##op2##_, SUFFIX)(a, b); } #define OPcmp(op) \ static Vc_INTRINSIC VectorType Vc_CONST cmp##op(VTArg a, VTArg b) { return CAT(_mm256_cmp##op##_, SUFFIX)(a, b); } #define OP_CAST_(op) \ static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_castps_, SUFFIX)( \ _mm256_##op##ps(CAT(CAT(_mm256_cast, SUFFIX), _ps)(a), \ CAT(CAT(_mm256_cast, SUFFIX), _ps)(b))); \ } #define MINMAX \ static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return CAT(_mm256_min_, SUFFIX)(a, b); } \ static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return CAT(_mm256_max_, SUFFIX)(a, b); } template<> struct VectorHelper { typedef m256d VectorType; #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN typedef const VectorType & VTArg; #else typedef const VectorType VTArg; #endif typedef double EntryType; typedef double ConcatType; #define SUFFIX pd static Vc_ALWAYS_INLINE VectorType notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_pd(mask), a); } static Vc_ALWAYS_INLINE VectorType set(const double a) { return CAT(_mm256_set1_, SUFFIX)(a); } static Vc_ALWAYS_INLINE VectorType set(const double a, const double b, const double c, const double d) { return CAT(_mm256_set_, SUFFIX)(a, b, c, d); } static Vc_ALWAYS_INLINE VectorType zero() { return CAT(_mm256_setzero_, SUFFIX)(); } static Vc_ALWAYS_INLINE VectorType one() { return CAT(_mm256_setone_, SUFFIX)(); }// set(1.); } static inline void fma(VectorType &v1, VTArg v2, VTArg v3) { #ifdef VC_IMPL_FMA4 v1 = _mm256_macc_pd(v1, v2, v3); #else VectorType h1 = _mm256_and_pd(v1, _mm256_broadcast_sd(reinterpret_cast(&c_general::highMaskDouble))); VectorType h2 = _mm256_and_pd(v2, _mm256_broadcast_sd(reinterpret_cast(&c_general::highMaskDouble))); #if defined(VC_GCC) && VC_GCC < 0x40703 // GCC before 4.7.3 uses an incorrect optimization where it replaces the subtraction with an andnot // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54703 asm("":"+x"(h1), "+x"(h2)); #endif const VectorType l1 = _mm256_sub_pd(v1, h1); const VectorType l2 = _mm256_sub_pd(v2, h2); const VectorType ll = mul(l1, l2); const VectorType lh = add(mul(l1, h2), mul(h1, l2)); const VectorType hh = mul(h1, h2); // ll < lh < hh for all entries is certain const VectorType lh_lt_v3 = cmplt(abs(lh), abs(v3)); // |lh| < |v3| const VectorType b = _mm256_blendv_pd(v3, lh, lh_lt_v3); const VectorType c = _mm256_blendv_pd(lh, v3, lh_lt_v3); v1 = add(add(ll, b), add(c, hh)); #endif } OP(add) OP(sub) OP(mul) OPcmp(eq) OPcmp(neq) OPcmp(lt) OPcmp(nlt) OPcmp(le) OPcmp(nle) OP1(sqrt) static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VTArg x) { return _mm256_div_pd(one(), sqrt(x)); } static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) { return _mm256_div_pd(one(), x); } static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VTArg x) { return _mm256_cmpunord_pd(x, x); } static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VTArg x) { return _mm256_cmpord_pd(x, _mm256_mul_pd(zero(), x)); } static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) { return CAT(_mm256_and_, SUFFIX)(a, _mm256_setabsmask_pd()); } MINMAX static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) { m128d b = _mm_min_pd(avx_cast(a), _mm256_extractf128_pd(a, 1)); b = _mm_min_sd(b, _mm_unpackhi_pd(b, b)); return _mm_cvtsd_f64(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) { m128d b = _mm_max_pd(avx_cast(a), _mm256_extractf128_pd(a, 1)); b = _mm_max_sd(b, _mm_unpackhi_pd(b, b)); return _mm_cvtsd_f64(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) { m128d b = _mm_mul_pd(avx_cast(a), _mm256_extractf128_pd(a, 1)); b = _mm_mul_sd(b, _mm_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1))); return _mm_cvtsd_f64(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) { m128d b = _mm_add_pd(avx_cast(a), _mm256_extractf128_pd(a, 1)); b = _mm_hadd_pd(b, b); // or: b = _mm_add_sd(b, _mm256_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1))); return _mm_cvtsd_f64(b); } #undef SUFFIX static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { return _mm256_round_pd(a, _MM_FROUND_NINT); } }; template<> struct VectorHelper { typedef float EntryType; typedef m256 VectorType; #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN typedef const VectorType & VTArg; #else typedef const VectorType VTArg; #endif typedef double ConcatType; #define SUFFIX ps static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(mask, a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return CAT(_mm256_set1_, SUFFIX)(a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d, const float e, const float f, const float g, const float h) { return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); } static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm256_setzero_, SUFFIX)(); } static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm256_setone_, SUFFIX)(); }// set(1.f); } static Vc_ALWAYS_INLINE Vc_CONST m256 concat(param256d a, param256d b) { return _mm256_insertf128_ps(avx_cast(_mm256_cvtpd_ps(a)), _mm256_cvtpd_ps(b), 1); } static inline void fma(VectorType &v1, VTArg v2, VTArg v3) { #ifdef VC_IMPL_FMA4 v1 = _mm256_macc_ps(v1, v2, v3); #else m256d v1_0 = _mm256_cvtps_pd(lo128(v1)); m256d v1_1 = _mm256_cvtps_pd(hi128(v1)); m256d v2_0 = _mm256_cvtps_pd(lo128(v2)); m256d v2_1 = _mm256_cvtps_pd(hi128(v2)); m256d v3_0 = _mm256_cvtps_pd(lo128(v3)); m256d v3_1 = _mm256_cvtps_pd(hi128(v3)); v1 = AVX::concat( _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)), _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1))); #endif } OP(add) OP(sub) OP(mul) OPcmp(eq) OPcmp(neq) OPcmp(lt) OPcmp(nlt) OPcmp(le) OPcmp(nle) OP1(sqrt) OP1(rsqrt) static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VTArg x) { return _mm256_cmpunord_ps(x, x); } static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VTArg x) { return _mm256_cmpord_ps(x, _mm256_mul_ps(zero(), x)); } static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) { return _mm256_rcp_ps(x); } static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) { return CAT(_mm256_and_, SUFFIX)(a, _mm256_setabsmask_ps()); } MINMAX static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) { m128 b = _mm_min_ps(avx_cast(a), _mm256_extractf128_ps(a, 1)); b = _mm_min_ps(b, _mm_movehl_ps(b, b)); // b = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3) b = _mm_min_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = min(a0, a1), a1, a2, a3 return _mm_cvtss_f32(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) { m128 b = _mm_max_ps(avx_cast(a), _mm256_extractf128_ps(a, 1)); b = _mm_max_ps(b, _mm_movehl_ps(b, b)); // b = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3) b = _mm_max_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = max(a0, a1), a1, a2, a3 return _mm_cvtss_f32(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) { m128 b = _mm_mul_ps(avx_cast(a), _mm256_extractf128_ps(a, 1)); b = _mm_mul_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3))); b = _mm_mul_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1))); return _mm_cvtss_f32(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) { m128 b = _mm_add_ps(avx_cast(a), _mm256_extractf128_ps(a, 1)); b = _mm_add_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3))); b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1))); return _mm_cvtss_f32(b); } #undef SUFFIX static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { return _mm256_round_ps(a, _MM_FROUND_NINT); } }; template<> struct VectorHelper : public VectorHelper {}; template<> struct VectorHelper { typedef int EntryType; typedef m256i VectorType; #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN typedef const VectorType & VTArg; #else typedef const VectorType VTArg; #endif typedef long long ConcatType; #define SUFFIX si256 OP_(or_) OP_(and_) OP_(xor_) static Vc_INTRINSIC VectorType Vc_CONST zero() { return CAT(_mm256_setzero_, SUFFIX)(); } static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_si256(mask), a); } #undef SUFFIX #define SUFFIX epi32 static Vc_INTRINSIC VectorType Vc_CONST one() { return CAT(_mm256_setone_, SUFFIX)(); } static Vc_INTRINSIC VectorType Vc_CONST set(const int a) { return CAT(_mm256_set1_, SUFFIX)(a); } static Vc_INTRINSIC VectorType Vc_CONST set(const int a, const int b, const int c, const int d, const int e, const int f, const int g, const int h) { return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); } static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { v1 = add(mul(v1, v2), v3); } static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) { return CAT(_mm256_slli_, SUFFIX)(a, shift); } static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) { return CAT(_mm256_srai_, SUFFIX)(a, shift); } OP1(abs) MINMAX static Vc_INTRINSIC EntryType Vc_CONST min(VTArg a) { m128i b = _mm_min_epi32(avx_cast(a), _mm256_extractf128_si256(a, 1)); b = _mm_min_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); b = _mm_min_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here return _mm_cvtsi128_si32(b); } static Vc_INTRINSIC EntryType Vc_CONST max(VTArg a) { m128i b = _mm_max_epi32(avx_cast(a), _mm256_extractf128_si256(a, 1)); b = _mm_max_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); b = _mm_max_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here return _mm_cvtsi128_si32(b); } static Vc_INTRINSIC EntryType Vc_CONST add(VTArg a) { m128i b = _mm_add_epi32(avx_cast(a), _mm256_extractf128_si256(a, 1)); b = _mm_add_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); b = _mm_add_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(b); } static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg a) { m128i b = _mm_mullo_epi32(avx_cast(a), _mm256_extractf128_si256(a, 1)); b = _mm_mullo_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); b = _mm_mullo_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(b); } static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mullo_epi32(a, b); } OP(add) OP(sub) OPcmp(eq) OPcmp(lt) OPcmp(gt) static Vc_INTRINSIC VectorType Vc_CONST cmpneq(VTArg a, VTArg b) { m256i x = cmpeq(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); } static Vc_INTRINSIC VectorType Vc_CONST cmpnlt(VTArg a, VTArg b) { m256i x = cmplt(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); } static Vc_INTRINSIC VectorType Vc_CONST cmple (VTArg a, VTArg b) { m256i x = cmpgt(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); } static Vc_INTRINSIC VectorType Vc_CONST cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); } #undef SUFFIX static Vc_INTRINSIC VectorType Vc_CONST round(VTArg a) { return a; } }; template<> struct VectorHelper { typedef unsigned int EntryType; typedef m256i VectorType; #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN typedef const VectorType & VTArg; #else typedef const VectorType VTArg; #endif typedef unsigned long long ConcatType; #define SUFFIX si256 OP_CAST_(or_) OP_CAST_(and_) OP_CAST_(xor_) static Vc_INTRINSIC VectorType Vc_CONST zero() { return CAT(_mm256_setzero_, SUFFIX)(); } static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_si256(mask), a); } #undef SUFFIX #define SUFFIX epu32 static Vc_INTRINSIC VectorType Vc_CONST one() { return CAT(_mm256_setone_, SUFFIX)(); } MINMAX static Vc_INTRINSIC EntryType Vc_CONST min(VTArg a) { m128i b = _mm_min_epu32(avx_cast(a), _mm256_extractf128_si256(a, 1)); b = _mm_min_epu32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); b = _mm_min_epu32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here return _mm_cvtsi128_si32(b); } static Vc_INTRINSIC EntryType Vc_CONST max(VTArg a) { m128i b = _mm_max_epu32(avx_cast(a), _mm256_extractf128_si256(a, 1)); b = _mm_max_epu32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); b = _mm_max_epu32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here return _mm_cvtsi128_si32(b); } static Vc_INTRINSIC EntryType Vc_CONST add(VTArg a) { m128i b = _mm_add_epi32(avx_cast(a), _mm256_extractf128_si256(a, 1)); b = _mm_add_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); b = _mm_add_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(b); } static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg a) { m128i b = _mm_mullo_epi32(avx_cast(a), _mm256_extractf128_si256(a, 1)); b = _mm_mullo_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); b = _mm_mullo_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(b); } static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mullo_epi32(a, b); } static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { v1 = add(mul(v1, v2), v3); } #undef SUFFIX #define SUFFIX epi32 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) { return CAT(_mm256_slli_, SUFFIX)(a, shift); } static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) { return CAT(_mm256_srli_, SUFFIX)(a, shift); } static Vc_INTRINSIC VectorType Vc_CONST set(const unsigned int a) { return CAT(_mm256_set1_, SUFFIX)(a); } static Vc_INTRINSIC VectorType Vc_CONST set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d, const unsigned int e, const unsigned int f, const unsigned int g, const unsigned int h) { return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); } OP(add) OP(sub) OPcmp(eq) static Vc_INTRINSIC VectorType Vc_CONST cmpneq(VTArg a, VTArg b) { return _mm256_andnot_si256(cmpeq(a, b), _mm256_setallone_si256()); } #ifndef USE_INCORRECT_UNSIGNED_COMPARE static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { return _mm256_cmplt_epu32(a, b); } static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { return _mm256_cmpgt_epu32(a, b); } #else OPcmp(lt) OPcmp(gt) #endif static Vc_INTRINSIC VectorType Vc_CONST cmpnlt(VTArg a, VTArg b) { return _mm256_andnot_si256(cmplt(a, b), _mm256_setallone_si256()); } static Vc_INTRINSIC VectorType Vc_CONST cmple (VTArg a, VTArg b) { return _mm256_andnot_si256(cmpgt(a, b), _mm256_setallone_si256()); } static Vc_INTRINSIC VectorType Vc_CONST cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); } #undef SUFFIX static Vc_INTRINSIC VectorType Vc_CONST round(VTArg a) { return a; } }; template<> struct VectorHelper { typedef VectorTypeHelper::Type VectorType; #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN typedef const VectorType & VTArg; #else typedef const VectorType VTArg; #endif typedef signed short EntryType; typedef int ConcatType; static Vc_INTRINSIC VectorType Vc_CONST or_(VTArg a, VTArg b) { return _mm_or_si128(a, b); } static Vc_INTRINSIC VectorType Vc_CONST and_(VTArg a, VTArg b) { return _mm_and_si128(a, b); } static Vc_INTRINSIC VectorType Vc_CONST xor_(VTArg a, VTArg b) { return _mm_xor_si128(a, b); } static Vc_INTRINSIC VectorType Vc_CONST zero() { return _mm_setzero_si128(); } static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param128 mask) { return _mm_and_si128(_mm_castps_si128(mask), a); } #define SUFFIX epi16 static Vc_INTRINSIC VectorType Vc_CONST one() { return CAT(_mm_setone_, SUFFIX)(); } static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) { return CAT(_mm_slli_, SUFFIX)(a, shift); } static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) { return CAT(_mm_srai_, SUFFIX)(a, shift); } static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); } static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a, const EntryType b, const EntryType c, const EntryType d, const EntryType e, const EntryType f, const EntryType g, const EntryType h) { return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h); } static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { v1 = add(mul(v1, v2), v3); } static Vc_INTRINSIC VectorType Vc_CONST abs(VTArg a) { return _mm_abs_epi16(a); } static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm_mullo_epi16(a, b); } static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm_min_epi16(a, b); } static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm_max_epi16(a, b); } static Vc_INTRINSIC EntryType Vc_CONST min(VTArg _a) { // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" VectorType a = min(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); // & 0xffff is implicit } static Vc_INTRINSIC EntryType Vc_CONST max(VTArg _a) { // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" VectorType a = max(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); // & 0xffff is implicit } static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg _a) { VectorType a = mul(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); // & 0xffff is implicit } static Vc_INTRINSIC EntryType Vc_CONST add(VTArg _a) { VectorType a = add(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); // & 0xffff is implicit } static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm_add_epi16(a, b); } static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm_sub_epi16(a, b); } static Vc_INTRINSIC VectorType Vc_CONST cmpeq(VTArg a, VTArg b) { return _mm_cmpeq_epi16(a, b); } static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { return _mm_cmplt_epi16(a, b); } static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { return _mm_cmpgt_epi16(a, b); } static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(VTArg a, VTArg b) { m128i x = cmpeq(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(VTArg a, VTArg b) { m128i x = cmplt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (VTArg a, VTArg b) { m128i x = cmpgt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); } #undef SUFFIX static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { return a; } }; template<> struct VectorHelper { typedef VectorTypeHelper::Type VectorType; #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN typedef const VectorType & VTArg; #else typedef const VectorType VTArg; #endif typedef unsigned short EntryType; typedef unsigned int ConcatType; static Vc_INTRINSIC VectorType Vc_CONST or_(VTArg a, VTArg b) { return _mm_or_si128(a, b); } static Vc_INTRINSIC VectorType Vc_CONST and_(VTArg a, VTArg b) { return _mm_and_si128(a, b); } static Vc_INTRINSIC VectorType Vc_CONST xor_(VTArg a, VTArg b) { return _mm_xor_si128(a, b); } static Vc_INTRINSIC VectorType Vc_CONST zero() { return _mm_setzero_si128(); } static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param128 mask) { return _mm_and_si128(_mm_castps_si128(mask), a); } static Vc_INTRINSIC VectorType Vc_CONST one() { return _mm_setone_epu16(); } static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm_mullo_epi16(a, b); } static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm_min_epu16(a, b); } static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm_max_epu16(a, b); } #define SUFFIX epi16 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) { return CAT(_mm_slli_, SUFFIX)(a, shift); } static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) { return CAT(_mm_srli_, SUFFIX)(a, shift); } static Vc_INTRINSIC EntryType Vc_CONST min(VTArg _a) { // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" VectorType a = min(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); // & 0xffff is implicit } static Vc_INTRINSIC EntryType Vc_CONST max(VTArg _a) { // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" VectorType a = max(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); // & 0xffff is implicit } static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg _a) { // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" VectorType a = mul(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); // & 0xffff is implicit } static Vc_INTRINSIC EntryType Vc_CONST add(VTArg _a) { // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" VectorType a = add(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); // & 0xffff is implicit } static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); } static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a, const EntryType b, const EntryType c, const EntryType d, const EntryType e, const EntryType f, const EntryType g, const EntryType h) { return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h); } static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { v1 = add(mul(v1, v2), v3); } static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm_add_epi16(a, b); } static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm_sub_epi16(a, b); } static Vc_INTRINSIC VectorType Vc_CONST cmpeq(VTArg a, VTArg b) { return _mm_cmpeq_epi16(a, b); } static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(VTArg a, VTArg b) { return _mm_andnot_si128(cmpeq(a, b), _mm_setallone_si128()); } #ifndef USE_INCORRECT_UNSIGNED_COMPARE static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { return _mm_cmplt_epu16(a, b); } static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { return _mm_cmpgt_epu16(a, b); } #else static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { return _mm_cmplt_epi16(a, b); } static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { return _mm_cmpgt_epi16(a, b); } #endif static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(VTArg a, VTArg b) { return _mm_andnot_si128(cmplt(a, b), _mm_setallone_si128()); } static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (VTArg a, VTArg b) { return _mm_andnot_si128(cmpgt(a, b), _mm_setallone_si128()); } static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); } #undef SUFFIX static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { return a; } }; #undef OP1 #undef OP #undef OP_ #undef OPx #undef OPcmp template<> struct VectorHelper { typedef VectorTypeHelper::Type VectorType; #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN typedef const VectorType & VTArg; #else typedef const VectorType VTArg; #endif typedef char EntryType; typedef short ConcatType; }; template<> struct VectorHelper { typedef VectorTypeHelper::Type VectorType; #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN typedef const VectorType & VTArg; #else typedef const VectorType VTArg; #endif typedef unsigned char EntryType; typedef unsigned short ConcatType; }; } // namespace AVX } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "vectorhelper.tcc" #include "undomacros.h" #endif // AVX_VECTORHELPER_H Vc-0.7.4/avx/vectorhelper.tcc000066400000000000000000000277361233512346000161030ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include "casts.h" #include /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace AVX { //////////////////////////////////////////////////////////////////////////////////////////////////// // float_v //////////////////////////////////////////////////////////////////////////////////////////////////// //// loads template<> Vc_ALWAYS_INLINE Vc_PURE m256 VectorHelper::load(const float *m, AlignedFlag) { return _mm256_load_ps(m); } template<> Vc_ALWAYS_INLINE Vc_PURE m256 VectorHelper::load(const float *m, UnalignedFlag) { return _mm256_loadu_ps(m); } template<> Vc_ALWAYS_INLINE Vc_PURE m256 VectorHelper::load(const float *m, StreamingAndAlignedFlag) { return avx_cast(concat(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(m))), _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(&m[4]))))); } template<> Vc_ALWAYS_INLINE Vc_PURE m256 VC_WARN("AVX does not support streaming unaligned loads. Will use non-streaming unaligned load instead.") VectorHelper::load(const float *m, StreamingAndUnalignedFlag) { return _mm256_loadu_ps(m); } //////////////////////////////////////////////////////////////////////////////////////////////////// //// stores Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, AlignedFlag) { _mm256_store_ps(mem, x); } Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, UnalignedFlag) { _mm256_storeu_ps(mem, x); } Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, StreamingAndAlignedFlag) { _mm256_stream_ps(mem, x); } Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, StreamingAndUnalignedFlag) { _mm_maskmoveu_si128(avx_cast(x), _mm_setallone_si128(), reinterpret_cast(mem)); _mm_maskmoveu_si128(_mm256_extractf128_si256(avx_cast(x), 1), _mm_setallone_si128(), reinterpret_cast(mem + 4)); } Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, VTArg m, AlignedFlag) { _mm256_maskstore(mem, m, x); } Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, VTArg m, UnalignedFlag) { _mm256_maskstore(mem, m, x); } Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) { _mm_maskmoveu_si128(avx_cast(x), avx_cast(m), reinterpret_cast(mem)); _mm_maskmoveu_si128(_mm256_extractf128_si256(avx_cast(x), 1), _mm256_extractf128_si256(avx_cast(m), 1), reinterpret_cast(mem + 4)); } Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) { _mm_maskmoveu_si128(avx_cast(x), avx_cast(m), reinterpret_cast(mem)); _mm_maskmoveu_si128(_mm256_extractf128_si256(avx_cast(x), 1), _mm256_extractf128_si256(avx_cast(m), 1), reinterpret_cast(mem + 4)); } //////////////////////////////////////////////////////////////////////////////////////////////////// // double_v //////////////////////////////////////////////////////////////////////////////////////////////////// //// loads template<> Vc_ALWAYS_INLINE Vc_PURE m256d VectorHelper::load(const double *m, AlignedFlag) { return _mm256_load_pd(m); } template<> Vc_ALWAYS_INLINE Vc_PURE m256d VectorHelper::load(const double *m, UnalignedFlag) { return _mm256_loadu_pd(m); } template<> Vc_ALWAYS_INLINE Vc_PURE m256d VectorHelper::load(const double *m, StreamingAndAlignedFlag) { return avx_cast(concat( _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(m))), _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(&m[2]))))); } template<> Vc_ALWAYS_INLINE Vc_PURE m256d VC_WARN("AVX does not support streaming unaligned loads. Will use non-streaming unaligned load instead.") VectorHelper::load(const double *m, StreamingAndUnalignedFlag) { return _mm256_loadu_pd(m); } //////////////////////////////////////////////////////////////////////////////////////////////////// //// stores Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, AlignedFlag) { _mm256_store_pd(mem, x); } Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, UnalignedFlag) { _mm256_storeu_pd(mem, x); } Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, StreamingAndAlignedFlag) { _mm256_stream_pd(mem, x); } Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, StreamingAndUnalignedFlag) { _mm_maskmoveu_si128(avx_cast(x), _mm_setallone_si128(), reinterpret_cast(mem)); _mm_maskmoveu_si128(avx_cast(_mm256_extractf128_pd(x, 1)), _mm_setallone_si128(), reinterpret_cast(mem + 2)); } Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, VTArg m, AlignedFlag) { _mm256_maskstore(mem, m, x); } Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, VTArg m, UnalignedFlag) { _mm256_maskstore(mem, m, x); } Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) { _mm_maskmoveu_si128(avx_cast(x), avx_cast(m), reinterpret_cast(mem)); _mm_maskmoveu_si128(avx_cast(_mm256_extractf128_pd(x, 1)), avx_cast(_mm256_extractf128_pd(m, 1)), reinterpret_cast(mem + 2)); } Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) { _mm_maskmoveu_si128(avx_cast(x), avx_cast(m), reinterpret_cast(mem)); _mm_maskmoveu_si128(avx_cast(_mm256_extractf128_pd(x, 1)), avx_cast(_mm256_extractf128_pd(m, 1)), reinterpret_cast(mem + 2)); } //////////////////////////////////////////////////////////////////////////////////////////////////// // (u)int_v //////////////////////////////////////////////////////////////////////////////////////////////////// //// loads template Vc_ALWAYS_INLINE Vc_PURE m256i VectorHelper::load(const T *m, AlignedFlag) { return _mm256_load_si256(reinterpret_cast(m)); } template Vc_ALWAYS_INLINE Vc_PURE m256i VectorHelper::load(const T *m, UnalignedFlag) { return _mm256_loadu_si256(reinterpret_cast(m)); } template Vc_ALWAYS_INLINE Vc_PURE m256i VectorHelper::load(const T *m, StreamingAndAlignedFlag) { return concat(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(m))), _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(&m[4])))); } template Vc_ALWAYS_INLINE Vc_PURE m256i VC_WARN("AVX does not support streaming unaligned loads. Will use non-streaming unaligned load instead.") VectorHelper::load(const T *m, StreamingAndUnalignedFlag) { return _mm256_loadu_si256(reinterpret_cast(m)); } //////////////////////////////////////////////////////////////////////////////////////////////////// //// stores template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, AlignedFlag) { _mm256_store_si256(reinterpret_cast<__m256i *>(mem), x); } template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, UnalignedFlag) { _mm256_storeu_si256(reinterpret_cast<__m256i *>(mem), x); } template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, StreamingAndAlignedFlag) { _mm256_stream_si256(reinterpret_cast<__m256i *>(mem), x); } template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, StreamingAndUnalignedFlag) { _mm_maskmoveu_si128(avx_cast(x), _mm_setallone_si128(), reinterpret_cast(mem)); _mm_maskmoveu_si128(_mm256_extractf128_si256(x, 1), _mm_setallone_si128(), reinterpret_cast(mem + 4)); } template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, AlignedFlag) { _mm256_maskstore(mem, m, x); } template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, UnalignedFlag) { _mm256_maskstore(mem, m, x); } template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) { _mm_maskmoveu_si128(lo128(x), lo128(m), reinterpret_cast(mem)); _mm_maskmoveu_si128(hi128(x), hi128(m), reinterpret_cast(mem + 4)); } template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) { _mm_maskmoveu_si128(lo128(x), lo128(m), reinterpret_cast(mem)); _mm_maskmoveu_si128(hi128(x), hi128(m), reinterpret_cast(mem + 4)); } //////////////////////////////////////////////////////////////////////////////////////////////////// // (u)short_v //////////////////////////////////////////////////////////////////////////////////////////////////// //// loads template Vc_ALWAYS_INLINE Vc_PURE m128i VectorHelper::load(const T *m, AlignedFlag) { return _mm_load_si128(reinterpret_cast(m)); } template Vc_ALWAYS_INLINE Vc_PURE m128i VectorHelper::load(const T *m, UnalignedFlag) { return _mm_loadu_si128(reinterpret_cast(m)); } template Vc_ALWAYS_INLINE Vc_PURE m128i VectorHelper::load(const T *m, StreamingAndAlignedFlag) { return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(m))); } template Vc_ALWAYS_INLINE Vc_PURE m128i VC_WARN("AVX does not support streaming unaligned loads. Will use non-streaming unaligned load instead.") VectorHelper::load(const T *m, StreamingAndUnalignedFlag) { return _mm_loadu_si128(reinterpret_cast(m)); } //////////////////////////////////////////////////////////////////////////////////////////////////// //// stores template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, AlignedFlag) { _mm_store_si128(reinterpret_cast<__m128i *>(mem), x); } template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, UnalignedFlag) { _mm_storeu_si128(reinterpret_cast<__m128i *>(mem), x); } template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, StreamingAndAlignedFlag) { _mm_stream_si128(reinterpret_cast<__m128i *>(mem), x); } template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, StreamingAndUnalignedFlag) { _mm_maskmoveu_si128(x, _mm_setallone_si128(), reinterpret_cast(mem)); } template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, AlignedFlag align) { store(mem, _mm_blendv_epi8(load(mem, align), x, m), align); } template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, UnalignedFlag align) { store(mem, _mm_blendv_epi8(load(mem, align), x, m), align); } template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) { _mm_maskmoveu_si128(x, m, reinterpret_cast(mem)); } template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) { _mm_maskmoveu_si128(x, m, reinterpret_cast(mem)); } } // namespace AVX } // namespace Vc /*OUTER_NAMESPACE_END*/ Vc-0.7.4/avx/writemaskedvector.h000066400000000000000000000064101233512346000166030ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_AVX_WRITEMASKEDVECTOR_H #define VC_AVX_WRITEMASKEDVECTOR_H #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace AVX { template class WriteMaskedVector { friend class Vector; typedef typename VectorTypeHelper::Type VectorType; typedef typename DetermineEntryType::Type EntryType; enum Constants { Size = sizeof(VectorType) / sizeof(EntryType) }; typedef typename Vc::AVX::Mask Mask; public: FREE_STORE_OPERATORS_ALIGNED(32) //prefix Vector Vc_ALWAYS_INLINE_L &operator++() Vc_ALWAYS_INLINE_R; Vector Vc_ALWAYS_INLINE_L &operator--() Vc_ALWAYS_INLINE_R; //postfix Vector Vc_ALWAYS_INLINE_L operator++(int) Vc_ALWAYS_INLINE_R; Vector Vc_ALWAYS_INLINE_L operator--(int) Vc_ALWAYS_INLINE_R; Vector Vc_ALWAYS_INLINE_L &operator+=(const Vector &x) Vc_ALWAYS_INLINE_R; Vector Vc_ALWAYS_INLINE_L &operator-=(const Vector &x) Vc_ALWAYS_INLINE_R; Vector Vc_ALWAYS_INLINE_L &operator*=(const Vector &x) Vc_ALWAYS_INLINE_R; Vector Vc_ALWAYS_INLINE_L &operator/=(const Vector &x) Vc_ALWAYS_INLINE_R; Vector Vc_ALWAYS_INLINE &operator+=(EntryType x) { return operator+=(Vector(x)); } Vector Vc_ALWAYS_INLINE &operator-=(EntryType x) { return operator-=(Vector(x)); } Vector Vc_ALWAYS_INLINE &operator*=(EntryType x) { return operator*=(Vector(x)); } Vector Vc_ALWAYS_INLINE &operator/=(EntryType x) { return operator/=(Vector(x)); } Vector Vc_ALWAYS_INLINE_L &operator=(const Vector &x) Vc_ALWAYS_INLINE_R; Vector Vc_ALWAYS_INLINE &operator=(EntryType x) { return operator=(Vector(x)); } template Vc_INTRINSIC void call(const F &f) const { return vec->call(f, mask); } template Vc_INTRINSIC void call(F &f) const { return vec->call(f, mask); } template Vc_INTRINSIC Vector apply(const F &f) const { return vec->apply(f, mask); } template Vc_INTRINSIC Vector apply(F &f) const { return vec->apply(f, mask); } private: Vc_ALWAYS_INLINE WriteMaskedVector(Vector *v, const Mask &k) : vec(v), mask(k) {} Vector *const vec; Mask mask; }; } // namespace AVX } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "writemaskedvector.tcc" #include "undomacros.h" #endif // VC_AVX_WRITEMASKEDVECTOR_H Vc-0.7.4/avx/writemaskedvector.tcc000066400000000000000000000055331233512346000171320ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2010-2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace AVX { template Vc_ALWAYS_INLINE Vector &WriteMaskedVector::operator++() { vec->data() = VectorHelper::add(vec->data(), VectorHelper::notMaskedToZero(VectorHelper::one(), mask.data()) ); return *vec; } template Vc_ALWAYS_INLINE Vector &WriteMaskedVector::operator--() { vec->data() = VectorHelper::sub(vec->data(), VectorHelper::notMaskedToZero(VectorHelper::one(), mask.data()) ); return *vec; } template Vc_ALWAYS_INLINE Vector WriteMaskedVector::operator++(int) { Vector ret(*vec); vec->data() = VectorHelper::add(vec->data(), VectorHelper::notMaskedToZero(VectorHelper::one(), mask.data()) ); return ret; } template Vc_ALWAYS_INLINE Vector WriteMaskedVector::operator--(int) { Vector ret(*vec); vec->data() = VectorHelper::sub(vec->data(), VectorHelper::notMaskedToZero(VectorHelper::one(), mask.data()) ); return ret; } template Vc_ALWAYS_INLINE Vector &WriteMaskedVector::operator+=(const Vector &x) { vec->data() = VectorHelper::add(vec->data(), VectorHelper::notMaskedToZero(x.data(), mask.data())); return *vec; } template Vc_ALWAYS_INLINE Vector &WriteMaskedVector::operator-=(const Vector &x) { vec->data() = VectorHelper::sub(vec->data(), VectorHelper::notMaskedToZero(x.data(), mask.data())); return *vec; } template Vc_ALWAYS_INLINE Vector &WriteMaskedVector::operator*=(const Vector &x) { vec->assign(VectorHelper::mul(vec->data(), x.data()), mask); return *vec; } template Vc_ALWAYS_INLINE Vector &WriteMaskedVector::operator/=(const Vector &x) { vec->assign(*vec / x, mask); return *vec; } template Vc_ALWAYS_INLINE Vector &WriteMaskedVector::operator=(const Vector &x) { vec->assign(x, mask); return *vec; } } // namespace AVX } // namespace Vc /*OUTER_NAMESPACE_END*/ Vc-0.7.4/changeVersion.sh000077500000000000000000000017041233512346000152250ustar00rootroot00000000000000#!/bin/bash cd "`dirname "$0"`" # Read version number eval `awk '/VC_VERSION_NUMBER 0x[0-9]+/ { h=$3 } END { major=strtonum(substr(h, 1, 4)) minor=strtonum("0x" substr(h, 5, 2)) patch=strtonum("0x" substr(h, 7, 2)) / 2 printf "oldVersion=\"%d.%d.%d\"\n", major, minor, patch printf "newVersion=\"%d.%d.%d\"\n", major, minor, patch + 1 }' include/Vc/version.h` echo "current version: $oldVersion" echo -n " new version: " read -e -i "$newVersion" newVersion versionString="$newVersion-dev" versionNumber=`echo $newVersion | awk '{ split($0, v, "."); printf "0x%02x%02x%02x", v[1], v[2], v[3] * 2 }'` versionNumber=`echo $versionNumber | awk '{ printf "0x%06x", (strtonum($0) + 1) }'` sed -i "s/^PROJECT_NUMBER = .*\$/PROJECT_NUMBER = $versionString/" doc/Doxyfile sed -i \ -e "s/VC_VERSION_STRING \".*\"\$/VC_VERSION_STRING \"$versionString\"/" \ -e "s/VC_VERSION_NUMBER 0x.*\$/VC_VERSION_NUMBER $versionNumber/" \ include/Vc/version.h Vc-0.7.4/cmake/000077500000000000000000000000001233512346000131515ustar00rootroot00000000000000Vc-0.7.4/cmake/AddCompilerFlag.cmake000066400000000000000000000113001233512346000171230ustar00rootroot00000000000000# - Add a given compiler flag to flags variables. # AddCompilerFlag( []) # or # AddCompilerFlag( [C_FLAGS ] [CXX_FLAGS ] [C_RESULT ] # [CXX_RESULT ]) #============================================================================= # Copyright 2010-2013 Matthias Kretz # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # * The names of Kitware, Inc., the Insight Consortium, or the names of # any consortium members, or of any contributors, may not be used to # endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #============================================================================= get_filename_component(_currentDir "${CMAKE_CURRENT_LIST_FILE}" PATH) include("${_currentDir}/CheckCCompilerFlag.cmake") include("${_currentDir}/CheckCXXCompilerFlag.cmake") macro(AddCompilerFlag _flag) string(REGEX REPLACE "[-.+/:= ]" "_" _flag_esc "${_flag}") set(_c_flags "CMAKE_C_FLAGS") set(_cxx_flags "CMAKE_CXX_FLAGS") set(_c_result tmp) set(_cxx_result tmp) if(${ARGC} EQUAL 2) message(WARNING "Deprecated use of the AddCompilerFlag macro.") unset(_c_result) set(_cxx_result ${ARGV1}) elseif(${ARGC} GREATER 2) set(state 0) unset(_c_flags) unset(_cxx_flags) unset(_c_result) unset(_cxx_result) foreach(_arg ${ARGN}) if(_arg STREQUAL "C_FLAGS") set(state 1) if(NOT DEFINED _c_result) set(_c_result tmp) endif() elseif(_arg STREQUAL "CXX_FLAGS") set(state 2) if(NOT DEFINED _cxx_result) set(_cxx_result tmp) endif() elseif(_arg STREQUAL "C_RESULT") set(state 3) elseif(_arg STREQUAL "CXX_RESULT") set(state 4) elseif(state EQUAL 1) set(_c_flags "${_arg}") elseif(state EQUAL 2) set(_cxx_flags "${_arg}") elseif(state EQUAL 3) set(_c_result "${_arg}") elseif(state EQUAL 4) set(_cxx_result "${_arg}") else() message(FATAL_ERROR "Syntax error for AddCompilerFlag") endif() endforeach() endif() if("${_flag}" STREQUAL "-mfma") # Compiling with FMA3 support may fail only at the assembler level. # In that case we need to have such an instruction in the test code set(_code "#include __m128 foo(__m128 x) { return _mm_fmadd_ps(x, x, x); } int main() { return 0; }") elseif("${_flag}" STREQUAL "-stdlib=libc++") # Compiling with libc++ not only requires a compiler that understands it, but also # the libc++ headers itself set(_code "#include int main() { return 0; }") else() set(_code "int main() { return 0; }") endif() if(DEFINED _c_result) check_c_compiler_flag("${_flag}" check_c_compiler_flag_${_flag_esc} "${_code}") set(${_c_result} ${check_c_compiler_flag_${_flag_esc}}) endif() if(DEFINED _cxx_result) check_cxx_compiler_flag("${_flag}" check_cxx_compiler_flag_${_flag_esc} "${_code}") set(${_cxx_result} ${check_cxx_compiler_flag_${_flag_esc}}) endif() if(check_c_compiler_flag_${_flag_esc} AND DEFINED _c_flags) set(${_c_flags} "${${_c_flags}} ${_flag}") endif() if(check_cxx_compiler_flag_${_flag_esc} AND DEFINED _cxx_flags) set(${_cxx_flags} "${${_cxx_flags}} ${_flag}") endif() endmacro(AddCompilerFlag) Vc-0.7.4/cmake/AddTargetProperty.cmake000066400000000000000000000040671233512346000175660ustar00rootroot00000000000000#============================================================================= # Copyright 2010-2013 Matthias Kretz # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # * The names of Kitware, Inc., the Insight Consortium, or the names of # any consortium members, or of any contributors, may not be used to # endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #============================================================================= macro(add_target_property _target _prop _value) get_target_property(_oldprop "${_target}" ${_prop}) if(NOT _oldprop) set_target_properties("${_target}" PROPERTIES ${_prop} "${_value}") else(NOT _oldprop) set_target_properties("${_target}" PROPERTIES ${_prop} "${_oldprop} ${_value}") endif(NOT _oldprop) endmacro(add_target_property) Vc-0.7.4/cmake/CheckCCompilerFlag.cmake000066400000000000000000000067371233512346000175750ustar00rootroot00000000000000# - Check whether the C compiler supports a given flag. # CHECK_C_COMPILER_FLAG( ) # - the compiler flag # - variable to store the result # This internally calls the check_c_source_compiles macro. # See help for CheckCSourceCompiles for a listing of variables # that can modify the build. #============================================================================= # Copyright 2006-2009 Kitware, Inc. # Copyright 2006 Alexander Neundorf # Copyright 2011-2013 Matthias Kretz # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # * The names of Kitware, Inc., the Insight Consortium, or the names of # any consortium members, or of any contributors, may not be used to # endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #============================================================================= INCLUDE(CheckCSourceCompiles) MACRO (CHECK_C_COMPILER_FLAG _FLAG _RESULT) SET(SAFE_CMAKE_REQUIRED_DEFINITIONS "${CMAKE_REQUIRED_DEFINITIONS}") SET(CMAKE_REQUIRED_DEFINITIONS "${_FLAG}") if(${ARGC} GREATER 2) SET(TEST_SOURCE "${ARGV2}") else() SET(TEST_SOURCE "int main() { return 0;}") endif() CHECK_C_SOURCE_COMPILES("${TEST_SOURCE}" ${_RESULT} # Some compilers do not fail with a bad flag FAIL_REGEX "error: bad value (.*) for .* switch" # GNU FAIL_REGEX "argument unused during compilation" # clang FAIL_REGEX "is valid for .* but not for C" # GNU FAIL_REGEX "unrecognized .*option" # GNU FAIL_REGEX "ignored for target" # GNU FAIL_REGEX "ignoring unknown option" # MSVC FAIL_REGEX "[Uu]nknown option" # HP FAIL_REGEX "[Ww]arning: [Oo]ption" # SunPro FAIL_REGEX "command option .* is not recognized" # XL FAIL_REGEX "WARNING: unknown flag:" # Open64 FAIL_REGEX " #10159: " # ICC FAIL_REGEX " #10353: " # ICC: option '-mfma' ignored, suggest using '-march=core-avx2' ) SET (CMAKE_REQUIRED_DEFINITIONS "${SAFE_CMAKE_REQUIRED_DEFINITIONS}") ENDMACRO (CHECK_C_COMPILER_FLAG) Vc-0.7.4/cmake/CheckCXXCompilerFlag.cmake000066400000000000000000000067601233512346000200510ustar00rootroot00000000000000# - Check whether the CXX compiler supports a given flag. # CHECK_CXX_COMPILER_FLAG( ) # - the compiler flag # - variable to store the result # This internally calls the check_cxx_source_compiles macro. See help # for CheckCXXSourceCompiles for a listing of variables that can # modify the build. #============================================================================= # Copyright 2006-2009 Kitware, Inc. # Copyright 2006 Alexander Neundorf # Copyright 2011-2013 Matthias Kretz # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # * The names of Kitware, Inc., the Insight Consortium, or the names of # any consortium members, or of any contributors, may not be used to # endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #============================================================================= INCLUDE(CheckCXXSourceCompiles) MACRO (CHECK_CXX_COMPILER_FLAG _FLAG _RESULT) SET(SAFE_CMAKE_REQUIRED_DEFINITIONS "${CMAKE_REQUIRED_DEFINITIONS}") SET(CMAKE_REQUIRED_DEFINITIONS "${_FLAG}") if(${ARGC} GREATER 2) SET(TEST_SOURCE "${ARGV2}") else() SET(TEST_SOURCE "int main() { return 0;}") endif() CHECK_CXX_SOURCE_COMPILES("${TEST_SOURCE}" ${_RESULT} # Some compilers do not fail with a bad flag FAIL_REGEX "error: bad value (.*) for .* switch" # GNU FAIL_REGEX "argument unused during compilation" # clang FAIL_REGEX "is valid for .* but not for C\\\\+\\\\+" # GNU FAIL_REGEX "unrecognized .*option" # GNU FAIL_REGEX "ignored for target" # GNU FAIL_REGEX "ignoring unknown option" # MSVC FAIL_REGEX "[Uu]nknown option" # HP FAIL_REGEX "[Ww]arning: [Oo]ption" # SunPro FAIL_REGEX "command option .* is not recognized" # XL FAIL_REGEX "WARNING: unknown flag:" # Open64 FAIL_REGEX " #10159: " # ICC FAIL_REGEX " #10353: " # ICC: option '-mfma' ignored, suggest using '-march=core-avx2' ) SET (CMAKE_REQUIRED_DEFINITIONS "${SAFE_CMAKE_REQUIRED_DEFINITIONS}") ENDMACRO (CHECK_CXX_COMPILER_FLAG) Vc-0.7.4/cmake/FindSSE.cmake000066400000000000000000000104071233512346000154100ustar00rootroot00000000000000# Check if SSE instructions are available on the machine where # the project is compiled. IF(CMAKE_SYSTEM_NAME MATCHES "Linux") EXEC_PROGRAM(cat ARGS "/proc/cpuinfo" OUTPUT_VARIABLE CPUINFO) STRING(REGEX REPLACE "^.*(sse2).*$" "\\1" SSE_THERE ${CPUINFO}) STRING(COMPARE EQUAL "sse2" "${SSE_THERE}" SSE2_TRUE) IF (SSE2_TRUE) set(SSE2_FOUND true CACHE BOOL "SSE2 available on host") ELSE (SSE2_TRUE) set(SSE2_FOUND false CACHE BOOL "SSE2 available on host") ENDIF (SSE2_TRUE) # /proc/cpuinfo apparently omits sse3 :( STRING(REGEX REPLACE "^.*[^s](sse3).*$" "\\1" SSE_THERE ${CPUINFO}) STRING(COMPARE EQUAL "sse3" "${SSE_THERE}" SSE3_TRUE) IF (NOT SSE3_TRUE) STRING(REGEX REPLACE "^.*(T2300).*$" "\\1" SSE_THERE ${CPUINFO}) STRING(COMPARE EQUAL "T2300" "${SSE_THERE}" SSE3_TRUE) ENDIF (NOT SSE3_TRUE) STRING(REGEX REPLACE "^.*(ssse3).*$" "\\1" SSE_THERE ${CPUINFO}) STRING(COMPARE EQUAL "ssse3" "${SSE_THERE}" SSSE3_TRUE) IF (SSE3_TRUE OR SSSE3_TRUE) set(SSE3_FOUND true CACHE BOOL "SSE3 available on host") ELSE (SSE3_TRUE OR SSSE3_TRUE) set(SSE3_FOUND false CACHE BOOL "SSE3 available on host") ENDIF (SSE3_TRUE OR SSSE3_TRUE) IF (SSSE3_TRUE) set(SSSE3_FOUND true CACHE BOOL "SSSE3 available on host") ELSE (SSSE3_TRUE) set(SSSE3_FOUND false CACHE BOOL "SSSE3 available on host") ENDIF (SSSE3_TRUE) STRING(REGEX REPLACE "^.*(sse4_1).*$" "\\1" SSE_THERE ${CPUINFO}) STRING(COMPARE EQUAL "sse4_1" "${SSE_THERE}" SSE41_TRUE) IF (SSE41_TRUE) set(SSE4_1_FOUND true CACHE BOOL "SSE4.1 available on host") ELSE (SSE41_TRUE) set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host") ENDIF (SSE41_TRUE) ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin") EXEC_PROGRAM("/usr/sbin/sysctl -n machdep.cpu.features" OUTPUT_VARIABLE CPUINFO) STRING(REGEX REPLACE "^.*[^S](SSE2).*$" "\\1" SSE_THERE ${CPUINFO}) STRING(COMPARE EQUAL "SSE2" "${SSE_THERE}" SSE2_TRUE) IF (SSE2_TRUE) set(SSE2_FOUND true CACHE BOOL "SSE2 available on host") ELSE (SSE2_TRUE) set(SSE2_FOUND false CACHE BOOL "SSE2 available on host") ENDIF (SSE2_TRUE) STRING(REGEX REPLACE "^.*[^S](SSE3).*$" "\\1" SSE_THERE ${CPUINFO}) STRING(COMPARE EQUAL "SSE3" "${SSE_THERE}" SSE3_TRUE) IF (SSE3_TRUE) set(SSE3_FOUND true CACHE BOOL "SSE3 available on host") ELSE (SSE3_TRUE) set(SSE3_FOUND false CACHE BOOL "SSE3 available on host") ENDIF (SSE3_TRUE) STRING(REGEX REPLACE "^.*(SSSE3).*$" "\\1" SSE_THERE ${CPUINFO}) STRING(COMPARE EQUAL "SSSE3" "${SSE_THERE}" SSSE3_TRUE) IF (SSSE3_TRUE) set(SSSE3_FOUND true CACHE BOOL "SSSE3 available on host") ELSE (SSSE3_TRUE) set(SSSE3_FOUND false CACHE BOOL "SSSE3 available on host") ENDIF (SSSE3_TRUE) STRING(REGEX REPLACE "^.*(SSE4.1).*$" "\\1" SSE_THERE ${CPUINFO}) STRING(COMPARE EQUAL "SSE4.1" "${SSE_THERE}" SSE41_TRUE) IF (SSE41_TRUE) set(SSE4_1_FOUND true CACHE BOOL "SSE4.1 available on host") ELSE (SSE41_TRUE) set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host") ENDIF (SSE41_TRUE) ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Windows") # TODO set(SSE2_FOUND true CACHE BOOL "SSE2 available on host") set(SSE3_FOUND false CACHE BOOL "SSE3 available on host") set(SSSE3_FOUND false CACHE BOOL "SSSE3 available on host") set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host") ELSE(CMAKE_SYSTEM_NAME MATCHES "Linux") set(SSE2_FOUND true CACHE BOOL "SSE2 available on host") set(SSE3_FOUND false CACHE BOOL "SSE3 available on host") set(SSSE3_FOUND false CACHE BOOL "SSSE3 available on host") set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host") ENDIF(CMAKE_SYSTEM_NAME MATCHES "Linux") if(NOT SSE2_FOUND) MESSAGE(STATUS "Could not find hardware support for SSE2 on this machine.") endif(NOT SSE2_FOUND) if(NOT SSE3_FOUND) MESSAGE(STATUS "Could not find hardware support for SSE3 on this machine.") endif(NOT SSE3_FOUND) if(NOT SSSE3_FOUND) MESSAGE(STATUS "Could not find hardware support for SSSE3 on this machine.") endif(NOT SSSE3_FOUND) if(NOT SSE4_1_FOUND) MESSAGE(STATUS "Could not find hardware support for SSE4.1 on this machine.") endif(NOT SSE4_1_FOUND) mark_as_advanced(SSE2_FOUND SSE3_FOUND SSSE3_FOUND SSE4_1_FOUND) Vc-0.7.4/cmake/FindVc.cmake000066400000000000000000000052341233512346000153300ustar00rootroot00000000000000# Locate the Vc template library. Vc can be found at http://gitorious.org/Vc/ # # This file is meant to be copied into projects that want to use Vc. It will # search for VcConfig.cmake, which ships with Vc and will provide up-to-date # buildsystem changes. Thus there should not be any need to update FindVc.cmake # again after you integrated it into your project. # # This module defines the following variables: # Vc_FOUND # Vc_INCLUDE_DIR # Vc_LIBRARIES # Vc_DEFINITIONS # Vc_VERSION_MAJOR # Vc_VERSION_MINOR # Vc_VERSION_PATCH # Vc_VERSION # Vc_VERSION_STRING # Vc_INSTALL_DIR # Vc_LIB_DIR # Vc_CMAKE_MODULES_DIR # # The following two variables are set according to the compiler used. Feel free # to use them to skip whole compilation units. # Vc_SSE_INTRINSICS_BROKEN # Vc_AVX_INTRINSICS_BROKEN # #============================================================================= # Copyright 2009-2012 Matthias Kretz # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # * The names of Kitware, Inc., the Insight Consortium, or the names of # any consortium members, or of any contributors, may not be used to # endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #============================================================================= find_package(Vc ${Vc_FIND_VERSION} QUIET NO_MODULE PATHS $ENV{HOME} /opt/Vc) include(FindPackageHandleStandardArgs) find_package_handle_standard_args(Vc CONFIG_MODE) Vc-0.7.4/cmake/OptimizeForArchitecture.cmake000066400000000000000000000575041233512346000210000ustar00rootroot00000000000000#============================================================================= # Copyright 2010-2013 Matthias Kretz # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # * The names of Kitware, Inc., the Insight Consortium, or the names of # any consortium members, or of any contributors, may not be used to # endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #============================================================================= get_filename_component(_currentDir "${CMAKE_CURRENT_LIST_FILE}" PATH) include("${_currentDir}/AddCompilerFlag.cmake") include(CheckIncludeFile) macro(_my_find _list _value _ret) list(FIND ${_list} "${_value}" _found) if(_found EQUAL -1) set(${_ret} FALSE) else(_found EQUAL -1) set(${_ret} TRUE) endif(_found EQUAL -1) endmacro(_my_find) macro(AutodetectHostArchitecture) set(TARGET_ARCHITECTURE "generic") set(Vc_ARCHITECTURE_FLAGS) set(_vendor_id) set(_cpu_family) set(_cpu_model) if(CMAKE_SYSTEM_NAME STREQUAL "Linux") file(READ "/proc/cpuinfo" _cpuinfo) string(REGEX REPLACE ".*vendor_id[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _vendor_id "${_cpuinfo}") string(REGEX REPLACE ".*cpu family[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_family "${_cpuinfo}") string(REGEX REPLACE ".*model[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_model "${_cpuinfo}") string(REGEX REPLACE ".*flags[ \t]*:[ \t]+([^\n]+).*" "\\1" _cpu_flags "${_cpuinfo}") elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") exec_program("/usr/sbin/sysctl -n machdep.cpu.vendor" OUTPUT_VARIABLE _vendor_id) exec_program("/usr/sbin/sysctl -n machdep.cpu.model" OUTPUT_VARIABLE _cpu_model) exec_program("/usr/sbin/sysctl -n machdep.cpu.family" OUTPUT_VARIABLE _cpu_family) exec_program("/usr/sbin/sysctl -n machdep.cpu.features" OUTPUT_VARIABLE _cpu_flags) string(TOLOWER "${_cpu_flags}" _cpu_flags) string(REPLACE "." "_" _cpu_flags "${_cpu_flags}") elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") get_filename_component(_vendor_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;VendorIdentifier]" NAME CACHE) get_filename_component(_cpu_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;Identifier]" NAME CACHE) mark_as_advanced(_vendor_id _cpu_id) string(REGEX REPLACE ".* Family ([0-9]+) .*" "\\1" _cpu_family "${_cpu_id}") string(REGEX REPLACE ".* Model ([0-9]+) .*" "\\1" _cpu_model "${_cpu_id}") endif(CMAKE_SYSTEM_NAME STREQUAL "Linux") if(_vendor_id STREQUAL "GenuineIntel") if(_cpu_family EQUAL 6) # Any recent Intel CPU except NetBurst if(_cpu_model EQUAL 62) set(TARGET_ARCHITECTURE "ivy-bridge") elseif(_cpu_model EQUAL 58) set(TARGET_ARCHITECTURE "ivy-bridge") elseif(_cpu_model EQUAL 47) # Xeon E7 4860 set(TARGET_ARCHITECTURE "westmere") elseif(_cpu_model EQUAL 46) # Xeon 7500 series set(TARGET_ARCHITECTURE "westmere") elseif(_cpu_model EQUAL 45) # Xeon TNG set(TARGET_ARCHITECTURE "sandy-bridge") elseif(_cpu_model EQUAL 44) # Xeon 5600 series set(TARGET_ARCHITECTURE "westmere") elseif(_cpu_model EQUAL 42) # Core TNG set(TARGET_ARCHITECTURE "sandy-bridge") elseif(_cpu_model EQUAL 37) # Core i7/i5/i3 set(TARGET_ARCHITECTURE "westmere") elseif(_cpu_model EQUAL 31) # Core i7/i5 set(TARGET_ARCHITECTURE "westmere") elseif(_cpu_model EQUAL 30) # Core i7/i5 set(TARGET_ARCHITECTURE "westmere") elseif(_cpu_model EQUAL 29) set(TARGET_ARCHITECTURE "penryn") elseif(_cpu_model EQUAL 28) set(TARGET_ARCHITECTURE "atom") elseif(_cpu_model EQUAL 26) set(TARGET_ARCHITECTURE "nehalem") elseif(_cpu_model EQUAL 23) set(TARGET_ARCHITECTURE "penryn") elseif(_cpu_model EQUAL 15) set(TARGET_ARCHITECTURE "merom") elseif(_cpu_model EQUAL 14) set(TARGET_ARCHITECTURE "core") elseif(_cpu_model LESS 14) message(WARNING "Your CPU (family ${_cpu_family}, model ${_cpu_model}) is not known. Auto-detection of optimization flags failed and will use the generic CPU settings with SSE2.") set(TARGET_ARCHITECTURE "generic") else() message(WARNING "Your CPU (family ${_cpu_family}, model ${_cpu_model}) is not known. Auto-detection of optimization flags failed and will use the 65nm Core 2 CPU settings.") set(TARGET_ARCHITECTURE "merom") endif() elseif(_cpu_family EQUAL 7) # Itanium (not supported) message(WARNING "Your CPU (Itanium: family ${_cpu_family}, model ${_cpu_model}) is not supported by OptimizeForArchitecture.cmake.") elseif(_cpu_family EQUAL 15) # NetBurst list(APPEND _available_vector_units_list "sse" "sse2") if(_cpu_model GREATER 2) # Not sure whether this must be 3 or even 4 instead list(APPEND _available_vector_units_list "sse" "sse2" "sse3") endif(_cpu_model GREATER 2) endif(_cpu_family EQUAL 6) elseif(_vendor_id STREQUAL "AuthenticAMD") if(_cpu_family EQUAL 22) # 16h set(TARGET_ARCHITECTURE "AMD 16h") elseif(_cpu_family EQUAL 21) # 15h if(_cpu_model LESS 2) set(TARGET_ARCHITECTURE "bulldozer") else() set(TARGET_ARCHITECTURE "piledriver") endif() elseif(_cpu_family EQUAL 20) # 14h set(TARGET_ARCHITECTURE "AMD 14h") elseif(_cpu_family EQUAL 18) # 12h elseif(_cpu_family EQUAL 16) # 10h set(TARGET_ARCHITECTURE "barcelona") elseif(_cpu_family EQUAL 15) set(TARGET_ARCHITECTURE "k8") if(_cpu_model GREATER 64) # I don't know the right number to put here. This is just a guess from the hardware I have access to set(TARGET_ARCHITECTURE "k8-sse3") endif(_cpu_model GREATER 64) endif() endif(_vendor_id STREQUAL "GenuineIntel") endmacro() macro(OptimizeForArchitecture) set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used.\nSetting the value to \"auto\" will try to optimize for the architecture where cmake is called.\nOther supported values are: \"none\", \"generic\", \"core\", \"merom\" (65nm Core2), \"penryn\" (45nm Core2), \"nehalem\", \"westmere\", \"sandy-bridge\", \"ivy-bridge\", \"atom\", \"k8\", \"k8-sse3\", \"barcelona\", \"istanbul\", \"magny-cours\", \"bulldozer\", \"interlagos\", \"piledriver\", \"AMD 14h\", \"AMD 16h\".") set(_force) if(NOT _last_target_arch STREQUAL "${TARGET_ARCHITECTURE}") message(STATUS "target changed from \"${_last_target_arch}\" to \"${TARGET_ARCHITECTURE}\"") set(_force FORCE) endif() set(_last_target_arch "${TARGET_ARCHITECTURE}" CACHE STRING "" FORCE) mark_as_advanced(_last_target_arch) string(TOLOWER "${TARGET_ARCHITECTURE}" TARGET_ARCHITECTURE) set(_march_flag_list) set(_available_vector_units_list) if(TARGET_ARCHITECTURE STREQUAL "auto") AutodetectHostArchitecture() message(STATUS "Detected CPU: ${TARGET_ARCHITECTURE}") endif(TARGET_ARCHITECTURE STREQUAL "auto") if(TARGET_ARCHITECTURE STREQUAL "core") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3") elseif(TARGET_ARCHITECTURE STREQUAL "merom") list(APPEND _march_flag_list "merom") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3") elseif(TARGET_ARCHITECTURE STREQUAL "penryn") list(APPEND _march_flag_list "penryn") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3") message(STATUS "Sadly the Penryn architecture exists in variants with SSE4.1 and without SSE4.1.") if(_cpu_flags MATCHES "sse4_1") message(STATUS "SSE4.1: enabled (auto-detected from this computer's CPU flags)") list(APPEND _available_vector_units_list "sse4.1") else() message(STATUS "SSE4.1: disabled (auto-detected from this computer's CPU flags)") endif() elseif(TARGET_ARCHITECTURE STREQUAL "nehalem") list(APPEND _march_flag_list "nehalem") list(APPEND _march_flag_list "corei7") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2") elseif(TARGET_ARCHITECTURE STREQUAL "westmere") list(APPEND _march_flag_list "westmere") list(APPEND _march_flag_list "corei7") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2") elseif(TARGET_ARCHITECTURE STREQUAL "ivy-bridge") list(APPEND _march_flag_list "core-avx-i") list(APPEND _march_flag_list "corei7-avx") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2" "avx" "rdrnd" "f16c") elseif(TARGET_ARCHITECTURE STREQUAL "sandy-bridge") list(APPEND _march_flag_list "sandybridge") list(APPEND _march_flag_list "corei7-avx") list(APPEND _march_flag_list "corei7") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2" "avx") elseif(TARGET_ARCHITECTURE STREQUAL "atom") list(APPEND _march_flag_list "atom") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3") elseif(TARGET_ARCHITECTURE STREQUAL "k8") list(APPEND _march_flag_list "k8") list(APPEND _available_vector_units_list "sse" "sse2") elseif(TARGET_ARCHITECTURE STREQUAL "k8-sse3") list(APPEND _march_flag_list "k8-sse3") list(APPEND _march_flag_list "k8") list(APPEND _available_vector_units_list "sse" "sse2" "sse3") elseif(TARGET_ARCHITECTURE STREQUAL "AMD 16h") list(APPEND _march_flag_list "btver2") list(APPEND _march_flag_list "btver1") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "f16c") elseif(TARGET_ARCHITECTURE STREQUAL "AMD 14h") list(APPEND _march_flag_list "btver1") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a") elseif(TARGET_ARCHITECTURE STREQUAL "piledriver") list(APPEND _march_flag_list "bdver2") list(APPEND _march_flag_list "bdver1") list(APPEND _march_flag_list "bulldozer") list(APPEND _march_flag_list "barcelona") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4" "fma" "f16c") elseif(TARGET_ARCHITECTURE STREQUAL "interlagos") list(APPEND _march_flag_list "bdver1") list(APPEND _march_flag_list "bulldozer") list(APPEND _march_flag_list "barcelona") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4") elseif(TARGET_ARCHITECTURE STREQUAL "bulldozer") list(APPEND _march_flag_list "bdver1") list(APPEND _march_flag_list "bulldozer") list(APPEND _march_flag_list "barcelona") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4") elseif(TARGET_ARCHITECTURE STREQUAL "barcelona") list(APPEND _march_flag_list "barcelona") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a") elseif(TARGET_ARCHITECTURE STREQUAL "istanbul") list(APPEND _march_flag_list "barcelona") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a") elseif(TARGET_ARCHITECTURE STREQUAL "magny-cours") list(APPEND _march_flag_list "barcelona") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a") elseif(TARGET_ARCHITECTURE STREQUAL "generic") list(APPEND _march_flag_list "generic") elseif(TARGET_ARCHITECTURE STREQUAL "none") # add this clause to remove it from the else clause else(TARGET_ARCHITECTURE STREQUAL "core") message(FATAL_ERROR "Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") endif(TARGET_ARCHITECTURE STREQUAL "core") if(NOT TARGET_ARCHITECTURE STREQUAL "none") set(_disable_vector_unit_list) set(_enable_vector_unit_list) _my_find(_available_vector_units_list "sse2" SSE2_FOUND) _my_find(_available_vector_units_list "sse3" SSE3_FOUND) _my_find(_available_vector_units_list "ssse3" SSSE3_FOUND) _my_find(_available_vector_units_list "sse4.1" SSE4_1_FOUND) _my_find(_available_vector_units_list "sse4.2" SSE4_2_FOUND) _my_find(_available_vector_units_list "sse4a" SSE4a_FOUND) if(DEFINED Vc_AVX_INTRINSICS_BROKEN AND Vc_AVX_INTRINSICS_BROKEN) UserWarning("AVX disabled per default because of old/broken compiler") set(AVX_FOUND false) set(XOP_FOUND false) set(FMA4_FOUND false) else() _my_find(_available_vector_units_list "avx" AVX_FOUND) if(DEFINED Vc_FMA4_INTRINSICS_BROKEN AND Vc_FMA4_INTRINSICS_BROKEN) UserWarning("FMA4 disabled per default because of old/broken compiler") set(FMA4_FOUND false) else() _my_find(_available_vector_units_list "fma4" FMA4_FOUND) endif() if(DEFINED Vc_XOP_INTRINSICS_BROKEN AND Vc_XOP_INTRINSICS_BROKEN) UserWarning("XOP disabled per default because of old/broken compiler") set(XOP_FOUND false) else() _my_find(_available_vector_units_list "xop" XOP_FOUND) endif() endif() set(USE_SSE2 ${SSE2_FOUND} CACHE BOOL "Use SSE2. If SSE2 instructions are not enabled the SSE implementation will be disabled." ${_force}) set(USE_SSE3 ${SSE3_FOUND} CACHE BOOL "Use SSE3. If SSE3 instructions are not enabled they will be emulated." ${_force}) set(USE_SSSE3 ${SSSE3_FOUND} CACHE BOOL "Use SSSE3. If SSSE3 instructions are not enabled they will be emulated." ${_force}) set(USE_SSE4_1 ${SSE4_1_FOUND} CACHE BOOL "Use SSE4.1. If SSE4.1 instructions are not enabled they will be emulated." ${_force}) set(USE_SSE4_2 ${SSE4_2_FOUND} CACHE BOOL "Use SSE4.2. If SSE4.2 instructions are not enabled they will be emulated." ${_force}) set(USE_SSE4a ${SSE4a_FOUND} CACHE BOOL "Use SSE4a. If SSE4a instructions are not enabled they will be emulated." ${_force}) set(USE_AVX ${AVX_FOUND} CACHE BOOL "Use AVX. This will double some of the vector sizes relative to SSE." ${_force}) set(USE_XOP ${XOP_FOUND} CACHE BOOL "Use XOP." ${_force}) set(USE_FMA4 ${FMA4_FOUND} CACHE BOOL "Use FMA4." ${_force}) mark_as_advanced(USE_SSE2 USE_SSE3 USE_SSSE3 USE_SSE4_1 USE_SSE4_2 USE_SSE4a USE_AVX USE_XOP USE_FMA4) if(USE_SSE2) list(APPEND _enable_vector_unit_list "sse2") else(USE_SSE2) list(APPEND _disable_vector_unit_list "sse2") endif(USE_SSE2) if(USE_SSE3) list(APPEND _enable_vector_unit_list "sse3") else(USE_SSE3) list(APPEND _disable_vector_unit_list "sse3") endif(USE_SSE3) if(USE_SSSE3) list(APPEND _enable_vector_unit_list "ssse3") else(USE_SSSE3) list(APPEND _disable_vector_unit_list "ssse3") endif(USE_SSSE3) if(USE_SSE4_1) list(APPEND _enable_vector_unit_list "sse4.1") else(USE_SSE4_1) list(APPEND _disable_vector_unit_list "sse4.1") endif(USE_SSE4_1) if(USE_SSE4_2) list(APPEND _enable_vector_unit_list "sse4.2") else(USE_SSE4_2) list(APPEND _disable_vector_unit_list "sse4.2") endif(USE_SSE4_2) if(USE_SSE4a) list(APPEND _enable_vector_unit_list "sse4a") else(USE_SSE4a) list(APPEND _disable_vector_unit_list "sse4a") endif(USE_SSE4a) if(USE_AVX) list(APPEND _enable_vector_unit_list "avx") # we want SSE intrinsics to result in instructions using the VEX prefix. # Otherwise integer ops (which require the older SSE intrinsics) would # always have a large penalty. list(APPEND _enable_vector_unit_list "sse2avx") else(USE_AVX) list(APPEND _disable_vector_unit_list "avx") endif(USE_AVX) if(USE_XOP) list(APPEND _enable_vector_unit_list "xop") else() list(APPEND _disable_vector_unit_list "xop") endif() if(USE_FMA4) list(APPEND _enable_vector_unit_list "fma4") else() list(APPEND _disable_vector_unit_list "fma4") endif() if(MSVC) # MSVC on 32 bit can select /arch:SSE2 (since 2010 also /arch:AVX) # MSVC on 64 bit cannot select anything (should have changed with MSVC 2010) _my_find(_enable_vector_unit_list "avx" _avx) set(_avx_flag FALSE) if(_avx) AddCompilerFlag("/arch:AVX" CXX_FLAGS Vc_ARCHITECTURE_FLAGS CXX_RESULT _avx_flag) endif() if(NOT _avx_flag) _my_find(_enable_vector_unit_list "sse2" _found) if(_found) AddCompilerFlag("/arch:SSE2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) endif() endif() foreach(_flag ${_enable_vector_unit_list}) string(TOUPPER "${_flag}" _flag) string(REPLACE "." "_" _flag "__${_flag}__") add_definitions("-D${_flag}") endforeach(_flag) elseif(CMAKE_CXX_COMPILER MATCHES "/(icpc|icc)$") # ICC (on Linux) _my_find(_available_vector_units_list "avx2" _found) if(_found) AddCompilerFlag("-xCORE-AVX2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) else(_found) _my_find(_available_vector_units_list "f16c" _found) if(_found) AddCompilerFlag("-xCORE-AVX-I" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) else(_found) _my_find(_available_vector_units_list "avx" _found) if(_found) AddCompilerFlag("-xAVX" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) else(_found) _my_find(_available_vector_units_list "sse4.2" _found) if(_found) AddCompilerFlag("-xSSE4.2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) else(_found) _my_find(_available_vector_units_list "sse4.1" _found) if(_found) AddCompilerFlag("-xSSE4.1" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) else(_found) _my_find(_available_vector_units_list "ssse3" _found) if(_found) AddCompilerFlag("-xSSSE3" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) else(_found) _my_find(_available_vector_units_list "sse3" _found) if(_found) # If the target host is an AMD machine then we still want to use -xSSE2 because the binary would refuse to run at all otherwise _my_find(_march_flag_list "barcelona" _found) if(NOT _found) _my_find(_march_flag_list "k8-sse3" _found) endif(NOT _found) if(_found) AddCompilerFlag("-xSSE2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) else(_found) AddCompilerFlag("-xSSE3" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) endif(_found) else(_found) _my_find(_available_vector_units_list "sse2" _found) if(_found) AddCompilerFlag("-xSSE2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) endif(_found) endif(_found) endif(_found) endif(_found) endif(_found) endif(_found) endif(_found) endif(_found) else() # not MSVC and not ICC => GCC, Clang, Open64 foreach(_flag ${_march_flag_list}) AddCompilerFlag("-march=${_flag}" CXX_RESULT _good CXX_FLAGS Vc_ARCHITECTURE_FLAGS) if(_good) break() endif(_good) endforeach(_flag) foreach(_flag ${_enable_vector_unit_list}) AddCompilerFlag("-m${_flag}" CXX_RESULT _result) if(_result) set(_header FALSE) if(_flag STREQUAL "sse3") set(_header "pmmintrin.h") elseif(_flag STREQUAL "ssse3") set(_header "tmmintrin.h") elseif(_flag STREQUAL "sse4.1") set(_header "smmintrin.h") elseif(_flag STREQUAL "sse4.2") set(_header "smmintrin.h") elseif(_flag STREQUAL "sse4a") set(_header "ammintrin.h") elseif(_flag STREQUAL "avx") set(_header "immintrin.h") elseif(_flag STREQUAL "fma4") set(_header "x86intrin.h") elseif(_flag STREQUAL "xop") set(_header "x86intrin.h") endif() set(_resultVar "HAVE_${_header}") string(REPLACE "." "_" _resultVar "${_resultVar}") if(_header) CHECK_INCLUDE_FILE("${_header}" ${_resultVar} "-m${_flag}") if(NOT ${_resultVar}) set(_useVar "USE_${_flag}") string(TOUPPER "${_useVar}" _useVar) string(REPLACE "." "_" _useVar "${_useVar}") message(STATUS "disabling ${_useVar} because ${_header} is missing") set(${_useVar} FALSE) list(APPEND _disable_vector_unit_list "${_flag}") endif() endif() if(NOT _header OR ${_resultVar}) set(Vc_ARCHITECTURE_FLAGS "${Vc_ARCHITECTURE_FLAGS} -m${_flag}") endif() endif() endforeach(_flag) foreach(_flag ${_disable_vector_unit_list}) AddCompilerFlag("-mno-${_flag}" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) endforeach(_flag) endif() endif() endmacro(OptimizeForArchitecture) Vc-0.7.4/cmake/UserWarning.cmake000066400000000000000000000003561233512346000164230ustar00rootroot00000000000000macro(UserWarning _msg) if("$ENV{DASHBOARD_TEST_FROM_CTEST}" STREQUAL "") # developer (non-dashboard) build message(WARNING "${_msg}") else() # dashboard build message(STATUS "${_msg}") endif() endmacro() Vc-0.7.4/cmake/VcConfig.cmake.in000066400000000000000000000012571233512346000162630ustar00rootroot00000000000000set(Vc_VERSION_MAJOR @Vc_VERSION_MAJOR@) set(Vc_VERSION_MINOR @Vc_VERSION_MINOR@) set(Vc_VERSION_PATCH @Vc_VERSION_PATCH@) set(Vc_VERSION @Vc_VERSION_MAJOR@.@Vc_VERSION_MINOR@.@Vc_VERSION_PATCH@) set(Vc_VERSION_STRING "@Vc_VERSION_MAJOR@.@Vc_VERSION_MINOR@.@Vc_VERSION_PATCH@") set(Vc_INSTALL_DIR "@CMAKE_INSTALL_PREFIX@") set(Vc_LIB_DIR "@CMAKE_INSTALL_PREFIX@/lib") set(Vc_INCLUDE_DIR "@CMAKE_INSTALL_PREFIX@/include") set(Vc_CMAKE_MODULES_DIR "@CMAKE_INSTALL_PREFIX@/lib/cmake/Vc") find_library(Vc_LIBRARIES Vc PATHS "${Vc_LIB_DIR}" NO_DEFAULT_PATH) include("${Vc_CMAKE_MODULES_DIR}/VcMacros.cmake") set(Vc_DEFINITIONS) vc_set_preferred_compiler_flags() Vc-0.7.4/cmake/VcConfigVersion.cmake.in000066400000000000000000000005401233512346000176230ustar00rootroot00000000000000set(PACKAGE_VERSION @Vc_VERSION_MAJOR@.@Vc_VERSION_MINOR@.@Vc_VERSION_PATCH@) if("${PACKAGE_VERSION}" VERSION_LESS "${PACKAGE_FIND_VERSION}") set(PACKAGE_VERSION_COMPATIBLE FALSE) else() set(PACKAGE_VERSION_COMPATIBLE TRUE) if("${PACKAGE_FIND_VERSION}" STREQUAL "${PACKAGE_VERSION}") set(PACKAGE_VERSION_EXACT TRUE) endif() endif() Vc-0.7.4/cmake/VcMacros.cmake000066400000000000000000000674241233512346000157050ustar00rootroot00000000000000# Macros for use with the Vc library. Vc can be found at http://code.compeng.uni-frankfurt.de/projects/vc # # The following macros are provided: # vc_determine_compiler # vc_set_preferred_compiler_flags # #============================================================================= # Copyright 2009-2013 Matthias Kretz # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # * The names of Kitware, Inc., the Insight Consortium, or the names of # any consortium members, or of any contributors, may not be used to # endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #============================================================================= cmake_minimum_required(VERSION 2.8.3) get_filename_component(_currentDir "${CMAKE_CURRENT_LIST_FILE}" PATH) include ("${_currentDir}/UserWarning.cmake") include ("${_currentDir}/AddCompilerFlag.cmake") include ("${_currentDir}/OptimizeForArchitecture.cmake") macro(vc_determine_compiler) if(NOT DEFINED Vc_COMPILER_IS_INTEL) execute_process(COMMAND "${CMAKE_CXX_COMPILER}" "--version" OUTPUT_VARIABLE _cxx_compiler_version ERROR_VARIABLE _cxx_compiler_version) set(Vc_COMPILER_IS_INTEL false) set(Vc_COMPILER_IS_OPEN64 false) set(Vc_COMPILER_IS_CLANG false) set(Vc_COMPILER_IS_MSVC false) set(Vc_COMPILER_IS_GCC false) if(CMAKE_CXX_COMPILER MATCHES "/(icpc|icc)$") set(Vc_COMPILER_IS_INTEL true) exec_program(${CMAKE_C_COMPILER} ARGS -dumpversion OUTPUT_VARIABLE Vc_ICC_VERSION) message(STATUS "Detected Compiler: Intel ${Vc_ICC_VERSION}") elseif(CMAKE_CXX_COMPILER MATCHES "(opencc|openCC)$") set(Vc_COMPILER_IS_OPEN64 true) message(STATUS "Detected Compiler: Open64") elseif(CMAKE_CXX_COMPILER MATCHES "clang\\+\\+$" OR "${_cxx_compiler_version}" MATCHES "clang") set(Vc_COMPILER_IS_CLANG true) exec_program(${CMAKE_CXX_COMPILER} ARGS --version OUTPUT_VARIABLE Vc_CLANG_VERSION) string(REGEX MATCH "[0-9]+\\.[0-9]+(\\.[0-9]+)?" Vc_CLANG_VERSION "${Vc_CLANG_VERSION}") message(STATUS "Detected Compiler: Clang ${Vc_CLANG_VERSION}") elseif(MSVC) set(Vc_COMPILER_IS_MSVC true) message(STATUS "Detected Compiler: MSVC ${MSVC_VERSION}") elseif(CMAKE_COMPILER_IS_GNUCXX) set(Vc_COMPILER_IS_GCC true) exec_program(${CMAKE_C_COMPILER} ARGS -dumpversion OUTPUT_VARIABLE Vc_GCC_VERSION) message(STATUS "Detected Compiler: GCC ${Vc_GCC_VERSION}") # some distributions patch their GCC to return nothing or only major and minor version on -dumpversion. # In that case we must extract the version number from --version. if(NOT Vc_GCC_VERSION OR Vc_GCC_VERSION MATCHES "^[0-9]\\.[0-9]+$") exec_program(${CMAKE_C_COMPILER} ARGS --version OUTPUT_VARIABLE Vc_GCC_VERSION) string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" Vc_GCC_VERSION "${Vc_GCC_VERSION}") message(STATUS "GCC Version from --version: ${Vc_GCC_VERSION}") endif() # some distributions patch their GCC to be API incompatible to what the FSF released. In # those cases we require a macro to identify the distribution version find_program(_lsb_release lsb_release) mark_as_advanced(_lsb_release) if(_lsb_release) execute_process(COMMAND ${_lsb_release} -is OUTPUT_VARIABLE _distributor_id OUTPUT_STRIP_TRAILING_WHITESPACE) execute_process(COMMAND ${_lsb_release} -rs OUTPUT_VARIABLE _distributor_release OUTPUT_STRIP_TRAILING_WHITESPACE) string(TOUPPER "${_distributor_id}" _distributor_id) if(_distributor_id STREQUAL "UBUNTU") execute_process(COMMAND ${CMAKE_C_COMPILER} --version OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE _gcc_version) string(REGEX MATCH "\\(.* ${Vc_GCC_VERSION}-([0-9]+).*\\)" _tmp "${_gcc_version}") if(_tmp) set(_patch ${CMAKE_MATCH_1}) string(REGEX MATCH "^([0-9]+)\\.([0-9]+)$" _tmp "${_distributor_release}") execute_process(COMMAND printf 0x%x%02x%02x ${CMAKE_MATCH_1} ${CMAKE_MATCH_2} ${_patch} OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE _tmp) set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -D__GNUC_UBUNTU_VERSION__=${_tmp}") endif() endif() endif() else() message(WARNING "Untested/-supported Compiler for use with Vc.\nPlease fill out the missing parts in the CMake scripts and submit a patch to http://code.compeng.uni-frankfurt.de/projects/vc") endif() endif() endmacro() macro(vc_set_gnu_buildtype_flags) set(CMAKE_CXX_FLAGS_DEBUG "-g3" CACHE STRING "Flags used by the compiler during debug builds." FORCE) set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG" CACHE STRING "Flags used by the compiler during release minsize builds." FORCE) set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "Flags used by the compiler during release builds (/MD /Ob1 /Oi /Ot /Oy /Gs will produce slightly less optimized but smaller files)." FORCE) set(CMAKE_CXX_FLAGS_RELWITHDEBUG "-O3" CACHE STRING "Flags used by the compiler during release builds containing runtime checks." FORCE) set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBUG} -g" CACHE STRING "Flags used by the compiler during Release with Debug Info builds." FORCE) set(CMAKE_C_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}" CACHE STRING "Flags used by the compiler during debug builds." FORCE) set(CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL}" CACHE STRING "Flags used by the compiler during release minsize builds." FORCE) set(CMAKE_C_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}" CACHE STRING "Flags used by the compiler during release builds (/MD /Ob1 /Oi /Ot /Oy /Gs will produce slightly less optimized but smaller files)." FORCE) set(CMAKE_C_FLAGS_RELWITHDEBUG "${CMAKE_CXX_FLAGS_RELWITHDEBUG}" CACHE STRING "Flags used by the compiler during release builds containing runtime checks." FORCE) set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}" CACHE STRING "Flags used by the compiler during Release with Debug Info builds." FORCE) if(CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebug") set(ENABLE_STRICT_ALIASING true CACHE BOOL "Enables strict aliasing rules for more aggressive optimizations") if(NOT ENABLE_STRICT_ALIASING) AddCompilerFlag(-fno-strict-aliasing) endif(NOT ENABLE_STRICT_ALIASING) endif() mark_as_advanced(CMAKE_CXX_FLAGS_RELWITHDEBUG CMAKE_C_FLAGS_RELWITHDEBUG) endmacro() macro(vc_add_compiler_flag VAR _flag) AddCompilerFlag("${_flag}" CXX_FLAGS ${VAR}) endmacro() macro(vc_check_assembler) if(APPLE) if(NOT Vc_COMPILER_IS_CLANG) message(WARNING "Apple does not provide an assembler with AVX support. AVX will not be available. Please use Clang if you want to use AVX.") set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -DVC_NO_XGETBV") set(Vc_AVX_INTRINSICS_BROKEN true) endif() else(APPLE) if(${ARGC} EQUAL 1) set(_as "${ARGV1}") else() exec_program(${CMAKE_CXX_COMPILER} ARGS -print-prog-name=as OUTPUT_VARIABLE _as) mark_as_advanced(_as) endif() if(NOT _as) message(WARNING "Could not find 'as', the assembler used by GCC. Hoping everything will work out...") else() exec_program(${_as} ARGS --version OUTPUT_VARIABLE _as_version) string(REGEX REPLACE "\\([^\\)]*\\)" "" _as_version "${_as_version}") string(REGEX MATCH "[1-9]\\.[0-9]+(\\.[0-9]+)?" _as_version "${_as_version}") if(_as_version VERSION_LESS "2.18.93") UserWarning("Your binutils is too old (${_as_version}). Some optimizations of Vc will be disabled.") add_definitions(-DVC_NO_XGETBV) # old assembler doesn't know the xgetbv instruction set(Vc_AVX_INTRINSICS_BROKEN true) set(Vc_XOP_INTRINSICS_BROKEN true) set(Vc_FMA4_INTRINSICS_BROKEN true) elseif(_as_version VERSION_LESS "2.21.0") UserWarning("Your binutils is too old (${_as_version}) for XOP instructions. They will therefore not be provided in libVc.") set(Vc_XOP_INTRINSICS_BROKEN true) endif() endif() endif(APPLE) endmacro() macro(vc_check_fpmath) # if compiling for 32 bit x86 we need to use the -mfpmath=sse since the x87 is broken by design include (CheckCXXSourceRuns) check_cxx_source_runs("int main() { return sizeof(void*) != 8; }" Vc_VOID_PTR_IS_64BIT) if(NOT Vc_VOID_PTR_IS_64BIT) exec_program(${CMAKE_C_COMPILER} ARGS -dumpmachine OUTPUT_VARIABLE _gcc_machine) if(_gcc_machine MATCHES "[x34567]86" OR _gcc_machine STREQUAL "mingw32") vc_add_compiler_flag(Vc_DEFINITIONS "-mfpmath=sse") endif() endif() endmacro() macro(vc_set_preferred_compiler_flags) vc_determine_compiler() set(_add_warning_flags false) set(_add_buildtype_flags false) foreach(_arg ${ARGN}) if(_arg STREQUAL "WARNING_FLAGS") set(_add_warning_flags true) elseif(_arg STREQUAL "BUILDTYPE_FLAGS") set(_add_buildtype_flags true) endif() endforeach() set(Vc_SSE_INTRINSICS_BROKEN false) set(Vc_AVX_INTRINSICS_BROKEN false) set(Vc_XOP_INTRINSICS_BROKEN false) set(Vc_FMA4_INTRINSICS_BROKEN false) if(Vc_COMPILER_IS_OPEN64) ################################################################################################## # Open64 # ################################################################################################## if(_add_warning_flags) AddCompilerFlag("-W") AddCompilerFlag("-Wall") AddCompilerFlag("-Wimplicit") AddCompilerFlag("-Wswitch") AddCompilerFlag("-Wformat") AddCompilerFlag("-Wchar-subscripts") AddCompilerFlag("-Wparentheses") AddCompilerFlag("-Wmultichar") AddCompilerFlag("-Wtrigraphs") AddCompilerFlag("-Wpointer-arith") AddCompilerFlag("-Wcast-align") AddCompilerFlag("-Wreturn-type") AddCompilerFlag("-ansi") AddCompilerFlag("-pedantic") AddCompilerFlag("-Wno-long-long") AddCompilerFlag("-Wshadow") AddCompilerFlag("-Wold-style-cast") AddCompilerFlag("-Wno-variadic-macros") endif() if(_add_buildtype_flags) vc_set_gnu_buildtype_flags() endif() vc_check_assembler() # Open64 4.5.1 still doesn't ship immintrin.h set(Vc_AVX_INTRINSICS_BROKEN true) elseif(Vc_COMPILER_IS_GCC) ################################################################################################## # GCC # ################################################################################################## if(_add_warning_flags) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W -Wall -Wswitch -Wformat -Wchar-subscripts -Wparentheses -Wmultichar -Wtrigraphs -Wpointer-arith -Wcast-align -Wreturn-type -pedantic -Wno-long-long -Wshadow") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W -Wall -Wswitch -Wformat -Wchar-subscripts -Wparentheses -Wmultichar -Wtrigraphs -Wpointer-arith -Wcast-align -Wreturn-type -pedantic -Wno-long-long -Wshadow") if(NOT WIN32) # the -ansi flag makes MinGW unusable, so maybe it's better to omit it set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ansi") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ansi") endif() AddCompilerFlag("-Wundef") AddCompilerFlag("-Wold-style-cast") AddCompilerFlag("-Wno-variadic-macros") if(Vc_GCC_VERSION VERSION_GREATER "4.5.2" AND Vc_GCC_VERSION VERSION_LESS "4.6.4") # GCC gives bogus "array subscript is above array bounds" warnings in math.cpp AddCompilerFlag("-Wno-array-bounds") endif() endif() vc_add_compiler_flag(Vc_DEFINITIONS "-Wabi") vc_add_compiler_flag(Vc_DEFINITIONS "-fabi-version=0") # ABI version 4 is required to make __m128 and __m256 appear as different types. 0 should give us the latest version. if(_add_buildtype_flags) vc_set_gnu_buildtype_flags() endif() # GCC 4.5.[01] fail at inlining some functions, creating functions with a single instructions, # thus creating a large overhead. if(Vc_GCC_VERSION VERSION_LESS "4.5.2" AND NOT Vc_GCC_VERSION VERSION_LESS "4.5.0") UserWarning("GCC 4.5.0 and 4.5.1 have problems with inlining correctly. Setting early-inlining-insns=12 as workaround.") AddCompilerFlag("--param early-inlining-insns=12") endif() if(Vc_GCC_VERSION VERSION_LESS "4.1.99") UserWarning("Your GCC is ancient and crashes on some important optimizations. The full set of SSE2 intrinsics is not supported. Vc will fall back to the scalar implementation. Use of the may_alias and always_inline attributes will be disabled. In turn all code using Vc must be compiled with -fno-strict-aliasing") vc_add_compiler_flag(Vc_DEFINITIONS "-fno-strict-aliasing") set(Vc_AVX_INTRINSICS_BROKEN true) set(Vc_SSE_INTRINSICS_BROKEN true) elseif(Vc_GCC_VERSION VERSION_LESS "4.4.6") UserWarning("Your GCC is older than 4.4.6. This is known to cause problems/bugs. Please update to the latest GCC if you can.") set(Vc_AVX_INTRINSICS_BROKEN true) if(Vc_GCC_VERSION VERSION_LESS "4.3.0") UserWarning("Your GCC is older than 4.3.0. It is unable to handle the full set of SSE2 intrinsics. All SSE code will be disabled. Please update to the latest GCC if you can.") set(Vc_SSE_INTRINSICS_BROKEN true) endif() endif() if(Vc_GCC_VERSION VERSION_LESS 4.5.0) UserWarning("GCC 4.4.x shows false positives for -Wparentheses, thus we rather disable the warning.") string(REPLACE " -Wparentheses " " " CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") string(REPLACE " -Wparentheses " " " CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -Wno-parentheses") UserWarning("GCC 4.4.x shows false positives for -Wstrict-aliasing, thus we rather disable the warning. Use a newer GCC for better warnings.") AddCompilerFlag("-Wno-strict-aliasing") UserWarning("GCC 4.4.x shows false positives for -Wuninitialized, thus we rather disable the warning. Use a newer GCC for better warnings.") AddCompilerFlag("-Wno-uninitialized") elseif(Vc_GCC_VERSION VERSION_EQUAL 4.6.0) UserWarning("GCC 4.6.0 miscompiles AVX loads/stores, leading to spurious segfaults. Disabling AVX per default.") set(Vc_AVX_INTRINSICS_BROKEN true) elseif(Vc_GCC_VERSION VERSION_EQUAL 4.7.0) UserWarning("GCC 4.7.0 miscompiles at -O3, adding -fno-predictive-commoning to the compiler flags as workaround") set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -fno-predictive-commoning") elseif(Vc_GCC_VERSION VERSION_EQUAL 4.8.0) UserWarning("GCC 4.8.0 miscompiles at -O3, adding -fno-tree-vectorize to the compiler flags as workaround") set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -fno-tree-vectorize") endif() vc_check_fpmath() vc_check_assembler() elseif(Vc_COMPILER_IS_INTEL) ################################################################################################## # Intel Compiler # ################################################################################################## if(_add_buildtype_flags) set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DNDEBUG -O3") set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3") set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -DNDEBUG -O3") set(ALIAS_FLAGS "-no-ansi-alias") if(CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") # default ICC to -no-ansi-alias because otherwise tests/utils_sse fails. So far I suspect a miscompilation... set(ENABLE_STRICT_ALIASING false CACHE BOOL "Enables strict aliasing rules for more aggressive optimizations") if(ENABLE_STRICT_ALIASING) set(ALIAS_FLAGS "-ansi-alias") endif(ENABLE_STRICT_ALIASING) endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ALIAS_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ALIAS_FLAGS}") endif() vc_add_compiler_flag(Vc_DEFINITIONS "-diag-disable 913") # Disable warning #13211 "Immediate parameter to intrinsic call too large". (sse/vector.tcc rotated(int)) vc_add_compiler_flag(Vc_DEFINITIONS "-diag-disable 13211") if(NOT "$ENV{DASHBOARD_TEST_FROM_CTEST}" STREQUAL "") # disable warning #2928: the __GXX_EXPERIMENTAL_CXX0X__ macro is disabled when using GNU version 4.6 with the c++0x option # this warning just adds noise about problems in the compiler - but I'm only interested in seeing problems in Vc vc_add_compiler_flag(Vc_DEFINITIONS "-diag-disable 2928") endif() # Intel doesn't implement the XOP or FMA4 intrinsics set(Vc_XOP_INTRINSICS_BROKEN true) set(Vc_FMA4_INTRINSICS_BROKEN true) elseif(Vc_COMPILER_IS_MSVC) if(_add_warning_flags) AddCompilerFlag("/wd4800") # Disable warning "forcing value to bool" AddCompilerFlag("/wd4996") # Disable warning about strdup vs. _strdup AddCompilerFlag("/wd4244") # Disable warning "conversion from 'unsigned int' to 'float', possible loss of data" AddCompilerFlag("/wd4146") # Disable warning "unary minus operator applied to unsigned type, result still unsigned" AddCompilerFlag("/wd4227") # Disable warning "anachronism used : qualifiers on reference are ignored" (this is about 'restrict' usage on references, stupid MSVC) AddCompilerFlag("/wd4722") # Disable warning "destructor never returns, potential memory leak" (warns about ~_UnitTest_Global_Object which we don't care about) AddCompilerFlag("/wd4748") # Disable warning "/GS can not protect parameters and local variables from local buffer overrun because optimizations are disabled in function" (I don't get it) add_definitions(-D_CRT_SECURE_NO_WARNINGS) endif() # MSVC does not support inline assembly on 64 bit! :( # searching the help for xgetbv doesn't turn up anything. So just fall back to not supporting AVX on Windows :( # TODO: apparently MSVC 2010 SP1 added _xgetbv set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -DVC_NO_XGETBV") # get rid of the min/max macros set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -DNOMINMAX") # MSVC doesn't implement the XOP or FMA4 intrinsics set(Vc_XOP_INTRINSICS_BROKEN true) set(Vc_FMA4_INTRINSICS_BROKEN true) if(MSVC_VERSION LESS 1700) UserWarning("MSVC before 2012 has a broken std::vector::resize implementation. STL + Vc code will probably not compile.") endif() elseif(Vc_COMPILER_IS_CLANG) # for now I don't know of any arguments I want to pass. -march and stuff is tried by OptimizeForArchitecture... if(Vc_CLANG_VERSION VERSION_EQUAL "3.0") UserWarning("Clang 3.0 has serious issues to compile Vc code and will most likely crash when trying to do so.\nPlease update to a recent clang version.") elseif(Vc_CLANG_VERSION VERSION_EQUAL "3.2" AND NOT APPLE) # the LLVM assembler gets FMAs wrong (bug 15040) vc_add_compiler_flag(Vc_DEFINITIONS "-no-integrated-as") endif() # disable these warnings because clang shows them for function overloads that were discarded via SFINAE vc_add_compiler_flag(Vc_DEFINITIONS "-Wno-local-type-template-args") vc_add_compiler_flag(Vc_DEFINITIONS "-Wno-unnamed-type-template-args") if(NOT DEFINED Vc_INSIDE_ROOT) # ROOT has to set this up AddCompilerFlag(-stdlib=libc++) endif() endif() if(NOT Vc_COMPILER_IS_MSVC) vc_add_compiler_flag(Vc_DEFINITIONS "-ffp-contract=fast") endif() OptimizeForArchitecture() set(Vc_DEFINITIONS "${Vc_ARCHITECTURE_FLAGS} ${Vc_DEFINITIONS}") set(VC_IMPL "auto" CACHE STRING "Force the Vc implementation globally to the selected instruction set. \"auto\" lets Vc use the best available instructions.") if(NOT VC_IMPL STREQUAL "auto") set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -DVC_IMPL=${VC_IMPL}") if(NOT VC_IMPL STREQUAL "Scalar") set(_use_var "USE_${VC_IMPL}") if(VC_IMPL STREQUAL "SSE") set(_use_var "USE_SSE2") endif() if(NOT ${_use_var}) message(WARNING "The selected value for VC_IMPL (${VC_IMPL}) will not work because the relevant instructions are not enabled via compiler flags.") endif() endif() endif() endmacro() # helper macro for vc_compile_for_all_implementations macro(_vc_compile_one_implementation _objs _impl) list(FIND _disabled_targets "${_impl}" _disabled_index) list(FIND _only_targets "${_impl}" _only_index) if(${_disabled_index} EQUAL -1 AND (NOT _only_targets OR ${_only_index} GREATER -1)) set(_extra_flags) set(_ok FALSE) foreach(_flag ${ARGN}) if(_flag STREQUAL "NO_FLAG") set(_ok TRUE) break() endif() string(REPLACE " " ";" _flag_list "${_flag}") foreach(_flag ${_flag_list}) AddCompilerFlag(${_flag} CXX_RESULT _ok) if(NOT _ok) break() endif() endforeach() if(_ok) set(_extra_flags ${_flag_list}) break() endif() endforeach() set(_outfile_flag -c -o) if(Vc_COMPILER_IS_MSVC) # MSVC for 64bit does not recognize /arch:SSE2 anymore. Therefore we set override _ok if _impl # says SSE if("${_impl}" MATCHES "SSE") set(_ok TRUE) endif() set(_outfile_flag /c /Fo) endif() if(_ok) get_filename_component(_out "${_vc_compile_src}" NAME_WE) get_filename_component(_ext "${_vc_compile_src}" EXT) if(Vc_COMPILER_IS_MSVC) set(_out "${_out}_${_impl}${_ext}.obj") else() set(_out "${_out}_${_impl}${_ext}.o") endif() add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${_out} COMMAND ${CMAKE_CXX_COMPILER} ${_flags} ${_extra_flags} -DVC_IMPL=${_impl} ${_outfile_flag}${_out} ${CMAKE_CURRENT_SOURCE_DIR}/${_vc_compile_src} MAIN_DEPENDENCY ${CMAKE_CURRENT_SOURCE_DIR}/${_vc_compile_src} IMPLICIT_DEPENDS CXX ${CMAKE_CURRENT_SOURCE_DIR}/${_vc_compile_src} COMMENT "Building CXX object ${_out}" WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" VERBATIM ) list(APPEND ${_objs} "${CMAKE_CURRENT_BINARY_DIR}/${_out}") endif() endif() endmacro() # Generate compile rules for the given C++ source file for all available implementations and return # the resulting list of object files in _obj # all remaining arguments are additional flags # Example: # vc_compile_for_all_implementations(_objs src/trigonometric.cpp FLAGS -DCOMPILE_BLAH EXCLUDE Scalar) # add_executable(executable main.cpp ${_objs}) macro(vc_compile_for_all_implementations _objs _src) set(${_objs}) # remove all -march, -msse, etc. flags from the flags we want to pass string(REPLACE "${Vc_ARCHITECTURE_FLAGS}" "" _flags "${Vc_DEFINITIONS}") string(REPLACE "-DVC_IMPL=[^ ]*" "" _flags "${_flags}") # capture the -march= switch as -mtune; if there is none skip it if(Vc_ARCHITECTURE_FLAGS MATCHES "-march=") string(REGEX REPLACE "^.*-march=([^ ]*).*$" "-mtune=\\1" _tmp "${Vc_ARCHITECTURE_FLAGS}") set(_flags "${_flags} ${_tmp}") endif() unset(_disabled_targets) unset(_only_targets) set(_state 0) foreach(_arg ${ARGN}) if(_arg STREQUAL "FLAGS") set(_state 1) elseif(_arg STREQUAL "EXCLUDE") set(_state 2) elseif(_arg STREQUAL "ONLY") set(_state 3) elseif(_state EQUAL 1) set(_flags "${_flags} ${_arg}") elseif(_state EQUAL 2) list(APPEND _disabled_targets "${_arg}") elseif(_state EQUAL 3) list(APPEND _only_targets "${_arg}") else() message(FATAL_ERROR "incorrect argument to vc_compile_for_all_implementations") endif() endforeach() # make a semicolon separated list of all flags string(TOUPPER "${CMAKE_BUILD_TYPE}" _tmp) set(_tmp "CMAKE_CXX_FLAGS_${_tmp}") string(REPLACE " " ";" _tmp "${CMAKE_CXX_FLAGS} ${${_tmp}} ${_flags}") set(_flags) foreach(item ${_tmp}) if(item MATCHES "^[^']*'[^']*$") if(_str) list(APPEND _flags "${_str} ${item}") unset(_str) else() set(_str "${item}") endif() else() list(APPEND _flags "${item}") endif() endforeach() get_directory_property(_inc INCLUDE_DIRECTORIES) foreach(_i ${_inc}) list(APPEND _flags "-I${_i}") endforeach() set(_vc_compile_src "${_src}") _vc_compile_one_implementation(${_objs} Scalar NO_FLAG) if(NOT Vc_SSE_INTRINSICS_BROKEN) _vc_compile_one_implementation(${_objs} SSE2 "-msse2" "-xSSE2" "/arch:SSE2") _vc_compile_one_implementation(${_objs} SSE3 "-msse3" "-xSSE3" "/arch:SSE2") _vc_compile_one_implementation(${_objs} SSSE3 "-mssse3" "-xSSSE3" "/arch:SSE2") _vc_compile_one_implementation(${_objs} SSE4_1 "-msse4.1" "-xSSE4.1" "/arch:SSE2") _vc_compile_one_implementation(${_objs} SSE4_2 "-msse4.2" "-xSSE4.2" "/arch:SSE2") _vc_compile_one_implementation(${_objs} SSE3+SSE4a "-msse4a") endif() if(NOT Vc_AVX_INTRINSICS_BROKEN) _vc_compile_one_implementation(${_objs} AVX "-mavx" "-xAVX" "/arch:AVX") if(NOT Vc_XOP_INTRINSICS_BROKEN) if(NOT Vc_FMA4_INTRINSICS_BROKEN) _vc_compile_one_implementation(${_objs} SSE+XOP+FMA4 "-mxop -mfma4" "" "") _vc_compile_one_implementation(${_objs} AVX+XOP+FMA4 "-mavx -mxop -mfma4" "" "") endif() _vc_compile_one_implementation(${_objs} SSE+XOP+FMA "-mxop -mfma" "" "") _vc_compile_one_implementation(${_objs} AVX+XOP+FMA "-mavx -mxop -mfma" "" "") endif() _vc_compile_one_implementation(${_objs} AVX+FMA "-mavx -mfma" "" "") endif() endmacro() Vc-0.7.4/cmake/msvc_version.c000066400000000000000000000000231233512346000160250ustar00rootroot00000000000000MSVC _MSC_FULL_VER Vc-0.7.4/common/000077500000000000000000000000001233512346000133615ustar00rootroot00000000000000Vc-0.7.4/common/aliasingentryhelper.h000066400000000000000000000162401233512346000176060ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2010-2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_COMMON_ALIASINGENTRYHELPER_H #define VC_COMMON_ALIASINGENTRYHELPER_H #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace Common { template class AliasingEntryHelper { private: typedef typename StorageType::EntryType T; #ifdef VC_ICC StorageType *const m_storage; const int m_index; public: Vc_ALWAYS_INLINE AliasingEntryHelper(StorageType *d, int index) : m_storage(d), m_index(index) {} Vc_ALWAYS_INLINE AliasingEntryHelper(const AliasingEntryHelper &rhs) : m_storage(rhs.m_storage), m_index(rhs.m_index) {} Vc_ALWAYS_INLINE AliasingEntryHelper &operator=(const AliasingEntryHelper &rhs) { m_storage->assign(m_index, rhs); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator =(T x) { m_storage->assign(m_index, x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator +=(T x) { m_storage->assign(m_index, m_storage->m(m_index) + x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator -=(T x) { m_storage->assign(m_index, m_storage->m(m_index) - x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator /=(T x) { m_storage->assign(m_index, m_storage->m(m_index) / x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator *=(T x) { m_storage->assign(m_index, m_storage->m(m_index) * x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator |=(T x) { m_storage->assign(m_index, m_storage->m(m_index) | x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator &=(T x) { m_storage->assign(m_index, m_storage->m(m_index) & x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator ^=(T x) { m_storage->assign(m_index, m_storage->m(m_index) ^ x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator %=(T x) { m_storage->assign(m_index, m_storage->m(m_index) % x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator<<=(T x) { m_storage->assign(m_index, m_storage->m(m_index)<< x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator>>=(T x) { m_storage->assign(m_index, m_storage->m(m_index)>> x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator++() { m_storage->assign(m_index, m_storage->m(m_index) + T(1)); return *this; } Vc_ALWAYS_INLINE T operator++(int) { T r = m_storage->m(m_index); m_storage->assign(m_index, m_storage->m(m_index) + T(1)); return r; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator--() { m_storage->assign(m_index, m_storage->m(m_index) - T(1)); return *this; } Vc_ALWAYS_INLINE T operator--(int) { T r = m_storage->m(m_index); m_storage->assign(m_index, m_storage->m(m_index) - T(1)); return r; } #define m_data m_storage->read(m_index) #else typedef T A Vc_MAY_ALIAS; A &m_data; public: template Vc_ALWAYS_INLINE AliasingEntryHelper(T2 &d) : m_data(reinterpret_cast(d)) {} Vc_ALWAYS_INLINE AliasingEntryHelper(A &d) : m_data(d) {} Vc_ALWAYS_INLINE AliasingEntryHelper &operator=(const AliasingEntryHelper &rhs) { m_data = rhs.m_data; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator =(T x) { m_data = x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator+=(T x) { m_data += x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator-=(T x) { m_data -= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator/=(T x) { m_data /= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator*=(T x) { m_data *= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator|=(T x) { m_data |= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator&=(T x) { m_data &= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator^=(T x) { m_data ^= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator%=(T x) { m_data %= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator<<=(T x) { m_data <<= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator>>=(T x) { m_data >>= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator++() { ++m_data; return *this; } Vc_ALWAYS_INLINE T operator++(int) { T r = m_data; ++m_data; return r; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator--() { --m_data; return *this; } Vc_ALWAYS_INLINE T operator--(int) { T r = m_data; --m_data; return r; } #endif Vc_ALWAYS_INLINE Vc_PURE operator const T() const { return m_data; } Vc_ALWAYS_INLINE Vc_PURE bool operator==(T x) const { return static_cast(m_data) == x; } Vc_ALWAYS_INLINE Vc_PURE bool operator!=(T x) const { return static_cast(m_data) != x; } Vc_ALWAYS_INLINE Vc_PURE bool operator<=(T x) const { return static_cast(m_data) <= x; } Vc_ALWAYS_INLINE Vc_PURE bool operator>=(T x) const { return static_cast(m_data) >= x; } Vc_ALWAYS_INLINE Vc_PURE bool operator< (T x) const { return static_cast(m_data) < x; } Vc_ALWAYS_INLINE Vc_PURE bool operator> (T x) const { return static_cast(m_data) > x; } Vc_ALWAYS_INLINE Vc_PURE T operator-() const { return -static_cast(m_data); } Vc_ALWAYS_INLINE Vc_PURE T operator~() const { return ~static_cast(m_data); } Vc_ALWAYS_INLINE Vc_PURE T operator+(T x) const { return static_cast(m_data) + x; } Vc_ALWAYS_INLINE Vc_PURE T operator-(T x) const { return static_cast(m_data) - x; } Vc_ALWAYS_INLINE Vc_PURE T operator/(T x) const { return static_cast(m_data) / x; } Vc_ALWAYS_INLINE Vc_PURE T operator*(T x) const { return static_cast(m_data) * x; } Vc_ALWAYS_INLINE Vc_PURE T operator|(T x) const { return static_cast(m_data) | x; } Vc_ALWAYS_INLINE Vc_PURE T operator&(T x) const { return static_cast(m_data) & x; } Vc_ALWAYS_INLINE Vc_PURE T operator^(T x) const { return static_cast(m_data) ^ x; } Vc_ALWAYS_INLINE Vc_PURE T operator%(T x) const { return static_cast(m_data) % x; } //T operator<<(T x) const { return static_cast(m_data) << x; } //T operator>>(T x) const { return static_cast(m_data) >> x; } #ifdef m_data #undef m_data #endif }; } // namespace Common } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // VC_COMMON_ALIASINGENTRYHELPER_H Vc-0.7.4/common/bitscanintrinsics.h000066400000000000000000000037571233512346000172770ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2011-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_COMMON_BITSCANINTRINSICS_H #define VC_COMMON_BITSCANINTRINSICS_H #if defined(VC_GCC) || defined(VC_CLANG) # if VC_GCC >= 0x40500 // GCC 4.5.0 introduced _bit_scan_forward / _bit_scan_reverse # include # else // GCC <= 4.4 and clang have x86intrin.h, but not the required functions # define _bit_scan_forward(x) __builtin_ctz(x) #include "macros.h" static Vc_ALWAYS_INLINE Vc_CONST int _Vc_bit_scan_reverse_asm(unsigned int x) { int r; __asm__("bsr %1,%0" : "=r"(r) : "X"(x)); return r; } #include "undomacros.h" # define _bit_scan_reverse(x) _Vc_bit_scan_reverse_asm(x) # endif #elif defined(VC_ICC) // for all I know ICC supports the _bit_scan_* intrinsics #elif defined(VC_OPEN64) // TODO #elif defined(VC_MSVC) #include "windows_fix_intrin.h" #pragma intrinsic(_BitScanForward) #pragma intrinsic(_BitScanReverse) static inline __forceinline unsigned long _bit_scan_forward(unsigned long x) { unsigned long index; _BitScanForward(&index, x); return index; } static inline __forceinline unsigned long _bit_scan_reverse(unsigned long x) { unsigned long index; _BitScanReverse(&index, x); return index; } #else // just assume the compiler can do it #endif #endif // VC_COMMON_BITSCANINTRINSICS_H Vc-0.7.4/common/deinterleave.h000066400000000000000000000057331233512346000162110ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2010-2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_COMMON_DEINTERLEAVE_H #define VC_COMMON_DEINTERLEAVE_H #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { /** * \ingroup Vectors * * Loads two vectors of values from an interleaved array. * * \param a, b The vectors to load the values from memory into. * \param memory The memory location where to read the next 2 * V::Size values from * \param align Either pass Vc::Aligned or Vc::Unaligned. It defaults to Vc::Aligned if nothing is * specified. * * If you store your data as * \code * struct { float x, y; } m[1000]; * \endcode * then the deinterleave function allows you to read \p Size concurrent x and y values like this: * \code * Vc::float_v x, y; * Vc::deinterleave(&x, &y, &m[10], Vc::Unaligned); * \endcode * This code will load m[10], m[12], m[14], ... into \p x and m[11], m[13], m[15], ... into \p y. * * The deinterleave function supports the following type combinations: \verbatim V \ M | float | double | ushort | short | uint | int =========|=======|========|========|=======|======|===== float_v | X | | X | X | | ---------|-------|--------|--------|-------|------|----- sfloat_v | X | | X | X | | ---------|-------|--------|--------|-------|------|----- double_v | | X | | | | ---------|-------|--------|--------|-------|------|----- int_v | | | | X | | X ---------|-------|--------|--------|-------|------|----- uint_v | | | X | | X | ---------|-------|--------|--------|-------|------|----- short_v | | | | X | | ---------|-------|--------|--------|-------|------|----- ushort_v | | | X | | | \endverbatim */ template Vc_ALWAYS_INLINE void deinterleave(V *a, V *b, const M *memory, A align) { Internal::Helper::deinterleave(*a, *b, memory, align); } // documented as default for align above template Vc_ALWAYS_INLINE void deinterleave(V *a, V *b, const M *memory) { Internal::Helper::deinterleave(*a, *b, memory, Aligned); } } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // VC_COMMON_DEINTERLEAVE_H Vc-0.7.4/common/exponential.h000066400000000000000000000120521233512346000160600ustar00rootroot00000000000000#ifndef COMMON_EXPONENTIAL_H #define COMMON_EXPONENTIAL_H /* This file is part of the Vc library. {{{ Copyright (C) 2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . ------------------------------------------------------------------- The exp implementation is derived from Cephes, which carries the following Copyright notice: Cephes Math Library Release 2.2: June, 1992 Copyright 1984, 1987, 1989 by Stephen L. Moshier Direct inquiries to 30 Frost Street, Cambridge, MA 02140 }}}*/ #ifndef VC_COMMON_EXPONENTIAL_H #define VC_COMMON_EXPONENTIAL_H #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace Common { using Vc::VC__USE_NAMESPACE::c_log; using Vc::VC__USE_NAMESPACE::Vector; using Vc::VC__USE_NAMESPACE::floor; using Vc::VC__USE_NAMESPACE::ldexp; static const float log2_e = 1.44269504088896341f; static const float MAXLOGF = 88.72283905206835f; static const float MINLOGF = -103.278929903431851103f; /* log(2^-149) */ static const float MAXNUMF = 3.4028234663852885981170418348451692544e38f; template struct TypenameForLdexp { typedef Vector Type; }; template<> struct TypenameForLdexp { typedef Vector Type; }; template static inline Vector exp(VC_ALIGNED_PARAMETER(Vector) _x) { typedef Vector V; typedef typename V::Mask M; typedef typename TypenameForLdexp::Type I; typedef Const C; V x(_x); const M overflow = x > MAXLOGF; const M underflow = x < MINLOGF; // log₂(eˣ) = x * log₂(e) * log₂(2) // = log₂(2^(x * log₂(e))) // => eˣ = 2^(x * log₂(e)) // => n = ⌊x * log₂(e) + ½⌋ // => y = x - n * ln(2) | recall that: ln(2) * log₂(e) == 1 // <=> eˣ = 2ⁿ * eʸ V z = floor(C::log2_e() * x + 0.5f); I n = static_cast(z); x -= z * C::ln2_large(); x -= z * C::ln2_small(); /* Theoretical peak relative error in [-0.5, +0.5] is 4.2e-9. */ z = ((((( 1.9875691500E-4f * x + 1.3981999507E-3f) * x + 8.3334519073E-3f) * x + 4.1665795894E-2f) * x + 1.6666665459E-1f) * x + 5.0000001201E-1f) * (x * x) + x + 1.0f; x = ldexp(z, n); // == z * 2ⁿ x(overflow) = std::numeric_limits::infinity(); x.setZero(underflow); return x; } static inline Vector exp(Vector::AsArg _x) { Vector x = _x; typedef Vector V; typedef V::Mask M; typedef Const C; const M overflow = x > Vc_buildDouble( 1, 0x0006232bdd7abcd2ull, 9); // max log const M underflow = x < Vc_buildDouble(-1, 0x0006232bdd7abcd2ull, 9); // min log V px = floor(C::log2_e() * x + 0.5); #ifdef VC_IMPL_SSE Vector n(px); n.data() = Mem::permute(n.data()); #elif defined(VC_IMPL_AVX) __m128i tmp = _mm256_cvttpd_epi32(px.data()); Vector n = AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); #endif x -= px * C::ln2_large(); //Vc_buildDouble(1, 0x00062e4000000000ull, -1); // ln2 x -= px * C::ln2_small(); //Vc_buildDouble(1, 0x0007f7d1cf79abcaull, -20); // ln2 const double P[] = { Vc_buildDouble(1, 0x000089cdd5e44be8ull, -13), Vc_buildDouble(1, 0x000f06d10cca2c7eull, -6), Vc_buildDouble(1, 0x0000000000000000ull, 0) }; const double Q[] = { Vc_buildDouble(1, 0x00092eb6bc365fa0ull, -19), Vc_buildDouble(1, 0x0004ae39b508b6c0ull, -9), Vc_buildDouble(1, 0x000d17099887e074ull, -3), Vc_buildDouble(1, 0x0000000000000000ull, 1) }; const V x2 = x * x; px = x * ((P[0] * x2 + P[1]) * x2 + P[2]); x = px / ((((Q[0] * x2 + Q[1]) * x2 + Q[2]) * x2 + Q[3]) - px); x = V::One() + 2.0 * x; x = ldexp(x, n); // == x * 2ⁿ x(overflow) = std::numeric_limits::infinity(); x.setZero(underflow); return x; } } // namespace Common namespace VC__USE_NAMESPACE { using Vc::Common::exp; } // namespace VC__USE_NAMESPACE } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // VC_COMMON_EXPONENTIAL_H #endif // COMMON_EXPONENTIAL_H Vc-0.7.4/common/fix_clang_emmintrin.h000066400000000000000000000057361233512346000175610ustar00rootroot00000000000000/*{{{ Copyright (C) 2013 Matthias Kretz Permission to use, copy, modify, and distribute this software and its documentation for any purpose and without fee is hereby granted, provided that the above copyright notice appear in all copies and that both that the copyright notice and this permission notice and warranty disclaimer appear in supporting documentation, and that the name of the author not be used in advertising or publicity pertaining to distribution of the software without specific, written prior permission. The author disclaim all warranties with regard to this software, including all implied warranties of merchantability and fitness. In no event shall the author be liable for any special, indirect or consequential damages or any damages whatsoever resulting from loss of use, data or profits, whether in an action of contract, negligence or other tortious action, arising out of or in connection with the use or performance of this software. }}}*/ #ifndef VC_COMMON_FIX_CLANG_EMMINTRIN_H #define VC_COMMON_FIX_CLANG_EMMINTRIN_H #include #ifdef VC_CLANG #ifdef _mm_slli_si128 #undef _mm_slli_si128 #define _mm_slli_si128(a, count) __extension__ ({ \ (__m128i)__builtin_ia32_pslldqi128((__m128i)(a), (count)*8); }) #endif #ifdef _mm_srli_si128 #undef _mm_srli_si128 #define _mm_srli_si128(a, count) __extension__ ({ \ (__m128i)__builtin_ia32_psrldqi128((__m128i)(a), (count)*8); }) #endif #ifdef _mm_shuffle_epi32 #undef _mm_shuffle_epi32 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \ (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), (__v4si) _mm_set1_epi32(0), \ (imm) & 0x3, ((imm) & 0xc) >> 2, \ ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); }) #endif #ifdef _mm_shufflelo_epi16 #undef _mm_shufflelo_epi16 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \ (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), (__v8hi) _mm_set1_epi16(0), \ (imm) & 0x3, ((imm) & 0xc) >> 2, \ ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ 4, 5, 6, 7); }) #endif #ifdef _mm_shufflehi_epi16 #undef _mm_shufflehi_epi16 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \ (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), (__v8hi) _mm_set1_epi16(0), \ 0, 1, 2, 3, \ 4 + (((imm) & 0x03) >> 0), \ 4 + (((imm) & 0x0c) >> 2), \ 4 + (((imm) & 0x30) >> 4), \ 4 + (((imm) & 0xc0) >> 6)); }) #endif #ifdef _mm_shuffle_pd #undef _mm_shuffle_pd #define _mm_shuffle_pd(a, b, i) __extension__ ({ \ __builtin_shufflevector((__m128d)(a), (__m128d)(b), (i) & 1, (((i) & 2) >> 1) + 2); }) #endif #endif // VC_CLANG #endif // VC_COMMON_FIX_CLANG_EMMINTRIN_H Vc-0.7.4/common/iif.h000066400000000000000000000037371233512346000143130ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright (C) 2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . }}}*/ #ifndef VC_COMMON_IIF_H #define VC_COMMON_IIF_H #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { /** * Function to mimic the ternary operator '?:'. * * \param condition Determines which values are returned. This is analog to the first argument to * the ternary operator. * \param trueValue The values to return where \p condition is \c true. * \param falseValue The values to return where \p condition is \c false. * \return A combination of entries from \p trueValue and \p falseValue, according to \p condition. * * So instead of the scalar variant * \code * float x = a > 1.f ? b : b + c; * \endcode * you'd write * \code * float_v x = Vc::iif (a > 1.f, b, b + c); * \endcode */ #ifndef VC_MSVC template static Vc_ALWAYS_INLINE Vector iif (typename Vector::Mask condition, Vector trueValue, Vector falseValue) { #else template static Vc_ALWAYS_INLINE Vector iif (const typename Vector::Mask &condition, const Vector &trueValue, const Vector &_falseValue) { Vector falseValue(_falseValue); #endif falseValue(condition) = trueValue; return falseValue; } } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // VC_COMMON_IIF_H Vc-0.7.4/common/interleavedmemory.h000066400000000000000000000224551233512346000172750ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright (C) 2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . }}}*/ #ifndef VC_COMMON_INTERLEAVEDMEMORY_H #define VC_COMMON_INTERLEAVEDMEMORY_H #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace Common { namespace Internal { template struct CopyConst { typedef B Type; }; template struct CopyConst { typedef const B Type; }; template struct EnableInterleaves { typedef R Type; }; template struct EnableInterleaves; } // namespace Internal /** * \internal */ template struct InterleavedMemoryAccessBase { typedef typename V::EntryType T; typedef typename V::IndexType I; typedef typename V::AsArg VArg; typedef T Ta Vc_MAY_ALIAS; const I m_indexes; Ta *const m_data; Vc_ALWAYS_INLINE InterleavedMemoryAccessBase(typename I::AsArg indexes, Ta *data) : m_indexes(indexes), m_data(data) { } // implementations of the following are in {scalar,sse,avx}/interleavedmemory.tcc void deinterleave(V &v0, V &v1) const; void deinterleave(V &v0, V &v1, V &v2) const; void deinterleave(V &v0, V &v1, V &v2, V &v3) const; void deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4) const; void deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4, V &v5) const; void deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6) const; void deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7) const; void interleave(VArg v0, VArg v1); void interleave(VArg v0, VArg v1, VArg v2); void interleave(VArg v0, VArg v1, VArg v2, VArg v3); void interleave(VArg v0, VArg v1, VArg v2, VArg v3, VArg v4); void interleave(VArg v0, VArg v1, VArg v2, VArg v3, VArg v4, VArg v5); void interleave(VArg v0, VArg v1, VArg v2, VArg v3, VArg v4, VArg v5, VArg v6); void interleave(VArg v0, VArg v1, VArg v2, VArg v3, VArg v4, VArg v5, VArg v6, VArg v7); }; /** * \internal */ // delay execution of the deinterleaving gather until operator= template struct InterleavedMemoryReadAccess : public InterleavedMemoryAccessBase { typedef InterleavedMemoryAccessBase Base; typedef typename Base::Ta Ta; typedef typename Base::I I; Vc_ALWAYS_INLINE InterleavedMemoryReadAccess(const Ta *data, typename I::AsArg indexes) : Base(indexes * I(StructSize), const_cast(data)) // this needs to be refactored to properly keep the constness { } }; /** * \internal */ template struct InterleavedMemoryAccess : public InterleavedMemoryReadAccess { typedef InterleavedMemoryAccessBase Base; typedef typename Base::Ta Ta; typedef typename Base::I I; Vc_ALWAYS_INLINE InterleavedMemoryAccess(Ta *data, typename I::AsArg indexes) : InterleavedMemoryReadAccess(data, indexes) { } #define _VC_SCATTER_ASSIGNMENT(LENGTH, parameters) \ Vc_ALWAYS_INLINE void operator=(const VectorTuple &rhs) \ { \ VC_STATIC_ASSERT(LENGTH <= StructSize, You_are_trying_to_scatter_more_data_into_the_struct_than_it_has); \ this->interleave parameters ; \ } \ Vc_ALWAYS_INLINE void operator=(const VectorTuple &rhs) \ { \ VC_STATIC_ASSERT(LENGTH <= StructSize, You_are_trying_to_scatter_more_data_into_the_struct_than_it_has); \ checkIndexesUnique(); \ this->interleave parameters ; \ } _VC_SCATTER_ASSIGNMENT(2, (rhs.l, rhs.r)) _VC_SCATTER_ASSIGNMENT(3, (rhs.l.l, rhs.l.r, rhs.r)); _VC_SCATTER_ASSIGNMENT(4, (rhs.l.l.l, rhs.l.l.r, rhs.l.r, rhs.r)); _VC_SCATTER_ASSIGNMENT(5, (rhs.l.l.l.l, rhs.l.l.l.r, rhs.l.l.r, rhs.l.r, rhs.r)); _VC_SCATTER_ASSIGNMENT(6, (rhs.l.l.l.l.l, rhs.l.l.l.l.r, rhs.l.l.l.r, rhs.l.l.r, rhs.l.r, rhs.r)); _VC_SCATTER_ASSIGNMENT(7, (rhs.l.l.l.l.l.l, rhs.l.l.l.l.l.r, rhs.l.l.l.l.r, rhs.l.l.l.r, rhs.l.l.r, rhs.l.r, rhs.r)); _VC_SCATTER_ASSIGNMENT(8, (rhs.l.l.l.l.l.l.l, rhs.l.l.l.l.l.l.r, rhs.l.l.l.l.l.r, rhs.l.l.l.l.r, rhs.l.l.l.r, rhs.l.l.r, rhs.l.r, rhs.r)); #undef _VC_SCATTER_ASSIGNMENT private: #ifdef NDEBUG Vc_ALWAYS_INLINE void checkIndexesUnique() const {} #else void checkIndexesUnique() const { const I test = Base::m_indexes.sorted(); VC_ASSERT(I::Size == 1 || (test == test.rotated(1)).isEmpty()) } #endif }; #ifdef DOXYGEN } // namespace Common // in doxygen InterleavedMemoryWrapper should appear in the Vc namespace (see the using statement // below) #endif /** * Wraps a pointer to memory with convenience functions to access it via vectors. * * \param S The type of the struct. * \param V The type of the vector to be returned when read. This should reflect the type of the * members inside the struct. * * \see operator[] * \ingroup Utilities * \headerfile interleavedmemory.h */ template class InterleavedMemoryWrapper { typedef typename V::EntryType T; typedef typename V::IndexType I; typedef typename V::AsArg VArg; typedef typename I::AsArg IndexType; typedef InterleavedMemoryAccess Access; typedef InterleavedMemoryReadAccess ReadAccess; typedef typename Internal::CopyConst::Type Ta Vc_MAY_ALIAS; Ta *const m_data; VC_STATIC_ASSERT((sizeof(S) / sizeof(T)) * sizeof(T) == sizeof(S), InterleavedMemoryAccess_does_not_support_packed_structs); public: /** * Constructs the wrapper object. * * \param s A pointer to a C-array. */ Vc_ALWAYS_INLINE InterleavedMemoryWrapper(S *s) : m_data(reinterpret_cast(s)) { } /** * Interleaved scatter/gather access. * * Assuming you have a struct of floats and a vector of \p indexes into the array, this function * can be used to access the struct entries as vectors using the minimal number of store or load * instructions. * * \param indexes Vector of indexes that determine the gather locations. * * \return A special (magic) object that executes the loads and deinterleave on assignment to a * vector tuple. * * Example: * \code * struct Foo { * float x, y, z; * }; * * void fillWithBar(Foo *_data, uint_v indexes) * { * Vc::InterleavedMemoryWrapper data(_data); * const float_v x = bar(1); * const float_v y = bar(2); * const float_v z = bar(3); * data[indexes] = (x, y, z); * // it's also possible to just store a subset at the front of the struct: * data[indexes] = (x, y); * // if you want to store a single entry, use scatter: * z.scatter(_data, &Foo::x, indexes); * } * * float_v normalizeStuff(Foo *_data, uint_v indexes) * { * Vc::InterleavedMemoryWrapper data(_data); * float_v x, y, z; * (x, y, z) = data[indexes]; * // it is also possible to just load a subset from the front of the struct: * // (x, y) = data[indexes]; * return Vc::sqrt(x * x + y * y + z * z); * } * \endcode * * You may think of the gather operation (or scatter as the inverse) like this: \verbatim Memory: {x0 y0 z0 x1 y1 z1 x2 y2 z2 x3 y3 z3 x4 y4 z4 x5 y5 z5 x6 y6 z6 x7 y7 z7 x8 y8 z8} indexes: [5, 0, 1, 7] Result in (x, y, z): ({x5 x0 x1 x7}, {y5 y0 y1 y7}, {z5 z0 z1 z7}) \endverbatim * * \warning If \p indexes contains non-unique entries on scatter, the result is undefined. If * \c NDEBUG is not defined the implementation will assert that the \p indexes entries are unique. */ #ifdef DOXYGEN Vc_ALWAYS_INLINE Access operator[](IndexType indexes) #else // need to SFINAE disable this for objects that wrap constant data template Vc_ALWAYS_INLINE typename Internal::EnableInterleaves::Type operator[]( VC_ALIGNED_PARAMETER(U) indexes) #endif { return Access(m_data, indexes); } /// const overload (gathers only) of the above function Vc_ALWAYS_INLINE ReadAccess operator[](VC_ALIGNED_PARAMETER(IndexType) indexes) const { return ReadAccess(m_data, indexes); } /// alias of the above function Vc_ALWAYS_INLINE ReadAccess gather(VC_ALIGNED_PARAMETER(IndexType) indexes) const { return operator[](indexes); } //Vc_ALWAYS_INLINE Access scatter(I indexes, VArg v0, VArg v1); }; #ifndef DOXYGEN } // namespace Common using Common::InterleavedMemoryWrapper; #endif } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // VC_COMMON_INTERLEAVEDMEMORY_H Vc-0.7.4/common/logarithm.h000066400000000000000000000245321233512346000155260ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ /* The log implementations are based on code from Julien Pommier which carries the following copyright information: */ /* Inspired by Intel Approximate Math library, and based on the corresponding algorithms of the cephes math library */ /* Copyright (C) 2007 Julien Pommier This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. (this is the zlib license) */ #ifndef VC_COMMON_LOGARITHM_H #define VC_COMMON_LOGARITHM_H #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace Common { #ifdef VC__USE_NAMESPACE using Vc::VC__USE_NAMESPACE::Const; using Vc::VC__USE_NAMESPACE::Vector; namespace Internal { using namespace Vc::VC__USE_NAMESPACE::Internal; } // namespace Internal #endif enum LogarithmBase { BaseE, Base10, Base2 }; template struct LogImpl { template static Vc_ALWAYS_INLINE void log_series(Vector &VC_RESTRICT x, typename Vector::AsArg exponent) { typedef Vector V; typedef Const C; // Taylor series around x = 2^exponent // f(x) = ln(x) → exponent * ln(2) → C::ln2_small + C::ln2_large // f'(x) = x⁻¹ → x → 1 // f''(x) = - x⁻² → -x² / 2 → C::_1_2() // = 2!x⁻³ → x³ / 3 → C::P(8) // = -3!x⁻⁴ → -x⁴ / 4 → C::P(7) // = 4!x⁻⁵ → x⁵ / 5 → C::P(6) // ... // The high order coefficients are adjusted to reduce the error that occurs from ommission // of higher order terms. // P(0) is the smallest term and |x| < 1 ⇒ |xⁿ| > |xⁿ⁺¹| // The order of additions must go from smallest to largest terms const V x2 = x * x; // 0 → 4 #ifdef VC_LOG_ILP V y2 = (C::P(6) * /*4 → 8*/ x2 + /* 8 → 11*/ C::P(7) * /*1 → 5*/ x) + /*11 → 14*/ C::P(8); V y0 = (C::P(0) * /*5 → 9*/ x2 + /* 9 → 12*/ C::P(1) * /*2 → 6*/ x) + /*12 → 15*/ C::P(2); V y1 = (C::P(3) * /*6 → 10*/ x2 + /*10 → 13*/ C::P(4) * /*3 → 7*/ x) + /*13 → 16*/ C::P(5); const V x3 = x2 * x; // 7 → 11 const V x6 = x3 * x3; // 11 → 15 const V x9 = x6 * x3; // 15 → 19 V y = (y0 * /*19 → 23*/ x9 + /*23 → 26*/ y1 * /*16 → 20*/ x6) + /*26 → 29*/ y2 * /*14 → 18*/ x3; #elif defined VC_LOG_ILP2 /* * name start done * movaps %xmm0, %xmm1 ; x 0 1 * movaps %xmm0, %xmm2 ; x 0 1 * mulps %xmm1, %xmm1 ; x2 1 5 *xmm1 * movaps , %xmm15 ; y8 1 2 * mulps %xmm1, %xmm2 ; x3 5 9 *xmm2 * movaps %xmm1, %xmm3 ; x2 5 6 * movaps %xmm1, %xmm4 ; x2 5 6 * mulps %xmm3, %xmm3 ; x4 6 10 *xmm3 * movaps %xmm2, %xmm5 ; x3 9 10 * movaps %xmm2, %xmm6 ; x3 9 10 * mulps %xmm2, %xmm4 ; x5 9 13 *xmm4 * movaps %xmm3, %xmm7 ; x4 10 11 * movaps %xmm3, %xmm8 ; x4 10 11 * movaps %xmm3, %xmm9 ; x4 10 11 * mulps %xmm5, %xmm5 ; x6 10 14 *xmm5 * mulps %xmm3, %xmm6 ; x7 11 15 *xmm6 * mulps %xmm7, %xmm7 ; x8 12 16 *xmm7 * movaps %xmm4, %xmm10 ; x5 13 14 * mulps %xmm4, %xmm8 ; x9 13 17 *xmm8 * mulps %xmm5, %xmm10 ; x11 14 18 *xmm10 * mulps %xmm5, %xmm9 ; x10 15 19 *xmm9 * mulps , %xmm10 ; y0 18 22 * mulps , %xmm9 ; y1 19 23 * mulps , %xmm8 ; y2 20 24 * mulps , %xmm7 ; y3 21 25 * addps %xmm10, %xmm9 ; y 23 26 * addps %xmm9, %xmm8 ; y 26 29 * addps %xmm8, %xmm7 ; y 29 32 */ const V x3 = x2 * x; // 4 → 8 const V x4 = x2 * x2; // 5 → 9 const V x5 = x2 * x3; // 8 → 12 const V x6 = x3 * x3; // 9 → 13 const V x7 = x4 * x3; // const V x8 = x4 * x4; const V x9 = x5 * x4; const V x10 = x5 * x5; const V x11 = x5 * x6; // 13 → 17 V y = C::P(0) * x11 + C::P(1) * x10 + C::P(2) * x9 + C::P(3) * x8 + C::P(4) * x7 + C::P(5) * x6 + C::P(6) * x5 + C::P(7) * x4 + C::P(8) * x3; #else V y = C::P(0); unrolled_loop16(i, 1, 9, y = y * x + C::P(i); ); y *= x * x2; #endif switch (Base) { case BaseE: // ln(2) is split in two parts to increase precision (i.e. ln2_small + ln2_large = ln(2)) y += exponent * C::ln2_small(); y -= x2 * C::_1_2(); // [0, 0.25[ x += y; x += exponent * C::ln2_large(); break; case Base10: y += exponent * C::ln2_small(); y -= x2 * C::_1_2(); // [0, 0.25[ x += y; x += exponent * C::ln2_large(); x *= C::log10_e(); break; case Base2: { const V x_ = x; x *= C::log2_e(); y *= C::log2_e(); y -= x_ * x * C::_1_2(); // [0, 0.25[ x += y; x += exponent; break; } } } static Vc_ALWAYS_INLINE void log_series(Vector &VC_RESTRICT x, Vector::AsArg exponent) { typedef Vector V; typedef Const C; const V x2 = x * x; V y = C::P(0); V y2 = C::Q(0) + x; unrolled_loop16(i, 1, 5, y = y * x + C::P(i); y2 = y2 * x + C::Q(i); ); y2 = x / y2; y = y * x + C::P(5); y = x2 * y * y2; // TODO: refactor the following with the float implementation: switch (Base) { case BaseE: // ln(2) is split in two parts to increase precision (i.e. ln2_small + ln2_large = ln(2)) y += exponent * C::ln2_small(); y -= x2 * C::_1_2(); // [0, 0.25[ x += y; x += exponent * C::ln2_large(); break; case Base10: y += exponent * C::ln2_small(); y -= x2 * C::_1_2(); // [0, 0.25[ x += y; x += exponent * C::ln2_large(); x *= C::log10_e(); break; case Base2: { const V x_ = x; x *= C::log2_e(); y *= C::log2_e(); y -= x_ * x * C::_1_2(); // [0, 0.25[ x += y; x += exponent; break; } } } template static inline Vector calc(VC_ALIGNED_PARAMETER(Vector) _x) { typedef Vector V; typedef typename V::Mask M; typedef Const C; V x(_x); const M invalidMask = x < V::Zero(); const M infinityMask = x == V::Zero(); const M denormal = x <= C::min(); x(denormal) *= V(Vc_buildDouble(1, 0, 54)); // 2²⁵ V exponent = Internal::exponent(x.data()); // = ⎣log₂(x)⎦ exponent(denormal) -= 54; x.setZero(C::exponentMask()); // keep only the fractional part ⇒ x ∈ [1, 2[ x |= C::_1_2(); // and set the exponent to 2⁻¹ ⇒ x ∈ [½, 1[ // split calculation in two cases: // A: x ∈ [½, √½[ // B: x ∈ [√½, 1[ // √½ defines the point where Δe(x) := log₂(x) - ⎣log₂(x)⎦ = ½, i.e. // log₂(√½) - ⎣log₂(√½)⎦ = ½ * -1 - ⎣½ * -1⎦ = -½ + 1 = ½ const M smallX = x < C::_1_sqrt2(); x(smallX) += x; // => x ∈ [√½, 1[ ∪ [1.5, 1 + √½[ x -= V::One(); // => x ∈ [√½ - 1, 0[ ∪ [0.5, √½[ exponent(!smallX) += V::One(); log_series(x, exponent); // A: (ˣ⁄₂ᵉ - 1, e) B: (ˣ⁄₂ᵉ⁺¹ - 1, e + 1) x.setQnan(invalidMask); // x < 0 → NaN x(infinityMask) = C::neginf(); // x = 0 → -∞ return x; } }; template static Vc_ALWAYS_INLINE Vc_CONST Vector log(VC_ALIGNED_PARAMETER(Vector) x) { return LogImpl::calc(x); } template static Vc_ALWAYS_INLINE Vc_CONST Vector log10(VC_ALIGNED_PARAMETER(Vector) x) { return LogImpl::calc(x); } template static Vc_ALWAYS_INLINE Vc_CONST Vector log2(VC_ALIGNED_PARAMETER(Vector) x) { return LogImpl::calc(x); } } // namespace Common #ifdef VC__USE_NAMESPACE namespace VC__USE_NAMESPACE { using Vc::Common::log; using Vc::Common::log10; using Vc::Common::log2; } // namespace VC__USE_NAMESPACE #undef VC__USE_NAMESPACE #endif } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // VC_COMMON_LOGARITHM_H Vc-0.7.4/common/macros.h000066400000000000000000000404421233512346000150220ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2010-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_COMMON_MACROS_H #define VC_COMMON_MACROS_H #undef VC_COMMON_UNDOMACROS_H #include #if defined(VC_GCC) && !defined(__OPTIMIZE__) # if VC_GCC >= 0x40500 # pragma GCC diagnostic push # define Vc_POP_GCC_DIAGNOSTIC__ 1 # endif // GCC uses lots of old-style-casts in macros that disguise as intrinsics # pragma GCC diagnostic ignored "-Wold-style-cast" #endif #ifdef VC_MSVC # define ALIGN(n) __declspec(align(n)) # define STRUCT_ALIGN1(n) ALIGN(n) # define STRUCT_ALIGN2(n) # define ALIGNED_TYPEDEF(n, _type_, _newType_) typedef ALIGN(n) _type_ _newType_ #else # define ALIGN(n) __attribute__((aligned(n))) # define STRUCT_ALIGN1(n) # define STRUCT_ALIGN2(n) ALIGN(n) # define ALIGNED_TYPEDEF(n, _type_, _newType_) typedef _type_ _newType_ ALIGN(n) #endif #ifdef VC_CXX11 #define Vc_ALIGNOF(_TYPE_) alignof(_TYPE_) #else #define Vc_ALIGNOF(_TYPE_) __alignof(_TYPE_) #endif #ifdef VC_CLANG # define Vc_INTRINSIC_L inline # define Vc_INTRINSIC_R __attribute__((always_inline)) # define Vc_INTRINSIC Vc_INTRINSIC_L Vc_INTRINSIC_R # define Vc_FLATTEN # define Vc_CONST __attribute__((const)) # define Vc_CONST_L # define Vc_CONST_R Vc_CONST # define Vc_PURE __attribute__((pure)) # define Vc_PURE_L # define Vc_PURE_R Vc_PURE # define Vc_MAY_ALIAS __attribute__((may_alias)) # define Vc_ALWAYS_INLINE_L inline # define Vc_ALWAYS_INLINE_R __attribute__((always_inline)) # define Vc_ALWAYS_INLINE Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE_R # define VC_IS_UNLIKELY(x) __builtin_expect(x, 0) # define VC_IS_LIKELY(x) __builtin_expect(x, 1) # define VC_RESTRICT __restrict__ # define VC_DEPRECATED(msg) #elif defined(__GNUC__) # if (defined(VC_GCC) && VC_GCC < 0x40300) || defined(VC_OPEN64) // GCC 4.1 and 4.2 ICE on may_alias. Since Open64 uses the GCC 4.2 frontend it has the same problem. # define Vc_MAY_ALIAS # else # define Vc_MAY_ALIAS __attribute__((__may_alias__)) # endif # if (defined(VC_GCC) && VC_GCC < 0x40300) // GCC 4.1 fails with "sorry unimplemented: inlining failed" # define Vc_INTRINSIC_R __attribute__((__flatten__)) # elif defined(VC_OPEN64) // the GCC 4.2 frontend doesn't know the __artificial__ attribute # define Vc_INTRINSIC_R __attribute__((__flatten__, __always_inline__)) # else # define Vc_INTRINSIC_R __attribute__((__flatten__, __always_inline__, __artificial__)) # endif # define Vc_INTRINSIC_L inline # define Vc_INTRINSIC Vc_INTRINSIC_L Vc_INTRINSIC_R # define Vc_FLATTEN __attribute__((__flatten__)) # define Vc_ALWAYS_INLINE_L inline # define Vc_ALWAYS_INLINE_R __attribute__((__always_inline__)) # define Vc_ALWAYS_INLINE Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE_R # ifdef VC_ICC // ICC miscompiles if there are functions marked as pure or const # define Vc_PURE # define Vc_CONST # else # define Vc_PURE __attribute__((__pure__)) # define Vc_CONST __attribute__((__const__)) # endif # define Vc_CONST_L # define Vc_CONST_R Vc_CONST # define Vc_PURE_L # define Vc_PURE_R Vc_PURE # define VC_IS_UNLIKELY(x) __builtin_expect(x, 0) # define VC_IS_LIKELY(x) __builtin_expect(x, 1) # define VC_RESTRICT __restrict__ # define VC_DEPRECATED(msg) __attribute__((__deprecated__(msg))) #else # define Vc_FLATTEN # ifdef Vc_PURE # undef Vc_PURE # endif # define Vc_MAY_ALIAS # ifdef VC_MSVC # define Vc_ALWAYS_INLINE inline __forceinline # define Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE # define Vc_ALWAYS_INLINE_R # define Vc_CONST __declspec(noalias) # define Vc_CONST_L Vc_CONST # define Vc_CONST_R # define Vc_PURE /*Vc_CONST*/ # define Vc_PURE_L Vc_PURE # define Vc_PURE_R # define Vc_INTRINSIC inline __forceinline # define Vc_INTRINSIC_L Vc_INTRINSIC # define Vc_INTRINSIC_R # else # define Vc_ALWAYS_INLINE # define Vc_ALWAYS_INLINE_L # define Vc_ALWAYS_INLINE_R # define Vc_CONST # define Vc_CONST_L # define Vc_CONST_R # define Vc_PURE # define Vc_PURE_L # define Vc_PURE_R # define Vc_INTRINSIC # define Vc_INTRINSIC_L # define Vc_INTRINSIC_R # endif # define VC_IS_UNLIKELY(x) x # define VC_IS_LIKELY(x) x # define VC_RESTRICT __restrict # define VC_DEPRECATED(msg) __declspec(deprecated(msg)) #endif #if __cplusplus >= 201103 /*C++11*/ #define _VC_CONSTEXPR constexpr #define _VC_CONSTEXPR_L _VC_CONSTEXPR #define _VC_CONSTEXPR_R #else #define _VC_CONSTEXPR Vc_INTRINSIC Vc_CONST #define _VC_CONSTEXPR_L Vc_INTRINSIC_L Vc_CONST_L #define _VC_CONSTEXPR_R Vc_INTRINSIC_R Vc_CONST_R #endif #ifdef VC_CXX11 # define _VC_NOEXCEPT noexcept #else # define _VC_NOEXCEPT throw() #endif #define FREE_STORE_OPERATORS_ALIGNED(alignment) \ Vc_ALWAYS_INLINE void *operator new(size_t size) { return _mm_malloc(size, alignment); } \ Vc_ALWAYS_INLINE void *operator new(size_t, void *p) { return p; } \ Vc_ALWAYS_INLINE void *operator new[](size_t size) { return _mm_malloc(size, alignment); } \ Vc_ALWAYS_INLINE void *operator new[](size_t , void *p) { return p; } \ Vc_ALWAYS_INLINE void operator delete(void *ptr, size_t) { _mm_free(ptr); } \ Vc_ALWAYS_INLINE void operator delete(void *, void *) {} \ Vc_ALWAYS_INLINE void operator delete[](void *ptr, size_t) { _mm_free(ptr); } \ Vc_ALWAYS_INLINE void operator delete[](void *, void *) {} #ifdef VC_GCC # define VC_WARN_INLINE # define VC_WARN(msg) __attribute__((warning("\n\t" msg))) #else # define VC_WARN_INLINE inline # define VC_WARN(msg) #endif #define unrolled_loop16(_it_, _start_, _end_, _code_) \ if (_start_ + 0 < _end_) { enum { _it_ = (_start_ + 0) < _end_ ? (_start_ + 0) : _start_ }; _code_ } \ if (_start_ + 1 < _end_) { enum { _it_ = (_start_ + 1) < _end_ ? (_start_ + 1) : _start_ }; _code_ } \ if (_start_ + 2 < _end_) { enum { _it_ = (_start_ + 2) < _end_ ? (_start_ + 2) : _start_ }; _code_ } \ if (_start_ + 3 < _end_) { enum { _it_ = (_start_ + 3) < _end_ ? (_start_ + 3) : _start_ }; _code_ } \ if (_start_ + 4 < _end_) { enum { _it_ = (_start_ + 4) < _end_ ? (_start_ + 4) : _start_ }; _code_ } \ if (_start_ + 5 < _end_) { enum { _it_ = (_start_ + 5) < _end_ ? (_start_ + 5) : _start_ }; _code_ } \ if (_start_ + 6 < _end_) { enum { _it_ = (_start_ + 6) < _end_ ? (_start_ + 6) : _start_ }; _code_ } \ if (_start_ + 7 < _end_) { enum { _it_ = (_start_ + 7) < _end_ ? (_start_ + 7) : _start_ }; _code_ } \ if (_start_ + 8 < _end_) { enum { _it_ = (_start_ + 8) < _end_ ? (_start_ + 8) : _start_ }; _code_ } \ if (_start_ + 9 < _end_) { enum { _it_ = (_start_ + 9) < _end_ ? (_start_ + 9) : _start_ }; _code_ } \ if (_start_ + 10 < _end_) { enum { _it_ = (_start_ + 10) < _end_ ? (_start_ + 10) : _start_ }; _code_ } \ if (_start_ + 11 < _end_) { enum { _it_ = (_start_ + 11) < _end_ ? (_start_ + 11) : _start_ }; _code_ } \ if (_start_ + 12 < _end_) { enum { _it_ = (_start_ + 12) < _end_ ? (_start_ + 12) : _start_ }; _code_ } \ if (_start_ + 13 < _end_) { enum { _it_ = (_start_ + 13) < _end_ ? (_start_ + 13) : _start_ }; _code_ } \ if (_start_ + 14 < _end_) { enum { _it_ = (_start_ + 14) < _end_ ? (_start_ + 14) : _start_ }; _code_ } \ if (_start_ + 15 < _end_) { enum { _it_ = (_start_ + 15) < _end_ ? (_start_ + 15) : _start_ }; _code_ } \ do {} while ( false ) #define for_all_vector_entries(_it_, _code_) \ unrolled_loop16(_it_, 0, Size, _code_) #ifdef VC_ASSERT #define VC_EXTERNAL_ASSERT 1 #else #ifdef NDEBUG #define VC_ASSERT(x) #else #include #define VC_ASSERT(x) assert(x); #endif #endif #ifdef VC_CLANG #define VC_HAS_BUILTIN(x) __has_builtin(x) #else #define VC_HAS_BUILTIN(x) 0 #endif #ifndef VC_COMMON_MACROS_H_ONCE #define VC_COMMON_MACROS_H_ONCE #define _VC_CAT_HELPER(a, b, c, d) a##b##c##d #define _VC_CAT(a, b, c, d) _VC_CAT_HELPER(a, b, c, d) #if __cplusplus >= 201103 /*C++11*/ || (defined(VC_MSVC) && VC_MSVC >= 160000000) #define VC_STATIC_ASSERT_NC(cond, msg) \ static_assert(cond, #msg) #define VC_STATIC_ASSERT(cond, msg) VC_STATIC_ASSERT_NC(cond, msg) #else // C++98 /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace { template struct STATIC_ASSERT_FAILURE; template<> struct STATIC_ASSERT_FAILURE {}; }} /*OUTER_NAMESPACE_END*/ #define VC_STATIC_ASSERT_NC(cond, msg) \ typedef STATIC_ASSERT_FAILURE _VC_CAT(static_assert_failed_on_line_,__LINE__,_,msg); \ enum { \ _VC_CAT(static_assert_failed__on_line_,__LINE__,_,msg) = sizeof(_VC_CAT(static_assert_failed_on_line_,__LINE__,_,msg)) \ } #define VC_STATIC_ASSERT(cond, msg) VC_STATIC_ASSERT_NC(cond, msg) #endif // C++11/98 template struct exponentToMultiplier { enum Values__ { X = exponentToMultiplier::X * ((e - center < 31) ? 2 : 1), Value = (X == 0 ? 1 : X) }; }; template struct exponentToMultiplier { enum Values__ { X = 1, Value = X }; }; template struct exponentToMultiplier< -1, center> { enum Values__ { X = 0, Value = 1 }; }; template struct exponentToMultiplier< -128, center> { enum Values__ { X = 0, Value = 1 }; }; template struct exponentToMultiplier< -256, center> { enum Values__ { X = 0, Value = 1 }; }; template struct exponentToMultiplier< -384, center> { enum Values__ { X = 0, Value = 1 }; }; template struct exponentToMultiplier< -512, center> { enum Values__ { X = 0, Value = 1 }; }; template struct exponentToMultiplier< -640, center> { enum Values__ { X = 0, Value = 1 }; }; template struct exponentToMultiplier< -768, center> { enum Values__ { X = 0, Value = 1 }; }; template struct exponentToMultiplier< -896, center> { enum Values__ { X = 0, Value = 1 }; }; template struct exponentToMultiplier<-1024, center> { enum Values__ { X = 0, Value = 1 }; }; template struct exponentToDivisor { enum Values__ { X = exponentToDivisor::X * ((center - e < 31) ? 2 : 1), Value = (X == 0 ? 1 : X) }; }; template struct exponentToDivisor { enum Values__ { X = 1, Value = X }; }; template struct exponentToDivisor< 1, center> { enum Values__ { X = 0, Value = 1 }; }; template struct exponentToDivisor< 128, center> { enum Values__ { X = 0, Value = 1 }; }; template struct exponentToDivisor< 256, center> { enum Values__ { X = 0, Value = 1 }; }; template struct exponentToDivisor< 384, center> { enum Values__ { X = 0, Value = 1 }; }; template struct exponentToDivisor< 512, center> { enum Values__ { X = 0, Value = 1 }; }; template struct exponentToDivisor< 640, center> { enum Values__ { X = 0, Value = 1 }; }; template struct exponentToDivisor< 768, center> { enum Values__ { X = 0, Value = 1 }; }; template struct exponentToDivisor< 896, center> { enum Values__ { X = 0, Value = 1 }; }; template struct exponentToDivisor< 1024, center> { enum Values__ { X = 0, Value = 1 }; }; #endif // VC_COMMON_MACROS_H_ONCE #define _CAT_IMPL(a, b) a##b #define CAT(a, b) _CAT_IMPL(a, b) #define Vc_buildDouble(sign, mantissa, exponent) \ ((static_cast((mantissa & 0x000fffffffffffffull) | 0x0010000000000000ull) / 0x0010000000000000ull) \ * exponentToMultiplier::Value \ * exponentToMultiplier::Value \ * exponentToMultiplier::Value \ * exponentToMultiplier::Value \ / exponentToDivisor::Value \ / exponentToDivisor::Value \ / exponentToDivisor::Value \ / exponentToDivisor::Value \ * static_cast(sign)) #define Vc_buildFloat(sign, mantissa, exponent) \ ((static_cast((mantissa & 0x007fffffu) | 0x00800000) / 0x00800000) \ * exponentToMultiplier::Value \ * exponentToMultiplier::Value \ * exponentToMultiplier::Value \ * exponentToMultiplier::Value \ / exponentToDivisor::Value \ / exponentToDivisor::Value \ / exponentToDivisor::Value \ / exponentToDivisor::Value \ * static_cast(sign)) #define _VC_APPLY_IMPL_1(macro, a, b, c, d, e) macro(a) #define _VC_APPLY_IMPL_2(macro, a, b, c, d, e) macro(a, b) #define _VC_APPLY_IMPL_3(macro, a, b, c, d, e) macro(a, b, c) #define _VC_APPLY_IMPL_4(macro, a, b, c, d, e) macro(a, b, c, d) #define _VC_APPLY_IMPL_5(macro, a, b, c, d, e) macro(a, b, c, d, e) #define VC_LIST_FLOAT_VECTOR_TYPES(size, macro, a, b, c, d) \ size(macro, double_v, a, b, c, d) \ size(macro, float_v, a, b, c, d) \ size(macro, sfloat_v, a, b, c, d) #define VC_LIST_INT_VECTOR_TYPES(size, macro, a, b, c, d) \ size(macro, int_v, a, b, c, d) \ size(macro, uint_v, a, b, c, d) \ size(macro, short_v, a, b, c, d) \ size(macro, ushort_v, a, b, c, d) #define VC_LIST_VECTOR_TYPES(size, macro, a, b, c, d) \ VC_LIST_FLOAT_VECTOR_TYPES(size, macro, a, b, c, d) \ VC_LIST_INT_VECTOR_TYPES(size, macro, a, b, c, d) #define VC_LIST_COMPARES(size, macro, a, b, c, d) \ size(macro, ==, a, b, c, d) \ size(macro, !=, a, b, c, d) \ size(macro, <=, a, b, c, d) \ size(macro, >=, a, b, c, d) \ size(macro, < , a, b, c, d) \ size(macro, > , a, b, c, d) #define VC_LIST_LOGICAL(size, macro, a, b, c, d) \ size(macro, &&, a, b, c, d) \ size(macro, ||, a, b, c, d) #define VC_LIST_BINARY(size, macro, a, b, c, d) \ size(macro, |, a, b, c, d) \ size(macro, &, a, b, c, d) \ size(macro, ^, a, b, c, d) #define VC_LIST_SHIFTS(size, macro, a, b, c, d) \ size(macro, <<, a, b, c, d) \ size(macro, >>, a, b, c, d) #define VC_LIST_ARITHMETICS(size, macro, a, b, c, d) \ size(macro, +, a, b, c, d) \ size(macro, -, a, b, c, d) \ size(macro, *, a, b, c, d) \ size(macro, /, a, b, c, d) \ size(macro, %, a, b, c, d) #define VC_APPLY_0(_list, macro) _list(_VC_APPLY_IMPL_1, macro, 0, 0, 0, 0) #define VC_APPLY_1(_list, macro, a) _list(_VC_APPLY_IMPL_2, macro, a, 0, 0, 0) #define VC_APPLY_2(_list, macro, a, b) _list(_VC_APPLY_IMPL_3, macro, a, b, 0, 0) #define VC_APPLY_3(_list, macro, a, b, c) _list(_VC_APPLY_IMPL_4, macro, a, b, c, 0) #define VC_APPLY_4(_list, macro, a, b, c, d) _list(_VC_APPLY_IMPL_5, macro, a, b, c, d) #define VC_ALL_COMPARES(macro) VC_APPLY_0(VC_LIST_COMPARES, macro) #define VC_ALL_LOGICAL(macro) VC_APPLY_0(VC_LIST_LOGICAL, macro) #define VC_ALL_BINARY(macro) VC_APPLY_0(VC_LIST_BINARY, macro) #define VC_ALL_SHIFTS(macro) VC_APPLY_0(VC_LIST_SHIFTS, macro) #define VC_ALL_ARITHMETICS(macro) VC_APPLY_0(VC_LIST_ARITHMETICS, macro) #define VC_ALL_FLOAT_VECTOR_TYPES(macro) VC_APPLY_0(VC_LIST_FLOAT_VECTOR_TYPES, macro) #define VC_ALL_VECTOR_TYPES(macro) VC_APPLY_0(VC_LIST_VECTOR_TYPES, macro) #define VC_EXACT_TYPE(_test, _reference, _type) \ typename EnableIf::Value, _type>::Value #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN #define VC_ALIGNED_PARAMETER(_Type) const _Type & #else #define VC_ALIGNED_PARAMETER(_Type) const _Type #endif #ifndef Vc__make_unique #define Vc__make_unique(name) _VC_CAT(Vc__,name,_,__LINE__) #endif #if defined(VC_ICC) || defined(VC_CLANG) #define VC_OFFSETOF(Type, member) (reinterpret_cast(&reinterpret_cast(0)->member) - reinterpret_cast(0)) #elif defined(VC_GCC) && VC_GCC < 0x40500 #define VC_OFFSETOF(Type, member) (reinterpret_cast(&reinterpret_cast(0x1000)->member) - reinterpret_cast(0x1000)) #else #define VC_OFFSETOF(Type, member) offsetof(Type, member) #endif #endif // VC_COMMON_MACROS_H Vc-0.7.4/common/memory.h000066400000000000000000000564771233512346000150650ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_COMMON_MEMORY_H #define VC_COMMON_MEMORY_H #include "memorybase.h" #include #include #include #include #include "memoryfwd.h" #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { /** * Allocates memory on the Heap with alignment and padding suitable for vectorized access. * * Memory that was allocated with this function must be released with Vc::free! Other methods might * work but are not portable. * * \param n Specifies the number of objects the allocated memory must be able to store. * \tparam T The type of the allocated memory. Note, that the constructor is not called. * \tparam A Determines the alignment of the memory. See \ref Vc::MallocAlignment. * * \return Pointer to memory of the requested type, or 0 on error. The allocated memory is padded at * the end to be a multiple of the requested alignment \p A. Thus if you request memory for 21 * int objects, aligned via Vc::AlignOnCacheline, you can safely read a full cacheline until the * end of the array, without generating an out-of-bounds access. For a cacheline size of 64 Bytes * and an int size of 4 Bytes you would thus get an array of 128 Bytes to work with. * * \warning * \li The standard malloc function specifies the number of Bytes to allocate whereas this * function specifies the number of values, thus differing in a factor of sizeof(T). * \li This function is mainly meant for use with builtin types. If you use a custom * type with a sizeof that is not a multiple of 2 the results might not be what you expect. * \li The constructor of T is not called. You can make up for this: * \code * SomeType *array = new(Vc::malloc(N)) SomeType[N]; * \endcode * * \see Vc::free * * \ingroup Utilities * \headerfile memory.h */ template Vc_ALWAYS_INLINE_L T *Vc_ALWAYS_INLINE_R malloc(size_t n) { return static_cast(Internal::Helper::malloc(n * sizeof(T))); } /** * Frees memory that was allocated with Vc::malloc. * * \param p The pointer to the memory to be freed. * * \tparam T The type of the allocated memory. * * \warning The destructor of T is not called. If needed, you can call the destructor before calling * free: * \code * for (int i = 0; i < N; ++i) { * p[i].~T(); * } * Vc::free(p); * \endcode * * \ingroup Utilities * \headerfile memory.h * * \see Vc::malloc */ template Vc_ALWAYS_INLINE void free(T *p) { Internal::Helper::free(p); } template struct _MemorySizeCalculation { enum AlignmentCalculations { Alignment = V::Size, AlignmentMask = Alignment - 1, MaskedSize = Size & AlignmentMask, Padding = Alignment - MaskedSize, PaddedSize = MaskedSize == 0 ? Size : Size + Padding }; }; /** * \ingroup Utilities * \headerfile memory.h * * A helper class for fixed-size two-dimensional arrays. * * \param V The vector type you want to operate on. (e.g. float_v or uint_v) * \param Size1 Number of rows * \param Size2 Number of columns */ template class Memory : public VectorAlignedBaseT, public MemoryBase, 2, Memory > { public: typedef typename V::EntryType EntryType; private: typedef MemoryBase, 2, Memory > Base; friend class MemoryBase, 2, Memory >; friend class MemoryDimensionBase, 2, Memory >; enum InternalConstants { PaddedSize2 = _MemorySizeCalculation::PaddedSize }; #if defined(VC_ICC) && defined(_WIN32) __declspec(align(__alignof(VectorAlignedBaseT))) #elif defined(VC_CLANG) __attribute__((aligned(__alignof(VectorAlignedBaseT)))) #elif defined(VC_MSVC) VectorAlignedBaseT _force_alignment; // __declspec(align(#)) accepts only numbers not __alignof nor just VectorAlignment // by putting VectorAlignedBaseT here _force_alignment is aligned correctly. // the downside is that there's a lot of padding before m_mem (32 Bytes with SSE) :( #endif EntryType m_mem[Size1][PaddedSize2]; public: using Base::vector; enum Constants { RowCount = Size1, VectorsCount = PaddedSize2 / V::Size }; /** * \return the number of rows in the array. * * \note This function can be eliminated by an optimizing compiler. */ _VC_CONSTEXPR size_t rowsCount() const { return RowCount; } /** * \return the number of scalar entries in the whole array. * * \warning Do not use this function for scalar iteration over the array since there will be * padding between rows if \c Size2 is not divisible by \c V::Size. * * \note This function can be optimized into a compile-time constant. */ _VC_CONSTEXPR size_t entriesCount() const { return Size1 * Size2; } /** * \return the number of vectors in the whole array. * * \note This function can be optimized into a compile-time constant. */ _VC_CONSTEXPR size_t vectorsCount() const { return VectorsCount * Size1; } /** * Copies the data from a different object. * * \param rhs The object to copy the data from. * * \return reference to the modified Memory object. * * \note Both objects must have the exact same vectorsCount(). */ template Vc_ALWAYS_INLINE Memory &operator=(const MemoryBase &rhs) { assert(vectorsCount() == rhs.vectorsCount()); Internal::copyVectors(*this, rhs); return *this; } Vc_ALWAYS_INLINE Memory &operator=(const Memory &rhs) { Internal::copyVectors(*this, rhs); return *this; } /** * Initialize all data with the given vector. * * \param v This vector will be used to initialize the memory. * * \return reference to the modified Memory object. */ inline Memory &operator=(const V &v) { for (size_t i = 0; i < vectorsCount(); ++i) { vector(i) = v; } return *this; } } #if defined(VC_ICC) && VC_ICC < 20120212 && !defined(_WIN32) __attribute__((__aligned__(__alignof(VectorAlignedBaseT)))) #endif ; /** * A helper class to simplify usage of correctly aligned and padded memory, allowing both vector and * scalar access. * * Example: * \code Vc::Memory array; // scalar access: for (size_t i = 0; i < array.entriesCount(); ++i) { int x = array[i]; // read array[i] = x; // write } // more explicit alternative: for (size_t i = 0; i < array.entriesCount(); ++i) { int x = array.scalar(i); // read array.scalar(i) = x; // write } // vector access: for (size_t i = 0; i < array.vectorsCount(); ++i) { int_v x = array.vector(i); // read array.vector(i) = x; // write } * \endcode * This code allocates a small array and implements three equivalent loops (that do nothing useful). * The loops show how scalar and vector read/write access is best implemented. * * Since the size of 11 is not a multiple of int_v::Size (unless you use the * scalar Vc implementation) the last write access of the vector loop would normally be out of * bounds. But the Memory class automatically pads the memory such that the whole array can be * accessed with correctly aligned memory addresses. * * \param V The vector type you want to operate on. (e.g. float_v or uint_v) * \param Size The number of entries of the scalar base type the memory should hold. This * is thus the same number as you would use for a normal C array (e.g. float mem[11] becomes * Memory mem). * * \see Memory * * \ingroup Utilities * \headerfile memory.h */ template class Memory : public VectorAlignedBaseT, public MemoryBase, 1, void> { public: typedef typename V::EntryType EntryType; private: typedef MemoryBase, 1, void> Base; friend class MemoryBase, 1, void>; friend class MemoryDimensionBase, 1, void>; enum InternalConstants { Alignment = V::Size, AlignmentMask = Alignment - 1, MaskedSize = Size & AlignmentMask, Padding = Alignment - MaskedSize, PaddedSize = MaskedSize == 0 ? Size : Size + Padding }; #if defined(VC_ICC) && defined(_WIN32) __declspec(align(__alignof(VectorAlignedBaseT))) #elif defined(VC_CLANG) __attribute__((aligned(__alignof(VectorAlignedBaseT)))) #elif defined(VC_MSVC) VectorAlignedBaseT _force_alignment; // __declspec(align(#)) accepts only numbers not __alignof nor just VectorAlignment // by putting VectorAlignedBaseT here _force_alignment is aligned correctly. // the downside is that there's a lot of padding before m_mem (32 Bytes with SSE) :( #endif EntryType m_mem[PaddedSize]; public: using Base::vector; enum Constants { EntriesCount = Size, VectorsCount = PaddedSize / V::Size }; /** * Wrap existing data with the Memory convenience class. * * This function returns a \em reference to a Memory object that you must * capture to avoid a copy of the whole data: * \code * Memory &m = Memory::fromRawData(someAlignedPointerToFloat) * \endcode * * \param ptr An aligned pointer to memory of type \p V::EntryType (e.g. \c float for * Vc::float_v). * \return A Memory object placed at the given location in memory. * * \warning The pointer \p ptr passed to this function must be aligned according to the * alignment restrictions of \p V. * \warning The size of the accessible memory must match \p Size. This includes the * required padding at the end to allow the last entries to be accessed via vectors. If * you know what you are doing you might violate this constraint. * \warning It is your responsibility to ensure that the memory is released correctly * (not too early/not leaked). This function simply adds convenience functions to \em * access the memory. */ static Vc_ALWAYS_INLINE Vc_CONST Memory &fromRawData(EntryType *ptr) { // DANGER! This placement new has to use the right address. If the compiler decides // RowMemory requires padding before the actual data then the address has to be adjusted // accordingly char *addr = reinterpret_cast(ptr); typedef Memory MM; addr -= VC_OFFSETOF(MM, m_mem); return *new(addr) MM; } /** * \return the number of scalar entries in the whole array. * * \note This function can be optimized into a compile-time constant. */ _VC_CONSTEXPR size_t entriesCount() const { return EntriesCount; } /** * \return the number of vectors in the whole array. * * \note This function can be optimized into a compile-time constant. */ _VC_CONSTEXPR size_t vectorsCount() const { return VectorsCount; } #ifdef VC_CXX11 Vc_ALWAYS_INLINE Memory() = default; #else Vc_ALWAYS_INLINE Memory() {} #endif inline Memory(const Memory &rhs) { Internal::copyVectors(*this, rhs); } template inline Memory(const Memory &rhs) { assert(vectorsCount() == rhs.vectorsCount()); Internal::copyVectors(*this, rhs); } inline Memory &operator=(const Memory &rhs) { Internal::copyVectors(*this, rhs); return *this; } template inline Memory &operator=(const Memory &rhs) { assert(vectorsCount() == rhs.vectorsCount()); Internal::copyVectors(*this, rhs); return *this; } Vc_ALWAYS_INLINE Memory &operator=(const EntryType *rhs) { std::memcpy(m_mem, rhs, entriesCount() * sizeof(EntryType)); return *this; } inline Memory &operator=(const V &v) { for (size_t i = 0; i < vectorsCount(); ++i) { vector(i) = v; } return *this; } } #if defined(VC_ICC) && VC_ICC < 20120212 && !defined(_WIN32) __attribute__((__aligned__(__alignof(VectorAlignedBaseT)) )) #endif ; /** * A helper class that is very similar to Memory but with dynamically allocated memory and * thus dynamic size. * * Example: * \code size_t size = 11; Vc::Memory array(size); // scalar access: for (size_t i = 0; i < array.entriesCount(); ++i) { array[i] = i; } // vector access: for (size_t i = 0; i < array.vectorsCount(); ++i) { array.vector(i) = int_v::IndexesFromZero() + i * int_v::Size; } * \endcode * This code allocates a small array with 11 scalar entries * and implements two equivalent loops that initialize the memory. * The scalar loop writes each individual int. The vectorized loop writes int_v::Size values to * memory per iteration. Since the size of 11 is not a multiple of int_v::Size (unless you use the * scalar Vc implementation) the last write access of the vector loop would normally be out of * bounds. But the Memory class automatically pads the memory such that the whole array can be * accessed with correctly aligned memory addresses. * (Note: the scalar loop can be auto-vectorized, except for the last three assignments.) * * \note The internal data pointer is not declared with the \c __restrict__ keyword. Therefore * modifying memory of V::EntryType will require the compiler to assume aliasing. If you want to use * the \c __restrict__ keyword you need to use a standard pointer to memory and do the vector * address calculation and loads and stores manually. * * \param V The vector type you want to operate on. (e.g. float_v or uint_v) * * \see Memory * * \ingroup Utilities * \headerfile memory.h */ template class Memory : public MemoryBase, 1, void> { public: typedef typename V::EntryType EntryType; private: typedef MemoryBase, 1, void> Base; friend class MemoryBase, 1, void>; friend class MemoryDimensionBase, 1, void>; enum InternalConstants { Alignment = V::Size, AlignmentMask = Alignment - 1 }; size_t m_entriesCount; size_t m_vectorsCount; EntryType *m_mem; size_t calcPaddedEntriesCount(size_t x) { size_t masked = x & AlignmentMask; return (masked == 0 ? x : x + (Alignment - masked)); } public: using Base::vector; /** * Allocate enough memory to access \p size values of type \p V::EntryType. * * The allocated memory is aligned and padded correctly for fully vectorized access. * * \param size Determines how many scalar values will fit into the allocated memory. */ Vc_ALWAYS_INLINE Memory(size_t size) : m_entriesCount(size), m_vectorsCount(calcPaddedEntriesCount(m_entriesCount)), m_mem(Vc::malloc(m_vectorsCount)) { m_vectorsCount /= V::Size; } /** * Copy the memory into a new memory area. * * The allocated memory is aligned and padded correctly for fully vectorized access. * * \param rhs The Memory object to copy from. */ template Vc_ALWAYS_INLINE Memory(const MemoryBase &rhs) : m_entriesCount(rhs.entriesCount()), m_vectorsCount(rhs.vectorsCount()), m_mem(Vc::malloc(m_vectorsCount * V::Size)) { Internal::copyVectors(*this, rhs); } /** * Overload of the above function. * * (Because C++ would otherwise not use the templated cctor and use a default-constructed cctor instead.) * * \param rhs The Memory object to copy from. */ Vc_ALWAYS_INLINE Memory(const Memory &rhs) : m_entriesCount(rhs.entriesCount()), m_vectorsCount(rhs.vectorsCount()), m_mem(Vc::malloc(m_vectorsCount * V::Size)) { Internal::copyVectors(*this, rhs); } /** * Frees the memory which was allocated in the constructor. */ Vc_ALWAYS_INLINE ~Memory() { Vc::free(m_mem); } /** * Swap the contents and size information of two Memory objects. * * \param rhs The other Memory object to swap. */ inline void swap(Memory &rhs) { std::swap(m_mem, rhs.m_mem); std::swap(m_entriesCount, rhs.m_entriesCount); std::swap(m_vectorsCount, rhs.m_vectorsCount); } /** * \return the number of scalar entries in the whole array. */ Vc_ALWAYS_INLINE Vc_PURE size_t entriesCount() const { return m_entriesCount; } /** * \return the number of vectors in the whole array. */ Vc_ALWAYS_INLINE Vc_PURE size_t vectorsCount() const { return m_vectorsCount; } /** * Overwrite all entries with the values stored in \p rhs. * * \param rhs The object to copy the data from. * * \return reference to the modified Memory object. * * \note this function requires the vectorsCount() of both Memory objects to be equal. */ template Vc_ALWAYS_INLINE Memory &operator=(const MemoryBase &rhs) { assert(vectorsCount() == rhs.vectorsCount()); Internal::copyVectors(*this, rhs); return *this; } Vc_ALWAYS_INLINE Memory &operator=(const Memory &rhs) { assert(vectorsCount() == rhs.vectorsCount()); Internal::copyVectors(*this, rhs); return *this; } /** * Overwrite all entries with the values stored in the memory at \p rhs. * * \param rhs The array to copy the data from. * * \return reference to the modified Memory object. * * \note this function requires that there are entriesCount() many values accessible from \p rhs. */ Vc_ALWAYS_INLINE Memory &operator=(const EntryType *rhs) { std::memcpy(m_mem, rhs, entriesCount() * sizeof(EntryType)); return *this; } }; /** * Prefetch the cacheline containing \p addr for a single read access. * * This prefetch completely bypasses the cache, not evicting any other data. * * \param addr The cacheline containing \p addr will be prefetched. * * \ingroup Utilities * \headerfile memory.h */ Vc_ALWAYS_INLINE void prefetchForOneRead(const void *addr) { Internal::Helper::prefetchForOneRead(addr); } /** * Prefetch the cacheline containing \p addr for modification. * * This prefetch evicts data from the cache. So use it only for data you really will use. When the * target system supports it the cacheline will be marked as modified while prefetching, saving work * later on. * * \param addr The cacheline containing \p addr will be prefetched. * * \ingroup Utilities * \headerfile memory.h */ Vc_ALWAYS_INLINE void prefetchForModify(const void *addr) { Internal::Helper::prefetchForModify(addr); } /** * Prefetch the cacheline containing \p addr to L1 cache. * * This prefetch evicts data from the cache. So use it only for data you really will use. * * \param addr The cacheline containing \p addr will be prefetched. * * \ingroup Utilities * \headerfile memory.h */ Vc_ALWAYS_INLINE void prefetchClose(const void *addr) { Internal::Helper::prefetchClose(addr); } /** * Prefetch the cacheline containing \p addr to L2 cache. * * This prefetch evicts data from the cache. So use it only for data you really will use. * * \param addr The cacheline containing \p addr will be prefetched. * * \ingroup Utilities * \headerfile memory.h */ Vc_ALWAYS_INLINE void prefetchMid(const void *addr) { Internal::Helper::prefetchMid(addr); } /** * Prefetch the cacheline containing \p addr to L3 cache. * * This prefetch evicts data from the cache. So use it only for data you really will use. * * \param addr The cacheline containing \p addr will be prefetched. * * \ingroup Utilities * \headerfile memory.h */ Vc_ALWAYS_INLINE void prefetchFar(const void *addr) { Internal::Helper::prefetchFar(addr); } } // namespace Vc /*OUTER_NAMESPACE_END*/ namespace std { template Vc_ALWAYS_INLINE void swap(Vc::Memory &a, Vc::Memory &b) { a.swap(b); } } // namespace std #include "undomacros.h" #endif // VC_COMMON_MEMORY_H Vc-0.7.4/common/memorybase.h000066400000000000000000000604201233512346000156770ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_COMMON_MEMORYBASE_H #define VC_COMMON_MEMORYBASE_H #include #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { #if __cplusplus >= 201103 || defined(VC_MSVC) #define VC_DECLTYPE(T1, op, T2) decltype(T1() op T2()) #elif defined(VC_OPEN64) || (defined(VC_GCC) && VC_GCC < 0x40300) #define VC_DECLTYPE(T1, op, T2) T1 #else namespace { struct one { char x; }; struct two { one x, y; }; template struct DecltypeHelper { static one test(const T1 &) { return one(); } static two test(const T2 &) { return two(); } //static void test(...) {} }; template struct DecltypeHelper { static one test(const T1 &) { return one(); } //static void test(...) {} }; template struct Decltype { typedef T1 Value; }; template struct Decltype { typedef T1 Value; }; template struct Decltype { typedef T2 Value; }; #ifdef VC_CLANG // this special case is only necessary to silence a warning (which is rather a note that clang // did the expected optimization): // warning: variable 'SOME_PTR' is not needed and will not be emitted [-Wunneeded-internal-declaration] // Then again, I don't remember why the SOME_PTR hack was necessary in the first place - some // strange compiler quirk... #define VC_DECLTYPE(T1, op, T2) typename Decltype::test(T1() op T2()))>::Value #else static const void *SOME_PTR; #define VC_DECLTYPE(T1, op, T2) typename Decltype::test(*static_cast(SOME_PTR) op *static_cast(SOME_PTR)))>::Value #endif } // anonymous namespace #endif #define VC_MEM_OPERATOR_EQ(op) \ template \ Vc_ALWAYS_INLINE VectorPointerHelper &operator op##=(const T &x) { \ const V result = V(m_ptr, Internal::FlagObject::the()) op x; \ result.store(m_ptr, Internal::FlagObject::the()); \ return *this; \ } /** * Helper class for the Memory::vector(size_t) class of functions. * * You will never need to directly make use of this class. It is an implementation detail of the * Memory API. * * \headerfile memorybase.h */ template class VectorPointerHelperConst { typedef typename V::EntryType EntryType; typedef typename V::Mask Mask; public: const EntryType *const m_ptr; explicit VectorPointerHelperConst(const EntryType *ptr) : m_ptr(ptr) {} /** * Cast to \p V operator. * * This function allows to assign this object to any object of type \p V. */ Vc_ALWAYS_INLINE Vc_PURE operator const V() const { return V(m_ptr, Internal::FlagObject::the()); } }; /** * Helper class for the Memory::vector(size_t) class of functions. * * You will never need to directly make use of this class. It is an implementation detail of the * Memory API. * * \headerfile memorybase.h */ template class VectorPointerHelper { typedef typename V::EntryType EntryType; typedef typename V::Mask Mask; public: EntryType *const m_ptr; explicit VectorPointerHelper(EntryType *ptr) : m_ptr(ptr) {} /** * Cast to \p V operator. * * This function allows to assign this object to any object of type \p V. */ Vc_ALWAYS_INLINE Vc_PURE operator const V() const { return V(m_ptr, Internal::FlagObject::the()); } template Vc_ALWAYS_INLINE VectorPointerHelper &operator=(const T &x) { V v; v = x; v.store(m_ptr, Internal::FlagObject::the()); return *this; } VC_ALL_BINARY(VC_MEM_OPERATOR_EQ) VC_ALL_ARITHMETICS(VC_MEM_OPERATOR_EQ) }; #undef VC_MEM_OPERATOR_EQ #define VC_VPH_OPERATOR(op) \ template \ VC_DECLTYPE(V1, op, V2) operator op(const VectorPointerHelper &x, const VectorPointerHelper &y) { \ return V1(x.m_ptr, Internal::FlagObject::the()) op V2(y.m_ptr, Internal::FlagObject::the()); \ } VC_ALL_ARITHMETICS(VC_VPH_OPERATOR) VC_ALL_BINARY (VC_VPH_OPERATOR) VC_ALL_COMPARES (VC_VPH_OPERATOR) #undef VC_VPH_OPERATOR template class MemoryDimensionBase; template class MemoryDimensionBase // {{{1 { private: Parent *p() { return static_cast(this); } const Parent *p() const { return static_cast(this); } public: /** * The type of the scalar entries in the array. */ typedef typename V::EntryType EntryType; /** * Returns a pointer to the start of the allocated memory. */ Vc_ALWAYS_INLINE Vc_PURE EntryType *entries() { return &p()->m_mem[0]; } /// Const overload of the above function. Vc_ALWAYS_INLINE Vc_PURE const EntryType *entries() const { return &p()->m_mem[0]; } /** * Returns the \p i-th scalar value in the memory. */ Vc_ALWAYS_INLINE Vc_PURE EntryType &scalar(size_t i) { return entries()[i]; } /// Const overload of the above function. Vc_ALWAYS_INLINE Vc_PURE const EntryType scalar(size_t i) const { return entries()[i]; } /** * Cast operator to the scalar type. This allows to use the object very much like a standard * C array. */ Vc_ALWAYS_INLINE Vc_PURE operator EntryType*() { return entries(); } /// Const overload of the above function. Vc_ALWAYS_INLINE Vc_PURE operator const EntryType*() const { return entries(); } // omit operator[] because the EntryType* cast operator suffices, for dox it makes sense to // show it, though because it helps API discoverability. #ifdef DOXYGEN /** * Returns the \p i-th scalar value in the memory. */ inline EntryType &operator[](size_t i); /// Const overload of the above function. inline const EntryType &operator[](size_t i) const; #endif /** * Uses a vector gather to combine the entries at the indexes in \p i into the returned * vector object. * * \param i An integer vector. It determines the entries to be gathered. * \returns A vector object. Modification of this object will not modify the values in * memory. * * \warning The API of this function might change in future versions of Vc to additionally * support scatters. */ template Vc_ALWAYS_INLINE Vc_PURE V operator[](Vector i) const { return V(entries(), i); } }; template class MemoryDimensionBase // {{{1 { private: Parent *p() { return static_cast(this); } const Parent *p() const { return static_cast(this); } public: /** * The type of the scalar entries in the array. */ typedef typename V::EntryType EntryType; static _VC_CONSTEXPR size_t rowCount() { return Parent::RowCount; } /** * Returns a pointer to the start of the allocated memory. */ Vc_ALWAYS_INLINE Vc_PURE EntryType *entries(size_t x = 0) { return &p()->m_mem[x][0]; } /// Const overload of the above function. Vc_ALWAYS_INLINE Vc_PURE const EntryType *entries(size_t x = 0) const { return &p()->m_mem[x][0]; } /** * Returns the \p i,j-th scalar value in the memory. */ Vc_ALWAYS_INLINE Vc_PURE EntryType &scalar(size_t i, size_t j) { return entries(i)[j]; } /// Const overload of the above function. Vc_ALWAYS_INLINE Vc_PURE const EntryType scalar(size_t i, size_t j) const { return entries(i)[j]; } /** * Returns the \p i-th row in the memory. */ Vc_ALWAYS_INLINE Vc_PURE RowMemory &operator[](size_t i) { return RowMemory::fromRawData(entries(i)); } /// Const overload of the above function. Vc_ALWAYS_INLINE Vc_PURE const RowMemory &operator[](size_t i) const { return RowMemory::fromRawData(const_cast(entries(i))); } /** * \return the number of rows in the array. * * \note This function can be eliminated by an optimizing compiler. */ Vc_ALWAYS_INLINE Vc_PURE size_t rowsCount() const { return p()->rowsCount(); } }; //{{{1 /** * \headerfile memorybase.h * * Common interface to all Memory classes, independent of allocation on the stack or heap. * * \param V The vector type you want to operate on. (e.g. float_v or uint_v) * \param Parent This type is the complete type of the class that derives from MemoryBase. * \param Dimension The number of dimensions the implementation provides. * \param RowMemory Class to be used to work on a single row. */ template class MemoryBase : public MemoryDimensionBase //{{{1 { private: Parent *p() { return static_cast(this); } const Parent *p() const { return static_cast(this); } public: /** * The type of the scalar entries in the array. */ typedef typename V::EntryType EntryType; /** * \return the number of scalar entries in the array. This function is optimized away * if a constant size array is used. */ Vc_ALWAYS_INLINE Vc_PURE size_t entriesCount() const { return p()->entriesCount(); } /** * \return the number of vector entries that span the array. This function is optimized away * if a constant size array is used. */ Vc_ALWAYS_INLINE Vc_PURE size_t vectorsCount() const { return p()->vectorsCount(); } using MemoryDimensionBase::entries; using MemoryDimensionBase::scalar; /** * \param i Selects the offset, where the vector should be read. * * \return a smart object to wrap the \p i-th vector in the memory. * * The return value can be used as any other vector object. I.e. you can substitute * something like * \code * float_v a = ..., b = ...; * a += b; * \endcode * with * \code * mem.vector(i) += b; * \endcode * * This function ensures that only \em aligned loads and stores are used. Thus it only allows to * access memory at fixed strides. If access to known offsets from the aligned vectors is * needed the vector(size_t, int) function can be used. */ Vc_ALWAYS_INLINE Vc_PURE VectorPointerHelper vector(size_t i) { return VectorPointerHelper(&entries()[i * V::Size]); } /** \brief Const overload of the above function * * \param i Selects the offset, where the vector should be read. * * \return a smart object to wrap the \p i-th vector in the memory. */ Vc_ALWAYS_INLINE Vc_PURE const VectorPointerHelperConst vector(size_t i) const { return VectorPointerHelperConst(&entries()[i * V::Size]); } /** * \return a smart object to wrap the vector starting from the \p i-th scalar entry in the memory. * * Example: * \code * Memory mem; * mem.setZero(); * for (int i = 0; i < mem.entriesCount(); i += float_v::Size) { * mem.vectorAt(i) += b; * } * \endcode * * \param i Specifies the scalar entry from where the vector will be loaded/stored. I.e. the * values scalar(i), scalar(i + 1), ..., scalar(i + V::Size - 1) will be read/overwritten. * * \param align You must take care to determine whether an unaligned load/store is * required. Per default an aligned load/store is used. If \p i is not a multiple of \c V::Size * you must pass Vc::Unaligned here. */ #ifdef DOXYGEN template inline VectorPointerHelper vectorAt(size_t i, A align = Vc::Aligned); /** \brief Const overload of the above function * * \return a smart object to wrap the vector starting from the \p i-th scalar entry in the memory. * * \param i Specifies the scalar entry from where the vector will be loaded/stored. I.e. the * values scalar(i), scalar(i + 1), ..., scalar(i + V::Size - 1) will be read/overwritten. * * \param align You must take care to determine whether an unaligned load/store is * required. Per default an aligned load/store is used. If \p i is not a multiple of \c V::Size * you must pass Vc::Unaligned here. */ template inline const VectorPointerHelperConst vectorAt(size_t i, A align = Vc::Aligned) const; #else template Vc_ALWAYS_INLINE Vc_PURE VectorPointerHelper vectorAt(size_t i, A) { return VectorPointerHelper(&entries()[i]); } template Vc_ALWAYS_INLINE Vc_PURE const VectorPointerHelperConst vectorAt(size_t i, A) const { return VectorPointerHelperConst(&entries()[i]); } Vc_ALWAYS_INLINE Vc_PURE VectorPointerHelper vectorAt(size_t i) { return VectorPointerHelper(&entries()[i]); } Vc_ALWAYS_INLINE Vc_PURE const VectorPointerHelperConst vectorAt(size_t i) const { return VectorPointerHelperConst(&entries()[i]); } #endif /** * \return a smart object to wrap the \p i-th vector + \p shift in the memory. * * This function ensures that only \em unaligned loads and stores are used. * It allows to access memory at any location aligned to the entry type. * * \param i Selects the memory location of the i-th vector. Thus if \p V::Size == 4 and * \p i is set to 3 the base address for the load/store will be the 12th entry * (same as \p &mem[12]). * \param shift Shifts the base address determined by parameter \p i by \p shift many * entries. Thus \p vector(3, 1) for \p V::Size == 4 will load/store the * 13th - 16th entries (same as \p &mem[13]). * * \note Any shift value is allowed as long as you make sure it stays within bounds of the * allocated memory. Shift values that are a multiple of \p V::Size will \em not result in * aligned loads. You have to use the above vector(size_t) function for aligned loads * instead. * * \note Thus a simple way to access vectors randomly is to set \p i to 0 and use \p shift as the * parameter to select the memory address: * \code * // don't use: * mem.vector(i / V::Size, i % V::Size) += 1; * // instead use: * mem.vector(0, i) += 1; * \endcode */ Vc_ALWAYS_INLINE Vc_PURE VectorPointerHelper vector(size_t i, int shift) { return VectorPointerHelper(&entries()[i * V::Size + shift]); } /// Const overload of the above function. Vc_ALWAYS_INLINE Vc_PURE const VectorPointerHelperConst vector(size_t i, int shift) const { return VectorPointerHelperConst(&entries()[i * V::Size + shift]); } /** * \return the first vector in the allocated memory. * * This function is simply a shorthand for vector(0). */ Vc_ALWAYS_INLINE Vc_PURE VectorPointerHelper firstVector() { return VectorPointerHelper(entries()); } /// Const overload of the above function. Vc_ALWAYS_INLINE Vc_PURE const VectorPointerHelperConst firstVector() const { return VectorPointerHelperConst(entries()); } /** * \return the last vector in the allocated memory. * * This function is simply a shorthand for vector(vectorsCount() - 1). */ Vc_ALWAYS_INLINE Vc_PURE VectorPointerHelper lastVector() { return VectorPointerHelper(&entries()[vectorsCount() * V::Size - V::Size]); } /// Const overload of the above function. Vc_ALWAYS_INLINE Vc_PURE const VectorPointerHelperConst lastVector() const { return VectorPointerHelperConst(&entries()[vectorsCount() * V::Size - V::Size]); } Vc_ALWAYS_INLINE Vc_PURE V gather(const unsigned char *indexes) const { return V(entries(), indexes); } Vc_ALWAYS_INLINE Vc_PURE V gather(const unsigned short *indexes) const { return V(entries(), indexes); } Vc_ALWAYS_INLINE Vc_PURE V gather(const unsigned int *indexes) const { return V(entries(), indexes); } Vc_ALWAYS_INLINE Vc_PURE V gather(const unsigned long *indexes) const { return V(entries(), indexes); } Vc_ALWAYS_INLINE void setZero() { V zero(Vc::Zero); for (size_t i = 0; i < vectorsCount(); ++i) { vector(i) = zero; } } template inline Parent &operator+=(const MemoryBase &rhs) { assert(vectorsCount() == rhs.vectorsCount()); for (size_t i = 0; i < vectorsCount(); ++i) { vector(i) += rhs.vector(i); } return static_cast(*this); } template inline Parent &operator-=(const MemoryBase &rhs) { assert(vectorsCount() == rhs.vectorsCount()); for (size_t i = 0; i < vectorsCount(); ++i) { vector(i) -= rhs.vector(i); } return static_cast(*this); } template inline Parent &operator*=(const MemoryBase &rhs) { assert(vectorsCount() == rhs.vectorsCount()); for (size_t i = 0; i < vectorsCount(); ++i) { vector(i) *= rhs.vector(i); } return static_cast(*this); } template inline Parent &operator/=(const MemoryBase &rhs) { assert(vectorsCount() == rhs.vectorsCount()); for (size_t i = 0; i < vectorsCount(); ++i) { vector(i) /= rhs.vector(i); } return static_cast(*this); } inline Parent &operator+=(EntryType rhs) { V v(rhs); for (size_t i = 0; i < vectorsCount(); ++i) { vector(i) += v; } return static_cast(*this); } inline Parent &operator-=(EntryType rhs) { V v(rhs); for (size_t i = 0; i < vectorsCount(); ++i) { vector(i) -= v; } return static_cast(*this); } inline Parent &operator*=(EntryType rhs) { V v(rhs); for (size_t i = 0; i < vectorsCount(); ++i) { vector(i) *= v; } return static_cast(*this); } inline Parent &operator/=(EntryType rhs) { V v(rhs); for (size_t i = 0; i < vectorsCount(); ++i) { vector(i) /= v; } return static_cast(*this); } template inline bool operator==(const MemoryBase &rhs) const { assert(vectorsCount() == rhs.vectorsCount()); for (size_t i = 0; i < vectorsCount(); ++i) { if (!(V(vector(i)) == V(rhs.vector(i))).isFull()) { return false; } } return true; } template inline bool operator!=(const MemoryBase &rhs) const { assert(vectorsCount() == rhs.vectorsCount()); for (size_t i = 0; i < vectorsCount(); ++i) { if (!(V(vector(i)) == V(rhs.vector(i))).isEmpty()) { return false; } } return true; } template inline bool operator<(const MemoryBase &rhs) const { assert(vectorsCount() == rhs.vectorsCount()); for (size_t i = 0; i < vectorsCount(); ++i) { if (!(V(vector(i)) < V(rhs.vector(i))).isFull()) { return false; } } return true; } template inline bool operator<=(const MemoryBase &rhs) const { assert(vectorsCount() == rhs.vectorsCount()); for (size_t i = 0; i < vectorsCount(); ++i) { if (!(V(vector(i)) <= V(rhs.vector(i))).isFull()) { return false; } } return true; } template inline bool operator>(const MemoryBase &rhs) const { assert(vectorsCount() == rhs.vectorsCount()); for (size_t i = 0; i < vectorsCount(); ++i) { if (!(V(vector(i)) > V(rhs.vector(i))).isFull()) { return false; } } return true; } template inline bool operator>=(const MemoryBase &rhs) const { assert(vectorsCount() == rhs.vectorsCount()); for (size_t i = 0; i < vectorsCount(); ++i) { if (!(V(vector(i)) >= V(rhs.vector(i))).isFull()) { return false; } } return true; } }; namespace Internal { template inline void copyVectors(MemoryBase &dst, const MemoryBase &src) { const size_t vectorsCount = dst.vectorsCount(); size_t i = 3; for (; i < vectorsCount; i += 4) { const V tmp3 = src.vector(i - 3); const V tmp2 = src.vector(i - 2); const V tmp1 = src.vector(i - 1); const V tmp0 = src.vector(i - 0); dst.vector(i - 3) = tmp3; dst.vector(i - 2) = tmp2; dst.vector(i - 1) = tmp1; dst.vector(i - 0) = tmp0; } for (i -= 3; i < vectorsCount; ++i) { dst.vector(i) = src.vector(i); } } } // namespace Internal } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // VC_COMMON_MEMORYBASE_H Vc-0.7.4/common/memoryfwd.h000066400000000000000000000017451233512346000155520ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_COMMON_MEMORYFWD_H #define VC_COMMON_MEMORYFWD_H /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { template class Memory; } // namespace Vc /*OUTER_NAMESPACE_END*/ #endif // VC_COMMON_MEMORYFWD_H Vc-0.7.4/common/operators.h000066400000000000000000000341121233512346000155510ustar00rootroot00000000000000#ifndef VC_ICC // ICC ICEs if the following type-traits are in the anonymous namespace namespace { #endif template struct EnableIfNeitherIntegerNorVector : public EnableIf::Value, T> {}; template struct EnableIfNeitherIntegerNorVector, T>; template struct IsVector { enum { Value = false }; }; template struct IsVector > { enum { Value = true }; }; template struct IsTypeCombinationOf { enum { Value = IsVector::Value ? (IsVector::Value ? ( // Vec × Vec ( IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || (HasImplicitCast::Value && IsEqualType::Value && !HasImplicitCast::Value) || ( IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || (HasImplicitCast::Value && IsEqualType::Value && !HasImplicitCast::Value) ) : ( // Vec × Scalar (HasImplicitCast::Value && IsEqualType::Value && !HasImplicitCast::Value) || ( IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) )) : (IsVector::Value ? ( // Scalar × Vec ( IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || (HasImplicitCast::Value && IsEqualType::Value && !HasImplicitCast::Value) ) : ( // Scalar × Scalar ( IsEqualType::Value && IsEqualType::Value) || ( IsEqualType::Value && IsEqualType::Value) )) }; }; template struct IsVectorOperands { enum { Value = (HasImplicitCast::Value && !HasImplicitCast::Value && !IsEqualType::Value && IsEqualType::Value) || (HasImplicitCast::Value && !HasImplicitCast::Value && !IsEqualType::Value && IsEqualType::Value) }; }; #ifndef VC_ICC } #endif // float-int arithmetic operators //{{{1 // These operators must be very picky about the exact types they want to handle. Once (uncontrolled) // implicit type conversions get involved, ambiguous overloads will occur. E.g. a simple int × enum // will become ambiguous because it can convert both to a vector type, which then can execute the // operator. We can't argue that such code should not be used - it could break existing code, not // under control of the developer, just by putting the Vc header somewhere on top. // // The following type combinations are safe (always symmetric): // 1. Vector × Vector // 2. Vector × Scalar (int, float, enum value, ...) // 3. Some object that has a vector cast operator × Vector // 4. Some object that has a vector cast operator × Scalar // // Additionally there are restrictions on which types combine to what resulting type: // 1.a. float × double_v -> double_v // 1.b. any int × double_v -> double_v // 2.a. (u)int_v × float_v -> float_v // 2.b. (u)int_v × float -> float_v // 2.c. any int × float_v -> float_v // 3.a. (u)short_v × sfloat_v -> sfloat_v // 3.b. (u)short_v × float -> sfloat_v // 3.c. short × sfloat_v -> sfloat_v // 4.a. int_v × uint_v -> uint_v // 4.b. any int × uint_v -> uint_v // 4.c. unsigned int × int_v -> uint_v // 4.d. signed int × int_v -> int_v // 5. shorts like ints #define VC_OPERATOR_FORWARD_(ret, op) \ template static Vc_ALWAYS_INLINE typename EnableIf< \ IsVectorOperands::Value || \ ((IsEqualType::Value || IsLikeInteger::Value) && HasImplicitCast::Value && !HasImplicitCast::Value) || \ ((IsEqualType::Value || IsLikeInteger::Value) && HasImplicitCast::Value && !HasImplicitCast::Value) || \ false, double_##ret>::Value operator op(const T0 &x, const T1 &y) { return double_v(x) op double_v(y); } \ \ template static Vc_ALWAYS_INLINE typename EnableIf< \ IsVectorOperands::Value || \ IsTypeCombinationOf::Value || \ IsTypeCombinationOf::Value || \ IsTypeCombinationOf::Value || \ IsTypeCombinationOf::Value || \ (IsLikeInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ (IsLikeInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ false, float_##ret>::Value operator op(const T0 &x, const T1 &y) { return float_v(x) op float_v(y); } \ \ template static Vc_ALWAYS_INLINE typename EnableIf< \ IsVectorOperands::Value || \ IsTypeCombinationOf::Value || \ IsTypeCombinationOf::Value || \ IsTypeCombinationOf::Value || \ IsTypeCombinationOf::Value || \ (IsLikeInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ (IsLikeInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ false, sfloat_##ret>::Value operator op(const T0 &x, const T1 &y) { return sfloat_v(x) op sfloat_v(y); } \ \ template static Vc_ALWAYS_INLINE typename EnableIf< \ IsVectorOperands::Value || \ IsTypeCombinationOf::Value || \ (IsUnsignedInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ (IsUnsignedInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ (IsLikeInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ (IsLikeInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ false, uint_##ret>::Value operator op(const T0 &x, const T1 &y) { return uint_v(x) op uint_v(y); } \ template static Vc_ALWAYS_INLINE typename EnableIf< \ IsVectorOperands::Value || \ (IsLikeSignedInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ (IsLikeSignedInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ false, int_##ret>::Value operator op(const T0 &x, const T1 &y) { return int_v(x) op int_v(y); } \ \ template static Vc_ALWAYS_INLINE typename EnableIf< \ IsVectorOperands::Value || \ IsTypeCombinationOf::Value || \ (IsUnsignedInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ (IsUnsignedInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ (IsLikeInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ (IsLikeInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ false, ushort_##ret>::Value operator op(const T0 &x, const T1 &y) { return ushort_v(x) op ushort_v(y); } \ template static Vc_ALWAYS_INLINE typename EnableIf< \ IsVectorOperands::Value || \ (IsLikeSignedInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ (IsLikeSignedInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ false, short_##ret>::Value operator op(const T0 &x, const T1 &y) { return short_v(x) op short_v(y); } // break incorrect combinations #define VC_OPERATOR_INTENTIONAL_ERROR_1(V, op) \ template static inline typename EnableIfNeitherIntegerNorVector >::Value operator op(const V &, const T &) { return Vc::Error::invalid_operands_of_types(); } \ template static inline typename EnableIfNeitherIntegerNorVector >::Value operator op(const T &, const V &) { return Vc::Error::invalid_operands_of_types(); } #define VC_OPERATOR_INTENTIONAL_ERROR_2(V1, V2, op) \ static inline Vc::Error::invalid_operands_of_types operator op(V1::AsArg, V2::AsArg) { return Vc::Error::invalid_operands_of_types(); } \ static inline Vc::Error::invalid_operands_of_types operator op(V2::AsArg, V1::AsArg) { return Vc::Error::invalid_operands_of_types(); } #define VC_OPERATOR_INTENTIONAL_ERROR_3(V, _T, op) \ template static inline typename EnableIf::Value, Vc::Error::invalid_operands_of_types >::Value operator op(const V &, const T &) { return Vc::Error::invalid_operands_of_types(); } \ template static inline typename EnableIf::Value, Vc::Error::invalid_operands_of_types >::Value operator op(const T &, const V &) { return Vc::Error::invalid_operands_of_types(); } //#define VC_EXTRA_CHECKING #ifdef VC_EXTRA_CHECKING #define VC_OPERATOR_INTENTIONAL_ERROR(op) \ VC_OPERATOR_INTENTIONAL_ERROR_2(double_v, sfloat_v, op) \ VC_OPERATOR_INTENTIONAL_ERROR_2(double_v, float_v, op) \ VC_OPERATOR_INTENTIONAL_ERROR_2(double_v, int_v, op) \ VC_OPERATOR_INTENTIONAL_ERROR_2(double_v, uint_v, op) \ VC_OPERATOR_INTENTIONAL_ERROR_2(double_v, short_v, op) \ VC_OPERATOR_INTENTIONAL_ERROR_2(double_v, ushort_v, op) \ VC_OPERATOR_INTENTIONAL_ERROR_2( int_v, short_v, op) \ VC_OPERATOR_INTENTIONAL_ERROR_2( uint_v, short_v, op) \ VC_OPERATOR_INTENTIONAL_ERROR_2( int_v, ushort_v, op) \ VC_OPERATOR_INTENTIONAL_ERROR_2( uint_v, ushort_v, op) \ VC_APPLY_1(VC_LIST_VECTOR_TYPES, VC_OPERATOR_INTENTIONAL_ERROR_1, op) \ VC_OPERATOR_INTENTIONAL_ERROR_2( float_v, short_v, op) \ VC_OPERATOR_INTENTIONAL_ERROR_2( float_v, ushort_v, op) \ VC_OPERATOR_INTENTIONAL_ERROR_2(sfloat_v, float_v, op) \ VC_OPERATOR_INTENTIONAL_ERROR_2(sfloat_v, int_v, op) \ VC_OPERATOR_INTENTIONAL_ERROR_2(sfloat_v, uint_v, op) \ VC_OPERATOR_INTENTIONAL_ERROR_3( float_v, double, op) \ VC_OPERATOR_INTENTIONAL_ERROR_3(sfloat_v, double, op) #else #define VC_OPERATOR_INTENTIONAL_ERROR(op) #endif #define VC_OPERATOR_FORWARD_COMMUTATIVE(ret, op, op2) \ template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, double, double_##ret) operator op(T x, double_v::AsArg y) { return y op2 x; } \ template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, float, sfloat_##ret) operator op(T x, sfloat_v::AsArg y) { return y op2 x; } \ template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, float, float_##ret) operator op(T x, float_v::AsArg y) { return y op2 x; } \ template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, int, int_##ret) operator op(T x, int_v::AsArg y) { return y op2 x; } \ template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, unsigned int, uint_##ret) operator op(T x, uint_v::AsArg y) { return y op2 x; } \ template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, short, short_##ret) operator op(T x, short_v::AsArg y) { return y op2 x; } \ template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, unsigned short, ushort_##ret) operator op(T x, ushort_v::AsArg y) { return y op2 x; } \ VC_OPERATOR_FORWARD_(ret, op) \ VC_OPERATOR_INTENTIONAL_ERROR(op) #define VC_OPERATOR_FORWARD(ret, op) \ template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, double, double_##ret) operator op(T x, double_v::AsArg y) { return double_v(x) op y; } \ template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, float, sfloat_##ret) operator op(T x, sfloat_v::AsArg y) { return sfloat_v(x) op y; } \ template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, float, float_##ret) operator op(T x, float_v::AsArg y) { return float_v(x) op y; } \ template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, int, int_##ret) operator op(T x, int_v::AsArg y) { return int_v(x) op y; } \ template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, unsigned int, uint_##ret) operator op(T x, uint_v::AsArg y) { return uint_v(x) op y; } \ template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, short, short_##ret) operator op(T x, short_v::AsArg y) { return short_v(x) op y; } \ template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, unsigned short, ushort_##ret) operator op(T x, ushort_v::AsArg y) { return ushort_v(x) op y; } \ VC_OPERATOR_FORWARD_(ret, op) \ VC_OPERATOR_INTENTIONAL_ERROR(op) VC_OPERATOR_FORWARD_COMMUTATIVE(v, *, *) VC_OPERATOR_FORWARD(v, /) VC_OPERATOR_FORWARD_COMMUTATIVE(v, +, +) VC_OPERATOR_FORWARD(v, -) VC_OPERATOR_FORWARD_COMMUTATIVE(v, |, |) VC_OPERATOR_FORWARD_COMMUTATIVE(v, &, &) VC_OPERATOR_FORWARD_COMMUTATIVE(v, ^, ^) VC_OPERATOR_FORWARD_COMMUTATIVE(m, <, >) VC_OPERATOR_FORWARD_COMMUTATIVE(m, >, <) VC_OPERATOR_FORWARD_COMMUTATIVE(m, <=, >=) VC_OPERATOR_FORWARD_COMMUTATIVE(m, >=, <=) VC_OPERATOR_FORWARD_COMMUTATIVE(m, ==, ==) VC_OPERATOR_FORWARD_COMMUTATIVE(m, !=, !=) #undef VC_OPERATOR_FORWARD_ #undef VC_OPERATOR_INTENTIONAL_ERROR_1 #undef VC_OPERATOR_INTENTIONAL_ERROR_2 #undef VC_OPERATOR_INTENTIONAL_ERROR #undef VC_OPERATOR_FORWARD_COMMUTATIVE #undef VC_OPERATOR_FORWARD // }}}1 Vc-0.7.4/common/storage.h000066400000000000000000000110761233512346000152030ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2010-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_COMMON_STORAGE_H #define VC_COMMON_STORAGE_H #include "aliasingentryhelper.h" #include "macros.h" #include "types.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace Common { template class VectorMemoryUnion { public: typedef _VectorType VectorType; typedef _EntryType EntryType; typedef EntryType AliasingEntryType Vc_MAY_ALIAS; Vc_ALWAYS_INLINE VectorMemoryUnion() { assertCorrectAlignment(&v()); } #if defined VC_ICC || defined VC_MSVC Vc_ALWAYS_INLINE VectorMemoryUnion(const VectorType &x) { data.v = x; assertCorrectAlignment(&data.v); } Vc_ALWAYS_INLINE VectorMemoryUnion &operator=(const VectorType &x) { data.v = x; return *this; } Vc_ALWAYS_INLINE Vc_PURE VectorType &v() { return reinterpret_cast(data.v); } Vc_ALWAYS_INLINE Vc_PURE const VectorType &v() const { return reinterpret_cast(data.v); } #if defined VC_ICC Vc_ALWAYS_INLINE Vc_PURE AliasingEntryHelper m(size_t index) { return AliasingEntryHelper(this, index); } Vc_ALWAYS_INLINE void assign(size_t index, EntryType x) { data.m[index] = x; } Vc_ALWAYS_INLINE Vc_PURE EntryType read(size_t index) const { return data.m[index]; } #else Vc_ALWAYS_INLINE Vc_PURE EntryType &m(size_t index) { return data.m[index]; } #endif Vc_ALWAYS_INLINE Vc_PURE EntryType m(size_t index) const { return data.m[index]; } #ifdef VC_COMPILE_BENCHMARKS public: #endif private: union VectorScalarUnion { VectorTypeBase v; EntryType m[sizeof(VectorTypeBase)/sizeof(EntryType)]; } data; #else Vc_ALWAYS_INLINE VectorMemoryUnion(VectorType x) : data(x) { assertCorrectAlignment(&data); } Vc_ALWAYS_INLINE VectorMemoryUnion &operator=(VectorType x) { data = x; return *this; } Vc_ALWAYS_INLINE Vc_PURE VectorType &v() { return data; } Vc_ALWAYS_INLINE Vc_PURE const VectorType &v() const { return data; } Vc_ALWAYS_INLINE Vc_PURE AliasingEntryType &m(size_t index) { return reinterpret_cast(&data)[index]; } Vc_ALWAYS_INLINE Vc_PURE EntryType m(size_t index) const { return reinterpret_cast(&data)[index]; } private: #ifdef VC_COMPILE_BENCHMARKS public: #endif VectorType data; #endif }; #if VC_GCC == 0x40700 || (VC_GCC >= 0x40600 && VC_GCC <= 0x40603) // workaround bug 52736 in GCC template static Vc_ALWAYS_INLINE Vc_CONST T &vectorMemoryUnionAliasedMember(V *data, size_t index) { if (__builtin_constant_p(index) && index == 0) { T *ret; asm("mov %1,%0" : "=r"(ret) : "r"(data)); return *ret; } else { return reinterpret_cast(data)[index]; } } template<> Vc_ALWAYS_INLINE Vc_PURE VectorMemoryUnion<__m128d, double>::AliasingEntryType &VectorMemoryUnion<__m128d, double>::m(size_t index) { return vectorMemoryUnionAliasedMember(&data, index); } template<> Vc_ALWAYS_INLINE Vc_PURE VectorMemoryUnion<__m128i, long long>::AliasingEntryType &VectorMemoryUnion<__m128i, long long>::m(size_t index) { return vectorMemoryUnionAliasedMember(&data, index); } template<> Vc_ALWAYS_INLINE Vc_PURE VectorMemoryUnion<__m128i, unsigned long long>::AliasingEntryType &VectorMemoryUnion<__m128i, unsigned long long>::m(size_t index) { return vectorMemoryUnionAliasedMember(&data, index); } #endif } // namespace Common } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // VC_COMMON_STORAGE_H Vc-0.7.4/common/support.h000066400000000000000000000003751233512346000152530ustar00rootroot00000000000000#ifndef VC_DEPRECATED_COMMON_SUPPORT_H #define VC_DEPRECATED_COMMON_SUPPORT_H #ifdef __GNUC__ #warning "the header is deprecated. Use instead." #endif #include #endif // VC_DEPRECATED_COMMON_SUPPORT_H Vc-0.7.4/common/trigonometric.h000066400000000000000000000064411233512346000164240ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_COMMON_TRIGONOMETRIC_H #define VC_COMMON_TRIGONOMETRIC_H #ifndef VC__USE_NAMESPACE #error "Do not include Vc/common/trigonometric.h outside of Vc itself" #endif #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace { using Vc::VC__USE_NAMESPACE::Vector; } // namespace namespace Internal { template struct MapImpl { enum Dummy { Value = Impl }; }; template<> struct MapImpl { enum Dummy { Value = MapImpl::Value }; }; typedef ImplementationT::Value #if defined(VC_IMPL_XOP) && defined(VC_IMPL_FMA4) + Vc::XopInstructions + Vc::Fma4Instructions #endif > TrigonometricImplementation; } // namespace Internal template struct Trigonometric { template static Vector sin(const Vector &_x); template static Vector cos(const Vector &_x); template static void sincos(const Vector &_x, Vector *_sin, Vector *_cos); template static Vector asin (const Vector &_x); template static Vector atan (const Vector &_x); template static Vector atan2(const Vector &y, const Vector &x); }; namespace VC__USE_NAMESPACE #undef VC__USE_NAMESPACE { template static Vc_ALWAYS_INLINE Vc_PURE Vector sin(const Vector &_x) { return Vc::Trigonometric::sin(_x); } template static Vc_ALWAYS_INLINE Vc_PURE Vector cos(const Vector &_x) { return Vc::Trigonometric::cos(_x); } template static Vc_ALWAYS_INLINE void sincos(const Vector &_x, Vector *_sin, Vector *_cos) { Vc::Trigonometric::sincos(_x, _sin, _cos); } template static Vc_ALWAYS_INLINE Vc_PURE Vector asin (const Vector &_x) { return Vc::Trigonometric::asin(_x); } template static Vc_ALWAYS_INLINE Vc_PURE Vector atan (const Vector &_x) { return Vc::Trigonometric::atan(_x); } template static Vc_ALWAYS_INLINE Vc_PURE Vector atan2(const Vector &y, const Vector &x) { return Vc::Trigonometric::atan2(y, x); } } // namespace VC__USE_NAMESPACE } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // VC_COMMON_TRIGONOMETRIC_H Vc-0.7.4/common/types.h000066400000000000000000000305121233512346000146770ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright (C) 2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . }}}*/ #ifndef VC_COMMON_TYPES_H #define VC_COMMON_TYPES_H #ifdef VC_CHECK_ALIGNMENT #include #include #endif /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { // helper type to implement sfloat_v (Vector) struct sfloat {}; template struct DetermineEntryType { typedef T Type; }; template<> struct DetermineEntryType { typedef float Type; }; template struct NegateTypeHelper { typedef T Type; }; template<> struct NegateTypeHelper { typedef char Type; }; template<> struct NegateTypeHelper { typedef short Type; }; template<> struct NegateTypeHelper { typedef int Type; }; namespace VectorSpecialInitializerZero { enum ZEnum { Zero = 0 }; } namespace VectorSpecialInitializerOne { enum OEnum { One = 1 }; } namespace VectorSpecialInitializerIndexesFromZero { enum IEnum { IndexesFromZero }; } template class Memory; #ifdef VC_MSVC # if defined(VC_IMPL_Scalar) namespace Scalar { template class Vector; template class Mask; } #define _Vector Vc::Scalar::Vector # elif defined(VC_IMPL_SSE) namespace SSE { template class Vector; template class Mask; class Float8Mask; } #define _Vector Vc::SSE::Vector # elif defined(VC_IMPL_AVX) namespace AVX { template class Vector; template class Mask; } #define _Vector Vc::AVX::Vector # else # error "Sorry, MSVC is a nasty compiler and needs extra care. Please help." # endif #endif namespace { template struct EnableIf { typedef T Value; }; template struct EnableIf {}; template struct IsSignedInteger { enum { Value = 0 }; }; template<> struct IsSignedInteger { enum { Value = 1 }; }; template<> struct IsSignedInteger { enum { Value = 1 }; }; template<> struct IsSignedInteger { enum { Value = 1 }; }; template<> struct IsSignedInteger { enum { Value = 1 }; }; template<> struct IsSignedInteger { enum { Value = 1 }; }; template struct IsUnsignedInteger { enum { Value = 0 }; }; template<> struct IsUnsignedInteger { enum { Value = 1 }; }; template<> struct IsUnsignedInteger { enum { Value = 1 }; }; template<> struct IsUnsignedInteger { enum { Value = 1 }; }; template<> struct IsUnsignedInteger { enum { Value = 1 }; }; template<> struct IsUnsignedInteger { enum { Value = 1 }; }; template struct IsInteger { enum { Value = IsSignedInteger::Value | IsUnsignedInteger::Value }; }; template struct IsReal { enum { Value = 0 }; }; template<> struct IsReal { enum { Value = 1 }; }; template<> struct IsReal { enum { Value = 1 }; }; template struct IsEqualType { enum { Value = 0 }; }; template struct IsEqualType { enum { Value = 1 }; }; template struct IsInTypelist { enum { Value = false }; }; template struct IsInTypelist { enum { Value = true }; }; template struct IsInTypelist { enum { Value = true }; }; template struct IsInTypelist { enum { Value = true }; }; template struct IsInTypelist { enum { Value = true }; }; template struct IsInTypelist { enum { Value = true }; }; template struct IsInTypelist { enum { Value = true }; }; template struct IsInTypelist { enum { Value = true }; }; template struct IsCombinationOf { enum { Value = false }; }; template struct IsCombinationOf { enum { Value = true }; }; template struct IsCombinationOf { enum { Value = true }; }; namespace { struct yes { char x; }; struct no { yes x, y; }; } // anonymous namespace template struct HasImplicitCast { #ifdef VC_MSVC // MSVC can't compile this code if we pass a type that has large alignment restrictions by // value // clang OTOH warns about this code if we pass a null-reference, thus we ifdef the const-ref // for MSVC only static yes test(const To &) { return yes(); } #else static yes test( To) { return yes(); } #endif static no test(...) { return no(); } enum { #ifdef VC_MSVC // I want to test whether implicit cast works. If it works MSVC thinks it should give a warning. Wrong. Shut up. #pragma warning(suppress : 4257 4267) #endif Value = !!(sizeof(test(*static_cast(0))) == sizeof(yes)) }; }; #if defined(VC_GCC) && VC_GCC < 0x40300 // GCC 4.1 is very noisy because of the float->int and double->int type trait tests. We get // around this noise with a little specialization. template<> struct HasImplicitCast { enum { Value = true }; }; template<> struct HasImplicitCast { enum { Value = true }; }; #endif #ifdef VC_MSVC // MSVC is such a broken compiler :'( // HasImplicitCast breaks if From has an __declspec(align(#)) modifier and has no implicit cast // to To. That's because it'll call test(...) as test(From) and not test(const From &). // This results in C2718. And MSVC is too stupid to see that it should just shut up and // everybody would be happy. // // Because the HasImplicitCast specializations can only be implemented after the Vector class // was declared we have to write some nasty hacks. template struct HasImplicitCast<_Vector, T2> { enum { Value = false }; }; #if defined(VC_IMPL_Scalar) template struct HasImplicitCast, T2> { enum { Value = false }; }; template struct HasImplicitCast, Vc::Scalar::Mask > { enum { Value = true }; }; #elif defined(VC_IMPL_SSE) template struct HasImplicitCast, T2> { enum { Value = false }; }; template struct HasImplicitCast, Vc::SSE::Mask > { enum { Value = true }; }; template struct HasImplicitCast { enum { Value = false }; }; template<> struct HasImplicitCast { enum { Value = true }; }; #elif defined(VC_IMPL_AVX) template struct HasImplicitCast, T2> { enum { Value = false }; }; template struct HasImplicitCast, Vc::AVX::Mask > { enum { Value = true }; }; #endif template struct HasImplicitCast<_Vector, _Vector > { enum { Value = true }; }; //template<> struct HasImplicitCast<_Vector< int>, _Vector< unsigned int>> { enum { Value = true }; }; //template<> struct HasImplicitCast<_Vector< unsigned int>, _Vector< int>> { enum { Value = true }; }; //template<> struct HasImplicitCast<_Vector< short>, _Vector> { enum { Value = true }; }; //template<> struct HasImplicitCast<_Vector, _Vector< short>> { enum { Value = true }; }; template struct HasImplicitCast, T2> { enum { Value = false }; }; template struct HasImplicitCast, Vc::Memory > { enum { Value = true }; }; #undef _Vector #endif template struct CanConvertToInt : public HasImplicitCast {}; template<> struct CanConvertToInt { enum { Value = 0 }; }; //template<> struct CanConvertToInt { enum { Value = 0 }; }; //template<> struct CanConvertToInt { enum { Value = 0 }; }; enum TestEnum {}; VC_STATIC_ASSERT(CanConvertToInt::Value == 1, CanConvertToInt_is_broken); VC_STATIC_ASSERT(CanConvertToInt::Value == 1, CanConvertToInt_is_broken); VC_STATIC_ASSERT(CanConvertToInt::Value == 0, CanConvertToInt_is_broken); VC_STATIC_ASSERT(CanConvertToInt::Value == 1, CanConvertToInt_is_broken); VC_STATIC_ASSERT(CanConvertToInt::Value == 1, CanConvertToInt_is_broken); VC_STATIC_ASSERT(CanConvertToInt::Value == 0, CanConvertToInt_is_broken); VC_STATIC_ASSERT(CanConvertToInt::Value == 1, CanConvertToInt_is_broken); typedef HasImplicitCast HasImplicitCastTest0; typedef HasImplicitCast HasImplicitCastTest1; typedef HasImplicitCast HasImplicitCastTest2; typedef HasImplicitCast HasImplicitCastTest3; typedef HasImplicitCast HasImplicitCastTest4; VC_STATIC_ASSERT(HasImplicitCastTest0::Value == true, HasImplicitCast0_is_broken); VC_STATIC_ASSERT(HasImplicitCastTest1::Value == true, HasImplicitCast1_is_broken); VC_STATIC_ASSERT(HasImplicitCastTest2::Value == true, HasImplicitCast2_is_broken); VC_STATIC_ASSERT(HasImplicitCastTest3::Value == true, HasImplicitCast3_is_broken); VC_STATIC_ASSERT(HasImplicitCastTest4::Value == false, HasImplicitCast4_is_broken); template struct IsLikeInteger { enum { Value = !IsReal::Value && CanConvertToInt::Value }; }; template struct IsLikeSignedInteger { enum { Value = IsLikeInteger::Value && !IsUnsignedInteger::Value }; }; } // anonymous namespace #ifndef VC_CHECK_ALIGNMENT template static Vc_ALWAYS_INLINE void assertCorrectAlignment(const _T *){} #else template static Vc_ALWAYS_INLINE void assertCorrectAlignment(const _T *ptr) { const size_t s = Vc_ALIGNOF(_T); if((reinterpret_cast(ptr) & ((s ^ (s & (s - 1))) - 1)) != 0) { fprintf(stderr, "A vector with incorrect alignment has just been created. Look at the stacktrace to find the guilty object.\n"); abort(); } } #endif } // namespace Vc /*OUTER_NAMESPACE_END*/ #endif // VC_COMMON_TYPES_H Vc-0.7.4/common/undomacros.h000066400000000000000000000046471233512346000157170ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2010-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_COMMON_UNDOMACROS_H #define VC_COMMON_UNDOMACROS_H #undef VC_COMMON_MACROS_H #undef Vc_ALIGNOF #undef Vc_INTRINSIC #undef Vc_INTRINSIC_L #undef Vc_INTRINSIC_R #undef Vc_CONST #undef Vc_CONST_L #undef Vc_CONST_R #undef Vc_PURE #undef Vc_PURE_L #undef Vc_PURE_R #undef Vc_MAY_ALIAS #undef Vc_ALWAYS_INLINE #undef Vc_ALWAYS_INLINE_L #undef Vc_ALWAYS_INLINE_R #undef VC_IS_UNLIKELY #undef VC_IS_LIKELY #undef VC_RESTRICT #undef VC_DEPRECATED #undef _VC_CONSTEXPR #undef _VC_CONSTEXPR_L #undef _VC_CONSTEXPR_R #undef _VC_NOEXCEPT #undef ALIGN #undef STRUCT_ALIGN1 #undef STRUCT_ALIGN2 #undef ALIGNED_TYPEDEF #undef _CAT_IMPL #undef CAT #undef unrolled_loop16 #undef for_all_vector_entries #undef FREE_STORE_OPERATORS_ALIGNED #undef VC_WARN_INLINE #undef VC_WARN #ifdef VC_EXTERNAL_ASSERT #undef VC_EXTERNAL_ASSERT #else #undef VC_ASSERT #endif #undef VC_HAS_BUILTIN #undef Vc_buildDouble #undef Vc_buildFloat #undef _VC_APPLY_IMPL_1 #undef _VC_APPLY_IMPL_2 #undef _VC_APPLY_IMPL_3 #undef _VC_APPLY_IMPL_4 #undef _VC_APPLY_IMPL_5 #undef VC_LIST_FLOAT_VECTOR_TYPES #undef VC_LIST_INT_VECTOR_TYPES #undef VC_LIST_VECTOR_TYPES #undef VC_LIST_COMPARES #undef VC_LIST_LOGICAL #undef VC_LIST_BINARY #undef VC_LIST_SHIFTS #undef VC_LIST_ARITHMETICS #undef VC_APPLY_0 #undef VC_APPLY_1 #undef VC_APPLY_2 #undef VC_APPLY_3 #undef VC_APPLY_4 #undef VC_ALL_COMPARES #undef VC_ALL_LOGICAL #undef VC_ALL_BINARY #undef VC_ALL_SHIFTS #undef VC_ALL_ARITHMETICS #undef VC_ALL_FLOAT_VECTOR_TYPES #undef VC_ALL_VECTOR_TYPES #undef VC_EXACT_TYPE #undef VC_ALIGNED_PARAMETER #undef VC_OFFSETOF #ifdef Vc_POP_GCC_DIAGNOSTIC__ #pragma GCC diagnostic pop #undef Vc_POP_GCC_DIAGNOSTIC__ #endif #endif // VC_COMMON_UNDOMACROS_H Vc-0.7.4/common/vectortuple.h000066400000000000000000000113361233512346000161120ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright (C) 2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . }}}*/ #ifndef VC_COMMON_VECTORTUPLE_H #define VC_COMMON_VECTORTUPLE_H #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace Common { template struct InterleavedMemoryReadAccess; template struct VectorTuple; template struct VectorTuple<2, V> { typedef typename V::EntryType T; typedef V &VC_RESTRICT Reference; Reference l, r; Vc_ALWAYS_INLINE VectorTuple(Reference a, Reference b) : l(a), r(b) { } Vc_ALWAYS_INLINE VectorTuple<3, V> operator,(V &a) const { return VectorTuple<3, V>(*this, a); } Vc_ALWAYS_INLINE VectorTuple<3, const V> operator,(const V &a) const { return VectorTuple<3, const V>(*this, a); } template Vc_ALWAYS_INLINE void operator=(const InterleavedMemoryReadAccess &access) const { VC_STATIC_ASSERT(2 <= StructSize, You_are_trying_to_extract_more_data_from_the_struct_than_it_has); access.deinterleave(l, r); } }; template struct VectorTuple<2, const V> { typedef typename V::EntryType T; typedef const V &VC_RESTRICT Reference; Reference l, r; Vc_ALWAYS_INLINE VectorTuple(Reference a, Reference b) : l(a), r(b) { } Vc_ALWAYS_INLINE VectorTuple<3, const V> operator,(const V &a) const { return VectorTuple<3, const V>(*this, a); } }; #define _VC_VECTORTUPLE_SPECIALIZATION(LENGTH, parameters) \ template struct VectorTuple \ { \ typedef typename V::EntryType T; \ typedef V &VC_RESTRICT Reference; \ const VectorTuple &l; \ Reference r; \ \ Vc_ALWAYS_INLINE VectorTuple(const VectorTuple &tuple, Reference a) \ : l(tuple), r(a) \ { \ } \ \ Vc_ALWAYS_INLINE VectorTuple operator,(V &a) const \ { \ return VectorTuple(*this, a); \ } \ \ template \ Vc_ALWAYS_INLINE void operator=(const InterleavedMemoryReadAccess &access) const \ { \ VC_STATIC_ASSERT(LENGTH <= StructSize, You_are_trying_to_extract_more_data_from_the_struct_than_it_has); \ access.deinterleave parameters; \ } \ }; \ template struct VectorTuple \ { \ typedef typename V::EntryType T; \ typedef const V &VC_RESTRICT Reference; \ const VectorTuple &l; \ Reference r; \ \ Vc_ALWAYS_INLINE VectorTuple(const VectorTuple &tuple, Reference a) \ : l(tuple), r(a) \ { \ } \ \ Vc_ALWAYS_INLINE VectorTuple operator,(const V &a) const \ { \ return VectorTuple(*this, a); \ } \ } _VC_VECTORTUPLE_SPECIALIZATION(3, (l.l, l.r, r)); _VC_VECTORTUPLE_SPECIALIZATION(4, (l.l.l, l.l.r, l.r, r)); _VC_VECTORTUPLE_SPECIALIZATION(5, (l.l.l.l, l.l.l.r, l.l.r, l.r, r)); _VC_VECTORTUPLE_SPECIALIZATION(6, (l.l.l.l.l, l.l.l.l.r, l.l.l.r, l.l.r, l.r, r)); _VC_VECTORTUPLE_SPECIALIZATION(7, (l.l.l.l.l.l, l.l.l.l.l.r, l.l.l.l.r, l.l.l.r, l.l.r, l.r, r)); _VC_VECTORTUPLE_SPECIALIZATION(8, (l.l.l.l.l.l.l, l.l.l.l.l.l.r, l.l.l.l.l.r, l.l.l.l.r, l.l.l.r, l.l.r, l.r, r)); // VC_STATIC_ASSERT(false, You_are_gathering_too_many_vectors__This_is_not_implemented); } // namespace Common #ifdef VC_IMPL_Scalar namespace Scalar #elif defined VC_IMPL_SSE namespace SSE #elif defined VC_IMPL_AVX namespace AVX #endif { template Vc_ALWAYS_INLINE Common::VectorTuple<2, Vc::Vector > operator,(Vc::Vector &a, Vc::Vector &b) { return Common::VectorTuple<2, Vc::Vector >(a, b); } template Vc_ALWAYS_INLINE Common::VectorTuple<2, const Vc::Vector > operator,(const Vc::Vector &a, const Vc::Vector &b) { return Common::VectorTuple<2, const Vc::Vector >(a, b); } } // namespace Scalar/SSE/AVX } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // VC_COMMON_VECTORTUPLE_H Vc-0.7.4/common/windows_fix_intrin.h000066400000000000000000000346661233512346000174740ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_COMMON_WINDOWS_FIX_INTRIN_H #define VC_COMMON_WINDOWS_FIX_INTRIN_H #if defined(VC_MSVC) && !defined(__midl) // MSVC sucks. If you include intrin.h you get all SSE and AVX intrinsics // declared. This is a problem because we need to implement the intrinsics // that are not supported in hardware ourselves. // Something always includes intrin.h even if you don't // do it explicitly. Therefore we try to be the first to include it // but with __midl defined, in which case it is basically empty. #ifdef __INTRIN_H_ #error "intrin.h was already included, polluting the namespace. Please fix your code to include the Vc headers before anything that includes intrin.h. (Vc will declare the relevant intrinsics as they are required by some system headers.)" #endif #define __midl #include #undef __midl #include #include #include extern "C" { #ifdef _WIN64 _CRTIMP double ceil(_In_ double); __int64 _InterlockedDecrement64(__int64 volatile *); __int64 _InterlockedExchange64(__int64 volatile *, __int64); void * _InterlockedExchangePointer(void * volatile *, void *); __int64 _InterlockedExchangeAdd64(__int64 volatile *, __int64); void *_InterlockedCompareExchangePointer (void * volatile *, void *, void *); __int64 _InterlockedIncrement64(__int64 volatile *); int __cdecl _setjmpex(jmp_buf); void __faststorefence(void); __int64 __mulh(__int64,__int64); unsigned __int64 __umulh(unsigned __int64,unsigned __int64); unsigned __int64 __readcr0(void); unsigned __int64 __readcr2(void); unsigned __int64 __readcr3(void); unsigned __int64 __readcr4(void); unsigned __int64 __readcr8(void); void __writecr0(unsigned __int64); void __writecr3(unsigned __int64); void __writecr4(unsigned __int64); void __writecr8(unsigned __int64); unsigned __int64 __readdr(unsigned int); void __writedr(unsigned int, unsigned __int64); unsigned __int64 __readeflags(void); void __writeeflags(unsigned __int64); void __movsq(unsigned long long *, unsigned long long const *, size_t); unsigned char __readgsbyte(unsigned long Offset); unsigned short __readgsword(unsigned long Offset); unsigned long __readgsdword(unsigned long Offset); unsigned __int64 __readgsqword(unsigned long Offset); void __writegsbyte(unsigned long Offset, unsigned char Data); void __writegsword(unsigned long Offset, unsigned short Data); void __writegsdword(unsigned long Offset, unsigned long Data); void __writegsqword(unsigned long Offset, unsigned __int64 Data); void __addgsbyte(unsigned long Offset, unsigned char Data); void __addgsword(unsigned long Offset, unsigned short Data); void __addgsdword(unsigned long Offset, unsigned long Data); void __addgsqword(unsigned long Offset, unsigned __int64 Data); void __incgsbyte(unsigned long Offset); void __incgsword(unsigned long Offset); void __incgsdword(unsigned long Offset); void __incgsqword(unsigned long Offset); unsigned char __vmx_vmclear(unsigned __int64*); unsigned char __vmx_vmlaunch(void); unsigned char __vmx_vmptrld(unsigned __int64*); unsigned char __vmx_vmread(size_t, size_t*); unsigned char __vmx_vmresume(void); unsigned char __vmx_vmwrite(size_t, size_t); unsigned char __vmx_on(unsigned __int64*); void __stosq(unsigned __int64 *, unsigned __int64, size_t); unsigned char _interlockedbittestandset64(__int64 volatile *a, __int64 b); unsigned char _interlockedbittestandreset64(__int64 volatile *a, __int64 b); short _InterlockedCompareExchange16_np(short volatile *Destination, short Exchange, short Comparand); long _InterlockedCompareExchange_np (long volatile *, long, long); __int64 _InterlockedCompareExchange64_np(__int64 volatile *, __int64, __int64); void *_InterlockedCompareExchangePointer_np (void * volatile *, void *, void *); unsigned char _InterlockedCompareExchange128(__int64 volatile *, __int64, __int64, __int64 *); unsigned char _InterlockedCompareExchange128_np(__int64 volatile *, __int64, __int64, __int64 *); long _InterlockedAnd_np(long volatile *, long); char _InterlockedAnd8_np(char volatile *, char); short _InterlockedAnd16_np(short volatile *, short); __int64 _InterlockedAnd64_np(__int64 volatile *, __int64); long _InterlockedOr_np(long volatile *, long); char _InterlockedOr8_np(char volatile *, char); short _InterlockedOr16_np(short volatile *, short); __int64 _InterlockedOr64_np(__int64 volatile *, __int64); long _InterlockedXor_np(long volatile *, long); char _InterlockedXor8_np(char volatile *, char); short _InterlockedXor16_np(short volatile *, short); __int64 _InterlockedXor64_np(__int64 volatile *, __int64); unsigned __int64 __lzcnt64(unsigned __int64); unsigned __int64 __popcnt64(unsigned __int64); __int64 _InterlockedOr64(__int64 volatile *, __int64); __int64 _InterlockedXor64(__int64 volatile *, __int64); __int64 _InterlockedAnd64(__int64 volatile *, __int64); unsigned char _bittest64(__int64 const *a, __int64 b); unsigned char _bittestandset64(__int64 *a, __int64 b); unsigned char _bittestandreset64(__int64 *a, __int64 b); unsigned char _bittestandcomplement64(__int64 *a, __int64 b); unsigned char _BitScanForward64(unsigned long* Index, unsigned __int64 Mask); unsigned char _BitScanReverse64(unsigned long* Index, unsigned __int64 Mask); unsigned __int64 __shiftleft128(unsigned __int64 LowPart, unsigned __int64 HighPart, unsigned char Shift); unsigned __int64 __shiftright128(unsigned __int64 LowPart, unsigned __int64 HighPart, unsigned char Shift); unsigned __int64 _umul128(unsigned __int64 multiplier, unsigned __int64 multiplicand, unsigned __int64 *highproduct); __int64 _mul128(__int64 multiplier, __int64 multiplicand, __int64 *highproduct); #endif long _InterlockedOr(long volatile *, long); char _InterlockedOr8(char volatile *, char); short _InterlockedOr16(short volatile *, short); long _InterlockedXor(long volatile *, long); char _InterlockedXor8(char volatile *, char); short _InterlockedXor16(short volatile *, short); long _InterlockedAnd(long volatile *, long); char _InterlockedAnd8(char volatile *, char); short _InterlockedAnd16(short volatile *, short); unsigned char _bittest(long const *a, long b); unsigned char _bittestandset(long *a, long b); unsigned char _bittestandreset(long *a, long b); unsigned char _bittestandcomplement(long *a, long b); unsigned char _BitScanForward(unsigned long* Index, unsigned long Mask); unsigned char _BitScanReverse(unsigned long* Index, unsigned long Mask); _CRTIMP wchar_t * __cdecl wcscat( _Pre_cap_for_(_Source) _Prepost_z_ wchar_t *, _In_z_ const wchar_t * _Source); _Check_return_ _CRTIMP int __cdecl wcscmp(_In_z_ const wchar_t *,_In_z_ const wchar_t *); _CRTIMP wchar_t * __cdecl wcscpy(_Pre_cap_for_(_Source) _Post_z_ wchar_t *, _In_z_ const wchar_t * _Source); _Check_return_ _CRTIMP size_t __cdecl wcslen(_In_z_ const wchar_t *); #pragma warning(suppress: 4985) _CRTIMP wchar_t * __cdecl _wcsset(_Inout_z_ wchar_t *, wchar_t); void _ReadBarrier(void); unsigned char _rotr8(unsigned char value, unsigned char shift); unsigned short _rotr16(unsigned short value, unsigned char shift); unsigned char _rotl8(unsigned char value, unsigned char shift); unsigned short _rotl16(unsigned short value, unsigned char shift); short _InterlockedIncrement16(short volatile *Addend); short _InterlockedDecrement16(short volatile *Addend); short _InterlockedCompareExchange16(short volatile *Destination, short Exchange, short Comparand); void __nvreg_save_fence(void); void __nvreg_restore_fence(void); #ifdef _M_IX86 unsigned long __readcr0(void); unsigned long __readcr2(void); unsigned long __readcr3(void); unsigned long __readcr4(void); unsigned long __readcr8(void); void __writecr0(unsigned); void __writecr3(unsigned); void __writecr4(unsigned); void __writecr8(unsigned); unsigned __readdr(unsigned int); void __writedr(unsigned int, unsigned); unsigned __readeflags(void); void __writeeflags(unsigned); void __addfsbyte(unsigned long Offset, unsigned char Data); void __addfsword(unsigned long Offset, unsigned short Data); void __addfsdword(unsigned long Offset, unsigned long Data); void __incfsbyte(unsigned long Offset); void __incfsword(unsigned long Offset); void __incfsdword(unsigned long Offset); unsigned char __readfsbyte(unsigned long Offset); unsigned short __readfsword(unsigned long Offset); unsigned long __readfsdword(unsigned long Offset); unsigned __int64 __readfsqword(unsigned long Offset); void __writefsbyte(unsigned long Offset, unsigned char Data); void __writefsword(unsigned long Offset, unsigned short Data); void __writefsdword(unsigned long Offset, unsigned long Data); void __writefsqword(unsigned long Offset, unsigned __int64 Data); long _InterlockedAddLargeStatistic(__int64 volatile *, long); #endif _Ret_bytecap_(_Size) void * __cdecl _alloca(size_t _Size); int __cdecl abs(_In_ int); _Check_return_ unsigned short __cdecl _byteswap_ushort(_In_ unsigned short value); _Check_return_ unsigned long __cdecl _byteswap_ulong(_In_ unsigned long value); _Check_return_ unsigned __int64 __cdecl _byteswap_uint64(_In_ unsigned __int64 value); void __cdecl __debugbreak(void); void __cdecl _disable(void); __int64 __emul(int,int); unsigned __int64 __emulu(unsigned int,unsigned int); void __cdecl _enable(void); long __cdecl _InterlockedDecrement(long volatile *); long _InterlockedExchange(long volatile *, long); short _InterlockedExchange16(short volatile *, short); char _InterlockedExchange8(char volatile *, char); long _InterlockedExchangeAdd(long volatile *, long); short _InterlockedExchangeAdd16(short volatile *, short); char _InterlockedExchangeAdd8(char volatile *, char); long _InterlockedCompareExchange (long volatile *, long, long); __int64 _InterlockedCompareExchange64(__int64 volatile *, __int64, __int64); long __cdecl _InterlockedIncrement(long volatile *); int __cdecl _inp(unsigned short); int __cdecl inp(unsigned short); unsigned long __cdecl _inpd(unsigned short); unsigned long __cdecl inpd(unsigned short); unsigned short __cdecl _inpw(unsigned short); unsigned short __cdecl inpw(unsigned short); long __cdecl labs(_In_ long); _Check_return_ unsigned long __cdecl _lrotl(_In_ unsigned long,_In_ int); _Check_return_ unsigned long __cdecl _lrotr(_In_ unsigned long,_In_ int); unsigned __int64 __ll_lshift(unsigned __int64,int); __int64 __ll_rshift(__int64,int); _Check_return_ int __cdecl memcmp(_In_opt_bytecount_(_Size) const void *,_In_opt_bytecount_(_Size) const void *,_In_ size_t _Size); void * __cdecl memcpy(_Out_opt_bytecapcount_(_Size) void *,_In_opt_bytecount_(_Size) const void *,_In_ size_t _Size); void * __cdecl memset(_Out_opt_bytecapcount_(_Size) void *,_In_ int,_In_ size_t _Size); int __cdecl _outp(unsigned short,int); int __cdecl outp(unsigned short,int); unsigned long __cdecl _outpd(unsigned short,unsigned long); unsigned long __cdecl outpd(unsigned short,unsigned long); unsigned short __cdecl _outpw(unsigned short,unsigned short); unsigned short __cdecl outpw(unsigned short,unsigned short); void * _ReturnAddress(void); _Check_return_ unsigned int __cdecl _rotl(_In_ unsigned int,_In_ int); _Check_return_ unsigned int __cdecl _rotr(_In_ unsigned int,_In_ int); int __cdecl _setjmp(jmp_buf); _Check_return_ int __cdecl strcmp(_In_z_ const char *,_In_z_ const char *); _Check_return_ size_t __cdecl strlen(_In_z_ const char *); char * __cdecl strset(_Inout_z_ char *,_In_ int); unsigned __int64 __ull_rshift(unsigned __int64,int); void * _AddressOfReturnAddress(void); void _WriteBarrier(void); void _ReadWriteBarrier(void); void __wbinvd(void); void __invlpg(void*); unsigned __int64 __readmsr(unsigned long); void __writemsr(unsigned long, unsigned __int64); unsigned __int64 __rdtsc(void); void __movsb(unsigned char *, unsigned char const *, size_t); void __movsw(unsigned short *, unsigned short const *, size_t); void __movsd(unsigned long *, unsigned long const *, size_t); unsigned char __inbyte(unsigned short Port); unsigned short __inword(unsigned short Port); unsigned long __indword(unsigned short Port); void __outbyte(unsigned short Port, unsigned char Data); void __outword(unsigned short Port, unsigned short Data); void __outdword(unsigned short Port, unsigned long Data); void __inbytestring(unsigned short Port, unsigned char *Buffer, unsigned long Count); void __inwordstring(unsigned short Port, unsigned short *Buffer, unsigned long Count); void __indwordstring(unsigned short Port, unsigned long *Buffer, unsigned long Count); void __outbytestring(unsigned short Port, unsigned char *Buffer, unsigned long Count); void __outwordstring(unsigned short Port, unsigned short *Buffer, unsigned long Count); void __outdwordstring(unsigned short Port, unsigned long *Buffer, unsigned long Count); unsigned int __getcallerseflags(); void __vmx_vmptrst(unsigned __int64 *); void __vmx_off(void); void __svm_clgi(void); void __svm_invlpga(void*, int); void __svm_skinit(int); void __svm_stgi(void); void __svm_vmload(size_t); void __svm_vmrun(size_t); void __svm_vmsave(size_t); void __halt(void); void __sidt(void*); void __lidt(void*); void __ud2(void); void __nop(void); void __stosb(unsigned char *, unsigned char, size_t); void __stosw(unsigned short *, unsigned short, size_t); void __stosd(unsigned long *, unsigned long, size_t); unsigned char _interlockedbittestandset(long volatile *a, long b); unsigned char _interlockedbittestandreset(long volatile *a, long b); void __cpuid(int a[4], int b); void __cpuidex(int a[4], int b, int c); unsigned __int64 __readpmc(unsigned long a); unsigned long __segmentlimit(unsigned long a); _Check_return_ unsigned __int64 __cdecl _rotl64(_In_ unsigned __int64,_In_ int); _Check_return_ unsigned __int64 __cdecl _rotr64(_In_ unsigned __int64,_In_ int); __int64 __cdecl _abs64(__int64); void __int2c(void); char _InterlockedCompareExchange8(char volatile *Destination, char Exchange, char Comparand); unsigned short __lzcnt16(unsigned short); unsigned int __lzcnt(unsigned int); unsigned short __popcnt16(unsigned short); unsigned int __popcnt(unsigned int); unsigned __int64 __rdtscp(unsigned int*); } #endif #endif // VC_COMMON_WINDOWS_FIX_INTRIN_H Vc-0.7.4/configure-icc.sh000077500000000000000000000001121233512346000151370ustar00rootroot00000000000000#!/bin/sh CXX=icpc CC=icc cmake -DCMAKE_BUILD_TYPE=Release "`dirname $0`" Vc-0.7.4/doc/000077500000000000000000000000001233512346000126365ustar00rootroot00000000000000Vc-0.7.4/doc/Doxyfile000066400000000000000000002406251233512346000143550ustar00rootroot00000000000000# Doxyfile 1.8.4 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. # # All text after a double hash (##) is considered a comment and is placed # in front of the TAG it is preceding . # All text after a hash (#) is considered a comment and will be ignored. # The format is: # TAG = value [value, ...] # For lists items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (" "). #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # This tag specifies the encoding used for all characters in the config file # that follow. The default is UTF-8 which is also the encoding used for all # text before the first occurrence of this tag. Doxygen uses libiconv (or the # iconv built into libc) for the transcoding. See # http://www.gnu.org/software/libiconv for the list of possible encodings. DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or sequence of words) that should # identify the project. Note that if you do not use Doxywizard you need # to put quotes around the project name if it contains spaces. PROJECT_NAME = "Vc" # The PROJECT_NUMBER tag can be used to enter a project or revision number. # This could be handy for archiving the generated documentation or # if some version control system is used. PROJECT_NUMBER = 0.7.4 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer # a quick idea about the purpose of the project. Keep the description short. PROJECT_BRIEF = "SIMD Vector Classes for C++" # With the PROJECT_LOGO tag one can specify an logo or icon that is # included in the documentation. The maximum height of the logo should not # exceed 55 pixels and the maximum width should not exceed 200 pixels. # Doxygen will copy the logo to the output directory. PROJECT_LOGO = logo_small.png # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. # If a relative path is entered, it will be relative to the location # where doxygen was started. If left blank the current directory will be used. OUTPUT_DIRECTORY = ./ # If the CREATE_SUBDIRS tag is set to YES, then doxygen will create # 4096 sub-directories (in 2 levels) under the output directory of each output # format and will distribute the generated files over these directories. # Enabling this option can be useful when feeding doxygen a huge amount of # source files, where putting all generated files in the same directory would # otherwise cause performance problems for the file system. CREATE_SUBDIRS = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # The default language is English, other supported languages are: # Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, # Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, # Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English # messages), Korean, Korean-en, Latvian, Lithuanian, Norwegian, Macedonian, # Persian, Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, # Slovak, Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. OUTPUT_LANGUAGE = English # If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will # include brief member descriptions after the members that are listed in # the file and class documentation (similar to JavaDoc). # Set to NO to disable this. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend # the brief description of a member or function before the detailed description. # Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator # that is used to form the text in various listings. Each string # in this list, if found as the leading text of the brief description, will be # stripped from the text and the result after processing the whole list, is # used as the annotated text. Otherwise, the brief description is used as-is. # If left blank, the following values are used ("$name" is automatically # replaced with the name of the entity): "The $name class" "The $name widget" # "The $name file" "is" "provides" "specifies" "contains" # "represents" "a" "an" "the" ABBREVIATE_BRIEF = "The $name class" \ "The $name widget" \ "The $name file" \ is \ provides \ specifies \ contains \ represents \ a \ an \ the # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # Doxygen will generate a detailed section even if there is only a brief # description. ALWAYS_DETAILED_SEC = NO # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. INLINE_INHERITED_MEMB = YES # If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full # path before files name in the file list and in the header files. If set # to NO the shortest path that makes the file name unique will be used. FULL_PATH_NAMES = YES # If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag # can be used to strip a user-defined part of the path. Stripping is # only done if one of the specified strings matches the left-hand part of # the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the # path to strip. Note that you specify absolute paths here, but also # relative paths, which will be relative from the directory where doxygen is # started. STRIP_FROM_PATH = ./ # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of # the path mentioned in the documentation of a class, which tells # the reader which header file to include in order to use a class. # If left blank only the name of the header file containing the class # definition is used. Otherwise one should specify the include paths that # are normally passed to the compiler using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter # (but less readable) file names. This can be useful if your file system # doesn't support long names like on DOS, Mac, or CD-ROM. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen # will interpret the first line (until the first dot) of a JavaDoc-style # comment as the brief description. If set to NO, the JavaDoc # comments will behave just like regular Qt-style comments # (thus requiring an explicit @brief command for a brief description.) JAVADOC_AUTOBRIEF = YES # If the QT_AUTOBRIEF tag is set to YES then Doxygen will # interpret the first line (until the first dot) of a Qt-style # comment as the brief description. If set to NO, the comments # will behave just like regular Qt-style comments (thus requiring # an explicit \brief command for a brief description.) QT_AUTOBRIEF = NO # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen # treat a multi-line C++ special comment block (i.e. a block of //! or /// # comments) as a brief description. This used to be the default behaviour. # The new default is to treat a multi-line C++ comment block as a detailed # description. Set this tag to YES if you prefer the old behaviour instead. MULTILINE_CPP_IS_BRIEF = NO # If the INHERIT_DOCS tag is set to YES (the default) then an undocumented # member inherits the documentation from any documented member that it # re-implements. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce # a new page for each member. If set to NO, the documentation of a member will # be part of the file/class/namespace that contains it. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. # Doxygen uses this value to replace tabs by spaces in code fragments. TAB_SIZE = 8 # This tag can be used to specify a number of aliases that acts # as commands in the documentation. An alias has the form "name=value". # For example adding "sideeffect=\par Side Effects:\n" will allow you to # put the command \sideeffect (or @sideeffect) in the documentation, which # will result in a user-defined paragraph with heading "Side Effects:". # You can put \n's in the value part of an alias to insert newlines. ALIASES = # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding # "class=itcl::class" will allow you to use the command class in the # itcl::class meaning. TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C # sources only. Doxygen will then generate output that is more tailored for C. # For instance, some of the names that are used will be different. The list # of all members will be omitted, etc. OPTIMIZE_OUTPUT_FOR_C = NO # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java # sources only. Doxygen will then generate output that is more tailored for # Java. For instance, namespaces will be presented as packages, qualified # scopes will look different, etc. OPTIMIZE_OUTPUT_JAVA = NO # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran # sources only. Doxygen will then generate output that is more tailored for # Fortran. OPTIMIZE_FOR_FORTRAN = NO # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL # sources. Doxygen will then generate output that is tailored for # VHDL. OPTIMIZE_OUTPUT_VHDL = NO # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, # and language is one of the parsers supported by doxygen: IDL, Java, # Javascript, CSharp, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, # C++. For instance to make doxygen treat .inc files as Fortran files (default # is PHP), and .f files as C (default is Fortran), use: inc=Fortran f=C. Note # that for custom extensions you also need to set FILE_PATTERNS otherwise the # files are not read by doxygen. EXTENSION_MAPPING = # If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all # comments according to the Markdown format, which allows for more readable # documentation. See http://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you # can mix doxygen, HTML, and XML commands with Markdown formatting. # Disable only in case of backward compatibilities issues. MARKDOWN_SUPPORT = NO # When enabled doxygen tries to link words that correspond to documented # classes, or namespaces to their corresponding documentation. Such a link can # be prevented in individual cases by by putting a % sign in front of the word # or globally by setting AUTOLINK_SUPPORT to NO. AUTOLINK_SUPPORT = YES # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should # set this tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); v.s. # func(std::string) {}). This also makes the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. BUILTIN_STL_SUPPORT = NO # If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. # Doxygen will parse them like normal C++ but will assume all classes use public # instead of private inheritance when no explicit protection keyword is present. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate # getter and setter methods for a property. Setting this option to YES (the # default) will make doxygen replace the get and set methods by a property in # the documentation. This will only work if the methods are indeed getting or # setting a simple type. If this is not the case, or you want to show the # methods anyway, you should set this option to NO. IDL_PROPERTY_SUPPORT = YES # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES, then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. DISTRIBUTE_GROUP_DOC = NO # Set the SUBGROUPING tag to YES (the default) to allow class member groups of # the same type (for instance a group of public functions) to be put as a # subgroup of that type (e.g. under the Public Functions section). Set it to # NO to prevent subgrouping. Alternatively, this can be done per class using # the \nosubgrouping command. SUBGROUPING = YES # When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and # unions are shown inside the group in which they are included (e.g. using # @ingroup) instead of on a separate page (for HTML and Man pages) or # section (for LaTeX and RTF). INLINE_GROUPED_CLASSES = NO # When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and # unions with only public data fields or simple typedef fields will be shown # inline in the documentation of the scope in which they are defined (i.e. file, # namespace, or group documentation), provided this scope is documented. If set # to NO (the default), structs, classes, and unions are shown on a separate # page (for HTML and Man pages) or section (for LaTeX and RTF). INLINE_SIMPLE_STRUCTS = NO # When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum # is documented as struct, union, or enum with the name of the typedef. So # typedef struct TypeS {} TypeT, will appear in the documentation as a struct # with name TypeT. When disabled the typedef will appear as a member of a file, # namespace, or class. And the struct will be named TypeS. This can typically # be useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. TYPEDEF_HIDES_STRUCT = NO # The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This # cache is used to resolve symbols given their name and scope. Since this can # be an expensive process and often the same symbol appear multiple times in # the code, doxygen keeps a cache of pre-resolved symbols. If the cache is too # small doxygen will become slower. If the cache is too large, memory is wasted. # The cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid # range is 0..9, the default is 0, corresponding to a cache size of 2^16 = 65536 # symbols. LOOKUP_CACHE_SIZE = 0 #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- # If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in # documentation are documented, even if no documentation was available. # Private class members and static file members will be hidden unless # the EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES EXTRACT_ALL = NO # If the EXTRACT_PRIVATE tag is set to YES all private members of a class # will be included in the documentation. EXTRACT_PRIVATE = NO # If the EXTRACT_PACKAGE tag is set to YES all members with package or internal # scope will be included in the documentation. EXTRACT_PACKAGE = NO # If the EXTRACT_STATIC tag is set to YES all static members of a file # will be included in the documentation. EXTRACT_STATIC = NO # If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) # defined locally in source files will be included in the documentation. # If set to NO only classes defined in header files are included. EXTRACT_LOCAL_CLASSES = NO # This flag is only useful for Objective-C code. When set to YES local # methods, which are defined in the implementation section but not in # the interface are included in the documentation. # If set to NO (the default) only methods in the interface are included. EXTRACT_LOCAL_METHODS = NO # If this flag is set to YES, the members of anonymous namespaces will be # extracted and appear in the documentation as a namespace called # 'anonymous_namespace{file}', where file will be replaced with the base # name of the file that contains the anonymous namespace. By default # anonymous namespaces are hidden. EXTRACT_ANON_NSPACES = NO # If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all # undocumented members of documented classes, files or namespaces. # If set to NO (the default) these members will be included in the # various overviews, but no documentation section is generated. # This option has no effect if EXTRACT_ALL is enabled. HIDE_UNDOC_MEMBERS = YES # If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. # If set to NO (the default) these classes will be included in the various # overviews. This option has no effect if EXTRACT_ALL is enabled. HIDE_UNDOC_CLASSES = YES # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all # friend (class|struct|union) declarations. # If set to NO (the default) these declarations will be included in the # documentation. HIDE_FRIEND_COMPOUNDS = YES # If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any # documentation blocks found inside the body of a function. # If set to NO (the default) these blocks will be appended to the # function's detailed documentation block. HIDE_IN_BODY_DOCS = NO # The INTERNAL_DOCS tag determines if documentation # that is typed after a \internal command is included. If the tag is set # to NO (the default) then the documentation will be excluded. # Set it to YES to include the internal documentation. INTERNAL_DOCS = NO # If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate # file names in lower-case letters. If set to YES upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows # and Mac users are advised to set this option to NO. CASE_SENSE_NAMES = YES # If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen # will show members with their full class and namespace scopes in the # documentation. If set to YES the scope will be hidden. HIDE_SCOPE_NAMES = YES # If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen # will put a list of the files that are included by a file in the documentation # of that file. SHOW_INCLUDE_FILES = YES # If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen # will list include files with double quotes in the documentation # rather than with sharp brackets. FORCE_LOCAL_INCLUDES = NO # If the INLINE_INFO tag is set to YES (the default) then a tag [inline] # is inserted in the documentation for inline members. INLINE_INFO = NO # If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen # will sort the (detailed) documentation of file and class members # alphabetically by member name. If set to NO the members will appear in # declaration order. SORT_MEMBER_DOCS = NO # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the # brief documentation of file, namespace and class members alphabetically # by member name. If set to NO (the default) the members will appear in # declaration order. SORT_BRIEF_DOCS = NO # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen # will sort the (brief and detailed) documentation of class members so that # constructors and destructors are listed first. If set to NO (the default) # the constructors will appear in the respective orders defined by # SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. # This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO # and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. SORT_MEMBERS_CTORS_1ST = NO # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the # hierarchy of group names into alphabetical order. If set to NO (the default) # the group names will appear in their defined order. SORT_GROUP_NAMES = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be # sorted by fully-qualified names, including namespaces. If set to # NO (the default), the class list will be sorted only by class name, # not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the # alphabetical list. SORT_BY_SCOPE_NAME = NO # If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to # do proper type resolution of all parameters of a function it will reject a # match between the prototype and the implementation of a member function even # if there is only one candidate or it is obvious which candidate to choose # by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen # will still accept a match between prototype and implementation in such cases. STRICT_PROTO_MATCHING = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or # disable (NO) the todo list. This list is created by putting \todo # commands in the documentation. GENERATE_TODOLIST = NO # The GENERATE_TESTLIST tag can be used to enable (YES) or # disable (NO) the test list. This list is created by putting \test # commands in the documentation. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or # disable (NO) the bug list. This list is created by putting \bug # commands in the documentation. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or # disable (NO) the deprecated list. This list is created by putting # \deprecated commands in the documentation. GENERATE_DEPRECATEDLIST= YES # The ENABLED_SECTIONS tag can be used to enable conditional # documentation sections, marked by \if section-label ... \endif # and \cond section-label ... \endcond blocks. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines # the initial value of a variable or macro consists of for it to appear in # the documentation. If the initializer consists of more lines than specified # here it will be hidden. Use a value of 0 to hide initializers completely. # The appearance of the initializer of individual variables and macros in the # documentation can be controlled using \showinitializer or \hideinitializer # command in the documentation regardless of this setting. MAX_INITIALIZER_LINES = 30 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated # at the bottom of the documentation of classes and structs. If set to YES the # list will mention the files that were used to generate the documentation. SHOW_USED_FILES = NO # Set the SHOW_FILES tag to NO to disable the generation of the Files page. # This will remove the Files entry from the Quick Index and from the # Folder Tree View (if specified). The default is YES. SHOW_FILES = NO # Set the SHOW_NAMESPACES tag to NO to disable the generation of the # Namespaces page. # This will remove the Namespaces entry from the Quick Index # and from the Folder Tree View (if specified). The default is YES. SHOW_NAMESPACES = YES # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via # popen()) the command , where is the value of # the FILE_VERSION_FILTER tag, and is the name of an input file # provided by doxygen. Whatever the program writes to standard output # is used as the file version. See the manual for examples. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated # output files in an output format independent way. To create the layout file # that represents doxygen's defaults, run doxygen with the -l option. # You can optionally specify a file name after the option, if omitted # DoxygenLayout.xml will be used as the name of the layout file. LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files # containing the references data. This must be a list of .bib files. The # .bib extension is automatically appended if omitted. Using this command # requires the bibtex tool to be installed. See also # http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style # of the bibliography can be controlled using LATEX_BIB_STYLE. To use this # feature you need bibtex and perl available in the search path. Do not use # file names with spaces, bibtex cannot handle them. CITE_BIB_FILES = #--------------------------------------------------------------------------- # configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated # by doxygen. Possible values are YES and NO. If left blank NO is used. QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated by doxygen. Possible values are YES and NO. If left blank # NO is used. WARNINGS = YES # If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings # for undocumented members. If EXTRACT_ALL is set to YES then this flag will # automatically be disabled. WARN_IF_UNDOCUMENTED = YES # If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some # parameters in a documented function, or documenting parameters that # don't exist or using markup commands wrongly. WARN_IF_DOC_ERROR = YES # The WARN_NO_PARAMDOC option can be enabled to get warnings for # functions that are documented, but have no documentation for their parameters # or return value. If set to NO (the default) doxygen will only warn about # wrong or incomplete parameter documentation, but not about the absence of # documentation. WARN_NO_PARAMDOC = YES # The WARN_FORMAT tag determines the format of the warning messages that # doxygen can produce. The string should contain the $file, $line, and $text # tags, which will be replaced by the file and line number from which the # warning originated and the warning text. Optionally the format may contain # $version, which will be replaced by the version of the file (if it could # be obtained via FILE_VERSION_FILTER) WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning # and error messages should be written. If left blank the output is written # to stderr. WARN_LOGFILE = #--------------------------------------------------------------------------- # configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag can be used to specify the files and/or directories that contain # documented source files. You may enter file names like "myfile.cpp" or # directories like "/usr/src/myproject". Separate the files or directories # with spaces. INPUT = ../include/Vc/global.h \ ../include/Vc/cpuid.h \ ../include/Vc/support.h \ ../include/Vc/Allocator \ ./ \ ../common/ # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is # also the default input encoding. Doxygen uses libiconv (or the iconv built # into libc) for the transcoding. See http://www.gnu.org/software/libiconv for # the list of possible encodings. INPUT_ENCODING = UTF-8 # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp # and *.h) to filter out the source-files in the directories. If left # blank the following patterns are tested: # *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh # *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py # *.f90 *.f *.for *.vhd *.vhdl FILE_PATTERNS = dox.h \ examples.h \ memory.h \ memorybase.h \ interleavedmemory.h \ deinterleave.h \ support.h # The RECURSIVE tag can be used to turn specify whether or not subdirectories # should be searched for input files as well. Possible values are YES and NO. # If left blank NO is used. RECURSIVE = NO # The EXCLUDE tag can be used to specify files and/or directories that should be # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. # Note that relative paths are relative to the directory from which doxygen is # run. EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded # from the input. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. Note that the wildcards are matched # against the file with absolute path, so to exclude all test directories # for example use the pattern */test/* EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or # directories that contain example code fragments that are included (see # the \include command). EXAMPLE_PATH = ../examples/ # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp # and *.h) to filter out the source-files in the directories. If left # blank all files are included. EXAMPLE_PATTERNS = * # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude # commands irrespective of the value of the RECURSIVE tag. # Possible values are YES and NO. If left blank NO is used. EXAMPLE_RECURSIVE = NO # The IMAGE_PATH tag can be used to specify one or more files or # directories that contain image that are included in the documentation (see # the \image command). IMAGE_PATH = ./ # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command , where # is the value of the INPUT_FILTER tag, and is the name of an # input file. Doxygen will then use the output that the filter program writes # to standard output. # If FILTER_PATTERNS is specified, this tag will be ignored. # Note that the filter must not add or remove lines; it is applied before the # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. # Doxygen will compare the file name with each pattern and apply the # filter if there is a match. # The filters are a list of the form: # pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further # info on how filters are used. If FILTER_PATTERNS is empty or if # non of the patterns match the file name, INPUT_FILTER is applied. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will be used to filter the input files when producing source # files to browse (i.e. when SOURCE_BROWSER is set to YES). FILTER_SOURCE_FILES = NO # The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file # pattern. A pattern will override the setting for FILTER_PATTERN (if any) # and it is also possible to disable source filtering for a specific pattern # using *.ext= (so without naming a filter). This option only has effect when # FILTER_SOURCE_FILES is enabled. FILTER_SOURCE_PATTERNS = # If the USE_MD_FILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page # (index.html). This can be useful if you have a project on for instance GitHub # and want reuse the introduction page also for the doxygen output. USE_MDFILE_AS_MAINPAGE = #--------------------------------------------------------------------------- # configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will # be generated. Documented entities will be cross-referenced with these sources. # Note: To get rid of all source code in the generated output, make sure also # VERBATIM_HEADERS is set to NO. SOURCE_BROWSER = NO # Setting the INLINE_SOURCES tag to YES will include the body # of functions and classes directly in the documentation. INLINE_SOURCES = NO # Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct # doxygen to hide any special comment blocks from generated source code # fragments. Normal C, C++ and Fortran comments will always remain visible. STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES # then for each documented function all documented # functions referencing it will be listed. REFERENCED_BY_RELATION = NO # If the REFERENCES_RELATION tag is set to YES # then for each documented function all documented entities # called/used by that function will be listed. REFERENCES_RELATION = NO # If the REFERENCES_LINK_SOURCE tag is set to YES (the default) # and SOURCE_BROWSER tag is set to YES, then the hyperlinks from # functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will # link to the source code. # Otherwise they will link to the documentation. REFERENCES_LINK_SOURCE = YES # If the USE_HTAGS tag is set to YES then the references to source code # will point to the HTML generated by the htags(1) tool instead of doxygen # built-in source browser. The htags tool is part of GNU's global source # tagging system (see http://www.gnu.org/software/global/global.html). You # will need version 4.8.6 or higher. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen # will generate a verbatim copy of the header file for each class for # which an include is specified. Set to NO to disable this. VERBATIM_HEADERS = NO #--------------------------------------------------------------------------- # configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index # of all compounds will be generated. Enable this if the project # contains a lot of classes, structs, unions or interfaces. ALPHABETICAL_INDEX = NO # If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then # the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns # in which this list will be split (can be a number in the range [1..20]) COLS_IN_ALPHA_INDEX = 5 # In case all classes in a project start with a common prefix, all # classes will be put under the same header in the alphabetical index. # The IGNORE_PREFIX tag can be used to specify one or more prefixes that # should be ignored while generating the index headers. IGNORE_PREFIX = #--------------------------------------------------------------------------- # configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES (the default) Doxygen will # generate HTML output. GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `html' will be used as the default path. HTML_OUTPUT = html # The HTML_FILE_EXTENSION tag can be used to specify the file extension for # each generated HTML page (for example: .htm,.php,.asp). If it is left blank # doxygen will generate files with .html extension. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a personal HTML header for # each generated HTML page. If it is left blank doxygen will generate a # standard header. Note that when using a custom header you are responsible # for the proper inclusion of any scripts and style sheets that doxygen # needs, which is dependent on the configuration options used. # It is advised to generate a default header using "doxygen -w html # header.html footer.html stylesheet.css YourConfigFile" and then modify # that header. Note that the header is subject to change so you typically # have to redo this when upgrading to a newer version of doxygen or when # changing the value of configuration settings such as GENERATE_TREEVIEW! HTML_HEADER = # The HTML_FOOTER tag can be used to specify a personal HTML footer for # each generated HTML page. If it is left blank doxygen will generate a # standard footer. HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading # style sheet that is used by each HTML page. It can be used to # fine-tune the look of the HTML output. If left blank doxygen will # generate a default style sheet. Note that it is recommended to use # HTML_EXTRA_STYLESHEET instead of this one, as it is more robust and this # tag will in the future become obsolete. HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify an additional # user-defined cascading style sheet that is included after the standard # style sheets created by doxygen. Using this option one can overrule # certain style aspects. This is preferred over using HTML_STYLESHEET # since it does not replace the standard style sheet and is therefor more # robust against future updates. Doxygen will copy the style sheet file to # the output directory. HTML_EXTRA_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note # that these files will be copied to the base HTML output directory. Use the # $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these # files. In the HTML_STYLESHEET file, use the file name only. Also note that # the files will be copied as-is; there are no commands or markers available. HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. # Doxygen will adjust the colors in the style sheet and background images # according to this color. Hue is specified as an angle on a colorwheel, # see http://en.wikipedia.org/wiki/Hue for more information. # For instance the value 0 represents red, 60 is yellow, 120 is green, # 180 is cyan, 240 is blue, 300 purple, and 360 is red again. # The allowed range is 0 to 359. HTML_COLORSTYLE_HUE = 120 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of # the colors in the HTML output. For a value of 0 the output will use # grayscales only. A value of 255 will produce the most vivid colors. HTML_COLORSTYLE_SAT = 100 # The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to # the luminance component of the colors in the HTML output. Values below # 100 gradually make the output lighter, whereas values above 100 make # the output darker. The value divided by 100 is the actual gamma applied, # so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2, # and 100 does not change the gamma. HTML_COLORSTYLE_GAMMA = 120 # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML # page will contain the date and time when the page was generated. Setting # this to NO can help when comparing the output of multiple runs. HTML_TIMESTAMP = NO # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. HTML_DYNAMIC_SECTIONS = YES # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of # entries shown in the various tree structured indices initially; the user # can expand and collapse entries dynamically later on. Doxygen will expand # the tree to such a level that at most the specified number of entries are # visible (unless a fully collapsed tree already exceeds this amount). # So setting the number of entries 1 will produce a full collapsed tree by # default. 0 is a special value representing an infinite number of entries # and will result in a full expanded tree by default. HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files # will be generated that can be used as input for Apple's Xcode 3 # integrated development environment, introduced with OSX 10.5 (Leopard). # To create a documentation set, doxygen will generate a Makefile in the # HTML output directory. Running make will produce the docset in that # directory and running "make install" will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find # it at startup. # See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html # for more information. GENERATE_DOCSET = NO # When GENERATE_DOCSET tag is set to YES, this tag determines the name of the # feed. A documentation feed provides an umbrella under which multiple # documentation sets from a single provider (such as a company or product suite) # can be grouped. DOCSET_FEEDNAME = "Doxygen generated docs" # When GENERATE_DOCSET tag is set to YES, this tag specifies a string that # should uniquely identify the documentation set bundle. This should be a # reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen # will append .docset to the name. DOCSET_BUNDLE_ID = org.doxygen.Project # When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely # identify the documentation publisher. This should be a reverse domain-name # style string, e.g. com.mycompany.MyDocSet.documentation. DOCSET_PUBLISHER_ID = org.doxygen.Publisher # The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher. DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES, additional index files # will be generated that can be used as input for tools like the # Microsoft HTML help workshop to generate a compiled HTML help file (.chm) # of the generated HTML documentation. GENERATE_HTMLHELP = NO # If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can # be used to specify the file name of the resulting .chm file. You # can add a path in front of the file if the result should not be # written to the html output directory. CHM_FILE = # If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can # be used to specify the location (absolute path including file name) of # the HTML help compiler (hhc.exe). If non-empty doxygen will try to run # the HTML help compiler on the generated index.hhp. HHC_LOCATION = # If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag # controls if a separate .chi index file is generated (YES) or that # it should be included in the master .chm file (NO). GENERATE_CHI = NO # If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING # is used to encode HtmlHelp index (hhk), content (hhc) and project file # content. CHM_INDEX_ENCODING = # If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag # controls whether a binary table of contents is generated (YES) or a # normal table of contents (NO) in the .chm file. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members # to the contents of the HTML help documentation and to the tree view. TOC_EXPAND = YES # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated # that can be used as input for Qt's qhelpgenerator to generate a # Qt Compressed Help (.qch) of the generated HTML documentation. GENERATE_QHP = YES # If the QHG_LOCATION tag is specified, the QCH_FILE tag can # be used to specify the file name of the resulting .qch file. # The path specified is relative to the HTML output folder. QCH_FILE = vc-0.7.qch # The QHP_NAMESPACE tag specifies the namespace to use when generating # Qt Help Project output. For more information please see # http://doc.trolltech.com/qthelpproject.html#namespace QHP_NAMESPACE = de.uni-frankfurt.compeng.vc # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating # Qt Help Project output. For more information please see # http://doc.trolltech.com/qthelpproject.html#virtual-folders QHP_VIRTUAL_FOLDER = "Vc (master)" # If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to # add. For more information please see # http://doc.trolltech.com/qthelpproject.html#custom-filters QHP_CUST_FILTER_NAME = # The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see # # Qt Help Project / Custom Filters. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's # filter section matches. # # Qt Help Project / Filter Attributes. QHP_SECT_FILTER_ATTRS = # If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can # be used to specify the location of Qt's qhelpgenerator. # If non-empty doxygen will try to run qhelpgenerator on the generated # .qhp file. QHG_LOCATION = "../qhelpgenerator-wrapper" # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files # will be generated, which together with the HTML files, form an Eclipse help # plugin. To install this plugin and make it available under the help contents # menu in Eclipse, the contents of the directory containing the HTML and XML # files needs to be copied into the plugins directory of eclipse. The name of # the directory within the plugins directory should be the same as # the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before # the help appears. GENERATE_ECLIPSEHELP = NO # A unique identifier for the eclipse help plugin. When installing the plugin # the directory name containing the HTML and XML files should also have # this name. ECLIPSE_DOC_ID = org.doxygen.Project # The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) # at top of each HTML page. The value NO (the default) enables the index and # the value YES disables it. Since the tabs have the same information as the # navigation tree you can set this option to NO if you already set # GENERATE_TREEVIEW to YES. DISABLE_INDEX = YES # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. # If the tag value is set to YES, a side panel will be generated # containing a tree-like index structure (just like the one that # is generated for HTML Help). For this to work a browser that supports # JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). # Windows users are probably better off using the HTML help feature. # Since the tree basically has the same information as the tab index you # could consider to set DISABLE_INDEX to NO when enabling this option. GENERATE_TREEVIEW = YES # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values # (range [0,1..20]) that doxygen will group on one line in the generated HTML # documentation. Note that a value of 0 will completely suppress the enum # values from appearing in the overview section. ENUM_VALUES_PER_LINE = 4 # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be # used to set the initial width (in pixels) of the frame in which the tree # is shown. TREEVIEW_WIDTH = 250 # When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open # links to external symbols imported via tag files in a separate window. EXT_LINKS_IN_WINDOW = NO # Use this tag to change the font size of Latex formulas included # as images in the HTML documentation. The default is 10. Note that # when you change the font size after a successful doxygen run you need # to manually remove any form_*.png images from the HTML output directory # to force them to be regenerated. FORMULA_FONTSIZE = 10 # Use the FORMULA_TRANPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are # not supported properly for IE 6.0, but are supported on all modern browsers. # Note that when changing this option you need to delete any form_*.png files # in the HTML output before the changes have effect. FORMULA_TRANSPARENT = YES # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax # (see http://www.mathjax.org) which uses client side Javascript for the # rendering instead of using prerendered bitmaps. Use this if you do not # have LaTeX installed or if you want to formulas look prettier in the HTML # output. When enabled you may also need to install MathJax separately and # configure the path to it using the MATHJAX_RELPATH option. USE_MATHJAX = NO # When MathJax is enabled you can set the default output format to be used for # the MathJax output. Supported types are HTML-CSS, NativeMML (i.e. MathML) and # SVG. The default value is HTML-CSS, which is slower, but has the best # compatibility. MATHJAX_FORMAT = HTML-CSS # When MathJax is enabled you need to specify the location relative to the # HTML output directory using the MATHJAX_RELPATH option. The destination # directory should contain the MathJax.js script. For instance, if the mathjax # directory is located at the same level as the HTML output directory, then # MATHJAX_RELPATH should be ../mathjax. The default value points to # the MathJax Content Delivery Network so you can quickly see the result without # installing MathJax. # However, it is strongly recommended to install a local # copy of MathJax from http://www.mathjax.org before deployment. MATHJAX_RELPATH = http://www.mathjax.org/mathjax # The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension # names that should be enabled during MathJax rendering. MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript # pieces of code that will be used on startup of the MathJax code. MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box # for the HTML output. The underlying search engine uses javascript # and DHTML and should work on any modern browser. Note that when using # HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets # (GENERATE_DOCSET) there is already a search function so this one should # typically be disabled. For large projects the javascript based search engine # can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution. SEARCHENGINE = YES # When the SERVER_BASED_SEARCH tag is enabled the search engine will be # implemented using a web server instead of a web client using Javascript. # There are two flavours of web server based search depending on the # EXTERNAL_SEARCH setting. When disabled, doxygen will generate a PHP script for # searching and an index file used by the script. When EXTERNAL_SEARCH is # enabled the indexing and searching needs to be provided by external tools. # See the manual for details. SERVER_BASED_SEARCH = YES # When EXTERNAL_SEARCH is enabled doxygen will no longer generate the PHP # script for searching. Instead the search results are written to an XML file # which needs to be processed by an external indexer. Doxygen will invoke an # external search engine pointed to by the SEARCHENGINE_URL option to obtain # the search results. Doxygen ships with an example indexer (doxyindexer) and # search engine (doxysearch.cgi) which are based on the open source search # engine library Xapian. See the manual for configuration details. EXTERNAL_SEARCH = NO # The SEARCHENGINE_URL should point to a search engine hosted by a web server # which will returned the search results when EXTERNAL_SEARCH is enabled. # Doxygen ships with an example search engine (doxysearch) which is based on # the open source search engine library Xapian. See the manual for configuration # details. SEARCHENGINE_URL = # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed # search data is written to a file for indexing by an external tool. With the # SEARCHDATA_FILE tag the name of this file can be specified. SEARCHDATA_FILE = searchdata.xml # When SERVER_BASED_SEARCH AND EXTERNAL_SEARCH are both enabled the # EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is # useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple # projects and redirect the results back to the right project. EXTERNAL_SEARCH_ID = # The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen # projects other than the one defined by this configuration file, but that are # all added to the same external search index. Each project needs to have a # unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id # of to a relative location where the documentation can be found. # The format is: EXTRA_SEARCH_MAPPINGS = id1=loc1 id2=loc2 ... EXTRA_SEARCH_MAPPINGS = #--------------------------------------------------------------------------- # configuration options related to the LaTeX output #--------------------------------------------------------------------------- # If the GENERATE_LATEX tag is set to YES (the default) Doxygen will # generate Latex output. GENERATE_LATEX = NO # The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `latex' will be used as the default path. LATEX_OUTPUT = latex # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be # invoked. If left blank `latex' will be used as the default command name. # Note that when enabling USE_PDFLATEX this option is only used for # generating bitmaps for formulas in the HTML output, but not in the # Makefile that is written to the output directory. LATEX_CMD_NAME = latex # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to # generate index for LaTeX. If left blank `makeindex' will be used as the # default command name. MAKEINDEX_CMD_NAME = makeindex # If the COMPACT_LATEX tag is set to YES Doxygen generates more compact # LaTeX documents. This may be useful for small projects and may help to # save some trees in general. COMPACT_LATEX = NO # The PAPER_TYPE tag can be used to set the paper type that is used # by the printer. Possible values are: a4, letter, legal and # executive. If left blank a4 will be used. PAPER_TYPE = a4wide # The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX # packages that should be included in the LaTeX output. EXTRA_PACKAGES = # The LATEX_HEADER tag can be used to specify a personal LaTeX header for # the generated latex document. The header should contain everything until # the first chapter. If it is left blank doxygen will generate a # standard header. Notice: only use this tag if you know what you are doing! LATEX_HEADER = # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for # the generated latex document. The footer should contain everything after # the last chapter. If it is left blank doxygen will generate a # standard footer. Notice: only use this tag if you know what you are doing! LATEX_FOOTER = # The LATEX_EXTRA_FILES tag can be used to specify one or more extra images # or other source files which should be copied to the LaTeX output directory. # Note that the files will be copied as-is; there are no commands or markers # available. LATEX_EXTRA_FILES = # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated # is prepared for conversion to pdf (using ps2pdf). The pdf file will # contain links (just like the HTML output) instead of page references # This makes the output suitable for online browsing using a pdf viewer. PDF_HYPERLINKS = YES # If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of # plain latex in the generated Makefile. Set this option to YES to get a # higher quality PDF documentation. USE_PDFLATEX = YES # If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. # command to the generated LaTeX files. This will instruct LaTeX to keep # running if errors occur, instead of asking the user for help. # This option is also used when generating formulas in HTML. LATEX_BATCHMODE = NO # If LATEX_HIDE_INDICES is set to YES then doxygen will not # include the index chapters (such as File Index, Compound Index, etc.) # in the output. LATEX_HIDE_INDICES = NO # If LATEX_SOURCE_CODE is set to YES then doxygen will include # source code with syntax highlighting in the LaTeX output. # Note that which sources are shown also depends on other settings # such as SOURCE_BROWSER. LATEX_SOURCE_CODE = NO # The LATEX_BIB_STYLE tag can be used to specify the style to use for the # bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See # http://en.wikipedia.org/wiki/BibTeX for more info. LATEX_BIB_STYLE = plain #--------------------------------------------------------------------------- # configuration options related to the RTF output #--------------------------------------------------------------------------- # If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output # The RTF output is optimized for Word 97 and may not look very pretty with # other RTF readers or editors. GENERATE_RTF = NO # The RTF_OUTPUT tag is used to specify where the RTF docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `rtf' will be used as the default path. RTF_OUTPUT = rtf # If the COMPACT_RTF tag is set to YES Doxygen generates more compact # RTF documents. This may be useful for small projects and may help to # save some trees in general. COMPACT_RTF = NO # If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated # will contain hyperlink fields. The RTF file will # contain links (just like the HTML output) instead of page references. # This makes the output suitable for online browsing using WORD or other # programs which support those fields. # Note: wordpad (write) and others do not support links. RTF_HYPERLINKS = NO # Load style sheet definitions from file. Syntax is similar to doxygen's # config file, i.e. a series of assignments. You only have to provide # replacements, missing definitions are set to their default value. RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an rtf document. # Syntax is similar to doxygen's config file. RTF_EXTENSIONS_FILE = #--------------------------------------------------------------------------- # configuration options related to the man page output #--------------------------------------------------------------------------- # If the GENERATE_MAN tag is set to YES (the default) Doxygen will # generate man pages GENERATE_MAN = NO # The MAN_OUTPUT tag is used to specify where the man pages will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `man' will be used as the default path. MAN_OUTPUT = man # The MAN_EXTENSION tag determines the extension that is added to # the generated man pages (default is the subroutine's section .3) MAN_EXTENSION = .3 # If the MAN_LINKS tag is set to YES and Doxygen generates man output, # then it will generate one additional man file for each entity # documented in the real man page(s). These additional files # only source the real man page, but without them the man command # would be unable to find the correct page. The default is NO. MAN_LINKS = NO #--------------------------------------------------------------------------- # configuration options related to the XML output #--------------------------------------------------------------------------- # If the GENERATE_XML tag is set to YES Doxygen will # generate an XML file that captures the structure of # the code including all documentation. GENERATE_XML = NO # The XML_OUTPUT tag is used to specify where the XML pages will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `xml' will be used as the default path. XML_OUTPUT = xml # The XML_SCHEMA tag can be used to specify an XML schema, # which can be used by a validating XML parser to check the # syntax of the XML files. XML_SCHEMA = # The XML_DTD tag can be used to specify an XML DTD, # which can be used by a validating XML parser to check the # syntax of the XML files. XML_DTD = # If the XML_PROGRAMLISTING tag is set to YES Doxygen will # dump the program listings (including syntax highlighting # and cross-referencing information) to the XML output. Note that # enabling this will significantly increase the size of the XML output. XML_PROGRAMLISTING = YES #--------------------------------------------------------------------------- # configuration options related to the DOCBOOK output #--------------------------------------------------------------------------- # If the GENERATE_DOCBOOK tag is set to YES Doxygen will generate DOCBOOK files # that can be used to generate PDF. GENERATE_DOCBOOK = NO # The DOCBOOK_OUTPUT tag is used to specify where the DOCBOOK pages will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be put in # front of it. If left blank docbook will be used as the default path. DOCBOOK_OUTPUT = docbook #--------------------------------------------------------------------------- # configuration options for the AutoGen Definitions output #--------------------------------------------------------------------------- # If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will # generate an AutoGen Definitions (see autogen.sf.net) file # that captures the structure of the code including all # documentation. Note that this feature is still experimental # and incomplete at the moment. GENERATE_AUTOGEN_DEF = NO #--------------------------------------------------------------------------- # configuration options related to the Perl module output #--------------------------------------------------------------------------- # If the GENERATE_PERLMOD tag is set to YES Doxygen will # generate a Perl module file that captures the structure of # the code including all documentation. Note that this # feature is still experimental and incomplete at the # moment. GENERATE_PERLMOD = NO # If the PERLMOD_LATEX tag is set to YES Doxygen will generate # the necessary Makefile rules, Perl scripts and LaTeX code to be able # to generate PDF and DVI output from the Perl module output. PERLMOD_LATEX = NO # If the PERLMOD_PRETTY tag is set to YES the Perl module output will be # nicely formatted so it can be parsed by a human reader. # This is useful # if you want to understand what is going on. # On the other hand, if this # tag is set to NO the size of the Perl module output will be much smaller # and Perl will parse it just the same. PERLMOD_PRETTY = YES # The names of the make variables in the generated doxyrules.make file # are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. # This is useful so different doxyrules.make files included by the same # Makefile don't overwrite each other's variables. PERLMOD_MAKEVAR_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the preprocessor #--------------------------------------------------------------------------- # If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will # evaluate all C-preprocessor directives found in the sources and include # files. ENABLE_PREPROCESSING = YES # If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro # names in the source code. If set to NO (the default) only conditional # compilation will be performed. Macro expansion can be done in a controlled # way by setting EXPAND_ONLY_PREDEF to YES. MACRO_EXPANSION = YES # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES # then the macro expansion is limited to the macros specified with the # PREDEFINED and EXPAND_AS_DEFINED tags. EXPAND_ONLY_PREDEF = NO # If the SEARCH_INCLUDES tag is set to YES (the default) the includes files # pointed to by INCLUDE_PATH will be searched when a #include is found. SEARCH_INCLUDES = YES # The INCLUDE_PATH tag can be used to specify one or more directories that # contain include files that are not input files but should be processed by # the preprocessor. INCLUDE_PATH = # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard # patterns (like *.h and *.hpp) to filter out the header-files in the # directories. If left blank, the patterns specified with FILE_PATTERNS will # be used. INCLUDE_FILE_PATTERNS = # The PREDEFINED tag can be used to specify one or more macro names that # are defined before the preprocessor is started (similar to the -D option of # gcc). The argument of the tag is a list of macros of the form: name # or name=definition (no spaces). If the definition and the = are # omitted =1 is assumed. To prevent a macro definition from being # undefined via #undef or recursively expanded use the := operator # instead of the = operator. PREDEFINED = DOXYGEN \ Vc_INTRINSIC=inline \ Vc_INTRINSIC_L=inline \ Vc_INTRINSIC_R \ Vc_ALWAYS_INLINE=inline \ Vc_ALWAYS_INLINE_L=inline \ Vc_ALWAYS_INLINE_R \ Vc_PURE \ Vc_PURE_L \ Vc_PURE_R \ Vc_CONST \ Vc_CONST_L \ Vc_CONST_R \ _VC_CONSTEXPR=constexpr # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then # this tag can be used to specify a list of macro names that should be expanded. # The macro definition that is found in the sources will be used. # Use the PREDEFINED tag if you want to use a different macro definition that # overrules the definition found in the source code. EXPAND_AS_DEFINED = VECTOR_TYPE \ INDEX_TYPE \ ENTRY_TYPE \ MASK_TYPE \ INTEGER \ EXPONENT_TYPE \ VECTOR_TYPE_HAS_SHIFTS # If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then # doxygen's preprocessor will remove all references to function-like macros # that are alone on a line, have an all uppercase name, and do not end with a # semicolon, because these will confuse the parser if not removed. SKIP_FUNCTION_MACROS = YES #--------------------------------------------------------------------------- # Configuration::additions related to external references #--------------------------------------------------------------------------- # The TAGFILES option can be used to specify one or more tagfiles. For each # tag file the location of the external documentation should be added. The # format of a tag file without this location is as follows: # # TAGFILES = file1 file2 ... # Adding location for the tag files is done as follows: # # TAGFILES = file1=loc1 "file2 = loc2" ... # where "loc1" and "loc2" can be relative or absolute paths # or URLs. Note that each tag file must have a unique name (where the name does # NOT include the path). If a tag file is not located in the directory in which # doxygen is run, you must also specify the path to the tagfile here. TAGFILES = # When a file name is specified after GENERATE_TAGFILE, doxygen will create # a tag file that is based on the input files it reads. GENERATE_TAGFILE = # If the ALLEXTERNALS tag is set to YES all external classes will be listed # in the class index. If set to NO only the inherited external classes # will be listed. ALLEXTERNALS = NO # If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed # in the modules index. If set to NO, only the current project's groups will # be listed. EXTERNAL_GROUPS = YES # If the EXTERNAL_PAGES tag is set to YES all external pages will be listed # in the related pages index. If set to NO, only the current project's # pages will be listed. EXTERNAL_PAGES = YES # The PERL_PATH should be the absolute path and name of the perl script # interpreter (i.e. the result of `which perl'). PERL_PATH = /usr/bin/perl #--------------------------------------------------------------------------- # Configuration options related to the dot tool #--------------------------------------------------------------------------- # If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will # generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base # or super classes. Setting the tag to NO turns the diagrams off. Note that # this option also works with HAVE_DOT disabled, but it is recommended to # install and use dot, since it yields more powerful graphs. CLASS_DIAGRAMS = NO # You can define message sequence charts within doxygen comments using the \msc # command. Doxygen will then run the mscgen tool (see # http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the # documentation. The MSCGEN_PATH tag allows you to specify the directory where # the mscgen tool resides. If left empty the tool is assumed to be found in the # default search path. MSCGEN_PATH = # If set to YES, the inheritance and collaboration graphs will hide # inheritance and usage relations if the target is undocumented # or is not a class. HIDE_UNDOC_RELATIONS = YES # If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is # available from the path. This tool is part of Graphviz, a graph visualization # toolkit from AT&T and Lucent Bell Labs. The other options in this section # have no effect if this option is set to NO (the default) HAVE_DOT = NO # The DOT_NUM_THREADS specifies the number of dot invocations doxygen is # allowed to run in parallel. When set to 0 (the default) doxygen will # base this on the number of processors available in the system. You can set it # explicitly to a value larger than 0 to get control over the balance # between CPU load and processing speed. DOT_NUM_THREADS = 0 # By default doxygen will use the Helvetica font for all dot files that # doxygen generates. When you want a differently looking font you can specify # the font name using DOT_FONTNAME. You need to make sure dot is able to find # the font, which can be done by putting it in a standard location or by setting # the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the # directory containing the font. DOT_FONTNAME = # The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. # The default size is 10pt. DOT_FONTSIZE = 10 # By default doxygen will tell dot to use the Helvetica font. # If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to # set the path where dot can find it. DOT_FONTPATH = # If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen # will generate a graph for each documented class showing the direct and # indirect inheritance relations. Setting this tag to YES will force the # CLASS_DIAGRAMS tag to NO. CLASS_GRAPH = YES # If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen # will generate a graph for each documented class showing the direct and # indirect implementation dependencies (inheritance, containment, and # class references variables) of the class with other documented classes. COLLABORATION_GRAPH = YES # If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen # will generate a graph for groups, showing the direct groups dependencies GROUP_GRAPHS = YES # If the UML_LOOK tag is set to YES doxygen will generate inheritance and # collaboration diagrams in a style similar to the OMG's Unified Modeling # Language. UML_LOOK = NO # If the UML_LOOK tag is enabled, the fields and methods are shown inside # the class node. If there are many fields or methods and many nodes the # graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS # threshold limits the number of items for each type to make the size more # manageable. Set this to 0 for no limit. Note that the threshold may be # exceeded by 50% before the limit is enforced. UML_LIMIT_NUM_FIELDS = 10 # If set to YES, the inheritance and collaboration graphs will show the # relations between templates and their instances. TEMPLATE_RELATIONS = NO # If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT # tags are set to YES then doxygen will generate a graph for each documented # file showing the direct and indirect include dependencies of the file with # other documented files. INCLUDE_GRAPH = YES # If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and # HAVE_DOT tags are set to YES then doxygen will generate a graph for each # documented header file showing the documented files that directly or # indirectly include this file. INCLUDED_BY_GRAPH = YES # If the CALL_GRAPH and HAVE_DOT options are set to YES then # doxygen will generate a call dependency graph for every global function # or class method. Note that enabling this option will significantly increase # the time of a run. So in most cases it will be better to enable call graphs # for selected functions only using the \callgraph command. CALL_GRAPH = NO # If the CALLER_GRAPH and HAVE_DOT tags are set to YES then # doxygen will generate a caller dependency graph for every global function # or class method. Note that enabling this option will significantly increase # the time of a run. So in most cases it will be better to enable caller # graphs for selected functions only using the \callergraph command. CALLER_GRAPH = NO # If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen # will generate a graphical hierarchy of all classes instead of a textual one. GRAPHICAL_HIERARCHY = YES # If the DIRECTORY_GRAPH and HAVE_DOT tags are set to YES # then doxygen will show the dependencies a directory has on other directories # in a graphical way. The dependency relations are determined by the #include # relations between the files in the directories. DIRECTORY_GRAPH = YES # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images # generated by dot. Possible values are svg, png, jpg, or gif. # If left blank png will be used. If you choose svg you need to set # HTML_FILE_EXTENSION to xhtml in order to make the SVG files # visible in IE 9+ (other browsers do not have this requirement). DOT_IMAGE_FORMAT = png # If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to # enable generation of interactive SVG images that allow zooming and panning. # Note that this requires a modern browser other than Internet Explorer. # Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you # need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files # visible. Older versions of IE do not have SVG support. INTERACTIVE_SVG = NO # The tag DOT_PATH can be used to specify the path where the dot tool can be # found. If left blank, it is assumed the dot tool can be found in the path. DOT_PATH = # The DOTFILE_DIRS tag can be used to specify one or more directories that # contain dot files that are included in the documentation (see the # \dotfile command). DOTFILE_DIRS = # The MSCFILE_DIRS tag can be used to specify one or more directories that # contain msc files that are included in the documentation (see the # \mscfile command). MSCFILE_DIRS = # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of # nodes that will be shown in the graph. If the number of nodes in a graph # becomes larger than this value, doxygen will truncate the graph, which is # visualized by representing a node as a red box. Note that doxygen if the # number of direct children of the root node in a graph is already larger than # DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note # that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. DOT_GRAPH_MAX_NODES = 50 # The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the # graphs generated by dot. A depth value of 3 means that only nodes reachable # from the root by following a path via at most 3 edges will be shown. Nodes # that lay further from the root node will be omitted. Note that setting this # option to 1 or 2 may greatly reduce the computation time needed for large # code bases. Also note that the size of a graph can be further restricted by # DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. MAX_DOT_GRAPH_DEPTH = 1000 # Set the DOT_TRANSPARENT tag to YES to generate images with a transparent # background. This is disabled by default, because dot on Windows does not # seem to support this out of the box. Warning: Depending on the platform used, # enabling this option may lead to badly anti-aliased labels on the edges of # a graph (i.e. they become hard to read). DOT_TRANSPARENT = YES # Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output # files in one run (i.e. multiple -o and -T options on the command line). This # makes dot run faster, but since only newer versions of dot (>1.8.10) # support this, this feature is disabled by default. DOT_MULTI_TARGETS = NO # If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will # generate a legend page explaining the meaning of the various boxes and # arrows in the dot generated graphs. GENERATE_LEGEND = YES # If the DOT_CLEANUP tag is set to YES (the default) Doxygen will # remove the intermediate dot files that are used to generate # the various graphs. DOT_CLEANUP = YES Vc-0.7.4/doc/DoxygenLayout.xml000066400000000000000000000123311233512346000161730ustar00rootroot00000000000000 Vc-0.7.4/doc/dox-common-mask-ops.h000066400000000000000000000076701233512346000166310ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ /** * default constructor * * Leaves the mask uninitialized. */ MASK_TYPE(); /** * Constructs a mask with the entries initialized to zero. */ explicit MASK_TYPE(Vc::Zero); /** * Constructs a mask with the entries initialized to one. */ explicit MASK_TYPE(Vc::One); /** * Constructs a mask with the entries initialized to * \li one if \p b is \p true * \li zero if \p b is \p false * * \param b Determines the initial state of the mask. */ explicit MASK_TYPE(bool b); /** * Standard copy constructor */ MASK_TYPE(const MASK_TYPE &); /** * Constructs a mask object from a mask of different size/type. */ template MASK_TYPE(const OtherMask &); /** * Returns whether the two masks are equal in all entries. */ bool operator==(const MASK_TYPE &) const; /** * Returns whether the two masks differ in at least one entry. */ bool operator!=(const MASK_TYPE &) const; /** * Return the per-entry resulting mask of a logical (in this case same as bitwise) AND operation. */ MASK_TYPE operator&&(const MASK_TYPE &) const; /** * Return the per-entry resulting mask of a logical (in this case same as bitwise) AND operation. */ MASK_TYPE operator& (const MASK_TYPE &) const; /** * Return the per-entry resulting mask of a logical (in this case same as bitwise) OR operation. */ MASK_TYPE operator||(const MASK_TYPE &) const; /** * Return the per-entry resulting mask of a logical (in this case same as bitwise) OR operation. */ MASK_TYPE operator| (const MASK_TYPE &) const; /** * Return the per-entry resulting mask of a logical (in this case same as bitwise) XOR operation. */ MASK_TYPE operator^ (const MASK_TYPE &) const; /** * Return the per-entry resulting mask of a logical (in this case same as bitwise) NOT operation. */ MASK_TYPE operator! () const; /** * Modify the mask per-entry using a logical (in this case same as bitwise) AND operation. */ MASK_TYPE operator&=(const MASK_TYPE &); /** * Modify the mask per-entry using a logical (in this case same as bitwise) OR operation. */ MASK_TYPE operator|=(const MASK_TYPE &); /** * Return whether all entries of the mask are one. */ bool isFull() const; /** * Return whether all entries of the mask are zero. */ bool isEmpty() const; /** * Return whether the mask is neither full nor empty. */ bool isMix() const; /** * Cast to bool operator. Returns the same as isFull(). * * \warning Be careful with the cast to bool. Often it is better to write explicitly whether you * want isFull or !isEmpty or something else. If you define VC_NO_AUTOMATIC_BOOL_FROM_MASK this * operator will be disabled, requiring you to explicitly reduce masks to bools. */ operator bool() const; /** * Read-only access to mask entries. * * \param i Determines the boolean to be accessed. * \return the \p i th entry of the mask as a bool. */ bool operator[](int i) const; /** * Return how many entries of the mask are set to one. */ int count() const; /** * Returns the index of the first one in the mask. */ int firstOne() const; /** * Convert mask to an integer. * * \return An int where each bit corresponds to the boolean value in the mask. * * E.g. a mask like [true, false, false, true] would result in a 9 (in binary: 1001). */ int toInt() const; Vc-0.7.4/doc/dox-common-ops.h000066400000000000000000000547431233512346000157030ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ /** * The type of the vector used for indexes in gather and scatter operations. */ typedef INDEX_TYPE IndexType; /** * The type of the entries in the vector. */ typedef ENTRY_TYPE EntryType; /** * The type of the mask used for masked operations and returned from comparisons. */ typedef MASK_TYPE Mask; enum { /** * The size of the vector. I.e. the number of scalar entries in the vector. Do not make any * assumptions about the size of vectors. If you need a vector of float vs. integer of the same * size make use of IndexType instead. Note that this still does not guarantee the same size * (e.g. double_v on SSE has two entries but there exists no 64 bit integer vector type in Vc - * which would have two entries; thus double_v::IndexType is uint_v). * * Also you can easily use if clauses that compare sizes. The * compiler can statically evaluate and fully optimize dead code away (very much like \#ifdef, * but with syntax checking). */ Size }; /** * Construct an uninitialized vector. */ VECTOR_TYPE(); /** * Construct a vector with the entries initialized to zero. * * \see Vc::Zero, Zero() */ VECTOR_TYPE(Vc::Zero); /** * Returns a vector with the entries initialized to zero. */ static VECTOR_TYPE Zero(); /** * Construct a vector with the entries initialized to one. * * \see Vc::One */ VECTOR_TYPE(Vc::One); /** * Returns a vector with the entries initialized to one. */ static VECTOR_TYPE One(); #ifdef INTEGER /** * Construct a vector with the entries initialized to 0, 1, 2, 3, 4, 5, ... * * \see Vc::IndexesFromZero, IndexesFromZero() */ VECTOR_TYPE(Vc::IndexesFromZero); /** * Returns a vector with the entries initialized to 0, 1, 2, 3, 4, 5, ... */ static VECTOR_TYPE IndexesFromZero(); #endif /** * Returns a vector with pseudo-random entries. * * Currently the state of the random number generator cannot be modified and starts off with the * same state. Thus you will get the same sequence of numbers for the same sequence of calls. * * \return a new random vector. Floating-point values will be in the 0-1 range. Integers will use * the full range the integer representation allows. * * \note This function may use a very small amount of state and thus will be a weak random number generator. */ static VECTOR_TYPE Random(); /** * Construct a vector loading its entries from \p alignedMemory. * * \param alignedMemory A pointer to data. The pointer must be aligned on a * Vc::VectorAlignment boundary. */ VECTOR_TYPE(ENTRY_TYPE *alignedMemory); /** * Convert from another vector type. */ template explicit VECTOR_TYPE(const OtherVector &); /** * Broadcast Constructor. * * Constructs a vector with all entries of the vector filled with the given value. * * \param x The scalar value to broadcast to all entries of the constructed vector. * * \note If you want to set it to 0 or 1 use the special initializer constructors above. Calling * this constructor with 0 will cause a compilation error because the compiler cannot know which * constructor you meant. */ VECTOR_TYPE(ENTRY_TYPE x); /** * Construct a vector from an array of vectors with different Size. * * E.g. convert from two double_v to one float_v. * * \see expand */ //VECTOR_TYPE(const OtherVector *array); /** * Expand the values into an array of vectors that have a different Size. * * E.g. convert from one float_v to two double_v. * * This is the reverse of the above constructor. */ //void expand(OtherVector *array) const; /** * Load the vector entries from \p memory, overwriting the previous values. * * \param memory A pointer to data. * \param align Determines whether \p memory is an aligned pointer or not. * * \see Memory */ void load(const ENTRY_TYPE *memory, LoadStoreFlags align = Aligned); /** * Set all entries to zero. */ void setZero(); /** * Set all entries to zero where the mask is set. I.e. a 4-vector with a mask of 0111 would * set the last three entries to 0. * * \param mask Selects the entries to be set to zero. */ void setZero(const MASK_TYPE &mask); /** * Store the vector data to \p memory. * * \param memory A pointer to memory, where to store. * \param align Determines whether \p memory is an aligned pointer or not. * * \see Memory */ void store(EntryType *memory, LoadStoreFlags align = Aligned) const; /** * This operator can be used to modify scalar entries of the vector. * * \param index A value between 0 and Size. This value is not checked internally so you must make/be * sure it is in range. * * \return a reference to the vector entry at the given \p index. * * \warning This operator is known to miscompile with GCC 4.3.x. * \warning The use of this function may result in suboptimal performance. Please check whether you * can find a more vector-friendly way to do what you need. */ ENTRY_TYPE &operator[](int index); /** * This operator can be used to read scalar entries of the vector. * * \param index A value between 0 and Size. This value is not checked internally so you must make/be * sure it is in range. * * \return the vector entry at the given \p index. */ ENTRY_TYPE operator[](int index) const; /** * Writemask the vector before an assignment. * * \param mask The writemask to be used. * * \return an object that can be used for any kind of masked assignment. * * The returned object is only to be used for assignments and should not be assigned to a variable. * * Examples: * \code * float_v v = float_v::Zero(); // v = [0, 0, 0, 0] * int_v v2 = int_v::IndexesFromZero(); // v2 = [0, 1, 2, 3] * v(v2 < 2) = 1.f; // v = [1, 1, 0, 0] * v(v2 < 3) += 1.f; // v = [2, 2, 1, 0] * ++v2(v < 1.f); // v2 = [0, 1, 2, 4] * \endcode */ MaskedVector operator()(const MASK_TYPE &mask); /** * \name Gather and Scatter Functions * The gather and scatter functions allow you to easily use vectors with structured data and random * accesses. * * There are several variants: * \li random access in arrays (a[i]) * \li random access of members of structs in an array (a[i].member) * \li random access of members of members of structs in an array (a[i].member1.member2) * * All gather and scatter functions optionally take a mask as last argument. In that case only the * entries that are selected in the mask are read in memory and copied to the vector. This allows * you to have invalid indexes in the \p indexes vector if those are masked off in \p mask. * * \note If you use a constructor for a masked gather then the unmodified entries of the vector are * initilized to 0 before the gather. If you really want them uninitialized you can create a * uninitialized vector object first and then call the masked gather function on it. * * The index type (IndexT) can either be a pointer to integers (array) or a vector of integers. * * Accessing values of a struct works like this: * \code * struct MyData { * float a; * int b; * }; * * void foo(MyData *data, uint_v indexes) { * const float_v v1(data, &MyData::a, indexes); * const int_v v2(data, &MyData::b, indexes); * v1.scatter(data, &MyData::a, indexes - float_v::Size); * v2.scatter(data, &MyData::b, indexes - 1); * } * \endcode * * \param array A pointer into memory (without alignment restrictions). * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to * be read. Thus the offsets in \p indexes are relative to the \p array and not to * the size of the gathered type (i.e. array[i].*member1 is accessed instead of * (&(array->*member1))[i]) * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that * struct (i.e. array[i].*member1.*member2 is read). * \param indexes Determines the offsets into \p array where the values are gathered from/scattered * to. The type of indexes can either be an integer vector or a type that supports * operator[] access. * \param mask If a mask is given only the active entries will be gathered/scattered. */ //@{ /// gather constructor template VECTOR_TYPE(const ENTRY_TYPE *array, const IndexT indexes); /// masked gather constructor, initialized to zero template VECTOR_TYPE(const ENTRY_TYPE *array, const IndexT indexes, const MASK_TYPE &mask); /// gather template void gather(const ENTRY_TYPE *array, const IndexT indexes); /// masked gather template void gather(const ENTRY_TYPE *array, const IndexT indexes, const MASK_TYPE &mask); /// scatter template void scatter(ENTRY_TYPE *array, const IndexT indexes) const; /// masked scatter template void scatter(ENTRY_TYPE *array, const IndexT indexes, const MASK_TYPE &mask) const; ///////////////////////// /// struct member gather constructor template VECTOR_TYPE(const S1 *array, const ENTRY_TYPE S1::* member1, const IndexT indexes); /// masked struct member gather constructor, initialized to zero template VECTOR_TYPE(const S1 *array, const ENTRY_TYPE S1::* member1, const IndexT indexes, const MASK_TYPE &mask); /// struct member gather template void gather(const S1 *array, const ENTRY_TYPE S1::* member1, const IndexT indexes); /// masked struct member gather template void gather(const S1 *array, const ENTRY_TYPE S1::* member1, const IndexT indexes, const MASK_TYPE &mask); /// struct member scatter template void scatter(S1 *array, ENTRY_TYPE S1::* member1, const IndexT indexes) const ; /// masked struct member scatter template void scatter(S1 *array, ENTRY_TYPE S1::* member1, const IndexT indexes, const MASK_TYPE &mask) const ; ///////////////////////// /// struct member of struct member gather constructor template VECTOR_TYPE(const S1 *array, const S2 S1::* member1, const ENTRY_TYPE S2::* member2, const IndexT indexes); /// masked struct member of struct member gather constructor, initialized to zero template VECTOR_TYPE(const S1 *array, const S2 S1::* member1, const ENTRY_TYPE S2::* member2, const IndexT indexes, const MASK_TYPE &mask); /// struct member of struct member gather template void gather(const S1 *array, const S2 S1::* member1, const ENTRY_TYPE S2::* member2, const IndexT indexes); /// masked struct member of struct member gather template void gather(const S1 *array, const S2 S1::* member1, const ENTRY_TYPE S2::* member2, const IndexT indexes, const MASK_TYPE &mask); /// struct member of struct member scatter template void scatter(S1 *array, S2 S1::* member1, ENTRY_TYPE S2::* member2, const IndexT indexes) const ; /// maksed struct member of struct member scatter template void scatter(S1 *array, S2 S1::* member1, ENTRY_TYPE S2::* member2, const IndexT indexes, const MASK_TYPE &mask) const ; //@} /** * \name Comparisons * * All comparison operators return a mask object. * * \code * void foo(const float_v &a, const float_v &b) { * const float_m mask = a < b; * ... * } * \endcode * * \param x The vector to compare with. */ //@{ /// Returns mask that is \c true where vector entries are equal and \c false otherwise. MASK_TYPE operator==(const VECTOR_TYPE &x) const; /// Returns mask that is \c true where vector entries are not equal and \c false otherwise. MASK_TYPE operator!=(const VECTOR_TYPE &x) const; /// Returns mask that is \c true where the left vector entries are greater than on the right and \c false otherwise. MASK_TYPE operator> (const VECTOR_TYPE &x) const; /// Returns mask that is \c true where the left vector entries are greater than on the right or equal and \c false otherwise. MASK_TYPE operator>=(const VECTOR_TYPE &x) const; /// Returns mask that is \c true where the left vector entries are less than on the right and \c false otherwise. MASK_TYPE operator< (const VECTOR_TYPE &x) const; /// Returns mask that is \c true where the left vector entries are less than on the right or equal and \c false otherwise. MASK_TYPE operator<=(const VECTOR_TYPE &x) const; //@} /** * \name Arithmetic Operations * * The vector classes implement all the arithmetic and (bitwise) logical operations as you know from * builtin types. * * \code * void foo(const float_v &a, const float_v &b) { * const float_v product = a * b; * const float_v difference = a - b; * } * \endcode */ //@{ /// Returns a new vector with the sum of the respective entries of the left and right vector. VECTOR_TYPE operator+(VECTOR_TYPE x) const; /// Adds the respective entries of \p x to this vector. VECTOR_TYPE &operator+=(VECTOR_TYPE x); /// Returns a new vector with the difference of the respective entries of the left and right vector. VECTOR_TYPE operator-(VECTOR_TYPE x) const; /// Subtracts the respective entries of \p x from this vector. VECTOR_TYPE &operator-=(VECTOR_TYPE x); /// Returns a new vector with the product of the respective entries of the left and right vector. VECTOR_TYPE operator*(VECTOR_TYPE x) const; /// Multiplies the respective entries of \p x from to vector. VECTOR_TYPE &operator*=(VECTOR_TYPE x); /// Returns a new vector with the quotient of the respective entries of the left and right vector. VECTOR_TYPE operator/(VECTOR_TYPE x) const; /// Divides the respective entries of this vector by \p x. VECTOR_TYPE &operator/=(VECTOR_TYPE x); /// Returns a new vector with all entries negated. VECTOR_TYPE operator-() const; /// Returns a new vector with the binary or of the respective entries of the left and right vector. VECTOR_TYPE operator|(VECTOR_TYPE x) const; /// Returns a new vector with the binary and of the respective entries of the left and right vector. VECTOR_TYPE operator&(VECTOR_TYPE x) const; /// Returns a new vector with the binary xor of the respective entries of the left and right vector. VECTOR_TYPE operator^(VECTOR_TYPE x) const; #ifdef VECTOR_TYPE_HAS_SHIFTS /// Returns a new vector with each entry bitshifted to the left by \p x bits. VECTOR_TYPE operator<<(int x) const; /// Bitshift each entry to the left by \p x bits. VECTOR_TYPE &operator<<=(int x); /// Returns a new vector with each entry bitshifted to the right by \p x bits. VECTOR_TYPE operator>>(int x) const; /// Bitshift each entry to the right by \p x bits. VECTOR_TYPE &operator>>=(int x); /// Returns a new vector with each entry bitshifted to the left by \p x[i] bits. VECTOR_TYPE operator<<(VECTOR_TYPE x) const; /// Bitshift each entry to the left by \p x[i] bits. VECTOR_TYPE &operator<<=(VECTOR_TYPE x); /// Returns a new vector with each entry bitshifted to the right by \p x[i] bits. VECTOR_TYPE operator>>(VECTOR_TYPE x) const; /// Bitshift each entry to the right by \p x[i] bits. VECTOR_TYPE &operator>>=(VECTOR_TYPE x); #endif /** * Multiplies this vector with \p factor and then adds \p summand, without rounding between the * multiplication and the addition. * * \param factor The multiplication factor. * \param summand The summand that will be added after multiplication. * * \note This operation may have explicit hardware support, in which case it is normally faster to * use the FMA instead of separate multiply and add instructions. * \note If the target hardware does not have FMA support this function will be considerably slower * than a normal a * b + c. This is due to the increased precision fusedMultiplyAdd provides. */ void fusedMultiplyAdd(VECTOR_TYPE factor, VECTOR_TYPE summand); //@} /** * \name Horizontal Reduction Operations * * There are four horizontal operations available to reduce the values of a vector to a scalar * value. * * \code * void foo(const float_v &v) { * float min = v.min(); // smallest value in v * float sum = v.sum(); // sum of all values in v * } * \endcode */ //@{ /// Returns the smallest entry in the vector. ENTRY_TYPE min() const; /// Returns the largest entry in the vector. ENTRY_TYPE max() const; /// Returns the product of all entries in the vector. ENTRY_TYPE product() const; /// Returns the sum of all entries in the vector. ENTRY_TYPE sum() const; //@} /** * \name Apply/Call/Fill Functions * * There are still many situations where the code needs to switch from SIMD operations to scalar * execution. In this case you can, of course rely on operator[]. But there are also a number of * functions that can help with common patterns. * * The apply functions expect a function that returns a scalar value, i.e. a function of the form "T f(T)". * The call functions do not return a value and thus the function passed does not need a return * value. The fill functions are used to serially set the entries of the vector from the return * values of a function. * * Example: * \code * void foo(float_v v) { * float_v logarithm = v.apply(std::log); * float_v exponential = v.apply(std::exp); * } * \endcode * * Of course, with C++11, you can also use lambdas here: * \code * float_v power = v.apply([](float f) { return std::pow(f, 0.6f); }) * \endcode * * \param f A functor: this can either be a function or an object that implements operator(). */ //@{ /// Return a new vector where each entry is the return value of \p f called on the current value. template VECTOR_TYPE apply(Functor &f) const; /// Const overload of the above function. template VECTOR_TYPE apply(const Functor &f) const; /// As above, but skip the entries where \p mask is not set. template VECTOR_TYPE apply(Functor &f, MASK_TYPE mask) const; /// Const overload of the above function. template VECTOR_TYPE apply(const Functor &f, MASK_TYPE mask) const; /// Call \p f with the scalar entries of the vector. template void call(Functor &f) const; /// Const overload of the above function. template void call(const Functor &f) const; /// As above, but skip the entries where \p mask is not set. template void call(Functor &f, MASK_TYPE mask) const; /// Const overload of the above function. template void call(const Functor &f, MASK_TYPE mask) const; /// Fill the vector with the values [f(), f(), f(), ...]. void fill(ENTRY_TYPE (&f)()); /// Fill the vector with the values [f(0), f(1), f(2), ...]. template void fill(ENTRY_TYPE (&f)(IndexT)); //@} /** * \name Swizzles * * Swizzles are a special form of shuffles that, depending on the target hardware and swizzle type, * may be used without extra cost. The swizzles act on every successive four entries in the vector. * Thus the swizzle \verbatim [0, 1, 2, 3, 4, 5, 6, 7].dcba() \endverbatim results in * \verbatim [3, 2, 1, 0, 7, 6, 5, 4] \endverbatim. * * This implies a portability issue. The swizzles can only work on vectors where Size is a * multiple of four. * On Vc::Scalar all swizzles are implemented as no-ops. If a swizzle is used on a vector of Size == * 2 compilation will fail. */ //@{ /// Identity. const VECTOR_TYPE abcd() const; /// Permute pairs. const VECTOR_TYPE badc() const; /// Permute pairs of two / Rotate twice. const VECTOR_TYPE cdab() const; /// Broadcast a. const VECTOR_TYPE aaaa() const; /// Broadcast b. const VECTOR_TYPE bbbb() const; /// Broadcast c. const VECTOR_TYPE cccc() const; /// Broadcast d. const VECTOR_TYPE dddd() const; /// Rotate three: cross-product swizzle. const VECTOR_TYPE bcad() const; /// Rotate left. const VECTOR_TYPE bcda() const; /// Rotate right. const VECTOR_TYPE dabc() const; /// Permute inner pair. const VECTOR_TYPE acbd() const; /// Permute outer pair. const VECTOR_TYPE dbca() const; /// Reverse. const VECTOR_TYPE dcba() const; //@} /** * \name Shift and Rotate * * These functions allow to shift or rotate the entries in a vector by the given \p amount. Both * functions support positive and negative numbers for the shift/rotate value. * * Example: * \code * using namespace Vc; * int_v foo = int_v::IndexesFromZero() + 1; // e.g. [1, 2, 3, 4] with SSE * int_v x; * x = foo.shifted( 1); // [2, 3, 4, 0] * x = foo.shifted( 2); // [3, 4, 0, 0] * x = foo.shifted( 3); // [4, 0, 0, 0] * x = foo.shifted( 4); // [0, 0, 0, 0] * x = foo.shifted(-1); // [0, 1, 2, 3] * x = foo.shifted(-2); // [0, 0, 1, 2] * x = foo.shifted(-3); // [0, 0, 0, 1] * x = foo.shifted(-4); // [0, 0, 0, 0] * * x = foo.rotated( 1); // [2, 3, 4, 1] * x = foo.rotated( 2); // [3, 4, 1, 2] * x = foo.rotated( 3); // [4, 1, 2, 3] * x = foo.rotated( 4); // [1, 2, 3, 4] * x = foo.rotated(-1); // [4, 1, 2, 3] * x = foo.rotated(-2); // [3, 4, 1, 2] * x = foo.rotated(-3); // [2, 3, 4, 1] * x = foo.rotated(-4); // [1, 2, 3, 4] * \endcode * * These functions are slightly related to the above swizzles. In any case, they are often useful for * communication between SIMD lanes or binary decoding operations. */ //@{ /// Shift vector entries to the left by \p amount; shifting in zeros. const VECTOR_TYPE shifted(int amount) const; /// Rotate vector entries to the left by \p amount. const VECTOR_TYPE rotated(int amount) const; //@} /** * Return a sorted copy of the vector. * * \return A sorted vector. The returned values are in ascending order: \verbatim v[0] <= v[1] <= v[2] <= v[3] ... \endverbatim * * Example: * \code * int_v v = int_v::Random(); * int_v s = v.sorted(); * std::cout << v << '\n' << s << '\n'; * \endcode * * With SSE the output would be: * \verbatim [1513634383, -963914658, 1763536262, -1285037745] [-1285037745, -963914658, 1513634383, 1763536262] \endverbatim * * With the Scalar implementation: \verbatim [1513634383] [1513634383] \endverbatim */ VECTOR_TYPE sorted() const; Vc-0.7.4/doc/dox-math.h000066400000000000000000000144541233512346000145400ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ /** * \ingroup Math * * Returns the square root of \p v. */ VECTOR_TYPE sqrt(const VECTOR_TYPE &v); /** * \ingroup Math * * Returns the reciprocal square root of \p v. */ VECTOR_TYPE rsqrt(const VECTOR_TYPE &v); /** * \ingroup Math * * Returns the reciprocal of \p v. */ VECTOR_TYPE reciprocal(const VECTOR_TYPE &v); /** * \ingroup Math * * Returns the absolute value of \p v. */ VECTOR_TYPE abs(const VECTOR_TYPE &v); /** * \ingroup Math * * Returns the closest integer to \p v; 0.5 is rounded to even. */ VECTOR_TYPE round(const VECTOR_TYPE &v); /** * \ingroup Math * * Returns the natural logarithm of \p v. * * \note The single-precision implementation has an error of max. 1 ulp (mean 0.020 ulp) in the range ]0, 1000] (including denormals). * \note The double-precision implementation has an error of max. 1 ulp (mean 0.020 ulp) in the range ]0, 1000] (including denormals). */ VECTOR_TYPE log(const VECTOR_TYPE &v); /** * \ingroup Math * * Returns the base-2 logarithm of \p v. * * \note The single-precision implementation has an error of max. 1 ulp (mean 0.016 ulp) in the range ]0, 1000] (including denormals). * \note The double-precision implementation has an error of max. 1 ulp (mean 0.016 ulp) in the range ]0, 1000] (including denormals). */ VECTOR_TYPE log2(const VECTOR_TYPE &v); /** * \ingroup Math * * Returns the base-10 logarithm of \p v. * * \note The single-precision implementation has an error of max. 2 ulp (mean 0.31 ulp) in the range ]0, 1000] (including denormals). * \note The double-precision implementation has an error of max. 2 ulp (mean 0.26 ulp) in the range ]0, 1000] (including denormals). */ VECTOR_TYPE log10(const VECTOR_TYPE &v); /** * \ingroup Math * * Returns the exponential of \p v. */ VECTOR_TYPE exp(const VECTOR_TYPE &v); /** * \ingroup Math * * Returns the sine of \p v. * * \note The single-precision implementation has an error of max. 2 ulp (mean 0.17 ulp) in the range [-8192, 8192]. * \note The double-precision implementation has an error of max. 8e6 ulp (mean 1040 ulp) in the range [-8192, 8192]. * \note Vc versions before 0.7 had much larger errors. */ VECTOR_TYPE sin(const VECTOR_TYPE &v); /** * \ingroup Math * * Returns the cosine of \p v. * * \note The single-precision implementation has an error of max. 2 ulp (mean 0.18 ulp) in the range [-8192, 8192]. * \note The double-precision implementation has an error of max. 8e6 ulp (mean 1160 ulp) in the range [-8192, 8192]. * \note Vc versions before 0.7 had much larger errors. */ VECTOR_TYPE cos(const VECTOR_TYPE &v); /** * \ingroup Math * * Calculates the sine and cosine of \p v. * The values are returned in the \p sin and \p cos parameters. * * \param[in] v input value to sin and cos * \param[out] sin Needs to be a non-null pointer which will be set to the sine of \p v. * \param[out] cos Needs to be a non-null pointer which will be set to the cosine of \p v. * * \note The single-precision implementation has an error of max. 2 ulp (mean 0.18 ulp) in the range [-8192, 8192]. * \note The double-precision implementation has an error of max. 8e6 ulp (mean 1160 ulp) in the range [-8192, 8192]. * \note Vc versions before 0.7 had much larger errors. */ void sincos(const VECTOR_TYPE &v, VECTOR_TYPE *sin, VECTOR_TYPE *cos); /** * \ingroup Math * * Returns the arcsine of \p v. * * \note The single-precision implementation has an error of max. 2 ulp (mean 0.3 ulp). * \note The double-precision implementation has an error of max. 36 ulp (mean 0.4 ulp). */ VECTOR_TYPE asin(const VECTOR_TYPE &v); /** * \ingroup Math * * Returns the arctangent of \p v. * \note The single-precision implementation has an error of max. 3 ulp (mean 0.4 ulp) in the range [-8192, 8192]. * \note The double-precision implementation has an error of max. 2 ulp (mean 0.1 ulp) in the range [-8192, 8192]. */ VECTOR_TYPE atan(const VECTOR_TYPE &v); /** * \ingroup Math * * Returns the arctangent of \p x / \p y. */ VECTOR_TYPE atan2(const VECTOR_TYPE &x, const VECTOR_TYPE &y); /** * \ingroup Math * * Returns the minimum of \p x and \p y. */ VECTOR_TYPE min(const VECTOR_TYPE &x, const VECTOR_TYPE &y); /** * \ingroup Math * * Returns the maximum of \p x and \p y. */ VECTOR_TYPE max(const VECTOR_TYPE &x, const VECTOR_TYPE &y); /** * \ingroup Math * * Convert floating-point number to fractional and integral components. * * \param x value to be split into normalized fraction and exponent * \param e the exponent to base 2 of \p x * * \returns the normalized fraction. If \p x is non-zero, the return value is \p x times a power of two, and * its absolute value is always in the range [0.5,1). * * \returns * If \p x is zero, then the normalized fraction is zero and zero is stored in \p e. * * \returns * If \p x is a NaN, a NaN is returned, and the value of \p *e is unspecified. * * \returns * If \p x is positive infinity (negative infinity), positive infinity (nega‐ * tive infinity) is returned, and the value of \p *e is unspecified. */ VECTOR_TYPE frexp(const VECTOR_TYPE &x, EXPONENT_TYPE *e); /** * \ingroup Math * * Multiply floating-point number by integral power of 2 * * \param x value to be multiplied by 2 ^ \p e * \param e exponent * * \returns \p x * 2 ^ \p e */ VECTOR_TYPE ldexp(VECTOR_TYPE x, EXPONENT_TYPE e); /** * \ingroup Math * * Returns a mask that tells whether the values in the vector are finite (i.e.\ not NaN or +/-inf). */ MASK_TYPE isfinite(const VECTOR_TYPE &x); /** * \ingroup Math * * Returns a mask that tells whether the values in the vector are NaN. */ MASK_TYPE isnan(const VECTOR_TYPE &x); Vc-0.7.4/doc/dox-real-ops.h000066400000000000000000000050421233512346000153220ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2011-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ /** * Copies the sign of \p reference. * * \param reference This values sign bit will be transferred. * * \return a value where the sign of the value equals the sign of \p reference. I.e. * sign(v.copySign(r)) == sign(r). */ inline VECTOR_TYPE copySign(VECTOR_TYPE reference) const; /** * Extracts the exponent. * * \return the exponent to base 2. * * This function provides efficient access to the exponent of the floating point number. The * returned value is a fast approximation to the logarithm of base 2. The absolute error of that * approximation is between [0, 1[. * * Examples: \verbatim value | exponent | log2 =======|==========|======= 1.0 | 0 | 0 2.0 | 1 | 1 3.0 | 1 | 1.585 3.9 | 1 | 1.963 4.0 | 2 | 2 4.1 | 2 | 2.036 \endverbatim * * \warning This function assumes a positive value (non-zero). If the value is negative the sign bit will * modify the returned value. An input value of zero will return the bias of the floating-point * representation. If you compile with Vc runtime checks, the function will assert * values greater than or equal to zero. * * You may use abs to apply this function to negative values: * \code * abs(v).exponent() * \endcode */ inline VECTOR_TYPE exponent() const; /** * Check the sign bit of each vector entry. * * \return whether the sign bit is set. * * This function is especially useful to distinguish negative zero. * \code * float_v z = float_v::Zero(); // z.isNegative() will be m[0000], z < float_v::Zero() will be m[0000] * float_v nz = -0.f; // nz.isNegative() will be m[1111], nz < float_v::Zero() will be m[0000] * float_v n = -1.f; // n.isNegative() will be m[1111], n < float_v::Zero() will be m[1111] * \endcode */ inline MASK_TYPE isNegative() const; Vc-0.7.4/doc/dox.h000066400000000000000000001200571233512346000136060ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ /** * \mainpage * \image html logo.png * * The %Vc library is a collection of SIMD vector classes with existing implementations for SSE, AVX, * and a scalar fallback. An implementation for the Intel Xeon Phi is expected to be ready for %Vc * 0.8. * * \section background Background information and learning material * \li \ref intro * \li \ref portability * \li \ref featuremacros * \li \ref buildsystem * \li \ref examples * * \section apidox API documentation * \li \ref Vectors * \li \ref Masks * \li \ref Utilities * \li \ref Math * * Per default, code compiled against the %Vc headers will use the instruction set that the compiler * says is available. For example compiling with "g++ -mssse3" will enable compilation against the * SSE implementation using SSE the instruction sets SSE, SSE2, SSE3 and SSSE3. If you want to force * compilation against a specific implementation of the vector classes you can set the macro * VC_IMPL to either "Scalar", "SSE", "SSE2", "SSE3", "SSSE3", "SSE4_1", "SSE4_2", or "AVX". * You may additionally append "+XOP", "+FMA4", "+SSE4a", "+F16C", and "+POPCNT", e.g. "-D VC_IMPL=SSE+XOP+FMA4" * Setting VC_IMPL to * "SSE" will force the SSE instruction set, but lets the headers figure out the version to use or, * if that fails, uses SSE4.1. * After you include a %Vc header, you will have the following macros available, which you can (but * normally should not) use to determine the implementation %Vc uses: * \li \c VC_IMPL_Scalar * \li \c VC_IMPL_SSE (shorthand for SSE2 || SSE3 || SSSE3 || SSE4_1. SSE1 alone is not supported.) * \li \c VC_IMPL_SSE2 * \li \c VC_IMPL_SSE3 * \li \c VC_IMPL_SSSE3 * \li \c VC_IMPL_SSE4_1 * \li \c VC_IMPL_SSE4_2 * \li \c VC_IMPL_AVX * * Another set of macros you may use for target specific implementations are the \c VC_*_V_SIZE * macros: \ref Utilities */ /** * \page intro Introduction * * If you are new to vectorization please read this following part and make sure you understand it: * \li Forget what you learned about vectors in math classes. SIMD vectors are a different concept! * \li Forget about containers that also go by the name of a vector. SIMD vectors are a different concept! * \li A vector is defined by the hardware as a special register which is wider than required for a * single value. Thus multiple values fit into one register. The width of this register and the * size of the scalar data type in use determine the number of entries in the vector. * Therefore this number is an unchangeable property of the hardware and not a variable in the * %Vc API. * \li Note that hardware is free to use different vector register widths for different data types. * For example AVX has instructions for 256-bit floating point registers, but only 128-bit integer * instructions. * * \par Example 1: * * You can modify a function to use vector types and thus implement a horizontal vectorization. The * original scalar function could look like this: * \code * void normalize(float &x, float &y, float &z) * { * const float d = std::sqrt(x * x + y * y + z * z); * x /= d; * y /= d; * z /= d; * } * \endcode * To vectorize the \c normalize function with %Vc, the types must be substituted by their %Vc counterparts and math functions * must use the %Vc implementation (which is, per default, also imported into \c std namespace): * \code * using Vc::float_v; * * void normalize(float_v &x, float_v &y, float_v &z) * { * const float_v d = Vc::sqrt(x * x + y * y + z * z); * x /= d; * y /= d; * z /= d; * } * \endcode * The latter function is able to normalize four 3D vectors when compiled for SSE in the same * time the former function normalizes one 3D vector. * * For completeness, note that you can optimize the division in the normalize function further: * \code * const float_v d_inv = float_v::One() / Vc::sqrt(x * x + y * y + z * z); * const float_v d_inv = Vc::rsqrt(x * x + y * y + z * z); // less accurate, but faster * \endcode * Then you can multiply \c x, \c y, and \c z with \c d_inv, which is considerably faster than three * divisions. * * As you can probably see, the new challenge with %Vc is the use of good data-structures which * support horizontal vectorization. Depending on your problem at hand this may become the main * focus of design (it does not have to be, though). * * \section intro_alignment Alignment * * \subsection intro_alignment_background What is Alignment * * If you do not know what alignment is, and why it is important, read on, otherwise skip to \ref * intro_alignment_tools. Normally the alignment of data is an implementation detail left to the * compiler. Until C++11, the language did not even have any (official) means to query or modify * alignment. * * Most data types require more than one Byte for storage. Thus, even most atomic data types span * several locations in memory. E.g. if you have a pointer to \c float, the address stored in this * pointer just determines the first of four Bytes of the \c float. Naively, one could think that * any address (which belongs to the process) can be used to store such a float. While this is true * for some architectures, some architectures may terminate the process when a misaligned pointer is * dereferenced. The natural alignment for atomic data types typically is the same as their size. * Thus the address of a \c float object should always be a multiple of 4 Bytes. * * Alignment becomes more important for SIMD data types. * 1. There are different instructions to load/store aligned and unaligned vectors. The unaligned * load/stores recently were greatly improved in x86 CPUs. Still, the rule of thumb * says that aligned loads/stores are faster. * 2. Access to an unaligned vector with an instruction that expects an aligned vector crashes the * application. Once you write vectorized code you might want to make it a habit to check crashes * for unaligned addresses. * 3. Memory allocation on the heap will return addresses aligned to some system specific alignment * rule. E.g. Linux 32bit aligns on 8 Bytes, while Linux 64bit aligns on 16 Bytes. Both alignments * are not strict enough for AVX vectors. Worse, if you develop on Linux 64bit with SSE you won't * notice any problems until you switch to a 32bit build or AVX. * 4. Placement on the stack is determined at compile time and requires the compiler to know the * alignment restrictions of the type. * 5. The size of a cache line is just two or four times larger than the SIMD types (if not equal). * Thus, if you load several vectors consecutively from memory every fourth, second, or even every * load will have to be read from two different cache lines. This is called a cache line split. They * lead to degraded performance, which becomes very noticeable for memory intensive code. * * \subsection intro_alignment_tools Tools * * %Vc provides several classes and functions to get alignment right. * \li Vc::VectorAlignment is a compile time constant that equals the largest alignment restriction * (in Bytes) for the selected target architecture. * \li Vc::VectorAlignedBase and Vc::VectorAlignedBaseT are helper classes that use compiler * specific extensions to annotate the alignment restrictions for vector types. * Additionally they reimplement \c new and \c delete to return correctly aligned * pointers to the heap. * \li Vc::malloc and Vc::free are meant as replacements for \c malloc and \c free. They can be used * to allocate any type of memory with an abstract alignment restriction: \ref * Vc::MallocAlignment. Note, that (like \c malloc) the memory is only allocated * and not initialized. If you allocate memory for a type that has a constructor, * use the placement new syntax to initialize the memory. * \li Vc::Allocator is an STL compatible allocator class that behaves as specified in the C++ * specification, implementing the optional support for over-aligned types. * Therefore, memory addresses returned from this allocator will always be * aligned to at least the constraints attached to the type \c T. STL containers * will already default to Vc::Allocator for Vc::Vector. For all other * composite types you want to use, you can take the \ref VC_DECLARE_ALLOCATOR * convenience macro to set is as default. * \li Vc::Memory, Vc::Memory, Vc::Memory * The three different variants of the memory class can be used like a more * convenient C-array. It supports two-dimensional statically sized arrays and * one-dimensional statically and dynamically sized arrays. The memory can be * accessed easily via aligned vectors, but also via unaligned vectors or * gathers/scatters. */ /** * \page portability Portability Issues * * One of the major goals of %Vc is to ease development of portable code, while achieving highest * possible performance that requires target architecture specific instructions. This is possible * through having just a single type use different implementations of the same API depending on the * target architecture. Many of the details of the target architecture are often dependent on the * compiler flags that were used. Also there can be subtle differences between the implementations * that could lead to problems. This page aims to document all issues you might need to know about. * * \par Compiler Flags * * \li \e GCC: The compiler should be called with the -march=\ flag. Take a look at the GCC * manpage to find all possibilities for \. Additionally it is best to also add the -msse2 * -msse3 ... -mavx flags. If no SIMD instructions are enabled via compiler flags, %Vc must fall back * to the scalar implementation. * \li \e Clang: The same as for GCC applies. * \li \e ICC: Same as GCC, but the flags are called -xAVX -xSSE4.2 -xSSE4.1 -xSSSE3 -xSSE3 -xSSE2. * \li \e MSVC: On 32bit you can add the /arch:SSE2 flag. That's about all the MSVC documentation * says. Still the MSVC compiler knows about the newer instructions in SSE3 and upwards. How you can * determine what CPUs will be supported by the resulting binary is unclear. * * \par Where does the final executable run? * * You must be aware of the fact that a binary that is built for a given SIMD hardware may not run * on a processor that does not have these instructions. The executable will work fine as long as no * such instruction is actually executed and only crash at the place where such an instruction is * used. Thus it is better to check at application start whether the compiled in SIMD hardware is * really supported on the executing CPU. This can be determined with the * currentImplementationSupported function. * * If you want to distribute a binary that runs correctly on many different systems you either must * restrict it to the least common denominator (which often is SSE2), or you must compile the code * several times, with the different target architecture compiler options. A simple way to combine * the resulting executables would be via a wrapping script/executable that determines the correct * executable to use. A more sophisticated option is the use of the ifunc attribute GCC provides. * Other compilers might provide similar functionality. * * \par Guarantees * * It is guaranteed that: * \li \code int_v::Size == uint_v::Size == float_v::Size \endcode * \li \code short_v::Size == ushort_v::Size == sfloat_v::Size \endcode * * \par Important Differences between Implementations * * \li Obviously the number of entries in a vector depends on the target architecture. * \li Because of the guarantees above, sfloat_v does not necessarily map to a single SIMD register * and thus there could be a higher register pressure when this type is used. * \li Hardware that does not support 16-Bit integer vectors can implement the short_v and ushort_v * API via 32-Bit integer vectors. Thus, some of the overflow behavior might be slightly different, * and truncation will only happen when the vector is stored to memory. * * \section portability_compilerquirks Compiler Quirks * * Since SIMD is not part of the C/C++ language standards %Vc abstracts more or less standardized * compiler extensions. Sadly, not every issue can be transparently abstracted. * Therefore this will be the place where differences are documented: * \li MSVC is incapable of parameter passing by value, if the type has alignment restrictions. The * consequence is that all %Vc vector types and any type derived from Vc::VectorAlignedBase cannot be * used as function parameters, unless a pointer is used (this includes reference and * const-reference). So \code * void foo(Vc::float_v) {}\endcode does not compile, while \code * void foo(Vc::float_v &) {} * void foo(const Vc::float_v &) {} * void foo(Vc::float_v *) {} * \endcode all work. * Normally you should prefer passing by value since a sane compiler will then pass the data in a * register and does not have to store/load the data to/from the stack. %Vc defines \c * VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN for such cases. Also the %Vc vector types contain a composite * typedef \c AsArg which resolves to either const-ref or const-by-value. Thus, you can always use * \code void foo(Vc::float_v::AsArg) {}\endcode. */ /** * \page featuremacros Feature Macros * * The following macros are available to enable/disable selected features: * * \par VC_NO_STD_FUNCTIONS * * If this macro is defined, the %Vc math functions are * not imported into the \c std namespace. They are still available in the %Vc namespace. * * \par VC_CLEAN_NAMESPACE * * If this macro is defined, any symbol or macro that does not have a %Vc * prefix will be disabled. * * \par VC_NO_AUTOMATIC_BOOL_FROM_MASK * * Define this macro to disable automatic conversion from %Vc * mask types to bool. The automatic conversion corresponds to the isFull() function. By disabling * the automatic conversion you can find places where the implicit isFull() conversion is not the * correct reduction. * * \par VC_NO_VERSION_CHECK * * Define this macro to disable the safety check for the libVc version. * The check generates a small check for every object file, which is called at startup, i.e. before * the main function. * * \par VC_CHECK_ALIGNMENT * * If this macro is defined %Vc will assert correct alignment for all * objects that require correct alignment. This can be very useful to debug crashes resulting * from misaligned memory accesses. This check will introduce a significant overhead. */ /** * \page buildsystem Build System * * %Vc uses CMake as its buildsystem. It also provides much of the CMake logic it * uses for itself for other projects that use CMake and %Vc. Here's an (incomplete) list of features * you can get from the CMake scripts provided with %Vc: * \li check for a required %Vc version * \li locate libVc and %Vc includes * \li compiler flags to workaround %Vc related quirks/bugs in specific compilers * \li compiler flags to enable/disable SIMD instruction sets, defaulting to full support for the * host system * * \section buildsystem_variables CMake Variables * * To make use of these features simply copy the FindVc.cmake as installed by %Vc to your project. * Add \code find_package(Vc [version] [REQUIRED]) \endcode to your CMakeLists.txt. After that you * can use the following variables: * \li \e Vc_FOUND: tells whether the package was found * \li \e Vc_INCLUDE_DIR: you must add this to your include directories for the targets that you * want to compile against %Vc: \code include_directories(${Vc_INCLUDE_DIR}) \endcode * \li \e Vc_DEFINITIONS: recommended compiler flags. You can use them via add_definitions or the * COMPILE_FLAGS property. * * The following variables might be of interest, too: * \li \e Vc_SSE_INTRINSICS_BROKEN * \li \e Vc_AVX_INTRINSICS_BROKEN * \li \e Vc_XOP_INTRINSICS_BROKEN * \li \e Vc_FMA4_INTRINSICS_BROKEN * * \section buildsystem_macros CMake Macros * * The macro vc_compile_for_all_implementations is provided to help with compiling a given source * file multiple times with all different possible SIMD targets for the given architecture. * Example: \verbatim vc_compile_for_all_implementations(objs src/trigonometric.cpp FLAGS -DSOME_FLAG EXCLUDE Scalar SSE2) \endverbatim * You can specify an arbitrary number of additional compiler flags after the FLAGS argument. These * flags will be used for all compiler calls. After an optional EXCLUDE argument you can specify targets * that you want to exclude. After an optional ONLY argument you can specify targets that you want * to compile for. (So either you exclude some, or you explicitly list the targets you want.) * * Often it suffices to have SSE2 or SSE3 as the least common denominator and provide SSE4_1 and * AVX. Here is the currently complete list of possible targets the macro will compile for: * \li Scalar * \li SSE2 * \li SSE3 * \li SSSE3 * \li SSE4_1 * \li SSE4_2 * \li SSE3+SSE4a * \li SSE+XOP+FMA4 * \li AVX * \li AVX+XOP+FMA4 * * \section buildsystem_other Using Vc without CMake * * If your project does not use CMake all you need to do is the following: * \li Find the header file "Vc/Vc" and add its path to your include paths. * \li Find the library libVc and link to it. * \li Ensure you use the right compiler flags to enable the relevant SIMD instructions. */ /** * \defgroup Vectors Vectors * * The vector classes abstract the SIMD registers and their according instructions into types that * feel very familiar to C++ developers. * * Note that the documented types Vc::float_v, Vc::double_v, Vc::int_v, Vc::uint_v, Vc::short_v, * and Vc::ushort_v are actually \c typedefs of the \c Vc::Vector class: * \code * namespace Vc { * template class Vector; * typedef Vector double_v; * typedef Vector float_v; * // ... * } * \endcode * * \par Some general information on using the vector classes: * * Generally you can always mix scalar values with vectors as %Vc will automatically broadcast the * scalar to a vector and then execute a vector operation. But, in order to ensure that implicit * type conversions only happen as defined by the C standard, there is only a very strict implicit * scalar to vector constructor: * \code * int_v a = 1; // good: int_v(int) * uint_v b = 1u; // good: uint_v(unsigned int) * uint_v c = 1; // does not compile: uint_v(int) * float_v d = 1; // does not compile: float_v(int) * float_v e = 1.; // does not compile: float_v(double) * float_v f = 1.f; // good: float_v(float) * \endcode * * The following ways of initializing a vector are not allowed: * \code * int_v v(3, 2, 8, 0); // constructor does not exist because it is not portable * int_v v; * v[0] = 3; v[1] = 2; v[2] = 8; v[3] = 0; // do not hardcode the number of entries! * // You can not know whether somebody will compile with %Vc Scalar where int_v::Size == 1 * \endcode * * Instead, if really necessary you can do: * \code * Vc::int_v v; * for (int i = 0; i < int_v::Size; ++i) { * v[i] = f(i); * } * // which is equivalent to: * v.fill(f); * // or: * v = int_v::IndexesFromZero().apply(f); * \endcode */ /** * \defgroup Masks Masks * * Mask classes are abstractions for the results of vector comparisons. The actual implementation * differs depending on the SIMD instruction set. On SSE they contain a full 128-bit datatype while * on a different architecture they might be bit-fields. */ /** * \defgroup Utilities Utilities * * Additional classes, macros, and functions that help to work more easily with the main vector * types. */ /** * \defgroup Math Math * * Functions that implement math functions. Take care that some of the implementations will return * results with less precision than what the FPU calculates. */ /** * \brief Vector Classes Namespace * * All functions and types of %Vc are defined inside the %Vc namespace. * * To be precise, most types are actually defined inside a second namespace, such as Vc::SSE. At * compile-time the correct implementation is simply imported into the %Vc namespace. */ namespace Vc { /** * \class Vector dox.h * \ingroup Vectors * * The main SIMD vector class. * * \li Vc::float_v * \li Vc::sfloat_v * \li Vc::double_v * \li Vc::int_v * \li Vc::uint_v * \li Vc::short_v * \li Vc::ushort_v * * are only specializations of this class. For the full documentation take a look at the * specialized classes. For most cases there are no API differences for the specializations. * Thus you can make use of \c Vector for generic programming. */ template class Vector { public: #define INDEX_TYPE uint_v #define VECTOR_TYPE Vector #define ENTRY_TYPE T #define MASK_TYPE float_m #define EXPONENT_TYPE int_v #include "dox-common-ops.h" #include "dox-real-ops.h" #undef INDEX_TYPE #undef VECTOR_TYPE #undef ENTRY_TYPE #undef MASK_TYPE #undef EXPONENT_TYPE }; /** * \ingroup Vectors * * Enum to declare platform specific constants */ enum PlatformConstants { /** * Specifies the byte boundary for memory alignments necessary for aligned loads and stores. */ VectorAlignment }; /** * \ingroup Vectors * * Enum to declare special initializers for vector constructors. */ enum SpecialInitializer { /** * Used for optimized construction of vectors initialized to zero. */ Zero, /** * Used for optimized construction of vectors initialized to one. */ One, /** * Parameter to create a vector with the entries 0, 1, 2, * 3, 4, 5, ... (depending on the vector's size, of course). */ IndexesFromZero }; /** * \ingroup Vectors * * Enum for load and store functions to select the optimizations that are safe to use. */ enum LoadStoreFlags { /** * Tells %Vc that the load/store can expect a memory address that is aligned on the correct * boundary. * * If you specify Aligned, but the memory address is not aligned the program will most * likely crash. */ Aligned, /** * Tells %Vc that the load/store can \em not expect a memory address that is aligned on the correct * boundary. * * If you specify Unaligned, but the memory address is aligned the load/store will execute * slightly slower than necessary. */ Unaligned, /** * Tells %Vc to bypass the cache for the load/store. Whether this will actually be done * depends on the instruction set in use. * * Streaming stores can be interesting when the code calculates values that, after being * written to memory, will not be used for a long time or used by a different thread. * * \note Passing Streaming as only alignment flag implies Aligned! If you need unaligned * memory access you can use * \code * v.store(mem, Vc::Unaligned | Vc::Streaming); * \endcode */ Streaming }; #define INDEX_TYPE uint_v #define VECTOR_TYPE float_v #define ENTRY_TYPE float #define MASK_TYPE float_m #define EXPONENT_TYPE int_v /** * \class float_v dox.h * \ingroup Vectors * * SIMD Vector of single precision floats. * * \note This is the same type as Vc::Vector. */ class VECTOR_TYPE { public: #include "dox-common-ops.h" #include "dox-real-ops.h" }; /** * \class float_m dox.h * \ingroup Masks * * Mask object to use with float_v objects. * * Of the same type as int_m and uint_m. */ class MASK_TYPE { public: #include "dox-common-mask-ops.h" }; #include "dox-math.h" #undef VECTOR_TYPE #undef ENTRY_TYPE #undef MASK_TYPE #define VECTOR_TYPE double_v #define ENTRY_TYPE double #define MASK_TYPE double_m /** * \class double_v dox.h * \ingroup Vectors * * SIMD Vector of double precision floats. * * \note This is the same type as Vc::Vector. */ class VECTOR_TYPE { public: #include "dox-common-ops.h" #include "dox-real-ops.h" }; /** * \class double_m dox.h * \ingroup Masks * * Mask object to use with double_v objects. */ class MASK_TYPE { public: #include "dox-common-mask-ops.h" }; #include "dox-math.h" #undef VECTOR_TYPE #undef ENTRY_TYPE #undef MASK_TYPE #define VECTOR_TYPE_HAS_SHIFTS 1 #define VECTOR_TYPE int_v #define ENTRY_TYPE int #define MASK_TYPE int_m #define INTEGER /** * \class int_v dox.h * \ingroup Vectors * * SIMD Vector of 32 bit signed integers. * * \note This is the same type as Vc::Vector. */ class VECTOR_TYPE { public: #include "dox-common-ops.h" }; /** * \class int_m dox.h * \ingroup Masks * * Mask object to use with int_v objects. * * Of the same type as float_m and uint_m. */ class MASK_TYPE { public: #include "dox-common-mask-ops.h" }; #undef VECTOR_TYPE #undef ENTRY_TYPE #undef MASK_TYPE #define VECTOR_TYPE uint_v #define ENTRY_TYPE unsigned int #define MASK_TYPE uint_m /** * \class uint_v dox.h * \ingroup Vectors * * SIMD Vector of 32 bit unsigned integers. * * \note This is the same type as Vc::Vector. */ class VECTOR_TYPE { public: #include "dox-common-ops.h" }; /** * \class uint_m dox.h * \ingroup Masks * * Mask object to use with uint_v objects. * * Of the same type as int_m and float_m. */ class MASK_TYPE { public: #include "dox-common-mask-ops.h" }; #undef VECTOR_TYPE #undef ENTRY_TYPE #undef MASK_TYPE #undef INDEX_TYPE #define INDEX_TYPE ushort_v #define VECTOR_TYPE short_v #define ENTRY_TYPE short #define MASK_TYPE short_m /** * \class short_v dox.h * \ingroup Vectors * * SIMD Vector of 16 bit signed integers. * * \note This is the same type as Vc::Vector. * * \warning Vectors of this type are not supported on all platforms. In that case the vector * class will silently fall back to a Vc::int_v. */ class VECTOR_TYPE { public: #include "dox-common-ops.h" }; /** * \class short_m dox.h * \ingroup Masks * * Mask object to use with short_v objects. * * Of the same type as ushort_m. */ class MASK_TYPE { public: #include "dox-common-mask-ops.h" }; #undef VECTOR_TYPE #undef ENTRY_TYPE #undef MASK_TYPE #define VECTOR_TYPE ushort_v #define ENTRY_TYPE unsigned short #define MASK_TYPE ushort_m /** * \class ushort_v dox.h * \ingroup Vectors * * SIMD Vector of 16 bit unsigned integers. * * \note This is the same type as Vc::Vector. * * \warning Vectors of this type are not supported on all platforms. In that case the vector * class will silently fall back to a Vc::uint_v. */ class VECTOR_TYPE { public: #include "dox-common-ops.h" }; /** * \class ushort_m dox.h * \ingroup Masks * * Mask object to use with ushort_v objects. * * Of the same type as short_m. */ class MASK_TYPE { public: #include "dox-common-mask-ops.h" }; #undef VECTOR_TYPE #undef ENTRY_TYPE #undef MASK_TYPE #undef INTEGER #undef EXPONENT_TYPE #undef VECTOR_TYPE_HAS_SHIFTS #define EXPONENT_TYPE short_v #define VECTOR_TYPE sfloat_v #define ENTRY_TYPE float #define MASK_TYPE sfloat_m /** * \class sfloat_v dox.h * \ingroup Vectors * * SIMD Vector of single precision floats that is guaranteed to have as many entries as a * Vc::short_v and Vc::ushort_v. */ class VECTOR_TYPE { public: #include "dox-common-ops.h" #include "dox-real-ops.h" }; /** * \class sfloat_m dox.h * \ingroup Masks * \ingroup Masks * * Mask object to use with sfloat_v objects. */ class MASK_TYPE { public: #include "dox-common-mask-ops.h" }; #include "dox-math.h" #undef EXPONENT_TYPE #undef VECTOR_TYPE #undef ENTRY_TYPE #undef MASK_TYPE #undef INDEX_TYPE /** * \ingroup Math * \note Often int_v::Size == double_v::Size * 2, then only every second value in \p *e is defined. */ double_v frexp(const double_v &x, int_v *e); /** * \ingroup Math * \note Often int_v::Size == double_v::Size * 2, then only every second value in \p *e is defined. */ double_v ldexp(double_v x, int_v e); /** * \ingroup Utilities * * Force the vectors passed to the function into registers. This can be useful after looking at * the emitted assembly to force the compiler to optimize properly. * * \note Currently only has an effect for SSE vectors. * \note MSVC does not support this function at all. * * \warning Be careful with this function, especially since it can render the compiler unable to * compile for 32 bit systems if it forces more than 8 vectors in registers. */ void forceToRegisters(const vec &, ...); /** * \ingroup Utilities * * Helper class to ensure proper alignment. * * This class reimplements the \c new and \c delete operators to align the allocated object * suitably for vector data. Additionally the type is annotated to require that same alignment * when placed on the stack. * * \see Vc::VectorAlignedBaseT */ class VectorAlignedBase { public: void *operator new(size_t size); void *operator new(size_t, void *p); void *operator new[](size_t size); void operator delete(void *ptr, size_t); void operator delete[](void *ptr, size_t); }; /** * \ingroup Utilities * * Helper class to ensure proper alignment. * * This class reimplements the \c new and \c delete operators to align the allocated object * suitably for vector data. Additionally the type is annotated to require that same alignment * when placed on the stack. * * This class differs from Vc::VectorAlignedBase in that the template parameter determines the * alignment. The alignment rules for different vector types might be different. If you use * Vc::VectorAlignedBase you will get the most restrictive alignment (i.e. it will work for all * vector types, but might lead to unnecessary padding). * * \tparam V One of the %Vc vector types. * * \see Vc::VectorAlignedBase */ template class VectorAlignedBaseT { public: void *operator new(size_t size); void *operator new(size_t, void *p); void *operator new[](size_t size); void operator delete(void *ptr, size_t); void operator delete[](void *ptr, size_t); }; } /** * \ingroup Utilities * * Loop over all set bits in the mask. The iterator variable will be set to the position of the set * bits. A mask of e.g. 00011010 would result in the loop being called with the iterator being set to * 1, 3, and 4. * * This allows you to write: * \code * float_v a = ...; * Vc_foreach_bit(int i, a < 0.f) { * std::cout << a[i] << "\n"; * } * \endcode * The example prints all the values in \p a that are negative, and only those. * * \param iterator The iterator variable. For example "int i". * \param mask The mask to iterate over. You can also just write a vector operation that returns a * mask. * * \note Since %Vc 0.7 break and continue are supported in foreach_bit loops. */ #define Vc_foreach_bit(iterator, mask) /** * \ingroup Utilities * * Alias for Vc_foreach_bit unless VC_CLEAN_NAMESPACE is defined. */ #define foreach_bit(iterator, mask) /** * \ingroup Vectors * \headerfile dox.h * * Prints the contents of a vector into a stream object. * * \code * const Vc::int_v v(Vc::IndexesFromZero); * std::cout << v << std::endl; * \endcode * will output (with SSE): \verbatim [0, 1, 2, 3] \endverbatim * * \param s Any standard C++ ostream object. For example std::cout or a std::stringstream object. * \param v Any Vc::Vector object. * \return The ostream object: to chain multiple stream operations. * * \note With the GNU standard library this function will check, whether the output stream is a tty. * In that case it will colorize the output. */ template std::ostream &operator<<(std::ostream &s, const Vc::Vector &v); /** * \ingroup Masks * \headerfile dox.h * * Prints the contents of a mask into a stream object. * * \code * const Vc::short_m m = Vc::short_v::IndexesFromZero() < 3; * std::cout << m << std::endl; * \endcode * will output (with SSE): \verbatim m[1110 0000] \endverbatim * * \param s Any standard C++ ostream object. For example std::cout or a std::stringstream object. * \param v Any %Vc mask object. * \return The ostream object: to chain multiple stream operations. * * \note With the GNU standard library this function will check, whether the output stream is a tty. * In that case it will colorize the output. */ template std::ostream &operator<<(std::ostream &s, const typename Vc::Vector::Mask &v); /** * \ingroup Utilities * \headerfile dox.h * * Prints the contents of a Memory object into a stream object. * * \code * Vc::Memory m; * for (int i = 0; i < m.entriesCount(); ++i) { * m[i] = i; * } * std::cout << m << std::endl; * \endcode * will output (with SSE): \verbatim {[0, 1, 2, 3] [4, 5, 6, 7] [8, 9, 0, 0]} \endverbatim * * \param s Any standard C++ ostream object. For example std::cout or a std::stringstream object. * \param m Any Vc::Memory object. * \return The ostream object: to chain multiple stream operations. * * \note With the GNU standard library this function will check, whether the output stream is a tty. * In that case it will colorize the output. * * \warning Please do not forget that printing a large memory object can take a long time. */ template inline std::ostream &operator<<(std::ostream &s, const Vc::MemoryBase &m); namespace Vc { /** * \ingroup Utilities * \headerfile dox.h * * \returns the version string of the %Vc headers. * * \note There exists a built-in check that ensures on application startup that the %Vc version of the * library (link time) and the headers (compile time) are equal. A mismatch between headers and * library could lead to errors that are very hard to debug. * \note If you need to disable the check (it costs a very small amount of application startup time) * you can define VC_NO_VERSION_CHECK at compile time. */ const char *versionString(); /** * \ingroup Utilities * \headerfile dox.h * * \returns the version of the %Vc headers encoded in an integer. */ unsigned int versionNumber(); /** * \name SIMD Support Feature Macros * \ingroup Utilities */ //@{ /** * \ingroup Utilities * This macro is set to the value of \ref Vc::Implementation that the current translation unit is * compiled with. */ #define VC_IMPL /** * \ingroup Utilities * This macro is defined if the current translation unit is compiled with XOP instruction support. */ #define VC_IMPL_XOP /** * \ingroup Utilities * This macro is defined if the current translation unit is compiled with FMA4 instruction support. */ #define VC_IMPL_FMA4 /** * \ingroup Utilities * This macro is defined if the current translation unit is compiled with F16C instruction support. */ #define VC_IMPL_F16C /** * \ingroup Utilities * This macro is defined if the current translation unit is compiled with POPCNT instruction support. */ #define VC_IMPL_POPCNT /** * \ingroup Utilities * This macro is defined if the current translation unit is compiled with SSE4a instruction support. */ #define VC_IMPL_SSE4a /** * \ingroup Utilities * This macro is defined if the current translation unit is compiled without any SIMD support. */ #define VC_IMPL_Scalar /** * \ingroup Utilities * This macro is defined if the current translation unit is compiled with any version of SSE (but not * AVX). */ #define VC_IMPL_SSE /** * \ingroup Utilities * This macro is defined if the current translation unit is compiled with SSE2 instruction support * (excluding SSE3 and up). */ #define VC_IMPL_SSE2 /** * \ingroup Utilities * This macro is defined if the current translation unit is compiled with SSE3 instruction support (excluding SSSE3 and up). */ #define VC_IMPL_SSE3 /** * \ingroup Utilities * This macro is defined if the current translation unit is compiled with SSSE3 instruction support (excluding SSE4.1 and up). */ #define VC_IMPL_SSSE3 /** * \ingroup Utilities * This macro is defined if the current translation unit is compiled with SSE4.1 instruction support (excluding SSE4.2 and up). */ #define VC_IMPL_SSE4_1 /** * \ingroup Utilities * This macro is defined if the current translation unit is compiled with SSE4.2 instruction support (excluding AVX and up). */ #define VC_IMPL_SSE4_2 /** * \ingroup Utilities * This macro is defined if the current translation unit is compiled with AVX instruction support (excluding AVX2 and up). */ #define VC_IMPL_AVX //@} /** * \name Version Macros * \ingroup Utilities */ //@{ /** * \ingroup Utilities * Contains the version string of the %Vc headers. Same as Vc::versionString(). */ #define VC_VERSION_STRING /** * \ingroup Utilities * Contains the encoded version number of the %Vc headers. Same as Vc::versionNumber(). */ #define VC_VERSION_NUMBER /** * \ingroup Utilities * * Helper macro to compare against an encoded version number. * Example: * \code * #if VC_VERSION_CHECK(0.5.1) >= VC_VERSION_NUMBER * \endcode */ #define VC_VERSION_CHECK(major, minor, patch) //@} /** * \name SIMD Vector Size Macros * \ingroup Utilities */ //@{ /** * \ingroup Utilities * An integer (for use with the preprocessor) that gives the number of entries in a double_v. */ #define VC_DOUBLE_V_SIZE /** * \ingroup Utilities * An integer (for use with the preprocessor) that gives the number of entries in a float_v. */ #define VC_FLOAT_V_SIZE /** * \ingroup Utilities * An integer (for use with the preprocessor) that gives the number of entries in a sfloat_v. */ #define VC_SFLOAT_V_SIZE /** * \ingroup Utilities * An integer (for use with the preprocessor) that gives the number of entries in a int_v. */ #define VC_INT_V_SIZE /** * \ingroup Utilities * An integer (for use with the preprocessor) that gives the number of entries in a uint_v. */ #define VC_UINT_V_SIZE /** * \ingroup Utilities * An integer (for use with the preprocessor) that gives the number of entries in a short_v. */ #define VC_SHORT_V_SIZE /** * \ingroup Utilities * An integer (for use with the preprocessor) that gives the number of entries in a ushort_v. */ #define VC_USHORT_V_SIZE //@} } // namespace Vc Vc-0.7.4/doc/examples.h000066400000000000000000000323271233512346000146340ustar00rootroot00000000000000/** * \page examples Examples * * There are several examples shipping with %Vc. If you have a suggestion for a useful or interesting * example, please contact vc@compeng.uni-frankfurt.de. * * \li \subpage ex-polarcoord * This example explains the basic approaches to vectorization of a given problem. It contains a * discussion of storage issues. * \li \subpage ex-finitediff * \li \subpage ex-matrix * \li \subpage ex-mandelbrot * \li \subpage ex-buddhabrot * ********************************************************************************* * * \page ex-polarcoord Polar Coordinates * * The \c polarcoord example generates 1000 random Cartesian 2D coordinates that are then * converted to polar coordinates and printed to the terminal. * This is a very simple example but shows the concept of vertical versus horizontal * vectorization quite nicely. * * \section ex_polarcoord_background Background * * In our problem we start with the allocation and random initialization of 1000 Cartesian 2D * coordinates. Thus every coordinate consists of two floating-point values (x and y). * \code * struct CartesianCoordinate * { * float x, y; * }; * CartesianCoordinate input[1000]; * \endcode * \image html polarcoord-cartesian.png "Cartesian coordinate" * * We want to convert them to 1000 polar coordinates. * \code * struct PolarCoordinate * { * float r, phi; * }; * PolarCoordinate output[1000]; * \endcode * \image html polarcoord-polar.png "Polar coordinate" * * Recall that: * \f[ * r^2 = x^2 + y^2 * \f]\f[ * \tan\phi = y/x * \f] * (One typically uses \c atan2 to calculate \c phi efficiently.) * * \section ex_polarcoord_vectorization Identify Vectorizable Parts * * When you look into vectorization of your application/algorithm, the first task is to identify the * data parallelism to use for vectorization. * A scalar implementation of our problem could look like this: * \code * for (int i = 0; i < ArraySize; ++i) { * const float x = input[i].x; * const float y = input[i].y; * output[i].r = std::sqrt(x * x + y * y); * output[i].phi = std::atan2(y, x) * 57.295780181884765625f; // 180/pi * if (output[i].phi < 0.f) { * output[i].phi += 360.f; * } * } * \endcode * The data parallelism inside the loop is minimal. It basically consists of two multiplications * that can be executed in parallel. This kind of parallelism is already exploited by all modern * processors via pipelining, which is one form of instruction level parallelism (ILP). * Thus, if one were to put the x and y values into a SIMD vector, this one multiplication could be * executed with just a single SIMD instruction. This vectorization is called \e vertical * vectorization, because the vector is placed vertically into the object. * * There is much more data parallelism in this code snippet, though. The different iteration steps * are all independent, which means that subsequent steps do not depend on results of the preceding steps. * Therefore, several steps of the loop can be executed in parallel. This is the most * straightforward vectorization strategy for our problem: * From a loop, always execute N steps in parallel, where N is the number of entries in the SIMD vector. * The input values to the loop need to be placed into a vector. * Then all intermediate values and results are also vectors. Using the %Vc datatypes a single loop * step would then look like this: * \code * // x and y are of type float_v * float_v r = Vc::sqrt(x * x + y * y); * float_v phi = Vc::atan2(y, x) * 57.295780181884765625f; // 180/pi * phi(output[i].phi < 0.f) += 360.f; * \endcode * This vectorization is called \e horizontal vectorization, because the vector is placed * horizontally over several objects. * * \section ex_polarcoord_data Data Structures * * To form the \c x vector from the previously used storage format, one would thus write: * \code * float_v x; * for (int i = 0; i < float_v::Size; ++i) { * x[i] = input[offset + i].x; * } * \endcode * Notice how the memory access is rather inefficient. * * \subsection ex_polarcoord_data_aos Array of Structs (AoS) * * The data was originally stored as array of * structs (\e AoS). Another way to call it is \e interleaved storage. That's because the entries of * the \c x and \c y vectors are interleaved in memory. * Let us assume the storage format is given and we cannot change it. * We would rather not load and store all our vectors entry by entry as this would lead to * inefficient code, which mainly occupies the load/store ports of the processor. Instead, we can use * a little helper function %Vc provides to load the data as vectors with subsequent deinterleaving: * \code * Vc::float_v x, y; * Vc::deinterleave(&x, &y, &input[i], Vc::Aligned); * \endcode * This pattern can be very efficient if the interleaved data members are always accessed together. * This optimizes for data locality and thus cache usage. * * \subsection ex_polarcoord_data_vectors Interleaved Vectors * * If you can change the data structures, it might be a good option to store interleaved vectors: * \code * struct CartesianCoordinate * { * Vc::float_v x, y; * }; * CartesianCoordinate input[(1000 + Vc::float_v::Size - 1) / Vc::float_v::Size]; * \endcode * Accessing vectors of \c x and \c y is then as simple as accessing the members of a \c * CartesianCoordinate object. This can be slightly more efficient than the previous method because * the deinterleaving step is not required anymore. On the downside your data structure now depends * on the target architecture, which can be a portability concern. * In short, the \c sizeof operator returns different values depending on Vc::float_v::Size. * Thus, you would have to ensure correct conversion to target independent data * formats for any data exchange (storage, network). (But if you are really careful about portable * data exchange, you already have to handle endian conversion anyway.) * * Note the unfortunate complication of determining the size of the array. In order to fit 1000 * scalar values into the array, the number of vectors times the vector size must be greater or equal * than 1000. But integer division truncates. * * Sadly, there is one last issue with alignment. If the \c CartesianCoordinate object is allocated * on the stack everything is fine (because the compiler knows about the alignment restrictions of * \c x and \c y and thus of \c CartesianCoordinate). But if \c CartesianCoordinate is allocated on * the heap (with \c new or inside an STL container), the correct alignment is not ensured. %Vc provides * Vc::VectorAlignedBase, which contains the correct reimplementations of the \c new and \c delete operators: * \code * struct CartesianCoordinate : public Vc::VectorAlignedBase * { * Vc::float_v x, y; * } * CartesianCoordinate *input = new CartesianCoordinate[(1000 + Vc::float_v::Size - 1) / Vc::float_v::Size]; * \endcode * To ensure correctly aligned storage with STL containers you can use Vc::Allocator: * \code * struct CartesianCoordinate * { * Vc::float_v x, y; * } * VC_DECLARE_ALLOCATOR(CartesianCoordinate) * std::vector input((1000 + Vc::float_v::Size - 1) / Vc::float_v::Size); * \endcode * * For a thorough discussion of alignment see \ref intro_alignment. * * \subsection ex_polarcoord_data_soa Struct of Arrays (SoA) * * A third option is storage in the form of a single struct instance that contains arrays of the * data members: * \code * template struct CartesianCoordinate * { * float x[Size], y[Size]; * } * CartesianCoordinate<1000> input; * \endcode * Now all \c x values are adjacent in memory and thus can easily be loaded and stored as vectors. * Well, two problems remain: * 1. The alignment of \c x and \c y is not defined and therefore not guaranteed. Vector loads and * stores thus must assume unaligned pointers, which is bad for performance. Even worse, if an * instruction that expects an aligned pointer is executed with an unaligned address the program * will crash. * 2. The size of the \c x and \c y arrays is not guaranteed to be large enough to allow the last * values in the arrays to be loaded/stored as vectors. * * %Vc provides the Vc::Memory class to solve both issues: * \code * template struct CartesianCoordinate * { * Vc::Memory x, y; * } * CartesianCoordinate<1000> input; * \endcode * * \section ex_polarcoord_complete The Complete Example * * Now that we have covered the background and know what we need - let us take a look at the * complete example code. * * \snippet polarcoord/main.cpp includes * The example starts with the main include directive to use for %Vc: \c \#include \c . * The remaining includes are required for terminal output. * Note that we include Vc::float_v into the global namespace. * It is not recommended to include the whole %Vc namespace into the global namespace * except maybe inside a function scope. * * \snippet polarcoord/main.cpp memory allocation * At the start of the program, the input and output memory is allocated. * Of course, you can abstract these variables into structs/classes for Cartesian and polar * coordinates. * The Vc::Memory class can be used to allocate memory on the stack or on the heap. * In this case the memory is allocated on the stack, since the size of the memory is given at * compile time. * The first \c float value of Vc::Memory (e.g. x_mem[0]) is guaranteed to be aligned to the * natural SIMD vector alignment. * Also, the size of the allocated memory may be padded at the end to allow access to the last \c * float value (e.g. x_mem[999]) with a SIMD vector. * Thus, if this example is compiled for a target with a vector width (\c Vc::float_v::Size) of 16 * entries, the four arrays would internally be allocated as size 1008 (63 vectors with 16 entries = * 1008 entries). * * \snippet polarcoord/main.cpp random init * Next the x and y values are initialized with random numbers. * %Vc includes a simple vectorized random number generator. * The floating point RNGs in %Vc produce values in the range from 0 to 1. * Thus the value has to be scaled and subtracted to get into the desired range of -1 to 1. * The iteration over the memory goes from 0 (no surprise) to a value determined by the Vc::Memory * class. In the case of fixed-size allocation, this number is also available to the compiler as a * compile time constant. Vc::Memory has two functions to use as upper bound for iterations: * Vc::Memory::entriesCount and Vc::Memory::vectorsCount. The former returns the same number as was * used for allocation. The latter returns the number of SIMD vectors that fit into the (padded) * allocated memory. Thus, if Vc::float_v::Size were 16, \c x_mem.vectorsCount() would expand to 63. * Inside the loop, the memory i-th vector is then set to a random value. * * \warning Please do not use this RNG until you have read its documentation. It may not be as * random as you need it to be. * * \snippet polarcoord/main.cpp conversion * Finally we arrive at the conversion of the Cartesian coordinates to polar coordinates. * The for loop is equivalent to the one above. * * Inside the loop we first assign the x and y values to local variables. * This is not necessary; but it can help the compiler with optimization. The issue is that when you * access values from some memory area, the compiler cannot always be sure that the pointers to * memory do not alias (i.e. point to the same location). Thus, the compiler might rather take the * safe way out and load the value from memory more often than necessary. By using local variables, * the compiler has an easy task to prove that the value does not change and can be cached in a * register. This is a general issue, and not a special issue with SIMD. In this case mainly serves * to make the following code more readable. * * * * \snippet polarcoord/main.cpp output * ********************************************************************************* * * \page ex-finitediff Finite Differences * ********************************************************************************* * * \page ex-matrix Matrix Class * ********************************************************************************* * * \page ex-buddhabrot Buddhabrot * ********************************************************************************* * * \page ex-mandelbrot Mandelbrot * * This example draws a colorized Mandelbrot image on screen using Qt4 widgets. * * The example uses a simple class to abstract complex numbers. In principle, one could just use * std::complex, if it would perform well enough. But especially the norm function is very slow for * scalar float/double. Also, complex multiplication is correctly implemented to handle NaN and * infinity. This is not required for Mandelbrot as these special cases will not occur. * Additionally, the provided complex abstraction stores the square of the real and imaginary parts * to help the compiler in optimizing the code as good as possible. * \snippet mandelbrot/mandel.cpp MyComplex * * Mandelbrot uses the function z = z² + c for iteration. * \snippet mandelbrot/mandel.cpp P function * ********************************************************************************* * * \page ex-buddhabrot Buddhabrot * */ Vc-0.7.4/doc/logo.png000066400000000000000000000177511233512346000143170ustar00rootroot00000000000000PNG  IHDRdOOsBIT|d pHYs W WF^ctEXtSoftwarewww.inkscape.org<fIDATxy\e.Wu/!$6AYdQ10J UgQat 2:8(@PbN';ޫ.tRNTS]y, !xfNDT{ yn҄Q%!TGU lfq-p8`JP=x'O"3dADUzK4ʸ?$] op3q˔\KdE̼ųex@{f>)a Y{Px#AL9o=D1ùW0!ud<}P0C)"p=a0s`Bպl|\=!P.PI*"=o}QmӢpN3\Oa1.|`ME<_',|3u.M.BLE2&|061sKNuy3jp:3t4r7Pgz ""w7E |p|[]9 <'( -nR1ilf a\e+{E= z;%\h袽i `槑Ԣ@vK^19 ǻ!W0s%{L@AfKrPk̼Jpf`p!3y7*R ` kR<دjڅ0T3e%Dp'5EEQⰂ/!&is<wN'oECu +k>{e5-oX̽m7X೪z}PH`𸪻ٓ%O"ZYLÊzCQ`E䖻`w: W$3_Q"l;+_׬[w{N_SykGc=ë%[%nݺa,_Ml(]ylC((a0s =WSCD&7};~uQ% a9^dc@o1%ិ-N8q09kl% ꭒ$]FF䊂aM1ps=^MEQ݈\P`v WaG~4˗/oMʐP°Yš?~jɒ%u.%W(FxM! x>b'Tv -p򍍍8 m=X09 Vѐe '֬Y< >XI(J䔂vk8uG⤠IZ_#_*p&+~xeCA 7'{`>hjjrC?iUUz}(u[%[GFUmBD:)+2֬Y}>D1UU8,EQ}S Xi/N+\j~ϗ(l^Uէqɗh(xaؙtw;"zN*JΜ934:.VU,l@UpGT(9]ap*#ai6 Zj'bڒ{/9_qNKv ":M*J8KURRb<?IY;UUc^ *\QL a>G\6K<ބt&yo Ln4s”éN? FѯB˝Y]S%^w0e/:M&g k6\{iNϫ3e'bKvԔ͏]78KА7ѧX{;=pg ge1Մ.RjN۾V'`=KCv۱pGai@D)^VTyW,X%x_8FhŬ3~o+t* ?S5I<k5)8;o`1UEJ4= ;fzs5Nsߡ CIugiW-o0wq)k.y׭*";7:20qWq "hEъ'OrI[XY/8r qg[35zU } Ts/Á:<-DԊ7F 踰wǙg^=묹sjNǺ~;wwcQ1g+?vR sap'=mCs}]δIL^:\;ZZ>(#~b򶂃heѲ%/]8Ձhc_ld][PZFK0o^3zaeWtӱ; {a<̓IyV̓X)W(5kv [EyV9! ,y3V̚Y9J&qvn$<d5R]9뜹?wI5NNK?k=nS;!ei 8!t!xUU% a3 YnԝKDI+L,nqEArv۫Olj'3sϤADEuK֟~YٟSgh}y]?ƪWM^̓[(vUjV1dտ"$qi'A e*TiXT&:8ƒϽ֕$R%Me?]|ғǛlgBY*VN72x33ыpc<ke:\:,/~VE}"AQ!s!P,6\b 6>>X}צHv' {?-/ח}dl͖Qoc'ݤ!&fa&E%+ַUU2-(a7^ɒRUM.rX荱f̌V<~0эR&1E3V6. .zncż/mGʣc77Ap.X)A'W|hn]8CDJԝ8vI̒aWaDc _3j3YrB4$-f],W:o;tfƝ@:?k_ww},7ho\8T>P~YSsɵM#+Jp"MuD"R'_Rɟ| ލ}it}df63-(@@mxVyXkt,߄ӽc `;\{DT* VC 󪪣E$D^ >z[A x_o[pc #\is]2ԍ#0=:wtg_Hq! Nw 1N +,=Dn\/0,PI6) /x0A %@5қcN<ޮC~zZ0ڻ"Z\x̭U(ԜV_U[Έ706׃ wDz[+\nB9$ypX%) Cd$ d`uVaӎY>η%hm74[wy\'?u z֖Iu!ug46 JK v>Je__X'%3Jš2/D  B@ D<$ 樯‚yѻu ɖsɣ.ְLׅGd"h^]/}tucO{fֲe *VlzS`[@U.EQ~7sQo pYd,[ֆMMs8HF>6ٽ,AN`{br@" _(ue *>*^l2CcY=XF, ϙHKj:*V[r?X qJC[ X,id$ "j'p?(ZR#ꋊ  03/ ZFOsoh-v݉g6Yfzw:{~}XÕ4N|2\Opo7nLcڻ?odK٬ji{5- N7 [ 6EQ~ 2(H$NǃH~qkN5ƫAܼEU1^Bi%ct%8os VćbKCwԬ_a n,->r޵{m7V-> 2u}mZOj5ؿv]Kel`zy/P2&TUrH>@+)$w<صm/ W`eؼ`y}:HuY%WVсe)b׾]&.' {S֪'=lqI RH5*B\-R]5D[z伣$_HQi> O}9_ڪ(`xv S3a&X7(܀lDBD7Hн %Kq$_Hu  m[G=3:iMDS|1߇ĐH 3_7=_Htщ#RIm2RX/+ A>N6fe]l0y/6-;ٶQxKn 0L;;v᭶NkB^-:f#C,QI@h^Ea :{S}Ơj  S3.1c/rDZ Js 1_t'}"2|Ev~Ɛm~D$_q8S&}U3s7N HgKA_l#"t$ƑiyYE`+B>{ݾcЄ܀WmED"a 11ȋP@ 7TG x|*Fuq═Yd?:z:P+ncG2 0c0aE0'힙[aM$AQbPZ"&_WK!q1] FtVӄHOȻ䨟k*kqp/yKĺo:ܚ`ߕAD `"1ذڛ6 nEKW+""0"DAp,Q"nt':@QYͳ` c I@kGQip<8`Yf+ծhfl7RAS31s#.3,!%X G}I~JJ)"r'dӳՅ00l*{qheb^P)B@j\یNlR\=Hn(|npXPpg7ޡ =[SD̨Fk? 7P4xj2~ Tg7iv3misО65s33E<ƘU >qda+DX?*'])IQw-8CwCBgav?Oa/Yџ(wL !\iTo'uL%V#e7F=191@$+EU,!j'VJaW/|;2~0fc}@C ¶e` IɶA=@̴n63mn؞M2B# "*d41('9Od:A_ 1 |0zw!:+nI1RAFB^3m==n6R#݉(` V>'@f >aT/HBTIBD BrX Iq1 Ib@Dnw/b$(t2`$uz>i 1u4]Fb]ZO~2L)aL{_F IC"%IHQAb$ )$E\$Drbz-V &k00a3 S7u͔)8mfHaf6-fEǥ0&-jN @ _ @$A@"D@$0;gReMNsMϬ1#56y>f 󰩙fvKw9EDT@-TIENDB`Vc-0.7.4/doc/logo.svg000066400000000000000000001353371233512346000143330ustar00rootroot00000000000000 image/svg+xml Vc-0.7.4/doc/logo_small.png000066400000000000000000000071731233512346000155040ustar00rootroot00000000000000PNG  IHDRc2HesBIT|d pHYsttfxIDATx\ypUu]ӂ*  lSFj1Sxg:Yuu8$L!i16nlwd 2`pٱ- =-o{Ǖ\}ғ-Iw9~{|ш i\\ :`;9y0 W.1p10\O9qwN瑺A  /E%gCF٘#"8ǰٝavrd,l>#1:8؛Xf/~8l~caJc炳IB'1ڸٓo@ƨS@ǝveyO?sއ6 fH\1sNf'O{dOrAeJX"3l庺;A HLw30Ax; r\q?kCAȚ5k67X/`"vCCXdzqCȑ~=78S޻`0@Q>C?\ mg02w##q@G2:<=*2Ja )QSSsXe] ܼysO&---f r?b"wU"oxl_)So /: ":^Zn]S}D`tdaG޿bŊMD4ߦAS4͟3̱Q ds<܄4:)XiTx, y*HccY(^]ZZZKD^"RJccm$ڎċL4^OY6 ;wuuu% l`[ǣ3[L$oڷ`Q#U`曉Ri/ّhnnݡDMF`uE#֍7 "-3Ǜ[D8<̶ͦIE"juv뚆2܅%cdM_ 0`fʵc+\}֒wx+ RUidpcV8>b( xMvmׯG#CYKmy/Jw>캄=HO7lLj Dd&"EEV}ޮwؒp͒[f߮V{=*R; 8ݼkdT+ܗ95 "kP(=a۶nݚM((lM.(?ʋG8{)ɘ[[QuU].=?G_9dPN=HDO$I_?[ښ_}x~:S=zƚ`E9"@VxyTN]Ǻ7Q~mF"H$3ؘR|W6oo5Br(PD4uˑ BW;#  uSeMjVEEcj Ica V:L A&۽CDN'i1DDmSSӉ =|_][e$EK܊Wa2`Aa0 9S%8t0%9.=)yq3ʫڴi#距8 DEQ|ߋ{ر9\Aݢ"]kŊW` zS fXgO4CKq0$>)!J"z}D$M:Qd03DLv&_LF{xX`w2\RyjWkf.)(B+=$`=|6`Ä֡hp>3PÑ"6>݌z$xɈpޯ_ԯI\)U&zJ% %Q('"? 4Ym0ƺP 5d`5t2Ld?9l&sF$yĈI'F8~>Dw?rC&U nA)z>vk'PH`djԣ$ d]tӓ.)jV"H+R"H+R"H+R"H+R"H+R"H+R"H+R'} __W$XW*.M$V)V!"Q ˣx?u * qychЅ}U/@vZW [,Z1h Y{3Q| NP߶b7B*>z+ ]@bҟ{Ue}e-BEwi~ɜd}ݡERc A(\X:^0c'kl?Ɨ5&_yQiqs=۾Jk)8𺇺M-:]2ԧX;O5:ѳ!{BP2#Lqџ۪SQaXoPբyw!s Uc:G f2>,S}aK&@rB:!b/'(6Z=ĘbUlXa"jSWTh ME Xq\.pD. \(Щ\/ظ\\߰x7,Ϲk]N-JsE?XV)`E XV)`E &niIENDB`Vc-0.7.4/doc/polarcoord-polar.png000066400000000000000000000027061233512346000166300ustar00rootroot00000000000000PNG  IHDRa ;iIDATx[LU5,@K5IALVz @Lc *ply; S2GT׋# d!7K;ۄnGkm Cngj8z zk}G4C&z kj(EUD#gtp$efVQ bMt և.$t* t5O6ۦ=0c 龍wS;FZ:UnGᑔHy:*/;y+yP6Sa\G >h.R&u;ĦF%/-; .a|u;kRxh"Q,4`,f:wNt`qlK{i|S=ݸr7ZX⦘@01kus,dP.oSiy =DF5]]  o=c<_MF? `}n~XtmP[^d-wGxxJ_`fOt%l="vW+~]Ly'p%U #<ZxDĥXУp!ZYj^<p XpxH#zԵÓm=g,?3ْ! #A8 E*U"bă7{(@C̦nNݐ;y 7lţ- 0ōx:x}k^ݩjݥ)+cۛlvLq,8 )h'cCEZlM|<ݑrQDQbϗ>mVf vm2e^?˦]Zz9+sTI%bc7P,NaK7{y+t˘8 wtAu- ԋR"RHT@*R )HE ؊[/& vIENDB`Vc-0.7.4/doc/qhelpgenerator-wrapper000077500000000000000000000007771233512346000172750ustar00rootroot00000000000000#!/bin/sh for retry in `seq 20`; do output="`qhelpgenerator "$@" 2>&1`" errors="`echo "$output"|grep "Error in line .*: Opening and ending tag mismatch."`" if test -z "$errors"; then echo "$output" exit fi if echo "$1"|grep -q '\.qhp'; then file="$1" elif echo "$2"|grep -q '\.qhp'; then file="$2" elif echo "$3"|grep -q '\.qhp'; then file="$3" fi for i in `echo "$errors"|tac|sed 's/^.* \([0-9]\+\):.*$/\1/'`; do sed -i "${i}d" "$file" done done qhelpgenerator "$@" Vc-0.7.4/examples/000077500000000000000000000000001233512346000137075ustar00rootroot00000000000000Vc-0.7.4/examples/CMakeLists.txt000066400000000000000000000051461233512346000164550ustar00rootroot00000000000000find_package(Qt4) set(SAFE_CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES}") set(SAFE_CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") set(CMAKE_REQUIRED_INCLUDES "${QT_INCLUDES}") set(CMAKE_REQUIRED_LIBRARIES "${QT_QTCORE_LIBRARY}") CHECK_CXX_SOURCE_COMPILES("#include int main() { QObject o; return 0;}" QT4_USABLE) mark_as_advanced(QT4_USABLE) set(CMAKE_REQUIRED_INCLUDES "${SAFE_CMAKE_REQUIRED_INCLUDES}") set(CMAKE_REQUIRED_LIBRARIES "${SAFE_CMAKE_REQUIRED_LIBRARIES}") macro(build_example name) set(_SRCS) set(_LIBS) set(_state 1) foreach(ARG ${ARGN}) if(ARG STREQUAL "LIBS") set(_state 2) elseif(_state EQUAL 1) set(_SRCS ${_SRCS} ${ARG}) elseif(_state EQUAL 2) set(_LIBS ${_LIBS} ${ARG}) endif() endforeach() add_executable("example_${name}_default" ${_SRCS}) target_link_libraries("example_${name}_default" Vc ${_LIBS}) add_executable("example_${name}_scalar" ${_SRCS}) add_target_property("example_${name}_scalar" COMPILE_FLAGS "-DVC_IMPL=Scalar") add_target_property("example_${name}_scalar" LABELS "Scalar") add_dependencies(Scalar "example_${name}_scalar") target_link_libraries("example_${name}_scalar" Vc ${_LIBS}) if(USE_SSE2) add_executable("example_${name}_sse" ${_SRCS}) add_target_property("example_${name}_sse" COMPILE_FLAGS "-DVC_IMPL=SSE") add_target_property("example_${name}_sse" LABELS "SSE") add_dependencies(SSE "example_${name}_sse") target_link_libraries("example_${name}_sse" Vc ${_LIBS}) endif() if(USE_AVX) add_executable("example_${name}_avx" ${_SRCS}) add_target_property("example_${name}_avx" COMPILE_FLAGS "-DVC_IMPL=AVX") add_target_property("example_${name}_avx" LABELS "AVX") add_dependencies(AVX "example_${name}_avx") target_link_libraries("example_${name}_avx" Vc ${_LIBS}) add_target_property("example_${name}_default" LABELS "AVX") add_dependencies(AVX "example_${name}_default") elseif(USE_SSE2) add_target_property("example_${name}_default" LABELS "SSE") add_dependencies(SSE "example_${name}_default") else() add_target_property("example_${name}_default" LABELS "Scalar") add_dependencies(Scalar "example_${name}_default") endif() endmacro(build_example) macro(my_add_subdirectory _name) list(FIND disabled_targets "example_${_name}" _disabled) if(_disabled EQUAL -1) add_subdirectory(${_name}) endif() endmacro() my_add_subdirectory(polarcoord) my_add_subdirectory(matrix) my_add_subdirectory(mandelbrot) my_add_subdirectory(buddhabrot) my_add_subdirectory(finitediff) Vc-0.7.4/examples/buddhabrot/000077500000000000000000000000001233512346000160255ustar00rootroot00000000000000Vc-0.7.4/examples/buddhabrot/CMakeLists.txt000066400000000000000000000012131233512346000205620ustar00rootroot00000000000000if(QT4_FOUND AND QT4_USABLE) include(${QT_USE_FILE}) include_directories(${CMAKE_CURRENT_BINARY_DIR}) add_executable(buddhabrot_sse main.cpp) add_target_property(buddhabrot_sse COMPILE_FLAGS "-DVC_IMPL=SSE") target_link_libraries(buddhabrot_sse ${QT_LIBRARIES} Vc) add_executable(buddhabrot_scalar main.cpp) add_target_property(buddhabrot_scalar COMPILE_FLAGS "-DVC_IMPL=Scalar") target_link_libraries(buddhabrot_scalar ${QT_LIBRARIES} Vc) add_executable(buddhabrot_scalar2 main.cpp) add_target_property(buddhabrot_scalar2 COMPILE_FLAGS "-DScalar") target_link_libraries(buddhabrot_scalar2 ${QT_LIBRARIES}) endif() Vc-0.7.4/examples/buddhabrot/main.cpp000066400000000000000000000534671233512346000174740ustar00rootroot00000000000000/* Copyright (C) 2010-2011 Matthias Kretz Permission to use, copy, modify, and distribute this software and its documentation for any purpose and without fee is hereby granted, provided that the above copyright notice appear in all copies and that both that the copyright notice and this permission notice and warranty disclaimer appear in supporting documentation, and that the name of the author not be used in advertising or publicity pertaining to distribution of the software without specific, written prior permission. The author disclaim all warranties with regard to this software, including all implied warranties of merchantability and fitness. In no event shall the author be liable for any special, indirect or consequential damages or any damages whatsoever resulting from loss of use, data or profits, whether in an action of contract, negligence or other tortious action, arising out of or in connection with the use or performance of this software. */ #include "main.h" #include "../tsc.h" #include #include #include #include #include #include #include #include #ifdef Scalar typedef float float_v; typedef int int_v; typedef bool int_m; #else #include using Vc::float_v; using Vc::float_m; using Vc::int_v; using Vc::int_m; #endif ProgressWriter::ProgressWriter() : m_out(stdout) { } void ProgressWriter::setValue(float vf) { static int lastPercent = -1; static int lastHash = 0; int p = static_cast(vf + 0.5f); int h = static_cast(vf * 0.78f + 0.5f); bool flush = false; if (p != lastPercent) { flush = true; if (lastPercent == -1) { m_out << "\033[80D\033[K" << "[ "; m_out.setFieldWidth(3); m_out << p; m_out.setFieldWidth(0); m_out << "% ]" << "\033[79D"; } else { m_out << "\033[s\033[80D\033[37C"; m_out.setFieldWidth(3); m_out << p; m_out.setFieldWidth(0); m_out << "\033[u"; } lastPercent = p; } for (; lastHash < h; ++lastHash) { flush = true; if (lastHash < 36 || lastHash > 39) { m_out << '#'; } else { m_out << "\033[1C"; } } if (flush) { m_out.flush(); } } void ProgressWriter::done() { setValue(100.f); m_out << "\033[2C"; m_out.flush(); } Baker::Baker() { } void Baker::setSize(int w, int h) { m_y = -1.f; m_height = 2.f; m_width = w * m_height / h; m_x = m_width * -0.667f; m_image = QImage(w, h, QImage::Format_RGB32); } void Baker::setFilename(const QString &filename) { m_filename = filename; } typedef std::complex Z; static inline Z P(Z z, Z c) { return z * z + c; } static inline Z::value_type fastNorm(const Z &z) { return z.real() * z.real() + z.imag() * z.imag(); } template static inline T square(T a) { return a * a; } template static inline T minOf(T a, T b) { return a < b ? a : b; } template static inline T maxOf(T a, T b) { return a < b ? b : a; } template static inline T clamp(T min, T value, T max) { if (value > max) { return max; } return value < min ? min : value; } struct Pixel { float blue; float green; float red; }; static const Pixel NULL_PIXEL = { 0, 0, 0 }; class Canvas { public: Canvas(int h, int w); void addDot(float x, float y, int red, int green, int blue); void toQImage(QImage *); private: void addDot(int x, int y, float red, float green, float blue) { Pixel &p = m_pixels[x + y * m_width]; p.blue += blue; p.green += green; p.red += red; } const int m_width; std::vector m_pixels; }; Canvas::Canvas(int h, int w) : m_width(w), m_pixels(h * w, NULL_PIXEL) { } void Canvas::addDot(float x, float y, int red, int green, int blue) { const int x1 = static_cast(std::floor(x)); const int x2 = static_cast(std::ceil (x)); const int y1 = static_cast(std::floor(y)); const int y2 = static_cast(std::ceil (y)); const float xfrac = x - std::floor(x); const float yfrac = y - std::floor(y); const float r = red; const float g = green; const float b = blue; const float frac11 = (1.f - xfrac) * (1.f - yfrac); const float frac12 = (1.f - xfrac) * yfrac; const float frac21 = xfrac * (1.f - yfrac); const float frac22 = xfrac * yfrac; addDot(x1, y1, r * frac11, g * frac11, b * frac11); addDot(x2, y1, r * frac21, g * frac21, b * frac21); addDot(x1, y2, r * frac12, g * frac12, b * frac12); addDot(x2, y2, r * frac22, g * frac22, b * frac22); } #define BUDDHABROT_USE_FUNCTION1 #ifdef BUDDHABROT_USE_FUNCTION2 static inline uchar reduceRange(float x, float m, float h) { /* m: max, h: median * +- -+ * | 3 3 2 | * | 510 h + 127 m - 765 h m | * | -------------------------- | * | 3 3 2 2 | * | h m + h m - 2 h m | * | | * | 3 3 2 | * | - 255 h - 254 m + 765 h m | * | ---------------------------- | * | 4 2 3 3 2 | * | h m - 2 h m + h m | * | | * | 2 2 | * | - 510 h m + 255 h + 127 m | * | --------------------------- | * | 4 2 3 3 2 | * | h m - 2 h m + h m | * +- -+ */ const float h2 = h * h; const float h3 = h2 * h; const float m2 = m * m; const float m3 = m2 * m; const float denom = h * m * square(m - h); return minOf(255.f, 0.5f //rounding + x / denom * ( 510.f * h3 + 127.f * m3 - 765.f * h2 * m + x / m * ( 765.f * h * m2 - 255.f * h3 - 254.f * m3 + x * ( 255.f * h2 + 127.f * m2 - 510.f * h * m) ))); } #elif defined(BUDDHABROT_USE_FUNCTION1) static inline unsigned int reduceRange(float x, float m, float h) { if (x <= m) { return 0.5f // rounding + 4.f / 255.f * h * h / m * x + square(x) * (h / square(m)) * (4.f - 8.f / 255.f * h); } else { return 0.5f // rounding + 255.f - 4.f * h + 4.f / 255.f * square(h) + x / m * (16.f * h - 1020.f - 12.f / 255.f * square(h)) + square(x / m) * (1020.f - 12.f * h + 8.f / 255.f * square(h)); } } #endif void Canvas::toQImage(QImage *img) { uchar *line = img->scanLine(0); const Pixel *p = &m_pixels[0]; #ifdef BUDDHABROT_USE_FUNCTION2 float max [3] = { 0.f, 0.f, 0.f }; std::vector sorted[3]; for (int i = 0; i < 3; ++i) { sorted[i].reserve(m_pixels.size()); } for (unsigned int i = 0; i < m_pixels.size(); ++i) { max[0] = maxOf(max[0], m_pixels[i].red); max[1] = maxOf(max[1], m_pixels[i].green); max[2] = maxOf(max[2], m_pixels[i].blue); if (m_pixels[i].red > 1.f) { sorted[0].push_back(m_pixels[i].red); } if (m_pixels[i].green > 1.f) { sorted[1].push_back(m_pixels[i].green); } if (m_pixels[i].blue > 1.f) { sorted[2].push_back(m_pixels[i].blue); } } for (int i = 0; i < 3; ++i) { std::sort(sorted[i].begin(), sorted[i].end()); } const float median[3] = { sorted[0][sorted[0].size() / 2], sorted[1][sorted[1].size() / 2], sorted[2][sorted[2].size() / 2] }; /* int hist[3][2]; for (int i = 0; i < 3; ++i) { hist[i][0] = hist[i][1] = 0; } for (unsigned int i = 0; i < m_pixels.size(); ++i) { ++hist[0][reduceRange(m_pixels[i].red , max[0], median[0]) / 128]; ++hist[1][reduceRange(m_pixels[i].green, max[1], median[1]) / 128]; ++hist[2][reduceRange(m_pixels[i].blue , max[2], median[2]) / 128]; } qDebug() << "Histogram:\n red:" << median[0] << hist[0][0] << hist[0][1] << "\ngreen:" << median[1] << hist[1][0] << hist[1][1] << "\n blue:" << median[2] << hist[2][0] << hist[2][1]; */ for (int yy = 0; yy < img->height(); ++yy) { for (int xx = 0; xx < img->width(); ++xx) { line[0] = reduceRange(p->blue , max[2], median[2]); line[1] = reduceRange(p->green, max[1], median[1]); line[2] = reduceRange(p->red , max[0], median[0]); line += 4; ++p; } } #elif defined(BUDDHABROT_USE_FUNCTION1) float max[3] = { 0.f, 0.f, 0.f }; for (unsigned int i = 0; i < m_pixels.size(); ++i) { max[0] = maxOf(max[0], m_pixels[i].red); max[1] = maxOf(max[1], m_pixels[i].green); max[2] = maxOf(max[2], m_pixels[i].blue); } float h[3] = { 220.f, 220.f, 220.f }; /* int hist[3][2]; for (int i = 0; i < 3; ++i) { hist[i][0] = hist[i][1] = 0; } for (unsigned int i = 0; i < m_pixels.size(); ++i) { ++hist[0][reduceRange(m_pixels[i].red , max[0], h[0]) / 128]; ++hist[1][reduceRange(m_pixels[i].green, max[1], h[1]) / 128]; ++hist[2][reduceRange(m_pixels[i].blue , max[2], h[2]) / 128]; } qDebug() << "Histogram:\n red:" << hist[0][0] << hist[0][1] << "\ngreen:" << hist[1][0] << hist[1][1] << "\n blue:" << hist[2][0] << hist[2][1]; */ for (int yy = 0; yy < img->height(); ++yy) { for (int xx = 0; xx < img->width(); ++xx) { line[0] = reduceRange(p->blue , max[2], h[2]); line[1] = reduceRange(p->green, max[1], h[1]); line[2] = reduceRange(p->red , max[0], h[0]); line += 4; ++p; } } #else float max [3] = { 0.f, 0.f, 0.f }; float mean [3] = { 0.f, 0.f, 0.f }; float stddev[3] = { 0.f, 0.f, 0.f }; for (unsigned int i = 0; i < m_pixels.size(); ++i) { max[0] = maxOf(max[0], m_pixels[i].red); max[1] = maxOf(max[1], m_pixels[i].green); max[2] = maxOf(max[2], m_pixels[i].blue); mean[0] += m_pixels[i].red; mean[1] += m_pixels[i].green; mean[2] += m_pixels[i].blue; stddev[0] += square(m_pixels[i].red); stddev[1] += square(m_pixels[i].green); stddev[2] += square(m_pixels[i].blue); } const float normalization = 1.f / m_pixels.size(); mean[0] *= normalization; mean[1] *= normalization; mean[2] *= normalization; stddev[0] = std::sqrt(stddev[0] * normalization - square(mean[0])); stddev[1] = std::sqrt(stddev[1] * normalization - square(mean[1])); stddev[2] = std::sqrt(stddev[2] * normalization - square(mean[2])); qDebug() << " max:" << max[0] << max[1] << max[2]; qDebug() << " mean:" << mean[0] << mean[1] << mean[2]; qDebug() << "stddev:" << stddev[0] << stddev[1] << stddev[2]; // colors have the range 0..max at this point // they should be transformed such that for the resulting mean and stddev: // mean - stddev = 0 // mean + stddev = min(min(2 * mean, max), 255) // // newColor = (c - mean) * min(min(2 * mean, max), 255) * 0.5 / stddev + 127.5 const float center[3] = { minOf(minOf(2.f * mean[0], max[0]), 255.f) * 0.5f, minOf(minOf(2.f * mean[1], max[1]), 255.f) * 0.5f, minOf(minOf(2.f * mean[2], max[2]), 255.f) * 0.5f }; const float sdFactor[3] = { 2.f, 2.f, 2.f }; const float redFactor = center[0] / (sdFactor[0] * stddev[0]); const float greenFactor = center[1] / (sdFactor[1] * stddev[1]); const float blueFactor = center[2] / (sdFactor[2] * stddev[2]); for (int yy = 0; yy < img->height(); ++yy) { for (int xx = 0; xx < img->width(); ++xx) { line[0] = clamp(0, static_cast(center[2] + (p->blue - mean[2]) * blueFactor ), 255); line[1] = clamp(0, static_cast(center[1] + (p->green - mean[1]) * greenFactor), 255); line[2] = clamp(0, static_cast(center[0] + (p->red - mean[0]) * redFactor ), 255); line += 4; ++p; } } #endif } Baker::Options::Options() { red[0] = 2; red[1] = 10; green[0] = 0; green[1] = 1; blue[0] = 11; blue[1] = 20; it[0] = 10000; it[1] = 50000; steps[0] = steps[1] = -1; } void Baker::createImage() { const int iHeight = m_image.height(); const int iWidth = m_image.width(); // Parameters Begin const float S = 4.f; const float nSteps[2] = { static_cast(m_opt.steps[0] == -1 ? std::sqrt(iWidth) * iWidth : m_opt.steps[0]), static_cast(m_opt.steps[1] == -1 ? std::sqrt(iHeight) * iHeight : m_opt.steps[1]) }; const int upperBound[3] = { m_opt.red[1], m_opt.green[1], m_opt.blue[1] }; const int lowerBound[3] = { m_opt.red[0], m_opt.green[0], m_opt.blue[0] }; int overallLowerBound = m_opt.it[0]; int maxIterations = m_opt.it[1];// maxOf(maxOf(overallLowerBound, upperBound[0]), maxOf(upperBound[1], upperBound[2])); float realMin = -2.102613f; float realMax = 1.200613f; float imagMin = 0.f; float imagMax = 1.23971f; // Parameters End TimeStampCounter timer; timer.Start(); // helper constants const int overallUpperBound = maxOf(upperBound[0], maxOf(upperBound[1], upperBound[2])); const float maxX = static_cast(iWidth ) - 1.f; const float maxY = static_cast(iHeight) - 1.f; const float xFact = iWidth / m_width; const float yFact = iHeight / m_height; const float realStep = (realMax - realMin) / nSteps[0]; const float imagStep = (imagMax - imagMin) / nSteps[1]; Canvas canvas(iHeight, iWidth); #ifdef Scalar for (float real = realMin; real <= realMax; real += realStep) { m_progress.setValue(99.f * (real - realMin) / (realMax - realMin)); for (float imag = imagMin; imag <= imagMax; imag += imagStep) { Z c(real, imag); Z c2 = Z(1.08f * real + 0.15f, imag); if (fastNorm(Z(real + 1.f, imag)) < 0.06f || (std::real(c2) < 0.42f && fastNorm(c2) < 0.417f)) { continue; } Z z = c; int n; for (n = 0; n <= maxIterations && fastNorm(z) < S; ++n) { z = P(z, c); } if (n <= maxIterations && n >= overallLowerBound) { // point is outside of the Mandelbrot set and required enough (overallLowerBound) // iterations to reach the cut-off value S Z cn(real, -imag); Z zn = cn; z = c; for (int i = 0; i <= overallUpperBound; ++i) { const float y2 = (std::imag(z) - m_y) * yFact; const float yn2 = (std::imag(zn) - m_y) * yFact; if (y2 >= 0.f && y2 < maxY && yn2 >= 0.f && yn2 < maxY) { const float x2 = (std::real(z) - m_x) * xFact; if (x2 >= 0.f && x2 < maxX) { const int red = (i >= lowerBound[0] && i <= upperBound[0]) ? 1 : 0; const int green = (i >= lowerBound[1] && i <= upperBound[1]) ? 1 : 0; const int blue = (i >= lowerBound[2] && i <= upperBound[2]) ? 1 : 0; canvas.addDot(x2, y2 , red, green, blue); canvas.addDot(x2, yn2, red, green, blue); } } z = P(z, c); zn = P(zn, cn); if (fastNorm(z) >= S) { // optimization: skip some useless looping break; } } } } } #else const float imagStep2 = imagStep * float_v::Size; const float_v imagMin2 = imagMin + imagStep * static_cast(int_v::IndexesFromZero()); for (float real = realMin; real <= realMax; real += realStep) { m_progress.setValue(99.f * (real - realMin) / (realMax - realMin)); for (float_v imag = imagMin2; imag <= imagMax; imag += imagStep2) { // FIXME: extra "tracks" if nSteps[1] is not a multiple of float_v::Size Z c(float_v(real), imag); Z c2 = Z(float_v(1.08f * real + 0.15f), imag); if (fastNorm(Z(float_v(real + 1.f), imag)) < 0.06f || (std::real(c2) < 0.42f && fastNorm(c2) < 0.417f)) { continue; } Z z = c; int_v n(Vc::Zero); int_m inside = fastNorm(z) < S; while (!(inside && n <= maxIterations).isEmpty()) { z = P(z, c); ++n(inside); inside &= fastNorm(z) < S; } inside |= n < overallLowerBound; if (inside.isFull()) { continue; } Z cn(float_v(real), -imag); Z zn = cn; z = c; for (int i = 0; i <= overallUpperBound; ++i) { const float_v y2 = (std::imag(z) - m_y) * yFact; const float_v yn2 = (std::imag(zn) - m_y) * yFact; const float_v x2 = (std::real(z) - m_x) * xFact; z = P(z, c); zn = P(zn, cn); const float_m drawMask = !inside && y2 >= 0.f && x2 >= 0.f && y2 < maxY && x2 < maxX && yn2 >= 0.f && yn2 < maxY; const int red = (i >= lowerBound[0] && i <= upperBound[0]) ? 1 : 0; const int green = (i >= lowerBound[1] && i <= upperBound[1]) ? 1 : 0; const int blue = (i >= lowerBound[2] && i <= upperBound[2]) ? 1 : 0; foreach_bit(int j, drawMask) { canvas.addDot(x2[j], y2 [j], red, green, blue); canvas.addDot(x2[j], yn2[j], red, green, blue); } if (fastNorm(z) >= S) { // optimization: skip some useless looping break; } } } } #endif canvas.toQImage(&m_image); timer.Stop(); m_progress.done(); qDebug() << timer.Cycles() << "cycles"; if (m_filename.isEmpty()) { m_filename = QString("r%1-%2_g%3-%4_b%5-%6_s%7-%8_i%9-%10_%11x%12.png") .arg(lowerBound[0]).arg(upperBound[0]) .arg(lowerBound[1]).arg(upperBound[1]) .arg(lowerBound[2]).arg(upperBound[2]) .arg(nSteps[0]).arg(nSteps[1]) .arg(overallLowerBound).arg(maxIterations) .arg(m_image.width()).arg(m_image.height()); } m_image.save(m_filename); } static void usage(const char *argv0) { Baker::Options o; QTextStream out(stdout); out << "Usage: " << argv0 << " [options] []\n\n" << "Options:\n" << " -h|--help This message.\n" << " -s|--size Specify the width and height of the resulting image file. [1024 768]\n" << " -r|--red Specify lower and upper iteration bounds for a red trace. [" << o.red[0] << ' ' << o.red[1] << "]\n" << " -g|--green Specify lower and upper iteration bounds for a green trace. [" << o.green[0] << ' ' << o.green[1] << "]\n" << " -b|--blue Specify lower and upper iteration bounds for a blue trace. [" << o.blue[0] << ' ' << o.blue[1] << "]\n" << " --steps Specify the steps in real and imaginary direction. [width^1.5 height^1.5]\n" << " --minIt Overall lower iteration bound. [" << o.it[0] << "]\n" << " --maxIt Overall upper iteration bound. [" << o.it[1] << "]\n" ; } int main(int argc, char **argv) { QCoreApplication app(argc, argv); const QStringList &args = QCoreApplication::arguments(); if (args.contains("--help") || args.contains("-h")) { usage(argv[0]); return 0; } Baker b; Baker::Options opt; int width = 1024; int height = 768; // parse args for (int i = 1; i < args.size(); ++i) { const QString &arg = args[i]; bool ok = true; if (arg == QLatin1String("--red") || arg == QLatin1String("-r")) { opt.red[0] = args[++i].toInt(&ok); if (ok) { opt.red[1] = args[++i].toInt(&ok); } } else if (arg == QLatin1String("--green") || arg == QLatin1String("-g")) { opt.green[0] = args[++i].toInt(&ok); if (ok) { opt.green[1] = args[++i].toInt(&ok); } } else if (arg == QLatin1String("--blue") || arg == QLatin1String("-b")) { opt.blue[0] = args[++i].toInt(&ok); if (ok) { opt.blue[1] = args[++i].toInt(&ok); } } else if (arg == QLatin1String("--steps")) { opt.steps[0] = args[++i].toInt(&ok); if (ok) { opt.steps[1] = args[++i].toInt(&ok); } } else if (arg == QLatin1String("--minIt")) { opt.it[0] = args[++i].toInt(&ok); } else if (arg == QLatin1String("--maxIt")) { opt.it[1] = args[++i].toInt(&ok); } else if (arg == QLatin1String("--size") || arg == QLatin1String("-s")) { width = args[++i].toInt(&ok); if (ok) { height = args[++i].toInt(&ok); } } else { static bool filenameSet = false; ok = !filenameSet; filenameSet = true; b.setFilename(arg); } if (!ok) { usage(argv[0]); return 1; } } b.setOptions(opt); b.setSize(width, height); b.createImage(); return 0; } Vc-0.7.4/examples/buddhabrot/main.h000066400000000000000000000036611233512346000171300ustar00rootroot00000000000000/* Copyright (C) 2010-2011 Matthias Kretz Permission to use, copy, modify, and distribute this software and its documentation for any purpose and without fee is hereby granted, provided that the above copyright notice appear in all copies and that both that the copyright notice and this permission notice and warranty disclaimer appear in supporting documentation, and that the name of the author not be used in advertising or publicity pertaining to distribution of the software without specific, written prior permission. The author disclaim all warranties with regard to this software, including all implied warranties of merchantability and fitness. In no event shall the author be liable for any special, indirect or consequential damages or any damages whatsoever resulting from loss of use, data or profits, whether in an action of contract, negligence or other tortious action, arising out of or in connection with the use or performance of this software. */ #ifndef MAIN_H #define MAIN_H #include #include #include #include class ProgressWriter { public: ProgressWriter(); void setValue(float v); void done(); private: QTextStream m_out; }; class Baker { public: struct Options { int red[2]; int green[2]; int blue[2]; int steps[2]; int it[2]; Options(); }; Baker(); void setOptions(Options o) { m_opt = o; } void setSize(int w, int h); void setFilename(const QString &); void createImage(); private: Options m_opt; float m_x; // left float m_y; // top float m_width; float m_height; QImage m_image; QString m_filename; ProgressWriter m_progress; }; #endif // MAIN_H Vc-0.7.4/examples/finitediff/000077500000000000000000000000001233512346000160165ustar00rootroot00000000000000Vc-0.7.4/examples/finitediff/CMakeLists.txt000066400000000000000000000000431233512346000205530ustar00rootroot00000000000000build_example(finitediff main.cpp) Vc-0.7.4/examples/finitediff/main.cpp000066400000000000000000000241001233512346000174430ustar00rootroot00000000000000/* Copyright (C) 2010 Jochen Gerhard Copyright (C) 2010-2012 Matthias Kretz Permission to use, copy, modify, and distribute this software and its documentation for any purpose and without fee is hereby granted, provided that the above copyright notice appear in all copies and that both that the copyright notice and this permission notice and warranty disclaimer appear in supporting documentation, and that the name of the author not be used in advertising or publicity pertaining to distribution of the software without specific, written prior permission. The author disclaim all warranties with regard to this software, including all implied warranties of merchantability and fitness. In no event shall the author be liable for any special, indirect or consequential damages or any damages whatsoever resulting from loss of use, data or profits, whether in an action of contract, negligence or other tortious action, arising out of or in connection with the use or performance of this software. */ /*! Finite difference method example We calculate central differences for a given function and compare it to the analytical solution. */ #include #include #include #include #include "../tsc.h" #include #define USE_SCALAR_SINCOS enum { N = 10240000, PrintStep = 1000000 }; static const float epsilon = 1e-7f; static const float lower = 0.f; static const float upper = 40000.f; static const float h = (upper - lower) / N; // dfu is the derivative of fu. This is really easy for sine and cosine: static inline float fu(float x) { return ( std::sin(x) ); } static inline float dfu(float x) { return ( std::cos(x) ); } static inline Vc::float_v fu(Vc::float_v::AsArg x) { #ifdef USE_SCALAR_SINCOS Vc::float_v r; for (int i = 0; i < Vc::float_v::Size; ++i) { r[i] = std::sin(x[i]); } return r; #else return Vc::sin(x); #endif } static inline Vc::float_v dfu(Vc::float_v::AsArg x) { #ifdef USE_SCALAR_SINCOS Vc::float_v r; for (int i = 0; i < Vc::float_v::Size; ++i) { r[i] = std::cos(x[i]); } return r; #else return Vc::cos(x); #endif } using Vc::float_v; // It is important for this example that the following variables (especially dy_points) are global // variables. Else the compiler can optimze all calculations of dy away except for the few places // where the value is used in printResults. Vc::Memory x_points; Vc::Memory y_points; float *VC_RESTRICT dy_points; void printResults() { std::cout << "------------------------------------------------------------\n" << std::setw(15) << "fu(x_i)" << std::setw(15) << "FD fu'(x_i)" << std::setw(15) << "SYM fu'(x)" << std::setw(15) << "error %\n"; for (int i = 0; i < N; i += PrintStep) { std::cout << std::setw(15) << y_points[i] << std::setw(15) << dy_points[i] << std::setw(15) << dfu(x_points[i]) << std::setw(15) << std::abs((dy_points[i] - dfu(x_points[i])) / (dfu(x_points[i] + epsilon)) * 100) << "\n"; } std::cout << std::setw(15) << y_points[N - 1] << std::setw(15) << dy_points[N - 1] << std::setw(15) << dfu(x_points[N - 1]) << std::setw(15) << std::abs((dy_points[N - 1] - dfu(x_points[N - 1])) / (dfu(x_points[N - 1] + epsilon)) * 100) << std::endl; } int main() { { float_v x_i(float_v::IndexType::IndexesFromZero()); for ( unsigned int i = 0; i < x_points.vectorsCount(); ++i, x_i += float_v::Size ) { const float_v x = x_i * h; x_points.vector(i) = x; y_points.vector(i) = fu(x); } } dy_points = Vc::malloc(N + float_v::Size - 1) + (float_v::Size - 1); double speedup; TimeStampCounter timer; { ///////// ignore this part - it only wakes up the CPU //////////////////////////// const float oneOver2h = 0.5f / h; // set borders explicit as up- or downdifferential dy_points[0] = (y_points[1] - y_points[0]) / h; // GCC auto-vectorizes the following loop. It is interesting to see that both Vc::Scalar and // Vc::SSE are faster, though. for ( int i = 1; i < N - 1; ++i) { dy_points[i] = (y_points[i + 1] - y_points[i - 1]) * oneOver2h; } dy_points[N - 1] = (y_points[N - 1] - y_points[N - 2]) / h; } ////////////////////////////////////////////////////////////////////////////////// { std::cout << "\n" << std::setw(60) << "Classical finite difference method" << std::endl; timer.Start(); const float oneOver2h = 0.5f / h; // set borders explicit as up- or downdifferential dy_points[0] = (y_points[1] - y_points[0]) / h; // GCC auto-vectorizes the following loop. It is interesting to see that both Vc::Scalar and // Vc::SSE are faster, though. for ( int i = 1; i < N - 1; ++i) { dy_points[i] = (y_points[i + 1] - y_points[i - 1]) * oneOver2h; } dy_points[N - 1] = (y_points[N - 1] - y_points[N - 2]) / h; timer.Stop(); printResults(); std::cout << "cycle count: " << timer.Cycles() << " | " << static_cast(N * 2) / timer.Cycles() << " FLOP/cycle" << " | " << static_cast(N * 2 * sizeof(float)) / timer.Cycles() << " Byte/cycle" << "\n"; } speedup = timer.Cycles(); { std::cout << std::setw(60) << "Vectorized finite difference method" << std::endl; timer.Start(); // All the differentials require to calculate (r - l) / 2h, where we calculate 1/2h as a // constant before the loop to avoid unnecessary calculations. Note that a good compiler can // already do this for you. const float_v oneOver2h = 0.5f / h; // Calculate the left border dy_points[0] = (y_points[1] - y_points[0]) / h; // Calculate the differentials streaming through the y and dy memory. The picture below // should give an idea of what values in y get read and what values are written to dy in // each iteration: // // y [...................................] // 00001111222233334444555566667777 // 00001111222233334444555566667777 // dy [...................................] // 00001111222233334444555566667777 // // The loop is manually unrolled four times to improve instruction level parallelism and // prefetching on architectures where four vectors fill one cache line. (Note that this // unrolling breaks auto-vectorization of the Vc::Scalar implementation when compiling with // GCC.) for (unsigned int i = 0; i < (y_points.entriesCount() - 2) / float_v::Size; i += 4) { // Prefetches make sure the data which is going to be used in 24/4 iterations is already // in the L1 cache. The prefetchForOneRead additionally instructs the CPU to not evict // these cache lines to L2/L3. Vc::prefetchForOneRead(&y_points[(i + 24) * float_v::Size]); // calculate float_v::Size differentials per (left - right) / 2h const float_v dy0 = (y_points.vector(i + 0, 2) - y_points.vector(i + 0)) * oneOver2h; const float_v dy1 = (y_points.vector(i + 1, 2) - y_points.vector(i + 1)) * oneOver2h; const float_v dy2 = (y_points.vector(i + 2, 2) - y_points.vector(i + 2)) * oneOver2h; const float_v dy3 = (y_points.vector(i + 3, 2) - y_points.vector(i + 3)) * oneOver2h; // Use streaming stores to reduce the required memory bandwidth. Without streaming // stores the CPU would first have to load the cache line, where the store occurs, from // memory into L1, then overwrite the data, and finally write it back to memory. But // since we never actually need the data that the CPU fetched from memory we'd like to // keep that bandwidth free for real work. Streaming stores allow us to issue stores // which the CPU gathers in store buffers to form full cache lines, which then get // written back to memory directly without the costly read. Thus we make better use of // the available memory bandwidth. dy0.store(&dy_points[(i + 0) * float_v::Size + 1], Vc::Streaming); dy1.store(&dy_points[(i + 1) * float_v::Size + 1], Vc::Streaming); dy2.store(&dy_points[(i + 2) * float_v::Size + 1], Vc::Streaming); dy3.store(&dy_points[(i + 3) * float_v::Size + 1], Vc::Streaming); } // Process the last vector. Note that this works for any N because Vc::Memory adds padding // to y_points and dy_points such that the last scalar value is somewhere inside lastVector. // The correct right border value for dy_points is overwritten in the last step unless N is // a multiple of float_v::Size + 2. // y [...................................] // 8888 // 8888 // dy [...................................] // 8888 { const size_t i = y_points.vectorsCount() - 1; const float_v left = y_points.vector(i, -2); const float_v right = y_points.lastVector(); ((right - left) * oneOver2h).store(&dy_points[i * float_v::Size - 1], Vc::Unaligned); } // ... and finally the right border dy_points[N - 1] = (y_points[N - 1] - y_points[N - 2]) / h; timer.Stop(); printResults(); std::cout << "cycle count: " << timer.Cycles() << " | " << static_cast(N * 2) / timer.Cycles() << " FLOP/cycle" << " | " << static_cast(N * 2 * sizeof(float)) / timer.Cycles() << " Byte/cycle" << "\n"; } speedup /= timer.Cycles(); std::cout << "Speedup: " << speedup << "\n"; Vc::free(dy_points - float_v::Size + 1); return 0; } Vc-0.7.4/examples/mandelbrot/000077500000000000000000000000001233512346000160365ustar00rootroot00000000000000Vc-0.7.4/examples/mandelbrot/CMakeLists.txt000066400000000000000000000045341233512346000206040ustar00rootroot00000000000000if(QT4_FOUND AND QT4_USABLE) include(${QT_USE_FILE}) include_directories(${CMAKE_CURRENT_BINARY_DIR}) qt4_generate_moc(main.h moc_main.cpp) qt4_generate_moc(mandel.h moc_mandel.cpp) set(SOURCES main.cpp ${CMAKE_CURRENT_BINARY_DIR}/moc_main.cpp mandel.cpp ${CMAKE_CURRENT_BINARY_DIR}/moc_mandel.cpp) build_example(mandelbrot ${SOURCES} LIBS ${QT_LIBRARIES}) # It is an interesting test if we can compare against the autovect # capabilities of Open64 and ICC, so we try to find those and compile extra # binaries with them find_program(O64_CXX openCC HINTS /opt/x86_open64-4.2.4/bin /opt/open64/bin) find_program(ICC_CXX icpc HINTS /opt/intel/bin $ENV{HOME}/intel/Compiler/11.1/072/bin/intel64) if(FALSE AND O64_CXX) add_custom_command(OUTPUT mandelbrot_open64 COMMAND ${O64_CXX} -O3 -Wall -msse3 -o ${CMAKE_CURRENT_BINARY_DIR}/mandelbrot_open64 -I ${CMAKE_CURRENT_BINARY_DIR} -I ${QT_INCLUDE_DIR} -I ${QT_QTCORE_INCLUDE_DIR} -I ${QT_QTGUI_INCLUDE_DIR} -I ${CMAKE_SOURCE_DIR} -I ${CMAKE_SOURCE_DIR}/include -L ${QT_LIBRARY_DIR} -lQtGui ${SOURCES} -DVC_IMPL=Scalar DEPENDS ${SOURCES} Vc WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMENT "Bulding mandelbrot_open64" VERBATIM) add_custom_target(build_mandelbrot_open64 ALL DEPENDS mandelbrot_open64) add_target_property(build_mandelbrot_open64 LABELS "other") add_dependencies(other build_mandelbrot_open64) endif() if(FALSE AND ICC_CXX) get_target_property(VcLocation Vc LOCATION) add_custom_command(OUTPUT mandelbrot_icc COMMAND ${ICC_CXX} -O3 -xSSE3 -o ${CMAKE_CURRENT_BINARY_DIR}/mandelbrot_icc -I ${CMAKE_CURRENT_BINARY_DIR} -I ${QT_INCLUDE_DIR} -I ${QT_QTCORE_INCLUDE_DIR} -I ${QT_QTGUI_INCLUDE_DIR} -I ${CMAKE_SOURCE_DIR} -I ${CMAKE_SOURCE_DIR}/include -L ${QT_LIBRARY_DIR} -lQtGui ${SOURCES} ${VcLocation} -DVC_IMPL=Scalar DEPENDS ${SOURCES} Vc WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMENT "Bulding mandelbrot_icc" VERBATIM) add_custom_target(build_mandelbrot_icc ALL DEPENDS mandelbrot_icc) add_target_property(build_mandelbrot_icc LABELS "other") add_dependencies(other build_mandelbrot_icc) endif() endif() Vc-0.7.4/examples/mandelbrot/main.cpp000066400000000000000000000110721233512346000174670ustar00rootroot00000000000000/* Copyright (C) 2010 Matthias Kretz Permission to use, copy, modify, and distribute this software and its documentation for any purpose and without fee is hereby granted, provided that the above copyright notice appear in all copies and that both that the copyright notice and this permission notice and warranty disclaimer appear in supporting documentation, and that the name of the author not be used in advertising or publicity pertaining to distribution of the software without specific, written prior permission. The author disclaim all warranties with regard to this software, including all implied warranties of merchantability and fitness. In no event shall the author be liable for any special, indirect or consequential damages or any damages whatsoever resulting from loss of use, data or profits, whether in an action of contract, negligence or other tortious action, arising out of or in connection with the use or performance of this software. */ #include "main.h" #include #include //#include MainWindow::MainWindow(QWidget *_parent) : QWidget(_parent), m_scale(0.01f) { m_x = width() * m_scale * -0.667f; m_y = height() * m_scale * -0.5f; m_rect1 = m_rect2 = rect(); m_rect1.setWidth(m_rect1.width() / 2); m_rect2.setX(m_rect1.width()); qRegisterMetaType(); qRegisterMetaType(); connect(&m_mandelVc, SIGNAL(ready(QImage, quint64)), SLOT(vcImage(QImage, quint64))); connect(&m_mandelScalar, SIGNAL(ready(QImage, quint64)), SLOT(scalarImage(QImage, quint64))); setWindowTitle(tr("Mandelbrot")); setCursor(Qt::CrossCursor); } void MainWindow::vcImage(const QImage &img, quint64 cycles) { m_img1 = img; update(m_rect1); if (cycles > 1) { m_cycles1 = cycles; updateTitle(); } if (QCoreApplication::arguments().contains("--benchmark")) { m_mandelScalar.brot(m_rect2.size(), m_x, m_y, m_scale); } } void MainWindow::scalarImage(const QImage &img, quint64 cycles) { m_img2 = img; update(m_rect2); if (cycles > 1) { m_cycles2 = cycles; updateTitle(); } } void MainWindow::updateTitle() { setWindowTitle(tr("Mandelbrot [Speedup: %1] [%2]").arg(m_cycles2 / m_cycles1).arg(m_img1 == m_img2 ? "Equal" : "Not Equal")); } void MainWindow::paintEvent(QPaintEvent *e) { QPainter p(this); QRect r1 = m_rect1 & e->rect(); p.drawImage(r1, m_img1, r1.translated(m_dragDelta)); QRect r2 = m_rect2 & e->rect(); p.drawImage(r2, m_img2, QRect(QPoint(), r2.size()).translated(m_dragDelta)); } void MainWindow::mousePressEvent(QMouseEvent *e) { m_dragStart = e->pos(); } void MainWindow::mouseMoveEvent(QMouseEvent *e) { m_dragDelta = m_dragStart - e->pos(); update(); } void MainWindow::mouseReleaseEvent(QMouseEvent *e) { m_dragDelta = m_dragStart - e->pos(); // translate m_x, m_y accordingly and recreate the image m_x += m_dragDelta.x() * m_scale; m_y += m_dragDelta.y() * m_scale; recreateImage(); m_dragDelta = QPoint(); } void MainWindow::wheelEvent(QWheelEvent *e) { if (e->delta() < 0 && width() * m_scale > 3.f && height() * m_scale > 2.f) { return; } const float xx = e->x() >= m_rect1.width() ? e->x() - m_rect1.width() : e->x(); const float constX = m_x + m_scale * xx; const float constY = m_y + m_scale * e->y(); if (e->delta() > 0) { m_scale *= 1.f / (1.f + e->delta() * 0.001f); } else { m_scale *= 1.f - e->delta() * 0.001f; } m_x = constX - m_scale * xx; m_y = constY - m_scale * e->y(); recreateImage(); //update(); } void MainWindow::resizeEvent(QResizeEvent *e) { if (e->oldSize().isValid()) { m_x += 0.25f * m_scale * (e->oldSize().width() - e->size().width()); m_y += 0.5f * m_scale * (e->oldSize().height() - e->size().height()); } else { m_x = e->size().width() * m_scale * -0.333f; m_y = e->size().height() * m_scale * -0.5f; } m_rect1 = m_rect2 = QRect(QPoint(), e->size()); m_rect1.setWidth(m_rect1.width() / 2); m_rect2.setX(m_rect1.width()); recreateImage(); update(); } void MainWindow::recreateImage() { if (!QCoreApplication::arguments().contains("--benchmark")) { m_mandelScalar.brot(m_rect2.size(), m_x, m_y, m_scale); } m_mandelVc.brot(m_rect1.size(), m_x, m_y, m_scale); } int main(int argc, char **argv) { QApplication app(argc, argv); MainWindow w; w.resize(600, 200); w.show(); return app.exec(); } Vc-0.7.4/examples/mandelbrot/main.h000066400000000000000000000041621233512346000171360ustar00rootroot00000000000000/* Copyright (C) 2010 Matthias Kretz Permission to use, copy, modify, and distribute this software and its documentation for any purpose and without fee is hereby granted, provided that the above copyright notice appear in all copies and that both that the copyright notice and this permission notice and warranty disclaimer appear in supporting documentation, and that the name of the author not be used in advertising or publicity pertaining to distribution of the software without specific, written prior permission. The author disclaim all warranties with regard to this software, including all implied warranties of merchantability and fitness. In no event shall the author be liable for any special, indirect or consequential damages or any damages whatsoever resulting from loss of use, data or profits, whether in an action of contract, negligence or other tortious action, arising out of or in connection with the use or performance of this software. */ #ifndef MAIN_H #define MAIN_H #include #include #include #include #include #include #include "mandel.h" class MainWindow : public QWidget { Q_OBJECT public: MainWindow(QWidget *parent = 0); protected: void paintEvent(QPaintEvent *); void resizeEvent(QResizeEvent *); void mousePressEvent(QMouseEvent *); void mouseMoveEvent(QMouseEvent *); void mouseReleaseEvent(QMouseEvent *); void wheelEvent(QWheelEvent *); private slots: void vcImage(const QImage &, quint64); void scalarImage(const QImage &, quint64); private: void recreateImage(); void updateTitle(); float m_x; // left float m_y; // top float m_scale; QImage m_img1; QImage m_img2; QRect m_rect1; QRect m_rect2; QPoint m_dragStart; QPoint m_dragDelta; float m_cycles1, m_cycles2; Mandel m_mandelVc; Mandel m_mandelScalar; }; #endif // MAIN_H Vc-0.7.4/examples/mandelbrot/mandel.cpp000066400000000000000000000145131233512346000200060ustar00rootroot00000000000000/* Copyright (C) 2010-2011 Matthias Kretz Permission to use, copy, modify, and distribute this software and its documentation for any purpose and without fee is hereby granted, provided that the above copyright notice appear in all copies and that both that the copyright notice and this permission notice and warranty disclaimer appear in supporting documentation, and that the name of the author not be used in advertising or publicity pertaining to distribution of the software without specific, written prior permission. The author disclaim all warranties with regard to this software, including all implied warranties of merchantability and fitness. In no event shall the author be liable for any special, indirect or consequential damages or any damages whatsoever resulting from loss of use, data or profits, whether in an action of contract, negligence or other tortious action, arising out of or in connection with the use or performance of this software. */ #include "mandel.h" #include #include #include "../tsc.h" #include #include using Vc::float_v; using Vc::float_m; using Vc::uint_v; using Vc::uint_m; template Mandel::Mandel(QObject *_parent) : MandelBase(_parent) { } MandelBase::MandelBase(QObject *_parent) : QThread(_parent), m_restart(false), m_abort(false) { } MandelBase::~MandelBase() { m_mutex.lock(); m_abort = true; m_wait.wakeOne(); m_mutex.unlock(); wait(); } void MandelBase::brot(const QSize &size, float x, float y, float scale) { QMutexLocker lock(&m_mutex); m_size = size; m_x = x; m_y = y; m_scale = scale; if (!isRunning()) { start(LowPriority); } else { m_restart = true; m_wait.wakeOne(); } } void MandelBase::run() { while (!m_abort) { // first we copy the parameters to our local data so that the main main thread can give a // new task while we're working m_mutex.lock(); // destination image, RGB is good - no need for alpha QImage image(m_size, QImage::Format_RGB32); float x = m_x; float y = m_y; float scale = m_scale; m_mutex.unlock(); // benchmark the number of cycles it takes TimeStampCounter timer; timer.Start(); // calculate the mandelbrot set/image mandelMe(image, x, y, scale, 255); timer.Stop(); // if no new set was requested in the meantime - return the finished image if (!m_restart) { emit ready(image, timer.Cycles()); } // wait for more work m_mutex.lock(); if (!m_restart) { m_wait.wait(&m_mutex); } m_restart = false; m_mutex.unlock(); } } static const float S = 4.f; /** * std::complex is way too slow for our limited purposes: * * norm is implemented as std::abs(z) * std::abs(z) for float * z * z is implemented as multiplication & lots of branches looking for NaN and inf * * since we know that we require the square of r and i for norm and multiplication we can * explicitely cache it in the object */ //! [MyComplex] template class MyComplex { public: MyComplex(T r, T i) : m_real(r), m_imag(i), m_real2(r * r), m_imag2(i * i) { } MyComplex squaredPlus(T r, T i) const { return MyComplex( m_real2 + r - m_imag2, (m_real + m_real) * m_imag + i ); } T norm() const { return m_real2 + m_imag2; } private: T m_real, m_imag; T m_real2, m_imag2; }; //! [MyComplex] //! [P function] template inline MyComplex P(MyComplex z, T c_real, T c_imag) { return z.squaredPlus(c_real, c_imag); } //! [P function] template<> void Mandel::mandelMe(QImage &image, float x0, float y0, float scale, int maxIt) { typedef MyComplex Z; const unsigned int height = image.height(); const unsigned int width = image.width(); const float_v colorScale = 0xff / static_cast(maxIt); for (unsigned int y = 0; y < height; ++y) { unsigned int *VC_RESTRICT line = reinterpret_cast(image.scanLine(y)); const float_v c_imag = y0 + y * scale; uint_m toStore; for (uint_v x = uint_v::IndexesFromZero(); !(toStore = x < width).isEmpty(); x += float_v::Size) { const float_v c_real = x0 + x * scale; Z z(c_real, c_imag); float_v n = 0.f; float_m inside = z.norm() < S; while (!(inside && n < maxIt).isEmpty()) { z = P(z, c_real, c_imag); ++n(inside); inside = z.norm() < S; } uint_v colorValue = static_cast((maxIt - n) * colorScale) * 0x10101; if (toStore.isFull()) { colorValue.store(line, Vc::Unaligned); line += uint_v::Size; } else { colorValue.store(line, toStore, Vc::Unaligned); break; // we don't need to check again wether x[0] + float_v::Size < width to break out of the loop } } if (restart()) { break; } } } template<> void Mandel::mandelMe(QImage &image, float x0, float y0, float scale, int maxIt) { typedef MyComplex Z; const int height = image.height(); const int width = image.width(); const float colorScale = 0xff / static_cast(maxIt); for (int y = 0; y < height; ++y) { unsigned int *VC_RESTRICT line = reinterpret_cast(image.scanLine(y)); const float c_imag = y0 + y * scale; for (int x = 0; x < width; ++x) { const float c_real = x0 + x * scale; Z z(c_real, c_imag); int n = 0; for (; z.norm() < S && n < maxIt; ++n) { z = P(z, c_real, c_imag); } *line++ = static_cast((maxIt - n) * colorScale) * 0x10101; } if (restart()) { break; } } } template class Mandel; template class Mandel; // vim: sw=4 sts=4 et tw=100 Vc-0.7.4/examples/mandelbrot/mandel.h000066400000000000000000000041601233512346000174500ustar00rootroot00000000000000/* Copyright (C) 2010 Matthias Kretz Permission to use, copy, modify, and distribute this software and its documentation for any purpose and without fee is hereby granted, provided that the above copyright notice appear in all copies and that both that the copyright notice and this permission notice and warranty disclaimer appear in supporting documentation, and that the name of the author not be used in advertising or publicity pertaining to distribution of the software without specific, written prior permission. The author disclaim all warranties with regard to this software, including all implied warranties of merchantability and fitness. In no event shall the author be liable for any special, indirect or consequential damages or any damages whatsoever resulting from loss of use, data or profits, whether in an action of contract, negligence or other tortious action, arising out of or in connection with the use or performance of this software. */ #include #include #include #include #include #include enum MandelImpl { VcImpl, ScalarImpl }; class MandelBase : public QThread { Q_OBJECT public: void brot(const QSize &size, float x, float y, float scale); protected: MandelBase(QObject* _parent = 0); ~MandelBase(); void emitImage(const QImage &image, quint64 cycles) { emit ready(image, cycles); } void run(); virtual void mandelMe(QImage &image, float x, float y, float scale, int maxIterations) = 0; inline bool restart() const { return m_restart; } signals: void ready(const QImage &image, quint64 cycles); private: QMutex m_mutex; QWaitCondition m_wait; QSize m_size; float m_x, m_y, m_scale; bool m_restart; bool m_abort; }; template class Mandel : public MandelBase { public: Mandel(QObject *_parent = 0); protected: void mandelMe(QImage &image, float x, float y, float scale, int maxIterations); }; Vc-0.7.4/examples/matrix/000077500000000000000000000000001233512346000152135ustar00rootroot00000000000000Vc-0.7.4/examples/matrix/CMakeLists.txt000066400000000000000000000000371233512346000177530ustar00rootroot00000000000000build_example(matrix main.cpp) Vc-0.7.4/examples/matrix/main.cpp000066400000000000000000000051231233512346000166440ustar00rootroot00000000000000/* This file is part of the Vc project Copyright (C) 2009-2010 Matthias Kretz Permission to use, copy, modify, and distribute this software and its documentation for any purpose and without fee is hereby granted, provided that the above copyright notice appear in all copies and that both that the copyright notice and this permission notice and warranty disclaimer appear in supporting documentation, and that the name of the author not be used in advertising or publicity pertaining to distribution of the software without specific, written prior permission. The author disclaim all warranties with regard to this software, including all implied warranties of merchantability and fitness. In no event shall the author be liable for any special, indirect or consequential damages or any damages whatsoever resulting from loss of use, data or profits, whether in an action of contract, negligence or other tortious action, arising out of or in connection with the use or performance of this software. */ #include #include #include #include template class Matrix; template std::ostream &operator<<(std::ostream &, const Matrix &); template class Matrix { friend std::ostream &operator<< <>(std::ostream &, const Matrix &); private: typedef Vc::Vector V; Vc::Memory m_mem; public: Matrix &operator=(const T &val) { V vec(val); for (unsigned int i = 0; i < m_mem.vectorsCount(); ++i) { m_mem.vector(i) = vec; } return *this; } Matrix &operator+=(const Matrix &rhs) { for (unsigned int i = 0; i < m_mem.vectorsCount(); ++i) { V v1(m_mem.vector(i)); v1 += V(rhs.m_mem.vector(i)); m_mem.vector(i) = v1; } return *this; } }; template std::ostream &operator<<(std::ostream &out, const Matrix &m) { for (unsigned int i = 0; i < Size; ++i) { std::cout << "[" << std::setw(6) << m.m_mem[i * Size]; for (unsigned int j = 1; j < Size; ++j) { std::cout << std::setw(6) << m.m_mem[i * Size + j]; } std::cout << " ]\n"; } return out; } int main() { Matrix m1; m1 = 1.f; Matrix m2; m2 = 2.f; m1 += m2; std::cout << m1 << std::endl; return 0; } Vc-0.7.4/examples/polarcoord/000077500000000000000000000000001233512346000160535ustar00rootroot00000000000000Vc-0.7.4/examples/polarcoord/CMakeLists.txt000066400000000000000000000000431233512346000206100ustar00rootroot00000000000000build_example(polarcoord main.cpp) Vc-0.7.4/examples/polarcoord/main.cpp000066400000000000000000000052561233512346000175130ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Permission to use, copy, modify, and distribute this software and its documentation for any purpose and without fee is hereby granted, provided that the above copyright notice appear in all copies and that both that the copyright notice and this permission notice and warranty disclaimer appear in supporting documentation, and that the name of the author not be used in advertising or publicity pertaining to distribution of the software without specific, written prior permission. The author disclaim all warranties with regard to this software, including all implied warranties of merchantability and fitness. In no event shall the author be liable for any special, indirect or consequential damages or any damages whatsoever resulting from loss of use, data or profits, whether in an action of contract, negligence or other tortious action, arising out of or in connection with the use or performance of this software. */ //! [includes] #include #include #include using Vc::float_v; //! [includes] //! [memory allocation] int main() { // allocate memory for our initial x and y coordinates. Note that you can also put it into a // normal float C-array but that you then must ensure alignment to Vc::VectorAlignment! Vc::Memory x_mem; Vc::Memory y_mem; Vc::Memory r_mem; Vc::Memory phi_mem; //! [memory allocation] //! [random init] // fill the memory with values from -1.f to 1.f for (size_t i = 0; i < x_mem.vectorsCount(); ++i) { x_mem.vector(i) = float_v::Random() * 2.f - 1.f; y_mem.vector(i) = float_v::Random() * 2.f - 1.f; } //! [random init] //! [conversion] // calculate the polar coordinates for all coordinates and overwrite the euclidian coordinates // with the result for (size_t i = 0; i < x_mem.vectorsCount(); ++i) { const float_v x = x_mem.vector(i); const float_v y = y_mem.vector(i); r_mem.vector(i) = Vc::sqrt(x * x + y * y); float_v phi = Vc::atan2(y, x) * 57.295780181884765625f; // 180/pi phi(phi < 0.f) += 360.f; phi_mem.vector(i) = phi; } //! [conversion] //! [output] // print the results for (size_t i = 0; i < x_mem.entriesCount(); ++i) { std::cout << std::setw(3) << i << ": "; std::cout << std::setw(10) << x_mem[i] << ", " << std::setw(10) << y_mem[i] << " -> "; std::cout << std::setw(10) << r_mem[i] << ", " << std::setw(10) << phi_mem[i] << '\n'; } return 0; } //! [output] Vc-0.7.4/examples/tsc.h000066400000000000000000000032461233512346000146560ustar00rootroot00000000000000/* Copyright (C) 2009-2012 Matthias Kretz This program is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) version 3. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef TSC_H #define TSC_H #ifdef _MSC_VER #include #pragma intrinsic(__rdtsc) #endif class TimeStampCounter { public: void Start(); void Stop(); unsigned long long Cycles() const; private: union Data { unsigned long long a; unsigned int b[2]; } m_start, m_end; }; inline void TimeStampCounter::Start() { #ifdef _MSC_VER unsigned int tmp; m_start.a = __rdtscp(&tmp); #else asm volatile("rdtscp" : "=a"(m_start.b[0]), "=d"(m_start.b[1]) :: "ecx" ); #endif } inline void TimeStampCounter::Stop() { #ifdef _MSC_VER unsigned int tmp; m_end.a = __rdtscp(&tmp); #else asm volatile("rdtscp" : "=a"(m_end.b[0]), "=d"(m_end.b[1]) :: "ecx" ); #endif } inline unsigned long long TimeStampCounter::Cycles() const { return m_end.a - m_start.a; } #endif // TSC_H Vc-0.7.4/generateForceToRegisters.rb000077500000000000000000000030701233512346000173650ustar00rootroot00000000000000#!/usr/bin/env ruby puts '#ifdef VC_GNU_ASM' 1.upto 8 do |max| print 'template<' max.downto 2 do |i| print "typename T#{i}, " end print "typename T1>\nstatic inline void ALWAYS_INLINE forceToRegisters(" max.downto 2 do |i| print "const Vector &x#{i}, " end print "const Vector &x1) {\n" print " __asm__ __volatile__(\"\"::" max.downto 2 do |i| print "\"x\"(x#{i}.data()), " end print "\"x\"(x1.data()));\n}\n" print 'template<' max.downto 2 do |i| print "typename T#{i}, " end print "typename T1>\nstatic inline void ALWAYS_INLINE forceToRegistersDirty(" max.downto 2 do |i| print "Vector &x#{i}, " end print "Vector &x1) {\n" print " __asm__ __volatile__(\"\":" max.downto 2 do |i| print "\"+x\"(x#{i}.data()), " end print "\"+x\"(x1.data()));\n}\n" end puts '#elif defined(VC_MSVC)' 1.upto 8 do |max| puts '#pragma optimize("g", off)' print 'template<' max.downto 2 do |i| print "typename T#{i}, " end print "typename T1>\nstatic inline void ALWAYS_INLINE forceToRegisters(" max.downto 2 do |i| print "const Vector &/*x#{i}*/, " end print "const Vector &/*x1*/) {\n" print "}\n" puts '#pragma optimize("g", off)' print 'template<' max.downto 2 do |i| print "typename T#{i}, " end print "typename T1>\nstatic inline void ALWAYS_INLINE forceToRegistersDirty(" max.downto 2 do |i| print "Vector &/*x#{i}*/, " end print "Vector &/*x1*/) {\n" print "}\n" puts '#pragma optimize("g", on)' end puts '#else' puts '#error "forceToRegisters unsupported on this compiler"' puts '#endif' Vc-0.7.4/include/000077500000000000000000000000001233512346000135145ustar00rootroot00000000000000Vc-0.7.4/include/Vc/000077500000000000000000000000001233512346000140645ustar00rootroot00000000000000Vc-0.7.4/include/Vc/Allocator000066400000000000000000000212231233512346000157270ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright (C) 2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . }}}*/ #ifndef VC_ALLOCATOR_H #define VC_ALLOCATOR_H #include #include #include #include "global.h" #ifdef VC_CXX11 #include #endif #include "common/macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { using std::size_t; using std::ptrdiff_t; /** * \headerfile Allocator * \ingroup Utilities * * Convenience macro to set the default allocator for a given \p Type to Vc::Allocator. * * \param Type Your type that you want to use with STL containers. * * \note You have to use this macro in the global namespace. */ #define VC_DECLARE_ALLOCATOR(Type) \ namespace std \ { \ template<> class allocator : public ::Vc::Allocator \ { \ public: \ template struct rebind { typedef ::std::allocator other; }; \ }; \ } #ifdef VC_MSVC #undef Vc_DECLARE_ALLOCATOR #define Vc_DECLARE_ALLOCATOR(Type) \ namespace std \ { \ template<> class allocator : public ::Vc::Allocator \ { \ public: \ template struct rebind { typedef ::std::allocator other; }; \ /* MSVC brokenness: the following function is optional - just doesn't compile without it */ \ const allocator &select_on_container_copy_construction() const { return *this; } \ }; \ } #endif /** * \headerfile Allocator * An allocator that uses global new and supports over-aligned types, as per [C++11 20.6.9]. * * Meant as a simple replacement for the allocator defined in the C++ Standard. * Allocation is done using the global new/delete operators. But if the alignment property of \p * T is larger than the size of a pointer, the allocate function allocates slightly more memory * to adjust the pointer for correct alignment. * * If the \p T does not require over-alignment no additional memory will be allocated. * * \tparam T The type of objects to allocate. * * Example: * \code * struct Data { * Vc::float_v x, y, z; * }; * * void fun() * { * std::vector dat0; // this will use std::allocator, which probably ignores the * // alignment requirements for Data. Thus any access to dat0 may * // crash your program. * * std::vector > dat1; // now std::vector will get correctly aligned * // memory. Accesses to dat1 are safe. * ... * \endcode * * %Vc ships a macro to conveniently tell STL to use Vc::Allocator per default for a given type: * \code * struct Data { * Vc::float_v x, y, z; * }; * VC_DECLARE_ALLOCATOR(Data) * * void fun() * { * std::vector dat0; // good now * ... * \endcode * * \ingroup Utilities */ template class Allocator { private: enum Constants { #ifdef VC_HAVE_STD_MAX_ALIGN_T NaturalAlignment = alignof(std::max_align_t), #elif defined(VC_HAVE_MAX_ALIGN_T) NaturalAlignment = alignof(::max_align_t), #else NaturalAlignment = sizeof(void *) > Vc_ALIGNOF(long double) ? sizeof(void *) : (Vc_ALIGNOF(long double) > Vc_ALIGNOF(long long) ? Vc_ALIGNOF(long double) : Vc_ALIGNOF(long long)), #endif #ifdef VC_IMPL_AVX SimdAlignment = 32, #elif defined VC_IMPL_SSE SimdAlignment = 16, #else SimdAlignment = 1, #endif Alignment = Vc_ALIGNOF(T) > SimdAlignment ? Vc_ALIGNOF(T) : SimdAlignment, /* The number of extra bytes allocated must be large enough to put a pointer right * before the adjusted address. This pointer stores the original address, which is * required to call ::operator delete in deallocate. * * The address we get from ::operator new is a multiple of NaturalAlignment: * p = N * NaturalAlignment * * Since all alignments are powers of two, Alignment is a multiple of NaturalAlignment: * Alignment = k * NaturalAlignment * * two cases: * 1. If p is already aligned to Alignment then allocate will return p + Alignment. In * this case there are Alignment Bytes available to store a pointer. * 2. If p is not aligned then p + (k - (N modulo k)) * NaturalAlignment will be * returned. Since NaturalAlignment >= sizeof(void*) the pointer fits. */ ExtraBytes = Alignment > NaturalAlignment ? Alignment : 0, AlignmentMask = Alignment - 1 }; public: typedef size_t size_type; typedef ptrdiff_t difference_type; typedef T* pointer; typedef const T* const_pointer; typedef T& reference; typedef const T& const_reference; typedef T value_type; template struct rebind { typedef Allocator other; }; Allocator() throw() { } Allocator(const Allocator&) throw() { } template Allocator(const Allocator&) throw() { } pointer address(reference x) const { return &x; } const_pointer address(const_reference x) const { return &x; } pointer allocate(size_type n, const void* = 0) { if (n > this->max_size()) { throw std::bad_alloc(); } char *p = static_cast(::operator new(n * sizeof(T) + ExtraBytes)); if (ExtraBytes > 0) { char *const pp = p; p += ExtraBytes; const char *null = 0; p -= ((p - null) & AlignmentMask); // equivalent to p &= ~AlignmentMask; reinterpret_cast(p)[-1] = pp; } return reinterpret_cast(p); } void deallocate(pointer p, size_type) { if (ExtraBytes > 0) { p = reinterpret_cast(p)[-1]; } ::operator delete(p); } size_type max_size() const throw() { return size_t(-1) / sizeof(T); } #ifdef VC_MSVC // MSVC brokenness: the following function is optional - just doesn't compile without it const Allocator &select_on_container_copy_construction() const { return *this; } // MSVC also requires a function that neither C++98 nor C++11 mention // but it doesn't support variadic templates... otherwise the VC_CXX11 clause would be nice void construct(pointer p) { ::new(p) T(); } // we still need the C++98 version: void construct(pointer p, const T& __val) { ::new(p) T(__val); } void destroy(pointer p) { p->~T(); } #elif defined(VC_CXX11) template void construct(U* p, Args&&... args) { ::new(p) U(std::forward(args)...); } template void destroy(U* p) { p->~U(); } #else void construct(pointer p, const T& __val) { ::new(p) T(__val); } void destroy(pointer p) { p->~T(); } #endif }; template inline bool operator==(const Allocator&, const Allocator&) { return true; } template inline bool operator!=(const Allocator&, const Allocator&) { return false; } } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "common/undomacros.h" #include "vector.h" namespace std { template class allocator > : public ::Vc::Allocator > { public: template struct rebind { typedef ::std::allocator other; }; #ifdef VC_MSVC // MSVC brokenness: the following function is optional - just doesn't compile without it const allocator &select_on_container_copy_construction() const { return *this; } #endif }; } #endif // VC_ALLOCATOR_H // vim: ft=cpp et sw=4 sts=4 Vc-0.7.4/include/Vc/IO000066400000000000000000000126631233512346000143260ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VECIO_H #define VECIO_H #include "vector.h" #include "Memory" #include #if defined(__GNUC__) && !defined(_WIN32) && defined(_GLIBCXX_OSTREAM) #define VC_HACK_OSTREAM_FOR_TTY 1 #endif #ifdef VC_HACK_OSTREAM_FOR_TTY #include #include #endif #include "internal/namespace.h" namespace { namespace AnsiColor { struct Type { const char *data; }; static const Type green = { "\033[1;40;32m" }; static const Type yellow = { "\033[1;40;33m" }; static const Type blue = { "\033[1;40;34m" }; static const Type normal = { "\033[0m" }; } // namespace AnsiColor #ifdef VC_HACK_OSTREAM_FOR_TTY class hacked_ostream : public std::ostream { public: using std::ostream::_M_streambuf; }; __attribute__((__const__)) bool mayUseColor(const std::ostream &os) { std::basic_streambuf *hack1 = const_cast *>(os.*(&hacked_ostream::_M_streambuf)); __gnu_cxx::stdio_sync_filebuf *hack = dynamic_cast<__gnu_cxx::stdio_sync_filebuf *>(hack1); if (!hack) { return false; } FILE *file = hack->file(); return 1 == isatty(fileno(file)); } #else inline bool mayUseColor(const std::ostream &) { return false; } #endif } // anonymous namespace namespace std { inline std::ostream &operator<<(std::ostream &out, const AnsiColor::Type &c) { if (mayUseColor(out)) { out << c.data; } return out; } template inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Vector &v) { out << AnsiColor::green << "["; out << v[0]; for (int i = 1; i < v.Size; ++i) { out << ", " << v[i]; } out << "]" << AnsiColor::normal; return out; } inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Vector &v) { out << AnsiColor::green << "["; out << int(v[0]); for (int i = 1; i < v.Size; ++i) { out << ", " << int(v[i]); } out << "]" << AnsiColor::normal; return out; } inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Vector &v) { out << AnsiColor::green << "["; out << int(v[0]); for (int i = 1; i < v.Size; ++i) { out << ", " << int(v[i]); } out << "]" << AnsiColor::normal; return out; } #ifdef VC_HAVE_FMA template inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::VectorMultiplication &v) { return out << VECTOR_NAMESPACE::Vector(v); } #endif #ifdef VC_IMPL_AVX template inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Mask &m) #else template inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Mask &m) #endif { out << AnsiColor::blue << "m["; for (unsigned int i = 0; i < VectorSize; ++i) { if (i > 0 && (i % 4) == 0) { out << " "; } if ( m[i] ) { out << AnsiColor::yellow << '1'; } else { out << AnsiColor::blue << '0'; } } out << AnsiColor::blue << "]" << AnsiColor::normal; return out; } #ifdef VC_IMPL_SSE inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Float8Mask &m) { out << AnsiColor::blue << "m["; for (unsigned int i = 0; i < 8; ++i) { if (i > 0 && (i % 4) == 0) { out << " "; } if ( m[i] ) { out << AnsiColor::yellow << '1'; } else { out << AnsiColor::blue << '0'; } } out << AnsiColor::blue << "]" << AnsiColor::normal; return out; } #endif template inline std::ostream &operator<<(std::ostream &out, const Vc::MemoryBase &m ) { out << AnsiColor::blue << "{" << AnsiColor::normal; for (unsigned int i = 0; i < m.vectorsCount(); ++i) { out << V(m.vector(i)); } out << AnsiColor::blue << "}" << AnsiColor::normal; return out; } template inline std::ostream &operator<<(std::ostream &out, const Vc::MemoryBase &m ) { out << AnsiColor::blue << "{" << AnsiColor::normal; for (size_t i = 0; i < m.rowsCount(); ++i) { if (i > 0) { out << "\n "; } const size_t vcount = m[i].vectorsCount(); for (size_t j = 0; j < vcount; ++j) { out << V(m[i].vector(j)); } } out << AnsiColor::blue << "}" << AnsiColor::normal; return out; } } // namespace std #undef VECTOR_NAMESPACE #endif // VECIO_H // vim: ft=cpp Vc-0.7.4/include/Vc/Memory000066400000000000000000000021511233512346000152560ustar00rootroot00000000000000 /* This file is part of the Vc library. Copyright (C) 2009 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef INCLUDE_VC_MEMORY #define INCLUDE_VC_MEMORY #include "vector.h" #include "common/memory.h" #include "common/interleavedmemory.h" #ifdef VC_IMPL_Scalar # include "scalar/interleavedmemory.tcc" #elif defined(VC_IMPL_AVX) # include "avx/interleavedmemory.tcc" #elif defined(VC_IMPL_SSE) # include "sse/interleavedmemory.tcc" #endif #endif // INCLUDE_VC_MEMORY // vim: ft=cpp Vc-0.7.4/include/Vc/Utils000066400000000000000000000016711233512346000151140ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2010 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_UTILS #define VC_UTILS #include "global.h" #ifdef VC_IMPL_Scalar # define VECTOR_NAMESPACE Scalar #else # define VECTOR_NAMESPACE SSE #endif #include "common/deinterleave.h" #endif // VC_UTILS Vc-0.7.4/include/Vc/Vc000066400000000000000000000015771233512346000143710ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_VC #define VC_VC #include "vector.h" #include "IO" #include "Memory" #include "Utils" #include "Allocator" #endif // VC_VC // vim: ft=cpp Vc-0.7.4/include/Vc/cpuid.h000066400000000000000000000275551233512346000153570ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef CPUID_H #define CPUID_H /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { /** * \ingroup Utilities * \headerfile cpuid.h * * This class is available for x86 / AMD64 systems to read and interpret information about the CPU's * capabilities. * * Before any of the getter functions may be called, the init() function must have been called. It * will be called automatically, but for any function executing before main, you better call * \c CpuId::init() first. * * %Vc users will most likely not need this class directly, but rely on the * isImplementationSupported, bestImplementationSupported, extraInstructionsSupported, and * currentImplementationSupported functions. */ class CpuId { typedef unsigned char uchar; typedef unsigned short ushort; typedef unsigned int uint; public: enum ProcessorType { OriginalOemProcessor = 0, IntelOverDriveProcessor = 1, DualProcessor = 2, IntelReserved = 3 }; /** * Reads the CPU capabilities and stores them for faster subsequent access. * * Will be executed automatically before main, but not necessarily before other functions * executing before main. */ static void init(); //! Return the cache line size in bits. static inline ushort cacheLineSize() { return static_cast(s_cacheLineSize) * 8u; } //! Return the ProcessorType. static inline ProcessorType processorType() { return s_processorType; } //! Return the family number of the processor (vendor dependent). static inline uint processorFamily() { return s_processorFamily; } //! Return the model number of the processor (vendor dependent). static inline uint processorModel() { return s_processorModel; } //! Return the number of logical processors. static inline uint logicalProcessors() { return s_logicalProcessors; } //! Return whether the CPU vendor is AMD. static inline bool isAmd () { return s_ecx0 == 0x444D4163; } //! Return whether the CPU vendor is Intel. static inline bool isIntel () { return s_ecx0 == 0x6C65746E; } //! Return whether the CPU supports SSE3. static inline bool hasSse3 () { return s_processorFeaturesC & (1 << 0); } //! Return whether the CPU supports the PCLMULQDQ instruction. static inline bool hasPclmulqdq() { return (s_processorFeaturesC & (1 << 1)) != 0; } //! Return whether the CPU supports the MONITOR/MWAIT instructions. static inline bool hasMonitor() { return (s_processorFeaturesC & (1 << 3)) != 0; } //! Return whether the CPU supports the Virtual Machine Extensions. static inline bool hasVmx () { return (s_processorFeaturesC & (1 << 5)) != 0; } //! Return whether the CPU supports the Safer Mode Extensions. static inline bool hasSmx () { return (s_processorFeaturesC & (1 << 6)) != 0; } //! Return whether the CPU supports the Enhanced Intel SpeedStep technology. static inline bool hasEist () { return (s_processorFeaturesC & (1 << 7)) != 0; } //! Return whether the CPU supports Thermal Monitor 2. static inline bool hasTm2 () { return (s_processorFeaturesC & (1 << 8)) != 0; } //! Return whether the CPU supports SSSE3. static inline bool hasSsse3() { return (s_processorFeaturesC & (1 << 9)) != 0; } //! Return whether the CPU supports FMA extensions using YMM state. static inline bool hasFma () { return (s_processorFeaturesC & (1 << 12)) != 0; } //! Return whether the CPU supports CMPXCHG16B. static inline bool hasCmpXchg16b() { return (s_processorFeaturesC & (1 << 13)) != 0; } //! Return whether the CPU supports the Perfmon and Debug Capability. static inline bool hasPdcm () { return (s_processorFeaturesC & (1 << 15)) != 0; } //! Return whether the CPU supports Direct Cache Access: prefetch data from a memory mapped device. static inline bool hasDca() { return (s_processorFeaturesC & (1 << 18)) != 0; } //! Return whether the CPU supports SSE 4.1 static inline bool hasSse41() { return (s_processorFeaturesC & (1 << 19)) != 0; } //! Return whether the CPU supports SSE 4.2 static inline bool hasSse42() { return (s_processorFeaturesC & (1 << 20)) != 0; } //! Return whether the CPU supports the MOVBE instruction. static inline bool hasMovbe() { return (s_processorFeaturesC & (1 << 22)) != 0; } //! Return whether the CPU supports the POPCNT instruction. static inline bool hasPopcnt(){ return (s_processorFeaturesC & (1 << 23)) != 0; } //static inline bool hasTscDeadline() { return (s_processorFeaturesC & (1 << 24)) != 0; } //! Return whether the CPU supports the AESNI instructions. static inline bool hasAes () { return (s_processorFeaturesC & (1 << 25)) != 0; } //static inline bool hasXsave() { return (s_processorFeaturesC & (1 << 26)) != 0; } //! Return whether the CPU and OS support the XSETBV/XGETBV instructions. static inline bool hasOsxsave() { return (s_processorFeaturesC & (1 << 27)) != 0; } //! Return whether the CPU supports AVX. static inline bool hasAvx () { return (s_processorFeaturesC & (1 << 28)) != 0; } //! Return whether the CPU supports 16-bit floating-point conversion instructions. static inline bool hasF16c () { return (s_processorFeaturesC & (1 << 29)) != 0; } //! Return whether the CPU supports the RDRAND instruction. static inline bool hasRdrand(){ return (s_processorFeaturesC & (1 << 30)) != 0; } //! Return whether the CPU contains an x87 FPU. static inline bool hasFpu () { return (s_processorFeaturesD & (1 << 0)) != 0; } static inline bool hasVme () { return (s_processorFeaturesD & (1 << 1)) != 0; } //! Return whether the CPU contains Debugging Extensions. static inline bool hasDe () { return (s_processorFeaturesD & (1 << 2)) != 0; } //! Return whether the CPU contains Page Size Extensions. static inline bool hasPse () { return (s_processorFeaturesD & (1 << 3)) != 0; } //! Return whether the CPU supports the RDTSC instruction. static inline bool hasTsc () { return (s_processorFeaturesD & (1 << 4)) != 0; } //! Return whether the CPU supports the Model Specific Registers instructions. static inline bool hasMsr () { return (s_processorFeaturesD & (1 << 5)) != 0; } //! Return whether the CPU supports the Physical Address Extension. static inline bool hasPae () { return (s_processorFeaturesD & (1 << 6)) != 0; } //! Return whether the CPU supports the CMPXCHG8B instruction. static inline bool hasCx8 () { return (s_processorFeaturesD & (1 << 8)) != 0; } //! Return whether the CPU supports Memory Type Range Registers. static inline bool hasMtrr () { return (s_processorFeaturesD & (1 << 12)) != 0; } //! Return whether the CPU supports CMOV instructions. static inline bool hasCmov () { return (s_processorFeaturesD & (1 << 15)) != 0; } //! Return whether the CPU supports the CLFLUSH instruction. static inline bool hasClfsh() { return (s_processorFeaturesD & (1 << 19)) != 0; } //! Return whether the CPU supports ACPI. static inline bool hasAcpi () { return (s_processorFeaturesD & (1 << 22)) != 0; } //! Return whether the CPU supports MMX. static inline bool hasMmx () { return (s_processorFeaturesD & (1 << 23)) != 0; } //! Return whether the CPU supports SSE. static inline bool hasSse () { return (s_processorFeaturesD & (1 << 25)) != 0; } //! Return whether the CPU supports SSE2. static inline bool hasSse2 () { return (s_processorFeaturesD & (1 << 26)) != 0; } static inline bool hasHtt () { return (s_processorFeaturesD & (1 << 28)) != 0; } //! Return whether the CPU supports SSE4a. static inline bool hasSse4a() { return (s_processorFeatures8C & (1 << 6)) != 0; } //! Return whether the CPU supports misaligned SSE instructions. static inline bool hasMisAlignSse() { return (s_processorFeatures8C & (1 << 7)) != 0; } //! Return whether the CPU supports the AMD prefetchw instruction. static inline bool hasAmdPrefetch() { return (s_processorFeatures8C & (1 << 8)) != 0; } //! Return whether the CPU supports the XOP instructions. static inline bool hasXop () { return (s_processorFeatures8C & (1 << 11)) != 0; } //! Return whether the CPU supports the FMA4 instructions. static inline bool hasFma4 () { return (s_processorFeatures8C & (1 << 16)) != 0; } //! Return whether the CPU supports the RDTSCP instruction. static inline bool hasRdtscp() { return (s_processorFeatures8D & (1 << 27)) != 0; } static inline bool has3DNow() { return (s_processorFeatures8D & (1u << 31)) != 0; } static inline bool has3DNowExt() { return (s_processorFeatures8D & (1 << 30)) != 0; } //! Return the size of the L1 instruction cache. static inline uint L1Instruction() { return s_L1Instruction; } //! Return the size of the L1 data cache. static inline uint L1Data() { return s_L1Data; } //! Return the size of the L2 cache. static inline uint L2Data() { return s_L2Data; } //! Return the size of the L3 cache. static inline uint L3Data() { return s_L3Data; } static inline ushort L1InstructionLineSize() { return s_L1InstructionLineSize; } static inline ushort L1DataLineSize() { return s_L1DataLineSize; } static inline ushort L2DataLineSize() { return s_L2DataLineSize; } static inline ushort L3DataLineSize() { return s_L3DataLineSize; } static inline uint L1Associativity() { return s_L1Associativity; } static inline uint L2Associativity() { return s_L2Associativity; } static inline uint L3Associativity() { return s_L3Associativity; } static inline ushort prefetch() { return s_prefetch; } private: static void interpret(uchar byte, bool *checkLeaf4); static uint s_ecx0; static uint s_logicalProcessors; static uint s_processorFeaturesC; static uint s_processorFeaturesD; static uint s_processorFeatures8C; static uint s_processorFeatures8D; static uint s_L1Instruction; static uint s_L1Data; static uint s_L2Data; static uint s_L3Data; static ushort s_L1InstructionLineSize; static ushort s_L1DataLineSize; static ushort s_L2DataLineSize; static ushort s_L3DataLineSize; static uint s_L1Associativity; static uint s_L2Associativity; static uint s_L3Associativity; static ushort s_prefetch; static uchar s_brandIndex; static uchar s_cacheLineSize; static uchar s_processorModel; static uchar s_processorFamily; static ProcessorType s_processorType; static bool s_noL2orL3; }; } // namespace Vc /*OUTER_NAMESPACE_END*/ #endif // CPUID_H Vc-0.7.4/include/Vc/double_v000066400000000000000000000002121233512346000156010ustar00rootroot00000000000000#ifdef __GNUC__ #warning "Use of the Vc/double_v header is deprecated. The header file will be removed in a future version of Vc." #endif Vc-0.7.4/include/Vc/float_v000066400000000000000000000002111233512346000154330ustar00rootroot00000000000000#ifdef __GNUC__ #warning "Use of the Vc/float_v header is deprecated. The header file will be removed in a future version of Vc." #endif Vc-0.7.4/include/Vc/global.h000066400000000000000000000353031233512346000155010ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_GLOBAL_H #define VC_GLOBAL_H #ifndef DOXYGEN // Compiler defines #ifdef __INTEL_COMPILER #define VC_ICC __INTEL_COMPILER_BUILD_DATE #elif defined(__OPENCC__) #define VC_OPEN64 1 #elif defined(__clang__) #define VC_CLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__) #elif defined(__GNUC__) #define VC_GCC (__GNUC__ * 0x10000 + __GNUC_MINOR__ * 0x100 + __GNUC_PATCHLEVEL__) #elif defined(_MSC_VER) #define VC_MSVC _MSC_FULL_VER #else #define VC_UNSUPPORTED_COMPILER 1 #endif // Features/Quirks defines #if defined VC_MSVC && defined _WIN32 // the Win32 ABI can't handle function parameters with alignment >= 16 #define VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN 1 #endif #if defined(__GNUC__) && !defined(VC_NO_INLINE_ASM) #define VC_GNU_ASM 1 #endif #if defined(VC_GCC) && (VC_GCC <= 0x40405 || (VC_GCC >= 0x40500 && VC_GCC <= 0x40502)) && !(VC_GCC == 0x40502 && defined(__GNUC_UBUNTU_VERSION__) && __GNUC_UBUNTU_VERSION__ == 0xb0408) // GCC 4.6.0 / 4.5.3 / 4.4.6 switched to the interface as defined by ICC // (Ubuntu 11.04 ships a GCC 4.5.2 with the new interface) #define VC_MM256_MASKSTORE_WRONG_MASK_TYPE 1 #endif #if defined(VC_GCC) && VC_GCC >= 0x40300 #define VC_HAVE_ATTRIBUTE_ERROR 1 #define VC_HAVE_ATTRIBUTE_WARNING 1 #endif #if (defined(__GXX_EXPERIMENTAL_CXX0X__) && VC_GCC >= 0x40600) || __cplusplus >= 201103 # define VC_CXX11 1 # ifdef VC_GCC # if VC_GCC >= 0x40700 // && VC_GCC < 0x408000) // ::max_align_t was introduced with GCC 4.7. std::max_align_t took a bit longer. # define VC_HAVE_MAX_ALIGN_T 1 # endif # elif defined(VC_ICC) # define VC_HAVE_MAX_ALIGN_T 1 # elif !defined(VC_CLANG) // Clang doesn't provide max_align_t at all # define VC_HAVE_STD_MAX_ALIGN_T 1 # endif #endif // ICC ships the AVX2 intrinsics inside the AVX1 header. // FIXME: the number 20120731 is too large, but I don't know which one is the right one #if (defined(VC_ICC) && VC_ICC >= 20120731) || (defined(VC_MSVC) && VC_MSVC >= 170000000) #define VC_UNCONDITIONAL_AVX2_INTRINSICS 1 #endif /* Define the following strings to a unique integer, which is the only type the preprocessor can * compare. This allows to use -DVC_IMPL=SSE3. The preprocessor will then consider VC_IMPL and SSE3 * to be equal. Of course, it is important to undefine the strings later on! */ #define Scalar 0x00100000 #define SSE 0x00200000 #define SSE2 0x00300000 #define SSE3 0x00400000 #define SSSE3 0x00500000 #define SSE4_1 0x00600000 #define SSE4_2 0x00700000 #define AVX 0x00800000 #define XOP 0x00000001 #define FMA4 0x00000002 #define F16C 0x00000004 #define POPCNT 0x00000008 #define SSE4a 0x00000010 #define FMA 0x00000020 #define IMPL_MASK 0xFFF00000 #define EXT_MASK 0x000FFFFF #ifdef VC_MSVC # ifdef _M_IX86_FP # if _M_IX86_FP >= 1 # ifndef __SSE__ # define __SSE__ 1 # endif # endif # if _M_IX86_FP >= 2 # ifndef __SSE2__ # define __SSE2__ 1 # endif # endif # elif defined(_M_AMD64) // If the target is x86_64 then SSE2 is guaranteed # ifndef __SSE__ # define __SSE__ 1 # endif # ifndef __SSE2__ # define __SSE2__ 1 # endif # endif #endif #ifndef VC_IMPL # if defined(__AVX__) # define VC_IMPL_AVX 1 # else # if defined(__SSE4_2__) # define VC_IMPL_SSE 1 # define VC_IMPL_SSE4_2 1 # endif # if defined(__SSE4_1__) # define VC_IMPL_SSE 1 # define VC_IMPL_SSE4_1 1 # endif # if defined(__SSE3__) # define VC_IMPL_SSE 1 # define VC_IMPL_SSE3 1 # endif # if defined(__SSSE3__) # define VC_IMPL_SSE 1 # define VC_IMPL_SSSE3 1 # endif # if defined(__SSE2__) # define VC_IMPL_SSE 1 # define VC_IMPL_SSE2 1 # endif # if defined(VC_IMPL_SSE) // nothing # else # define VC_IMPL_Scalar 1 # endif # endif # if defined(VC_IMPL_AVX) || defined(VC_IMPL_SSE) # ifdef __FMA4__ # define VC_IMPL_FMA4 1 # endif # ifdef __XOP__ # define VC_IMPL_XOP 1 # endif # ifdef __F16C__ # define VC_IMPL_F16C 1 # endif # ifdef __POPCNT__ # define VC_IMPL_POPCNT 1 # endif # ifdef __SSE4A__ # define VC_IMPL_SSE4a 1 # endif # ifdef __FMA__ # define VC_IMPL_FMA 1 # endif # endif #else // VC_IMPL # if (VC_IMPL & IMPL_MASK) == AVX // AVX supersedes SSE # define VC_IMPL_AVX 1 # elif (VC_IMPL & IMPL_MASK) == Scalar # define VC_IMPL_Scalar 1 # elif (VC_IMPL & IMPL_MASK) == SSE4_2 # define VC_IMPL_SSE4_2 1 # define VC_IMPL_SSE4_1 1 # define VC_IMPL_SSSE3 1 # define VC_IMPL_SSE3 1 # define VC_IMPL_SSE2 1 # define VC_IMPL_SSE 1 # elif (VC_IMPL & IMPL_MASK) == SSE4_1 # define VC_IMPL_SSE4_1 1 # define VC_IMPL_SSSE3 1 # define VC_IMPL_SSE3 1 # define VC_IMPL_SSE2 1 # define VC_IMPL_SSE 1 # elif (VC_IMPL & IMPL_MASK) == SSSE3 # define VC_IMPL_SSSE3 1 # define VC_IMPL_SSE3 1 # define VC_IMPL_SSE2 1 # define VC_IMPL_SSE 1 # elif (VC_IMPL & IMPL_MASK) == SSE3 # define VC_IMPL_SSE3 1 # define VC_IMPL_SSE2 1 # define VC_IMPL_SSE 1 # elif (VC_IMPL & IMPL_MASK) == SSE2 # define VC_IMPL_SSE2 1 # define VC_IMPL_SSE 1 # elif (VC_IMPL & IMPL_MASK) == SSE # define VC_IMPL_SSE 1 # if defined(__SSE4_2__) # define VC_IMPL_SSE4_2 1 # endif # if defined(__SSE4_1__) # define VC_IMPL_SSE4_1 1 # endif # if defined(__SSE3__) # define VC_IMPL_SSE3 1 # endif # if defined(__SSSE3__) # define VC_IMPL_SSSE3 1 # endif # if defined(__SSE2__) # define VC_IMPL_SSE2 1 # endif # elif (VC_IMPL & IMPL_MASK) == 0 && (VC_IMPL & SSE4a) // this is for backward compatibility only where SSE4a was included in the main // line of available SIMD instruction sets # define VC_IMPL_SSE3 1 # define VC_IMPL_SSE2 1 # define VC_IMPL_SSE 1 # endif # if (VC_IMPL & XOP) # define VC_IMPL_XOP 1 # endif # if (VC_IMPL & FMA4) # define VC_IMPL_FMA4 1 # endif # if (VC_IMPL & F16C) # define VC_IMPL_F16C 1 # endif # if (VC_IMPL & POPCNT) # define VC_IMPL_POPCNT 1 # endif # if (VC_IMPL & SSE4a) # define VC_IMPL_SSE4a 1 # endif # if (VC_IMPL & FMA) # define VC_IMPL_FMA 1 # endif # undef VC_IMPL #endif // VC_IMPL // If AVX is enabled in the compiler it will use VEX coding for the SIMD instructions. #ifdef __AVX__ # define VC_USE_VEX_CODING 1 #endif #if defined(VC_GCC) && VC_GCC < 0x40300 && !defined(VC_IMPL_Scalar) # ifndef VC_DONT_WARN_OLD_GCC # warning "GCC < 4.3 does not have full support for SSE2 intrinsics. Using scalar types/operations only. Define VC_DONT_WARN_OLD_GCC to silence this warning." # endif # undef VC_IMPL_SSE # undef VC_IMPL_SSE2 # undef VC_IMPL_SSE3 # undef VC_IMPL_SSE4_1 # undef VC_IMPL_SSE4_2 # undef VC_IMPL_SSSE3 # undef VC_IMPL_AVX # undef VC_IMPL_FMA4 # undef VC_IMPL_XOP # undef VC_IMPL_F16C # undef VC_IMPL_POPCNT # undef VC_IMPL_SSE4a # undef VC_IMPL_FMA # undef VC_USE_VEX_CODING # define VC_IMPL_Scalar 1 #endif # if !defined(VC_IMPL_Scalar) && !defined(VC_IMPL_SSE) && !defined(VC_IMPL_AVX) # error "No suitable Vc implementation was selected! Probably VC_IMPL was set to an invalid value." # elif defined(VC_IMPL_SSE) && !defined(VC_IMPL_SSE2) # error "SSE requested but no SSE2 support. Vc needs at least SSE2!" # endif #undef Scalar #undef SSE #undef SSE2 #undef SSE3 #undef SSSE3 #undef SSE4_1 #undef SSE4_2 #undef AVX #undef XOP #undef FMA4 #undef F16C #undef POPCNT #undef SSE4a #undef FMA #undef IMPL_MASK #undef EXT_MASK /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { enum AlignedFlag { Aligned = 0 }; enum UnalignedFlag { Unaligned = 1 }; enum StreamingAndAlignedFlag { // implies Aligned Streaming = 2 }; enum StreamingAndUnalignedFlag { StreamingAndUnaligned = 3 }; #endif // DOXYGEN /** * \ingroup Utilities * * Enum that specifies the alignment and padding restrictions to use for memory allocation with * Vc::malloc. */ enum MallocAlignment { /** * Align on boundary of vector sizes (e.g. 16 Bytes on SSE platforms) and pad to allow * vector access to the end. Thus the allocated memory contains a multiple of * VectorAlignment bytes. */ AlignOnVector, /** * Align on boundary of cache line sizes (e.g. 64 Bytes on x86) and pad to allow * full cache line access to the end. Thus the allocated memory contains a multiple of * 64 bytes. */ AlignOnCacheline, /** * Align on boundary of page sizes (e.g. 4096 Bytes on x86) and pad to allow * full page access to the end. Thus the allocated memory contains a multiple of * 4096 bytes. */ AlignOnPage }; #if __cplusplus >= 201103 /*C++11*/ #define Vc_CONSTEXPR constexpr #elif defined(__GNUC__) #define Vc_CONSTEXPR inline __attribute__((__always_inline__, __const__)) #elif defined(VC_MSVC) #define Vc_CONSTEXPR inline __forceinline #else #define Vc_CONSTEXPR inline #endif Vc_CONSTEXPR StreamingAndUnalignedFlag operator|(UnalignedFlag, StreamingAndAlignedFlag) { return StreamingAndUnaligned; } Vc_CONSTEXPR StreamingAndUnalignedFlag operator|(StreamingAndAlignedFlag, UnalignedFlag) { return StreamingAndUnaligned; } Vc_CONSTEXPR StreamingAndUnalignedFlag operator&(UnalignedFlag, StreamingAndAlignedFlag) { return StreamingAndUnaligned; } Vc_CONSTEXPR StreamingAndUnalignedFlag operator&(StreamingAndAlignedFlag, UnalignedFlag) { return StreamingAndUnaligned; } Vc_CONSTEXPR StreamingAndAlignedFlag operator|(AlignedFlag, StreamingAndAlignedFlag) { return Streaming; } Vc_CONSTEXPR StreamingAndAlignedFlag operator|(StreamingAndAlignedFlag, AlignedFlag) { return Streaming; } Vc_CONSTEXPR StreamingAndAlignedFlag operator&(AlignedFlag, StreamingAndAlignedFlag) { return Streaming; } Vc_CONSTEXPR StreamingAndAlignedFlag operator&(StreamingAndAlignedFlag, AlignedFlag) { return Streaming; } /** * \ingroup Utilities * * Enum to identify a certain SIMD instruction set. * * You can use \ref VC_IMPL for the currently active implementation. * * \see ExtraInstructions */ enum Implementation { /// uses only fundamental types ScalarImpl, /// x86 SSE + SSE2 SSE2Impl, /// x86 SSE + SSE2 + SSE3 SSE3Impl, /// x86 SSE + SSE2 + SSE3 + SSSE3 SSSE3Impl, /// x86 SSE + SSE2 + SSE3 + SSSE3 + SSE4.1 SSE41Impl, /// x86 SSE + SSE2 + SSE3 + SSSE3 + SSE4.1 + SSE4.2 SSE42Impl, /// x86 AVX AVXImpl, /// x86 AVX + AVX2 AVX2Impl, ImplementationMask = 0xfff }; /** * \ingroup Utilities * * The list of available instructions is not easily described by a linear list of instruction sets. * On x86 the following instruction sets always include their predecessors: * SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2 * * But there are additional instructions that are not necessarily required by this list. These are * covered in this enum. */ enum ExtraInstructions { //! Support for float16 conversions in hardware Float16cInstructions = 0x01000, //! Support for FMA4 instructions Fma4Instructions = 0x02000, //! Support for XOP instructions XopInstructions = 0x04000, //! Support for the population count instruction PopcntInstructions = 0x08000, //! Support for SSE4a instructions Sse4aInstructions = 0x10000, //! Support for FMA instructions (3 operand variant) FmaInstructions = 0x20000, // PclmulqdqInstructions, // AesInstructions, // RdrandInstructions ExtraInstructionsMask = 0xfffff000u }; #ifndef DOXYGEN #ifdef VC_IMPL_Scalar #define VC_IMPL ::Vc::ScalarImpl #elif defined(VC_IMPL_AVX) #define VC_IMPL ::Vc::AVXImpl #elif defined(VC_IMPL_SSE4_2) #define VC_IMPL ::Vc::SSE42Impl #elif defined(VC_IMPL_SSE4_1) #define VC_IMPL ::Vc::SSE41Impl #elif defined(VC_IMPL_SSSE3) #define VC_IMPL ::Vc::SSSE3Impl #elif defined(VC_IMPL_SSE3) #define VC_IMPL ::Vc::SSE3Impl #elif defined(VC_IMPL_SSE2) #define VC_IMPL ::Vc::SSE2Impl #endif template struct ImplementationT { enum _Value { Value = Features, Implementation = Features & Vc::ImplementationMask, ExtraInstructions = Features & Vc::ExtraInstructionsMask }; }; typedef ImplementationT< #ifdef VC_USE_VEX_CODING // everything will use VEX coding, so the system has to support AVX even if VC_IMPL_AVX is not set // but AFAIU the OSXSAVE and xgetbv tests do not have to positive (unless, of course, the // compiler decides to insert an instruction that uses the full register size - so better be on // the safe side) AVXImpl #else VC_IMPL #endif #ifdef VC_IMPL_SSE4a + Vc::Sse4aInstructions #ifdef VC_IMPL_XOP + Vc::XopInstructions #ifdef VC_IMPL_FMA4 + Vc::Fma4Instructions #endif #endif #endif #ifdef VC_IMPL_POPCNT + Vc::PopcntInstructions #endif #ifdef VC_IMPL_FMA + Vc::FmaInstructions #endif > CurrentImplementation; namespace Internal { template struct HelperImpl; typedef HelperImpl Helper; template struct FlagObject; template<> struct FlagObject { static Vc_CONSTEXPR AlignedFlag the() { return Aligned; } }; template<> struct FlagObject { static Vc_CONSTEXPR UnalignedFlag the() { return Unaligned; } }; template<> struct FlagObject { static Vc_CONSTEXPR StreamingAndAlignedFlag the() { return Streaming; } }; template<> struct FlagObject { static Vc_CONSTEXPR StreamingAndUnalignedFlag the() { return StreamingAndUnaligned; } }; } // namespace Internal namespace Warnings { void _operator_bracket_warning() #ifdef VC_HAVE_ATTRIBUTE_WARNING __attribute__((warning("\n\tUse of Vc::Vector::operator[] to modify scalar entries is known to miscompile with GCC 4.3.x.\n\tPlease upgrade to a more recent GCC or avoid operator[] altogether.\n\t(This warning adds an unnecessary function call to operator[] which should work around the problem at a little extra cost.)"))) #endif ; } // namespace Warnings namespace Error { template struct invalid_operands_of_types {}; } // namespace Error #endif // DOXYGEN } // namespace Vc /*OUTER_NAMESPACE_END*/ #undef Vc_CONSTEXPR #include "version.h" #endif // VC_GLOBAL_H Vc-0.7.4/include/Vc/int_v000066400000000000000000000002071233512346000151250ustar00rootroot00000000000000#ifdef __GNUC__ #warning "Use of the Vc/int_v header is deprecated. The header file will be removed in a future version of Vc." #endif Vc-0.7.4/include/Vc/internal/000077500000000000000000000000001233512346000157005ustar00rootroot00000000000000Vc-0.7.4/include/Vc/internal/namespace.h000066400000000000000000000020341233512346000200040ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2010-2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifdef VC_IMPL_Scalar # define VECTOR_NAMESPACE Vc::Scalar #elif defined(VC_IMPL_AVX) # define VECTOR_NAMESPACE Vc::AVX #elif defined(VC_IMPL_SSE) # define VECTOR_NAMESPACE Vc::SSE #else # error "No known Vc implementation was selected. This should not happen. The logic in Vc/global.h failed." #endif Vc-0.7.4/include/Vc/limits000066400000000000000000000044141233512346000153130ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef INCLUDE_VC_LIMITS #define INCLUDE_VC_LIMITS #include "vector.h" #include "common/macros.h" #include namespace std { template struct numeric_limits > : public numeric_limits::EntryType> { private: typedef numeric_limits::EntryType> _Base; public: static Vc_INTRINSIC Vc_CONST Vc::Vector max() { return Vc::Vector(_Base::max()); } static Vc_INTRINSIC Vc_CONST Vc::Vector min() { return Vc::Vector(_Base::min()); } static Vc_INTRINSIC Vc_CONST Vc::Vector lowest() { return Vc::Vector(_Base::lowest()); } static Vc_INTRINSIC Vc_CONST Vc::Vector epsilon() { return Vc::Vector(_Base::epsilon()); } static Vc_INTRINSIC Vc_CONST Vc::Vector round_error() { return Vc::Vector(_Base::round_error()); } static Vc_INTRINSIC Vc_CONST Vc::Vector infinity() { return Vc::Vector(_Base::infinity()); } static Vc_INTRINSIC Vc_CONST Vc::Vector quiet_NaN() { return Vc::Vector(_Base::quiet_NaN()); } static Vc_INTRINSIC Vc_CONST Vc::Vector signaling_NaN() { return Vc::Vector(_Base::signaling_NaN()); } static Vc_INTRINSIC Vc_CONST Vc::Vector denorm_min() { return Vc::Vector(_Base::denorm_min()); } }; } // namespace std #include "common/undomacros.h" #ifdef VC_IMPL_Scalar # include "scalar/limits.h" #elif defined(VC_IMPL_AVX) # include "avx/limits.h" #elif defined(VC_IMPL_SSE) # include "sse/limits.h" #endif #endif // INCLUDE_VC_LIMITS // vim: ft=cpp Vc-0.7.4/include/Vc/sfloat_v000066400000000000000000000002121233512346000156170ustar00rootroot00000000000000#ifdef __GNUC__ #warning "Use of the Vc/sfloat_v header is deprecated. The header file will be removed in a future version of Vc." #endif Vc-0.7.4/include/Vc/short_v000066400000000000000000000002111233512346000154650ustar00rootroot00000000000000#ifdef __GNUC__ #warning "Use of the Vc/short_v header is deprecated. The header file will be removed in a future version of Vc." #endif Vc-0.7.4/include/Vc/support.h000066400000000000000000000102741233512346000157550ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2010-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_COMMON_SUPPORT_H #define VC_COMMON_SUPPORT_H #ifndef VC_GLOBAL_H #error "Vc/global.h must be included first!" #endif #include #if defined(VC_GCC) && VC_GCC >= 0x40400 #define VC_TARGET_NO_SIMD __attribute__((target("no-sse2,no-avx"))) #else #define VC_TARGET_NO_SIMD #endif /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { /** * \name Micro-Architecture Feature Tests */ //@{ /** * \ingroup Utilities * \headerfile support.h * Determines the extra instructions supported by the current CPU. * * \return A combination of flags from Vc::ExtraInstructions that the current CPU supports. */ VC_TARGET_NO_SIMD unsigned int extraInstructionsSupported(); /** * \ingroup Utilities * \headerfile support.h * * Tests whether the given implementation is supported by the system the code is executing on. * * \return \c true if the OS and hardware support execution of instructions defined by \p impl. * \return \c false otherwise * * \param impl The SIMD target to test for. */ VC_TARGET_NO_SIMD bool isImplementationSupported(Vc::Implementation impl); /** * \internal * \ingroup Utilities * \headerfile support.h * * Tests whether the given implementation is supported by the system the code is executing on. * * \code * if (!isImplementationSupported()) { * std::cerr << "This code was compiled with features that this system does not support.\n"; * return EXIT_FAILURE; * } * \endcode * * \return \c true if the OS and hardware support execution of instructions defined by \p impl. * \return \c false otherwise * * \tparam Impl The SIMD target to test for. */ template VC_TARGET_NO_SIMD static inline bool isImplementationSupported() { return isImplementationSupported(static_cast(Impl::Implementation)) && (extraInstructionsSupported() & Impl::ExtraInstructions) == Impl::ExtraInstructions; } /** * \ingroup Utilities * \headerfile support.h * * Determines the best supported implementation for the current system. * * \return The enum value for the best implementation. */ VC_TARGET_NO_SIMD Vc::Implementation bestImplementationSupported(); #ifndef VC_COMPILE_LIB /** * \ingroup Utilities * \headerfile support.h * * Tests that the CPU and Operating System support the vector unit which was compiled for. This * function should be called before any other Vc functionality is used. It checks whether the program * will work. If this function returns \c false then the program should exit with a useful error * message before the OS has to kill it because of an invalid instruction exception. * * If the program continues and makes use of any vector features not supported by * hard- or software then the program will crash. * * Example: * \code * int main() * { * if (!Vc::currentImplementationSupported()) { * std::cerr << "CPU or OS requirements not met for the compiled in vector unit!\n"; * exit -1; * } * ... * } * \endcode * * \return \c true if the OS and hardware support execution of the currently selected SIMD * instructions. * \return \c false otherwise */ VC_TARGET_NO_SIMD #ifndef DOXYGEN static #endif inline bool currentImplementationSupported() { return isImplementationSupported(); } #endif // VC_COMPILE_LIB //@} } // namespace Vc /*OUTER_NAMESPACE_END*/ #undef VC_TARGET_NO_SIMD #endif // VC_COMMON_SUPPORT_H Vc-0.7.4/include/Vc/uint_v000066400000000000000000000002101233512346000153040ustar00rootroot00000000000000#ifdef __GNUC__ #warning "Use of the Vc/uint_v header is deprecated. The header file will be removed in a future version of Vc." #endif Vc-0.7.4/include/Vc/ushort_v000066400000000000000000000002121233512346000156530ustar00rootroot00000000000000#ifdef __GNUC__ #warning "Use of the Vc/ushort_v header is deprecated. The header file will be removed in a future version of Vc." #endif Vc-0.7.4/include/Vc/vector.h000066400000000000000000000104441233512346000155420ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VECTOR_H #define VECTOR_H #include "global.h" #include "internal/namespace.h" #ifdef VC_IMPL_Scalar # include "scalar/vector.h" # include "scalar/helperimpl.h" #elif defined(VC_IMPL_AVX) # include "avx/vector.h" # include "avx/helperimpl.h" #elif defined(VC_IMPL_SSE) # include "sse/vector.h" # include "sse/helperimpl.h" #endif #ifdef isfinite #undef isfinite #endif #ifdef isnan #undef isnan #endif /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { using VECTOR_NAMESPACE::VectorAlignment; using VECTOR_NAMESPACE::VectorAlignedBaseT; typedef VectorAlignedBaseT<> VectorAlignedBase; using namespace VectorSpecialInitializerZero; using namespace VectorSpecialInitializerOne; using namespace VectorSpecialInitializerIndexesFromZero; using VECTOR_NAMESPACE::min; using VECTOR_NAMESPACE::max; using VECTOR_NAMESPACE::sqrt; using VECTOR_NAMESPACE::rsqrt; using VECTOR_NAMESPACE::abs; using VECTOR_NAMESPACE::sin; using VECTOR_NAMESPACE::asin; using VECTOR_NAMESPACE::cos; using VECTOR_NAMESPACE::sincos; using VECTOR_NAMESPACE::trunc; using VECTOR_NAMESPACE::floor; using VECTOR_NAMESPACE::ceil; using VECTOR_NAMESPACE::exp; using VECTOR_NAMESPACE::log; using VECTOR_NAMESPACE::log2; using VECTOR_NAMESPACE::log10; using VECTOR_NAMESPACE::reciprocal; using VECTOR_NAMESPACE::atan; using VECTOR_NAMESPACE::atan2; using VECTOR_NAMESPACE::frexp; using VECTOR_NAMESPACE::ldexp; using VECTOR_NAMESPACE::round; using VECTOR_NAMESPACE::isfinite; using VECTOR_NAMESPACE::isnan; using VECTOR_NAMESPACE::forceToRegisters; using VECTOR_NAMESPACE::Vector; typedef VECTOR_NAMESPACE::double_v double_v; typedef double_v::Mask double_m; typedef VECTOR_NAMESPACE::sfloat_v sfloat_v; typedef sfloat_v::Mask sfloat_m; typedef VECTOR_NAMESPACE::float_v float_v; typedef float_v::Mask float_m; typedef VECTOR_NAMESPACE::int_v int_v; typedef int_v::Mask int_m; typedef VECTOR_NAMESPACE::uint_v uint_v; typedef uint_v::Mask uint_m; typedef VECTOR_NAMESPACE::short_v short_v; typedef short_v::Mask short_m; typedef VECTOR_NAMESPACE::ushort_v ushort_v; typedef ushort_v::Mask ushort_m; namespace { #if defined(VC_IMPL_SSE) || defined(VC_IMPL_AVX) using VECTOR_NAMESPACE::Const; #endif VC_STATIC_ASSERT_NC(double_v::Size == VC_DOUBLE_V_SIZE, VC_DOUBLE_V_SIZE_MACRO_WRONG); VC_STATIC_ASSERT_NC(float_v::Size == VC_FLOAT_V_SIZE , VC_FLOAT_V_SIZE_MACRO_WRONG ); VC_STATIC_ASSERT_NC(sfloat_v::Size == VC_SFLOAT_V_SIZE, VC_SFLOAT_V_SIZE_MACRO_WRONG); VC_STATIC_ASSERT_NC(int_v::Size == VC_INT_V_SIZE , VC_INT_V_SIZE_MACRO_WRONG ); VC_STATIC_ASSERT_NC(uint_v::Size == VC_UINT_V_SIZE , VC_UINT_V_SIZE_MACRO_WRONG ); VC_STATIC_ASSERT_NC(short_v::Size == VC_SHORT_V_SIZE , VC_SHORT_V_SIZE_MACRO_WRONG ); VC_STATIC_ASSERT_NC(ushort_v::Size == VC_USHORT_V_SIZE, VC_USHORT_V_SIZE_MACRO_WRONG); } } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "common/vectortuple.h" #include "common/iif.h" #ifndef VC_NO_NAMESPACE_ALIAS /*NAMESPACE_ALIAS*/ #endif #ifndef VC_NO_STD_FUNCTIONS namespace std { using Vc::min; using Vc::max; using Vc::abs; using Vc::asin; using Vc::atan; using Vc::atan2; using Vc::ceil; using Vc::cos; using Vc::exp; using Vc::floor; using Vc::frexp; using Vc::ldexp; using Vc::log; using Vc::log10; using Vc::log2; using Vc::round; using Vc::sin; using Vc::sqrt; using Vc::isfinite; using Vc::isnan; } // namespace std #endif #ifndef VC_CLEAN_NAMESPACE #define foreach_bit(_it_, _mask_) Vc_foreach_bit(_it_, _mask_) #endif #undef VECTOR_NAMESPACE #endif // VECTOR_H Vc-0.7.4/include/Vc/version.h000066400000000000000000000032451233512346000157260ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2010-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_VERSION_H #define VC_VERSION_H #define VC_VERSION_STRING "0.7.4" #define VC_VERSION_NUMBER 0x000708 #define VC_VERSION_CHECK(major, minor, patch) ((major << 16) | (minor << 8) | (patch << 1)) #define VC_LIBRARY_ABI_VERSION 3 /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { static inline const char *versionString() { return VC_VERSION_STRING; } static inline unsigned int versionNumber() { return VC_VERSION_NUMBER; } #if !defined(VC_NO_VERSION_CHECK) && !defined(VC_COMPILE_LIB) void checkLibraryAbi(unsigned int compileTimeAbi, unsigned int versionNumber, const char *versionString); namespace { static struct runLibraryAbiCheck { runLibraryAbiCheck() { checkLibraryAbi(VC_LIBRARY_ABI_VERSION, VC_VERSION_NUMBER, VC_VERSION_STRING); } } _runLibraryAbiCheck; } #endif } // namespace Vc /*OUTER_NAMESPACE_END*/ #endif // VC_VERSION_H Vc-0.7.4/makeApidox.sh000077500000000000000000000001111233512346000145030ustar00rootroot00000000000000#!/bin/sh cd "`dirname "$0"`/doc" rm -rf html doxygen rm -f html/*.qhp Vc-0.7.4/makeRelease.sh000077500000000000000000000052531233512346000146530ustar00rootroot00000000000000#!/bin/bash cd "`dirname "$0"`" # Read version number eval `awk '/VC_VERSION_NUMBER 0x[0-9]+/ { h=$3 } END { major=strtonum(substr(h, 1, 4)) minor=strtonum("0x" substr(h, 5, 2)) patch=strtonum("0x" substr(h, 7, 2)) / 2 printf "oldVersion=\"%d.%d.%d\"\n", major, minor, patch printf "newVersion=\"%d.%d.%d\"\n", major, minor, patch + 1 }' include/Vc/version.h` echo "current version: $oldVersion" echo -n " new release: " read -e -i "$newVersion" newVersion versionString=$newVersion versionNumber=`echo $newVersion | awk '{ split($0, v, "."); printf "0x%02x%02x%02x", v[1], v[2], v[3] * 2 }'` # Update the version number sed -i \ -e "s/^PROJECT_NUMBER = .*\$/PROJECT_NUMBER = $versionString/" \ -e "s/^HTML_TIMESTAMP = YES/HTML_TIMESTAMP = NO/" \ doc/Doxyfile sed -i \ -e "s/VC_VERSION_STRING \".*\"\$/VC_VERSION_STRING \"$versionString\"/" \ -e "s/VC_VERSION_NUMBER 0x.*\$/VC_VERSION_NUMBER $versionNumber/" \ include/Vc/version.h cat include/Vc/version.h # Don't build tests with make all sed -i \ -e 's/add_custom_target(build_tests ALL VERBATIM)/add_custom_target(build_tests VERBATIM)/' \ CMakeLists.txt git commit CMakeLists.txt doc/Doxyfile include/Vc/version.h -s -F- < ../"Vc-$versionString.tar.gz" # Create API docs tarball ./makeApidox.sh mv doc/html/*.qch "../Vc-${versionString}.qch" mv doc/html "Vc-docs-$versionString" && tar -czf "../Vc-docs-$versionString".tar.gz "Vc-docs-$versionString" rm -rf "Vc-docs-$versionString" # Update the version number of the after-release code versionString="$versionString-dev" versionNumber=`echo $versionNumber | awk '{ printf "0x%06x", (strtonum($0) + 1) }'` sed -i \ -e "s/^PROJECT_NUMBER = .*\$/PROJECT_NUMBER = $versionString/" \ -e "s/^HTML_TIMESTAMP = YES/HTML_TIMESTAMP = NO/" \ doc/Doxyfile sed -i \ -e "s/VC_VERSION_STRING \".*\"\$/VC_VERSION_STRING \"$versionString\"/" \ -e "s/VC_VERSION_NUMBER 0x.*\$/VC_VERSION_NUMBER $versionNumber/" \ include/Vc/version.h # Revert the build_tests change sed -i \ -e 's/add_custom_target(build_tests VERBATIM)/add_custom_target(build_tests ALL VERBATIM)/' \ CMakeLists.txt git commit CMakeLists.txt doc/Doxyfile include/Vc/version.h -s -F- <]" } function fatal() { echo "${1:-Error. Quit.}" >&2 exit 1 } function readWithDefault() { local default="${(P)1}" test -n "$2" && echo -n "$2 " || echo -n "$1 " test -n "$default" && echo -n "[$default] " read $1 test -z "${(P)1}" && eval ${1}="${default}" || eval ${1}="${(e)${(P)1}}" } function sourcesFor() { local pattern=$1 local file=$2 local output=$3 list=() local inside=false for i in `grep -A20 "$pattern\>" "$file"`; do case "$i" in STATIC|SHARED|MODULE|EXCLUDE_FROM_ALL) ;; "$pattern") inside=true ;; *')') $inside && test -n "${i%)}" && list=(${list} ${i%)}) inside=false ;; *) $inside && list=(${list} ${i}) ;; esac done eval "${output}=("${(u)list[@]}")" } rootDir= while [[ $# > 0 ]]; do case "$1" in -h|--help) usage exit ;; -r|--root) if ! test -f "$2/core/base/inc/TObject.h"; then echo "$2/core/base/inc/TObject.h not found" >&2 usage >&2 exit 1 fi rootDir="$2" shift ;; esac shift done if [[ -z "$rootDir" ]]; then rootDir="$HOME/src/root" readWithDefault rootDir "ROOT Sources" fi rootVcDir="$rootDir/misc/vc" echo "Clean up $rootVcDir" rm -r "$rootVcDir" vcDir="`dirname "$0"`" sourcesFor "add_library(Vc" "$vcDir/CMakeLists.txt" libVc_files pushd $vcDir includes=({scalar,sse,avx}{,/*.{h,tcc}} common{,/*.h} include/Vc/**/*) popd mkdir -p $rootVcDir/{inc/Vc,src,test} || fatal "Failed to create directories inside ROOT" for file in $includes; do src="$vcDir/$file" dst="$rootVcDir/inc/Vc/${file/include\/Vc\//}" if [[ -d "$src" ]]; then echo "mkdir $dst" mkdir -p "$dst" || fatal else echo "copying $dst" cp "$src" "$dst" || fatal fi done function copy() { while [[ $# > 0 ]]; do file="$1"; shift dstfile="src/${file//\//-}" src="$vcDir/$file" dst="$rootVcDir/$dstfile" echo "copying $dst" cp "$src" "$dst" || fatal done } copy "${libVc_files[@]}" # TODO: copy cmake files for installation # Read version number eval `awk '/VC_VERSION_NUMBER 0x[0-9]+/ { h=$3 } END { major=strtonum(substr(h, 1, 4)) minor=strtonum("0x" substr(h, 5, 2)) patch=strtonum("0x" substr(h, 7, 2)) / 2 printf "vcVersion=\"%d.%d.%d\"\n", major, minor, patch }' $vcDir/include/Vc/version.h` rootVcVersion="${vcVersion%%-*}-root" sed -i "s/${vcVersion}.*\"/$rootVcVersion\"/" $rootVcDir/inc/Vc/version.h # TODO: generate $rootVcDir/Module.mk cat > $rootVcDir/Module.mk < Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_SCALAR_DEINTERLEAVE_H #define VC_SCALAR_DEINTERLEAVE_H #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace Internal { template<> struct HelperImpl { template static Vc_ALWAYS_INLINE void deinterleave(V &a, V &b, const M *mem, A) { a = mem[0]; b = mem[1]; } static Vc_ALWAYS_INLINE void prefetchForOneRead(const void *) {} static Vc_ALWAYS_INLINE void prefetchForModify(const void *) {} static Vc_ALWAYS_INLINE void prefetchClose(const void *) {} static Vc_ALWAYS_INLINE void prefetchMid(const void *) {} static Vc_ALWAYS_INLINE void prefetchFar(const void *) {} template static Vc_ALWAYS_INLINE_L void *malloc(size_t n) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void free(void *p) Vc_ALWAYS_INLINE_R; }; } // namespace Scalar } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "helperimpl.tcc" #include "undomacros.h" #endif // VC_SCALAR_DEINTERLEAVE_H Vc-0.7.4/scalar/helperimpl.tcc000066400000000000000000000044071233512346000161770ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2010-2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_SCALAR_HELPERIMPL_TCC #define VC_SCALAR_HELPERIMPL_TCC #include #if defined _WIN32 || defined _WIN64 #include #endif /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace Internal { template static _VC_CONSTEXPR size_t nextMultipleOf(size_t value) { return (value % X) > 0 ? value + X - (value % X) : value; } template Vc_ALWAYS_INLINE void *HelperImpl::malloc(size_t n) { void *ptr = 0; switch (A) { case Vc::AlignOnVector: return std::malloc(n); case Vc::AlignOnCacheline: // TODO: hardcoding 64 is not such a great idea #ifdef _WIN32 #ifdef __GNUC__ #define _VC_ALIGNED_MALLOC __mingw_aligned_malloc #else #define _VC_ALIGNED_MALLOC _aligned_malloc #endif ptr = _VC_ALIGNED_MALLOC(nextMultipleOf<64>(n), 64); #else if (0 == posix_memalign(&ptr, 64, nextMultipleOf<64>(n))) { return ptr; } #endif break; case Vc::AlignOnPage: // TODO: hardcoding 4096 is not such a great idea #ifdef _WIN32 ptr = _VC_ALIGNED_MALLOC(nextMultipleOf<4096>(n), 4096); #undef _VC_ALIGNED_MALLOC #else if (0 == posix_memalign(&ptr, 4096, nextMultipleOf<4096>(n))) { return ptr; } #endif break; } return ptr; } Vc_ALWAYS_INLINE void HelperImpl::free(void *p) { std::free(p); } } // namespace Internal } // namespace Vc /*OUTER_NAMESPACE_END*/ #endif // VC_SCALAR_HELPERIMPL_TCC Vc-0.7.4/scalar/interleavedmemory.tcc000066400000000000000000000157061233512346000175750ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright (C) 2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . }}}*/ #ifndef VC_SCALAR_INTERLEAVEDMEMORY_TCC #define VC_SCALAR_INTERLEAVEDMEMORY_TCC #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace Common { template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ const typename V::AsArg v1) { m_data[m_indexes.data() + 0] = v0.data(); m_data[m_indexes.data() + 1] = v1.data(); }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ const typename V::AsArg v1, const typename V::AsArg v2) { m_data[m_indexes.data() + 0] = v0.data(); m_data[m_indexes.data() + 1] = v1.data(); m_data[m_indexes.data() + 2] = v2.data(); }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) { m_data[m_indexes.data() + 0] = v0.data(); m_data[m_indexes.data() + 1] = v1.data(); m_data[m_indexes.data() + 2] = v2.data(); m_data[m_indexes.data() + 3] = v3.data(); }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4) { m_data[m_indexes.data() + 0] = v0.data(); m_data[m_indexes.data() + 1] = v1.data(); m_data[m_indexes.data() + 2] = v2.data(); m_data[m_indexes.data() + 3] = v3.data(); m_data[m_indexes.data() + 4] = v4.data(); }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5) { m_data[m_indexes.data() + 0] = v0.data(); m_data[m_indexes.data() + 1] = v1.data(); m_data[m_indexes.data() + 2] = v2.data(); m_data[m_indexes.data() + 3] = v3.data(); m_data[m_indexes.data() + 4] = v4.data(); m_data[m_indexes.data() + 5] = v5.data(); }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6) { m_data[m_indexes.data() + 0] = v0.data(); m_data[m_indexes.data() + 1] = v1.data(); m_data[m_indexes.data() + 2] = v2.data(); m_data[m_indexes.data() + 3] = v3.data(); m_data[m_indexes.data() + 4] = v4.data(); m_data[m_indexes.data() + 5] = v5.data(); m_data[m_indexes.data() + 6] = v6.data(); }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6, const typename V::AsArg v7) { m_data[m_indexes.data() + 0] = v0.data(); m_data[m_indexes.data() + 1] = v1.data(); m_data[m_indexes.data() + 2] = v2.data(); m_data[m_indexes.data() + 3] = v3.data(); m_data[m_indexes.data() + 4] = v4.data(); m_data[m_indexes.data() + 5] = v5.data(); m_data[m_indexes.data() + 6] = v6.data(); m_data[m_indexes.data() + 7] = v7.data(); }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1) const/*{{{*/ { v0.data() = m_data[m_indexes.data() + 0]; v1.data() = m_data[m_indexes.data() + 1]; }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2) const/*{{{*/ { v0.data() = m_data[m_indexes.data() + 0]; v1.data() = m_data[m_indexes.data() + 1]; v2.data() = m_data[m_indexes.data() + 2]; }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3) const/*{{{*/ { v0.data() = m_data[m_indexes.data() + 0]; v1.data() = m_data[m_indexes.data() + 1]; v2.data() = m_data[m_indexes.data() + 2]; v3.data() = m_data[m_indexes.data() + 3]; }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4) const/*{{{*/ { v0.data() = m_data[m_indexes.data() + 0]; v1.data() = m_data[m_indexes.data() + 1]; v2.data() = m_data[m_indexes.data() + 2]; v3.data() = m_data[m_indexes.data() + 3]; v4.data() = m_data[m_indexes.data() + 4]; }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4, V &v5) const/*{{{*/ { v0.data() = m_data[m_indexes.data() + 0]; v1.data() = m_data[m_indexes.data() + 1]; v2.data() = m_data[m_indexes.data() + 2]; v3.data() = m_data[m_indexes.data() + 3]; v4.data() = m_data[m_indexes.data() + 4]; v5.data() = m_data[m_indexes.data() + 5]; }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6) const/*{{{*/ { v0.data() = m_data[m_indexes.data() + 0]; v1.data() = m_data[m_indexes.data() + 1]; v2.data() = m_data[m_indexes.data() + 2]; v3.data() = m_data[m_indexes.data() + 3]; v4.data() = m_data[m_indexes.data() + 4]; v5.data() = m_data[m_indexes.data() + 5]; v6.data() = m_data[m_indexes.data() + 6]; }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7) const/*{{{*/ { v0.data() = m_data[m_indexes.data() + 0]; v1.data() = m_data[m_indexes.data() + 1]; v2.data() = m_data[m_indexes.data() + 2]; v3.data() = m_data[m_indexes.data() + 3]; v4.data() = m_data[m_indexes.data() + 4]; v5.data() = m_data[m_indexes.data() + 5]; v6.data() = m_data[m_indexes.data() + 6]; v7.data() = m_data[m_indexes.data() + 7]; }/*}}}*/ } // namespace Common } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // VC_SCALAR_INTERLEAVEDMEMORY_TCC // vim: foldmethod=marker Vc-0.7.4/scalar/limits.h000066400000000000000000000015031233512346000150070ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2010 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_SCALAR_LIMITS_H #define VC_SCALAR_LIMITS_H #endif // VC_SCALAR_LIMITS_H Vc-0.7.4/scalar/macros.h000066400000000000000000000015411233512346000147740ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2010 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include "../common/macros.h" #ifndef VC_SCALAR_MACROS_H #define VC_SCALAR_MACROS_H #endif // VC_SCALAR_MACROS_H Vc-0.7.4/scalar/mask.h000066400000000000000000000077061233512346000144540ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_SCALAR_MASK_H #define VC_SCALAR_MASK_H #include "types.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace Scalar { template class Mask { public: Vc_ALWAYS_INLINE Mask() {} Vc_ALWAYS_INLINE explicit Mask(bool b) : m(b) {} Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerZero::ZEnum) : m(false) {} Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerOne::OEnum) : m(true) {} Vc_ALWAYS_INLINE Mask(const Mask *a) : m(a[0].m) {} Vc_ALWAYS_INLINE Mask &operator=(const Mask &rhs) { m = rhs.m; return *this; } Vc_ALWAYS_INLINE Mask &operator=(bool rhs) { m = rhs; return *this; } Vc_ALWAYS_INLINE void expand(Mask *x) { x[0].m = m; } Vc_ALWAYS_INLINE bool operator==(const Mask &rhs) const { return Mask(m == rhs.m); } Vc_ALWAYS_INLINE bool operator!=(const Mask &rhs) const { return Mask(m != rhs.m); } Vc_ALWAYS_INLINE Mask operator&&(const Mask &rhs) const { return Mask(m && rhs.m); } Vc_ALWAYS_INLINE Mask operator& (const Mask &rhs) const { return Mask(m && rhs.m); } Vc_ALWAYS_INLINE Mask operator||(const Mask &rhs) const { return Mask(m || rhs.m); } Vc_ALWAYS_INLINE Mask operator| (const Mask &rhs) const { return Mask(m || rhs.m); } Vc_ALWAYS_INLINE Mask operator^ (const Mask &rhs) const { return Mask(m ^ rhs.m); } Vc_ALWAYS_INLINE Mask operator!() const { return Mask(!m); } Vc_ALWAYS_INLINE Mask &operator&=(const Mask &rhs) { m &= rhs.m; return *this; } Vc_ALWAYS_INLINE Mask &operator|=(const Mask &rhs) { m |= rhs.m; return *this; } Vc_ALWAYS_INLINE Mask &operator^=(const Mask &rhs) { m ^= rhs.m; return *this; } Vc_ALWAYS_INLINE bool isFull () const { return m; } Vc_ALWAYS_INLINE bool isEmpty() const { return !m; } Vc_ALWAYS_INLINE bool isMix () const { return false; } Vc_ALWAYS_INLINE bool data () const { return m; } Vc_ALWAYS_INLINE bool dataI() const { return m; } Vc_ALWAYS_INLINE bool dataD() const { return m; } #ifndef VC_NO_AUTOMATIC_BOOL_FROM_MASK Vc_ALWAYS_INLINE operator bool() const { return isFull(); } #endif template Vc_ALWAYS_INLINE Mask cast() const { return *this; } Vc_ALWAYS_INLINE bool operator[](int) const { return m; } Vc_ALWAYS_INLINE int count() const { return m ? 1 : 0; } /** * Returns the index of the first one in the mask. * * The return value is undefined if the mask is empty. */ Vc_ALWAYS_INLINE int firstOne() const { return 0; } private: bool m; }; struct ForeachHelper { bool continu; Vc_ALWAYS_INLINE ForeachHelper(bool mask) : continu(mask) {} Vc_ALWAYS_INLINE void next() { continu = false; } }; #define Vc_foreach_bit(_it_, _mask_) \ for (Vc::Scalar::ForeachHelper Vc__make_unique(foreach_bit_obj)(_mask_); Vc__make_unique(foreach_bit_obj).continu; Vc__make_unique(foreach_bit_obj).next()) \ for (_it_ = 0; Vc__make_unique(foreach_bit_obj).continu; Vc__make_unique(foreach_bit_obj).next()) } // namespace Scalar } // namespace Vc /*OUTER_NAMESPACE_END*/ #endif // VC_SCALAR_MASK_H Vc-0.7.4/scalar/math.h000066400000000000000000000173361233512346000144520ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_SCALAR_MATH_H #define VC_SCALAR_MATH_H #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace Scalar { #define VC_MINMAX(V) \ static Vc_ALWAYS_INLINE V min(const V &x, const V &y) { return V(std::min(x.data(), y.data())); } \ static Vc_ALWAYS_INLINE V max(const V &x, const V &y) { return V(std::max(x.data(), y.data())); } VC_ALL_VECTOR_TYPES(VC_MINMAX) #undef VC_MINMAX template static Vc_ALWAYS_INLINE Vector sqrt (const Vector &x) { return Vector(std::sqrt(x.data())); } template static Vc_ALWAYS_INLINE Vector rsqrt(const Vector &x) { const typename Vector::EntryType one = 1; return Vector(one / std::sqrt(x.data())); } template static Vc_ALWAYS_INLINE Vector abs (const Vector &x) { return Vector(std::abs(x.data())); } template<> Vc_ALWAYS_INLINE int_v abs(const int_v &x) { return x < 0 ? -x : x; } template<> Vc_ALWAYS_INLINE uint_v abs(const uint_v &x) { return x; } template<> Vc_ALWAYS_INLINE short_v abs(const short_v &x) { return x < 0 ? -x : x; } template<> Vc_ALWAYS_INLINE ushort_v abs(const ushort_v &x) { return x; } template static Vc_ALWAYS_INLINE void sincos(const Vector &x, Vector *sin, Vector *cos) { #if (defined(VC_CLANG) && VC_HAS_BUILTIN(__builtin_sincosf)) || (!defined(VC_CLANG) && defined(__GNUC__) && !defined(_WIN32)) __builtin_sincosf(x.data(), &sin->data(), &cos->data()); #elif defined(_GNU_SOURCE) sincosf(x.data(), &sin->data(), &cos->data()); #else sin->data() = std::sin(x.data()); cos->data() = std::cos(x.data()); #endif } template<> Vc_ALWAYS_INLINE void sincos(const Vector &x, Vector *sin, Vector *cos) { #if (defined(VC_CLANG) && VC_HAS_BUILTIN(__builtin_sincos)) || (!defined(VC_CLANG) && defined(__GNUC__) && !defined(_WIN32)) __builtin_sincos(x.data(), &sin->data(), &cos->data()); #elif defined(_GNU_SOURCE) ::sincos(x.data(), &sin->data(), &cos->data()); #else sin->data() = std::sin(x.data()); cos->data() = std::cos(x.data()); #endif } template static Vc_ALWAYS_INLINE Vector sin (const Vector &x) { return Vector(std::sin(x.data())); } template static Vc_ALWAYS_INLINE Vector asin (const Vector &x) { return Vector(std::asin(x.data())); } template static Vc_ALWAYS_INLINE Vector cos (const Vector &x) { return Vector(std::cos(x.data())); } template static Vc_ALWAYS_INLINE Vector log (const Vector &x) { return Vector(std::log(x.data())); } template static Vc_ALWAYS_INLINE Vector log10(const Vector &x) { return Vector(std::log10(x.data())); } #if (defined(_XOPEN_SOURCE) && _XOPEN_SOURCE >= 600) || defined(_ISOC99_SOURCE) || (defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L) static Vc_ALWAYS_INLINE double_v log2(double_v::AsArg x) { return double_v(::log2 (x.data())); } static Vc_ALWAYS_INLINE sfloat_v log2(sfloat_v::AsArg x) { return sfloat_v(::log2f(x.data())); } static Vc_ALWAYS_INLINE float_v log2( float_v::AsArg x) { return float_v(::log2f(x.data())); } #else namespace { template static _VC_CONSTEXPR T c_ln2() { return Vc_buildFloat(1, 0x317218, -1); } // .693147182464599609375 template<> _VC_CONSTEXPR double c_ln2() { return Vc_buildDouble(1, 0x62E42FEFA39EFull, -1); } // .69314718055994528622676398299518041312694549560546875 } #define VC_LOG2(V) \ static Vc_ALWAYS_INLINE V log2(const V &x) \ { \ return V(std::log(x.data()) / c_ln2()); \ } VC_ALL_FLOAT_VECTOR_TYPES(VC_LOG2) #undef VC_LOG2 #endif template static Vc_ALWAYS_INLINE Vector exp (const Vector &x) { return Vector(std::exp(x.data())); } template static Vc_ALWAYS_INLINE Vector atan (const Vector &x) { return Vector(std::atan( x.data() )); } template static Vc_ALWAYS_INLINE Vector atan2(const Vector &x, const Vector &y) { return Vector(std::atan2( x.data(), y.data() )); } template static Vc_ALWAYS_INLINE Vector trunc(const Vector &x) { #if __cplusplus >= 201103 /*C++11*/ return std::trunc(x.data()); #else return x.data() > 0 ? std::floor(x.data()) : std::ceil(x.data()); #endif } template static Vc_ALWAYS_INLINE Vector floor(const Vector &x) { return Vector(std::floor(x.data())); } template static Vc_ALWAYS_INLINE Vector ceil(const Vector &x) { return Vector(std::ceil(x.data())); } template static Vc_ALWAYS_INLINE Vector round(const Vector &x) { return x; } namespace { template bool _realIsEvenHalf(T x) { const T two = 2; const T half = 0.5; const T f = std::floor(x * half) * two; return (x - f) == half; } } // namespace template<> Vc_ALWAYS_INLINE Vector round(const Vector &x) { return float_v(std::floor(x.data() + 0.5f) - (_realIsEvenHalf(x.data()) ? 1.f : 0.f)); } template<> Vc_ALWAYS_INLINE Vector round(const Vector &x) { return sfloat_v(std::floor(x.data() + 0.5f) - (_realIsEvenHalf(x.data()) ? 1.f : 0.f)); } template<> Vc_ALWAYS_INLINE Vector round(const Vector &x) { return double_v(std::floor(x.data() + 0.5 ) - (_realIsEvenHalf(x.data()) ? 1. : 0. )); } template static Vc_ALWAYS_INLINE Vector reciprocal(const Vector &x) { const typename Vector::EntryType one = 1; return Vector(one / x.data()); } #ifdef isfinite #undef isfinite #endif #ifdef isnan #undef isnan #endif template static Vc_ALWAYS_INLINE typename Vector::Mask isfinite(const Vector &x) { return typename Vector::Mask( #ifdef _MSC_VER !!_finite(x.data()) #elif defined(__INTEL_COMPILER) ::isfinite(x.data()) #else std::isfinite(x.data()) #endif ); } template static Vc_ALWAYS_INLINE typename Vector::Mask isnan(const Vector &x) { return typename Vector::Mask( #ifdef _MSC_VER !!_isnan(x.data()) #elif defined(__INTEL_COMPILER) ::isnan(x.data()) #else std::isnan(x.data()) #endif ); } Vc_ALWAYS_INLINE Vector frexp(Vector x, Vector *e) { return float_v(::frexpf(x.data(), &e->data())); } Vc_ALWAYS_INLINE Vector frexp(Vector x, Vector *e) { return double_v(::frexp(x.data(), &e->data())); } Vc_ALWAYS_INLINE sfloat_v frexp(sfloat_v x, short_v *e) { int ee; const float r = ::frexpf(x.data(), &ee); e->data() = ee; return sfloat_v(r); } Vc_ALWAYS_INLINE Vector ldexp(Vector x, Vector e) { return float_v(::ldexpf(x.data(), e.data())); } Vc_ALWAYS_INLINE Vector ldexp(Vector x, Vector e) { return double_v(::ldexp(x.data(), e.data())); } Vc_ALWAYS_INLINE sfloat_v ldexp(sfloat_v x, short_v e) { return sfloat_v(::ldexpf(x.data(), e.data())); } } // namespace Scalar } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // VC_SCALAR_MATH_H Vc-0.7.4/scalar/types.h000066400000000000000000000024121233512346000146520ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_SCALAR_TYPES_H #define VC_SCALAR_TYPES_H #define VC_DOUBLE_V_SIZE 1 #define VC_FLOAT_V_SIZE 1 #define VC_SFLOAT_V_SIZE 1 #define VC_INT_V_SIZE 1 #define VC_UINT_V_SIZE 1 #define VC_SHORT_V_SIZE 1 #define VC_USHORT_V_SIZE 1 #include "../common/types.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace Scalar { template class VectorAlignedBaseT {}; template class Vector; } // namespace Scalar } // namespace Vc /*OUTER_NAMESPACE_END*/ #endif // VC_SCALAR_TYPES_H Vc-0.7.4/scalar/undomacros.h000066400000000000000000000015611233512346000156640ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2010 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_SCALAR_UNDOMACROS_H #define VC_SCALAR_UNDOMACROS_H #endif // VC_SCALAR_UNDOMACROS_H #include "../common/undomacros.h" Vc-0.7.4/scalar/vector.h000066400000000000000000000572001233512346000150150ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef SCALAR_VECTOR_H #define SCALAR_VECTOR_H #include #include #include #ifdef _MSC_VER #include #endif #include "../common/memoryfwd.h" #include "macros.h" #include "types.h" #include "mask.h" #include "writemaskedvector.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace Scalar { enum VectorAlignmentEnum { VectorAlignment = 4 }; template class Vector { friend class WriteMaskedVector; public: typedef typename DetermineEntryType::Type EntryType; protected: EntryType m_data; public: typedef Vc::Memory, 1> Memory; typedef Vector IndexType; typedef Scalar::Mask<1u> Mask; typedef Vector AsArg; Vc_ALWAYS_INLINE EntryType &data() { return m_data; } Vc_ALWAYS_INLINE EntryType data() const { return m_data; } enum Constants { Size = 1 }; /////////////////////////////////////////////////////////////////////////////////////////// // uninitialized Vc_ALWAYS_INLINE Vector() {} /////////////////////////////////////////////////////////////////////////////////////////// // constants Vc_ALWAYS_INLINE Vector(VectorSpecialInitializerZero::ZEnum) : m_data(0) {} Vc_ALWAYS_INLINE Vector(VectorSpecialInitializerOne::OEnum) : m_data(1) {} Vc_ALWAYS_INLINE Vector(VectorSpecialInitializerIndexesFromZero::IEnum) : m_data(0) {} static Vc_ALWAYS_INLINE Vector Zero() { Vector r; r.m_data = 0; return r; } static Vc_ALWAYS_INLINE Vector One() { Vector r; r.m_data = 1; return r; } static Vc_ALWAYS_INLINE Vector IndexesFromZero() { return Zero(); } static Vc_INTRINSIC_L Vector Random() Vc_INTRINSIC_R; /////////////////////////////////////////////////////////////////////////////////////////// // static_cast / copy ctor template explicit Vc_ALWAYS_INLINE Vector(const Vector &x) : m_data(static_cast(x.data())) {} // implicit cast template Vc_ALWAYS_INLINE_L Vector &operator=(const Vector &x) Vc_ALWAYS_INLINE_R; // copy assignment Vc_ALWAYS_INLINE Vector &operator=(Vector v) { m_data = v.data(); return *this; } /////////////////////////////////////////////////////////////////////////////////////////// // broadcast explicit Vc_ALWAYS_INLINE Vector(EntryType x) : m_data(x) {} template Vc_ALWAYS_INLINE Vector(TT x, VC_EXACT_TYPE(TT, EntryType, void *) = 0) : m_data(x) {} Vc_ALWAYS_INLINE Vector &operator=(EntryType a) { m_data = a; return *this; } /////////////////////////////////////////////////////////////////////////////////////////// // load ctors explicit Vc_ALWAYS_INLINE Vector(const EntryType *x) : m_data(x[0]) {} template Vc_ALWAYS_INLINE Vector(const EntryType *x, A) : m_data(x[0]) {} template explicit Vc_ALWAYS_INLINE Vector(const Other *x) : m_data(x[0]) {} template Vc_ALWAYS_INLINE Vector(const Other *x, A) : m_data(x[0]) {} /////////////////////////////////////////////////////////////////////////////////////////// // expand 1 float_v to 2 double_v XXX rationale? remove it for release? XXX template Vc_ALWAYS_INLINE void expand(Vector *x) const { x->data() = static_cast(m_data); } template explicit Vc_ALWAYS_INLINE Vector(const Vector *a) : m_data(static_cast(a->data())) {} /////////////////////////////////////////////////////////////////////////////////////////// // zeroing Vc_ALWAYS_INLINE void setZero() { m_data = 0; } Vc_ALWAYS_INLINE void setZero(Mask k) { if (k) m_data = 0; } Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R; Vc_INTRINSIC_L void setQnan(Mask m) Vc_INTRINSIC_R; /////////////////////////////////////////////////////////////////////////////////////////// // load member functions template Vc_ALWAYS_INLINE void load(const Other *mem) { m_data = mem[0]; } template Vc_ALWAYS_INLINE void load(const Other *mem, A) { m_data = mem[0]; } template Vc_ALWAYS_INLINE void load(const Other *mem, Mask m) { if (m.data()) m_data = mem[0]; } Vc_ALWAYS_INLINE void load(const EntryType *mem) { m_data = mem[0]; } template Vc_ALWAYS_INLINE void load(const EntryType *mem, A) { m_data = mem[0]; } Vc_ALWAYS_INLINE void load(const EntryType *mem, Mask m) { if (m.data()) m_data = mem[0]; } /////////////////////////////////////////////////////////////////////////////////////////// // stores Vc_ALWAYS_INLINE void store(EntryType *mem) const { mem[0] = m_data; } Vc_ALWAYS_INLINE void store(EntryType *mem, Mask m) const { if (m.data()) mem[0] = m_data; } template Vc_ALWAYS_INLINE void store(EntryType *mem, A) const { store(mem); } template Vc_ALWAYS_INLINE void store(EntryType *mem, Mask m, A) const { store(mem, m); } /////////////////////////////////////////////////////////////////////////////////////////// // swizzles Vc_INTRINSIC const Vector &abcd() const { return *this; } Vc_INTRINSIC const Vector cdab() const { return *this; } Vc_INTRINSIC const Vector badc() const { return *this; } Vc_INTRINSIC const Vector aaaa() const { return *this; } Vc_INTRINSIC const Vector bbbb() const { return *this; } Vc_INTRINSIC const Vector cccc() const { return *this; } Vc_INTRINSIC const Vector dddd() const { return *this; } Vc_INTRINSIC const Vector bcad() const { return *this; } Vc_INTRINSIC const Vector bcda() const { return *this; } Vc_INTRINSIC const Vector dabc() const { return *this; } Vc_INTRINSIC const Vector acbd() const { return *this; } Vc_INTRINSIC const Vector dbca() const { return *this; } Vc_INTRINSIC const Vector dcba() const { return *this; } /////////////////////////////////////////////////////////////////////////////////////////// // gathers template Vc_ALWAYS_INLINE Vector(const EntryType *array, const IndexT *indexes) : m_data(array[indexes[0]]) {} template Vc_ALWAYS_INLINE Vector(const EntryType *array, Vector indexes) : m_data(array[indexes[0]]) {} template Vc_ALWAYS_INLINE Vector(const EntryType *array, IndexT indexes, Mask m) : m_data(m.data() ? array[indexes[0]] : 0) {} template Vc_ALWAYS_INLINE Vector(const S1 *array, const EntryType S1::* member1, IT indexes, Mask mask = Mask(true)) : m_data(mask.data() ? (&array[indexes[0]])->*(member1) : 0) {} template Vc_ALWAYS_INLINE Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, IT indexes, Mask mask = Mask(true)) : m_data(mask.data() ? array[indexes[0]].*(member1).*(member2) : 0) {} template Vc_ALWAYS_INLINE Vector(const S1 *array, const EntryType *const S1::* ptrMember1, IT1 outerIndex, IT2 innerIndex, Mask mask = Mask(true)) : m_data(mask.data() ? (array[outerIndex[0]].*(ptrMember1))[innerIndex[0]] : 0) {} template Vc_ALWAYS_INLINE void gather(const EntryType *array, IT indexes, Mask mask = Mask(true)) { if (mask.data()) m_data = array[indexes[0]]; } template Vc_ALWAYS_INLINE void gather(const S1 *array, const EntryType S1::* member1, IT indexes, Mask mask = Mask(true)) { if (mask.data()) m_data = (&array[indexes[0]])->*(member1); } template Vc_ALWAYS_INLINE void gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, IT indexes, Mask mask = Mask(true)) { if (mask.data()) m_data = array[indexes[0]].*(member1).*(member2); } template Vc_ALWAYS_INLINE void gather(const S1 *array, const EntryType *const S1::* ptrMember1, IT1 outerIndex, IT2 innerIndex, Mask mask = Mask(true)) { if (mask.data()) m_data = (array[outerIndex[0]].*(ptrMember1))[innerIndex[0]]; } /////////////////////////////////////////////////////////////////////////////////////////// // scatters Vc_ALWAYS_INLINE void scatter(EntryType *array, const Vector &indexes, Mask m = Mask(true)) const { if (m.data()) array[indexes[0]] = m_data; } template Vc_ALWAYS_INLINE void scatter(S1 *array, EntryType S1::* member, const Vector &indexes, Mask m = Mask(true)) const { if (m.data()) array[indexes[0]].*(member) = m_data; } template Vc_ALWAYS_INLINE void scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, const Vector &indexes, Mask m = Mask(true)) const { if (m.data()) array[indexes[0]].*(member1).*(member2) = m_data; } Vc_ALWAYS_INLINE void scatter(EntryType *array, const Vector &indexes, Mask m = Mask(true)) const { if (m.data()) array[indexes[0]] = m_data; } template Vc_ALWAYS_INLINE void scatter(S1 *array, EntryType S1::* member, const Vector &indexes, Mask m = Mask(true)) const { if (m.data()) array[indexes[0]].*(member) = m_data; } template Vc_ALWAYS_INLINE void scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, const Vector &indexes, Mask m = Mask(true)) const { if (m.data()) array[indexes[0]].*(member1).*(member2) = m_data; } //prefix Vc_ALWAYS_INLINE Vector &operator++() { ++m_data; return *this; } Vc_ALWAYS_INLINE Vector &operator--() { --m_data; return *this; } //postfix Vc_ALWAYS_INLINE Vector operator++(int) { return m_data++; } Vc_ALWAYS_INLINE Vector operator--(int) { return m_data--; } Vc_ALWAYS_INLINE EntryType &operator[](size_t index) { assert(index == 0); if(index) {} return m_data; } Vc_ALWAYS_INLINE EntryType operator[](size_t index) const { assert(index == 0); if(index) {} return m_data; } Vc_ALWAYS_INLINE Vector operator~() const { return Vector(~m_data); } Vc_ALWAYS_INLINE Vector::Type> operator-() const { return Vector::Type>(-m_data); } Vc_INTRINSIC Vector Vc_PURE operator+() const { return *this; } #define OPshift(symbol) \ Vc_ALWAYS_INLINE Vector &operator symbol##=(const Vector &x) { m_data symbol##= x.m_data; return *this; } \ Vc_ALWAYS_INLINE Vector &operator symbol##=(EntryType x) { return operator symbol##=(Vector(x)); } \ Vc_ALWAYS_INLINE Vector operator symbol(const Vector &x) const { return Vector(m_data symbol x.m_data); } #define OPshift_int(symbol) \ Vc_ALWAYS_INLINE Vector operator symbol(int x) const { return Vector(m_data symbol x); } #define OP(symbol) \ OPshift(symbol) \ template Vc_ALWAYS_INLINE VC_EXACT_TYPE(TT, EntryType, Vector) operator symbol(TT x) const { return operator symbol(Vector(x)); } #define OPcmp(symbol) \ Vc_ALWAYS_INLINE Mask operator symbol(const Vector &x) const { return Mask(m_data symbol x.m_data); } \ template Vc_ALWAYS_INLINE VC_EXACT_TYPE(TT, EntryType, Mask) operator symbol(TT x) const { return Mask(m_data symbol x); } VC_ALL_ARITHMETICS(OP) VC_ALL_BINARY(OP) VC_ALL_SHIFTS(OPshift) VC_ALL_SHIFTS(OPshift_int) VC_ALL_COMPARES(OPcmp) #undef OP #undef OPcmp #undef OPshift #undef OPshift_int Vc_INTRINSIC_L Vc_PURE_L Mask isNegative() const Vc_PURE_R Vc_INTRINSIC_R; Vc_ALWAYS_INLINE void fusedMultiplyAdd(const Vector &factor, const Vector &summand) { m_data = m_data * factor.data() + summand.data(); } Vc_ALWAYS_INLINE void assign(const Vector &v, const Mask &m) { if (m.data()) m_data = v.m_data; } template Vc_ALWAYS_INLINE V2 staticCast() const { return V2(static_cast(m_data)); } template Vc_ALWAYS_INLINE V2 reinterpretCast() const { typedef typename V2::EntryType AliasT2 Vc_MAY_ALIAS; return V2(*reinterpret_cast(&m_data)); } Vc_ALWAYS_INLINE WriteMaskedVector operator()(Mask m) { return WriteMaskedVector(this, m); } Vc_ALWAYS_INLINE bool pack(Mask &m1, Vector &v2, Mask &m2) { if (!m1.data() && m2.data()) { m_data = v2.m_data; m1 = true; m2 = false; return true; } return m1; } Vc_ALWAYS_INLINE EntryType min() const { return m_data; } Vc_ALWAYS_INLINE EntryType max() const { return m_data; } Vc_ALWAYS_INLINE EntryType product() const { return m_data; } Vc_ALWAYS_INLINE EntryType sum() const { return m_data; } Vc_ALWAYS_INLINE EntryType min(Mask) const { return m_data; } Vc_ALWAYS_INLINE EntryType max(Mask) const { return m_data; } Vc_ALWAYS_INLINE EntryType product(Mask) const { return m_data; } Vc_ALWAYS_INLINE EntryType sum(Mask m) const { if (m) return m_data; return static_cast(0); } Vc_INTRINSIC Vector shifted(int amount) const { return amount == 0 ? *this : Zero(); } Vc_INTRINSIC Vector rotated(int) const { return *this; } Vector sorted() const { return *this; } template void callWithValuesSorted(F &f) { f(m_data); } template Vc_INTRINSIC void call(const F &f) const { f(m_data); } template Vc_INTRINSIC void call(F &f) const { f(m_data); } template Vc_INTRINSIC void call(const F &f, Mask mask) const { if (mask) { f(m_data); } } template Vc_INTRINSIC void call(F &f, Mask mask) const { if (mask) { f(m_data); } } template Vc_INTRINSIC Vector apply(const F &f) const { return Vector(f(m_data)); } template Vc_INTRINSIC Vector apply(F &f) const { return Vector(f(m_data)); } template Vc_INTRINSIC Vector apply(const F &f, Mask mask) const { if (mask) { return Vector(f(m_data)); } else { return *this; } } template Vc_INTRINSIC Vector apply(F &f, Mask mask) const { if (mask) { return Vector(f(m_data)); } else { return *this; } } template Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) { m_data = f(0); } Vc_INTRINSIC void fill(EntryType (&f)()) { m_data = f(); } Vc_INTRINSIC_L Vector copySign(Vector reference) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector exponent() const Vc_INTRINSIC_R; }; typedef Vector double_v; typedef Vector float_v; typedef Vector sfloat_v; typedef Vector int_v; typedef Vector uint_v; typedef Vector short_v; typedef Vector ushort_v; typedef double_v::Mask double_m; typedef float_v::Mask float_m; typedef sfloat_v::Mask sfloat_m; typedef int_v::Mask int_m; typedef uint_v::Mask uint_m; typedef short_v::Mask short_m; typedef ushort_v::Mask ushort_m; template class SwizzledVector : public Vector {}; #ifdef _MSC_VER template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &) { } #else template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x01) { __asm__ __volatile__(""::"r"(x01.data())); } template<> Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x01) { __asm__ __volatile__(""::"x"(x01.data())); } template<> Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x01) { __asm__ __volatile__(""::"x"(x01.data())); } #endif template static Vc_ALWAYS_INLINE void forceToRegisters( const Vector &x01, const Vector &x02) { forceToRegisters(x01); forceToRegisters(x02); } template static Vc_ALWAYS_INLINE void forceToRegisters( const Vector &, const Vector &, const Vector &) {} template static Vc_ALWAYS_INLINE void forceToRegisters( const Vector &, const Vector &, const Vector &, const Vector &) {} template static Vc_ALWAYS_INLINE void forceToRegisters( const Vector &, const Vector &, const Vector &, const Vector &, const Vector &) {} template static Vc_ALWAYS_INLINE void forceToRegisters( const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &) {} template static Vc_ALWAYS_INLINE void forceToRegisters( const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &) {} template static Vc_ALWAYS_INLINE void forceToRegisters( const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &) {} template static Vc_ALWAYS_INLINE void forceToRegisters( const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &) {} template static Vc_ALWAYS_INLINE void forceToRegisters( const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &) {} template static Vc_ALWAYS_INLINE void forceToRegisters( const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &) {} template static Vc_ALWAYS_INLINE void forceToRegisters( const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &) {} template static Vc_ALWAYS_INLINE void forceToRegisters( const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &) {} template static Vc_ALWAYS_INLINE void forceToRegisters( const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &) {} template static Vc_ALWAYS_INLINE void forceToRegisters( const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &) {} template static Vc_ALWAYS_INLINE void forceToRegisters( const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &, const Vector &) {} } // namespace Scalar } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "vector.tcc" #include "math.h" #include "undomacros.h" #endif // SCALAR_VECTOR_H Vc-0.7.4/scalar/vector.tcc000066400000000000000000000174161233512346000153440ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2011-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { ALIGN(64) extern unsigned int RandomState[16]; namespace Scalar { // conversion/casts {{{1 template<> template<> Vc_INTRINSIC short_v &Vector::operator=(const ushort_v &x) { data() = static_cast(x.data()); return *this; } template<> template<> Vc_INTRINSIC ushort_v &Vector::operator=(const short_v &x) { data() = static_cast(x.data()); return *this; } template<> template<> Vc_INTRINSIC int_v &Vector::operator=(const uint_v &x) { data() = static_cast(x.data()); return *this; } template<> template<> Vc_INTRINSIC uint_v &Vector::operator=(const int_v &x) { data() = static_cast(x.data()); return *this; } // copySign ///////////////////////////////////////////////////////////////////////// {{{1 template<> Vc_INTRINSIC Vector Vector::copySign(Vector reference) const { union { float f; unsigned int i; } value, sign; value.f = data(); sign.f = reference.data(); value.i = (sign.i & 0x80000000u) | (value.i & 0x7fffffffu); return float_v(value.f); } template<> Vc_INTRINSIC sfloat_v Vector::copySign(sfloat_v reference) const { return sfloat_v(float_v(m_data).copySign(float_v(reference.data())).data()); } template<> Vc_INTRINSIC Vector Vector::copySign(Vector reference) const { union { double f; unsigned long long i; } value, sign; value.f = data(); sign.f = reference.data(); value.i = (sign.i & 0x8000000000000000ull) | (value.i & 0x7fffffffffffffffull); return double_v(value.f); } // }}}1 // bitwise operators {{{1 #define VC_CAST_OPERATOR_FORWARD(op, IntT, VecT) \ template<> Vc_ALWAYS_INLINE VecT &VecT::operator op##=(const VecT &x) { \ typedef IntT uinta Vc_MAY_ALIAS; \ uinta *left = reinterpret_cast(&m_data); \ const uinta *right = reinterpret_cast(&x.m_data); \ *left op##= *right; \ return *this; \ } \ template<> Vc_ALWAYS_INLINE Vc_PURE VecT VecT::operator op(const VecT &x) const { \ VecT ret = *this; \ return VecT(ret op##= x); \ } #define VC_CAST_OPERATOR_FORWARD_FLOAT(op) VC_CAST_OPERATOR_FORWARD(op, unsigned int, Vector) #define VC_CAST_OPERATOR_FORWARD_SFLOAT(op) VC_CAST_OPERATOR_FORWARD(op, unsigned int, Vector) #define VC_CAST_OPERATOR_FORWARD_DOUBLE(op) VC_CAST_OPERATOR_FORWARD(op, unsigned long long, Vector) VC_ALL_BINARY(VC_CAST_OPERATOR_FORWARD_FLOAT) VC_ALL_BINARY(VC_CAST_OPERATOR_FORWARD_SFLOAT) VC_ALL_BINARY(VC_CAST_OPERATOR_FORWARD_DOUBLE) #undef VC_CAST_OPERATOR_FORWARD #undef VC_CAST_OPERATOR_FORWARD_FLOAT #undef VC_CAST_OPERATOR_FORWARD_SFLOAT #undef VC_CAST_OPERATOR_FORWARD_DOUBLE // }}}1 // operators {{{1 #include "../common/operators.h" // }}}1 // exponent {{{1 template<> Vc_INTRINSIC Vector Vector::exponent() const { VC_ASSERT(m_data >= 0.f); union { float f; int i; } value; value.f = m_data; return float_v(static_cast((value.i >> 23) - 0x7f)); } template<> Vc_INTRINSIC sfloat_v Vector::exponent() const { return sfloat_v(float_v(m_data).exponent().data()); } template<> Vc_INTRINSIC Vector Vector::exponent() const { VC_ASSERT(m_data >= 0.); union { double f; long long i; } value; value.f = m_data; return double_v(static_cast((value.i >> 52) - 0x3ff)); } // }}}1 // FMA {{{1 static Vc_ALWAYS_INLINE float highBits(float x) { union { float f; unsigned int i; } y; y.f = x; y.i &= 0xfffff000u; return y.f; } static Vc_ALWAYS_INLINE double highBits(double x) { union { double f; unsigned long long i; } y; y.f = x; y.i &= 0xfffffffff8000000ull; return y.f; } template Vc_ALWAYS_INLINE T _fusedMultiplyAdd(T a, T b, T c) { const T h1 = highBits(a); const T l1 = a - h1; const T h2 = highBits(b); const T l2 = b - h2; const T ll = l1 * l2; const T lh = l1 * h2 + h1 * l2; const T hh = h1 * h2; if (std::abs(c) < std::abs(lh)) { return (ll + c) + (lh + hh); } else { return (ll + lh) + (c + hh); } } template<> Vc_ALWAYS_INLINE void float_v::fusedMultiplyAdd(const float_v &f, const float_v &s) { data() = _fusedMultiplyAdd(data(), f.data(), s.data()); } template<> Vc_ALWAYS_INLINE void sfloat_v::fusedMultiplyAdd(const sfloat_v &f, const sfloat_v &s) { data() = _fusedMultiplyAdd(data(), f.data(), s.data()); } template<> Vc_ALWAYS_INLINE void double_v::fusedMultiplyAdd(const double_v &f, const double_v &s) { data() = _fusedMultiplyAdd(data(), f.data(), s.data()); } // Random {{{1 static Vc_ALWAYS_INLINE void _doRandomStep(Vector &state0, Vector &state1) { state0.load(&Vc::RandomState[0]); state1.load(&Vc::RandomState[uint_v::Size]); (state1 * 0xdeece66du + 11).store(&Vc::RandomState[uint_v::Size]); uint_v((state0 * 0xdeece66du + 11).data() ^ (state1.data() >> 16)).store(&Vc::RandomState[0]); } template Vc_INTRINSIC Vector Vector::Random() { Vector state0, state1; _doRandomStep(state0, state1); return Vector(static_cast(state0.data())); } template<> Vc_INTRINSIC Vector Vector::Random() { Vector state0, state1; _doRandomStep(state0, state1); union { unsigned int i; float f; } x; x.i = (state0.data() & 0x0fffffffu) | 0x3f800000u; return float_v(x.f - 1.f); } template<> Vc_INTRINSIC sfloat_v Vector::Random() { return sfloat_v(Vector::Random().data()); } template<> Vc_INTRINSIC Vector Vector::Random() { typedef unsigned long long uint64 Vc_MAY_ALIAS; uint64 state0 = *reinterpret_cast(&Vc::RandomState[8]); state0 = (state0 * 0x5deece66dull + 11) & 0x000fffffffffffffull; *reinterpret_cast(&Vc::RandomState[8]) = state0; union { unsigned long long i; double f; } x; x.i = state0 | 0x3ff0000000000000ull; return double_v(x.f - 1.); } // isNegative {{{1 template Vc_INTRINSIC Vc_PURE typename Vector::Mask Vector::isNegative() const { union { float f; unsigned int i; } u; u.f = m_data; return Mask(0u != (u.i & 0x80000000u)); } template<> Vc_INTRINSIC Vc_PURE double_m double_v::isNegative() const { union { double d; unsigned long long l; } u; u.d = m_data; return double_m(0ull != (u.l & 0x8000000000000000ull)); } // setQnan {{{1 template Vc_INTRINSIC void Vector::setQnan() { union { float f; unsigned int i; } u; u.i = 0xffffffffu; m_data = u.f; } template<> Vc_INTRINSIC void double_v::setQnan() { union { double d; unsigned long long l; } u; u.l = 0xffffffffffffffffull; m_data = u.d; } template Vc_INTRINSIC void Vector::setQnan(Mask m) { if (m) { setQnan(); } } template<> Vc_INTRINSIC void double_v::setQnan(Mask m) { if (m) { setQnan(); } } // }}}1 } // namespace Scalar } // namespace Vc /*OUTER_NAMESPACE_END*/ // vim: foldmethod=marker Vc-0.7.4/scalar/writemaskedvector.h000066400000000000000000000067461233512346000172660ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_SCALAR_WRITEMASKEDVECTOR_H #define VC_SCALAR_WRITEMASKEDVECTOR_H /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace Scalar { template class WriteMaskedVector { friend class Vector; typedef typename Vector::Mask Mask; typedef typename Vector::EntryType EntryType; public: //prefix Vc_ALWAYS_INLINE Vector &operator++() { if (mask) ++vec->m_data; return *vec; } Vc_ALWAYS_INLINE Vector &operator--() { if (mask) --vec->m_data; return *vec; } //postfix Vc_ALWAYS_INLINE Vector operator++(int) { if (mask) vec->m_data++; return *vec; } Vc_ALWAYS_INLINE Vector operator--(int) { if (mask) vec->m_data--; return *vec; } Vc_ALWAYS_INLINE Vector &operator+=(Vector x) { if (mask) vec->m_data += x.m_data; return *vec; } Vc_ALWAYS_INLINE Vector &operator-=(Vector x) { if (mask) vec->m_data -= x.m_data; return *vec; } Vc_ALWAYS_INLINE Vector &operator*=(Vector x) { if (mask) vec->m_data *= x.m_data; return *vec; } Vc_ALWAYS_INLINE Vector &operator/=(Vector x) { if (mask) vec->m_data /= x.m_data; return *vec; } Vc_ALWAYS_INLINE Vector &operator=(Vector x) { vec->assign(x, mask); return *vec; } Vc_ALWAYS_INLINE Vector &operator+=(EntryType x) { if (mask) vec->m_data += x; return *vec; } Vc_ALWAYS_INLINE Vector &operator-=(EntryType x) { if (mask) vec->m_data -= x; return *vec; } Vc_ALWAYS_INLINE Vector &operator*=(EntryType x) { if (mask) vec->m_data *= x; return *vec; } Vc_ALWAYS_INLINE Vector &operator/=(EntryType x) { if (mask) vec->m_data /= x; return *vec; } Vc_ALWAYS_INLINE Vector &operator=(EntryType x) { vec->assign(Vector(x), mask); return *vec; } template Vc_ALWAYS_INLINE void call(const F &f) const { vec->call(f, mask); } template Vc_ALWAYS_INLINE void call(F &f) const { vec->call(f, mask); } template Vc_ALWAYS_INLINE Vector apply(const F &f) const { if (mask) { return Vector(f(vec->m_data)); } else { return *vec; } } template Vc_ALWAYS_INLINE Vector apply(F &f) const { if (mask) { return Vector(f(vec->m_data)); } else { return *vec; } } private: Vc_ALWAYS_INLINE WriteMaskedVector(Vector *v, Mask k) : vec(v), mask(k) {} Vector *const vec; Mask mask; }; } // namespace Scalar } // namespace Vc /*OUTER_NAMESPACE_END*/ #endif // VC_SCALAR_WRITEMASKEDVECTOR_H Vc-0.7.4/src/000077500000000000000000000000001233512346000126605ustar00rootroot00000000000000Vc-0.7.4/src/avx_sorthelper.cpp000066400000000000000000000431521233512346000164360ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include #include #include #include /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace AVX { template<> m128i SortHelper::sort(VTArg _x) { m128i lo, hi, y, x = _x; // sort pairs y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1)); lo = _mm_min_epi16(x, y); hi = _mm_max_epi16(x, y); x = _mm_blend_epi16(lo, hi, 0xaa); // merge left and right quads y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(0, 1, 2, 3)), _MM_SHUFFLE(0, 1, 2, 3)); lo = _mm_min_epi16(x, y); hi = _mm_max_epi16(x, y); x = _mm_blend_epi16(lo, hi, 0xcc); y = _mm_srli_si128(x, 2); lo = _mm_min_epi16(x, y); hi = _mm_max_epi16(x, y); x = _mm_blend_epi16(lo, _mm_slli_si128(hi, 2), 0xaa); // merge quads into octs y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)); y = _mm_shufflelo_epi16(y, _MM_SHUFFLE(0, 1, 2, 3)); lo = _mm_min_epi16(x, y); hi = _mm_max_epi16(x, y); x = _mm_unpacklo_epi16(lo, hi); y = _mm_srli_si128(x, 8); lo = _mm_min_epi16(x, y); hi = _mm_max_epi16(x, y); x = _mm_unpacklo_epi16(lo, hi); y = _mm_srli_si128(x, 8); lo = _mm_min_epi16(x, y); hi = _mm_max_epi16(x, y); return _mm_unpacklo_epi16(lo, hi); } template<> m128i SortHelper::sort(VTArg _x) { m128i lo, hi, y, x = _x; // sort pairs y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1)); lo = _mm_min_epu16(x, y); hi = _mm_max_epu16(x, y); x = _mm_blend_epi16(lo, hi, 0xaa); // merge left and right quads y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(0, 1, 2, 3)), _MM_SHUFFLE(0, 1, 2, 3)); lo = _mm_min_epu16(x, y); hi = _mm_max_epu16(x, y); x = _mm_blend_epi16(lo, hi, 0xcc); y = _mm_srli_si128(x, 2); lo = _mm_min_epu16(x, y); hi = _mm_max_epu16(x, y); x = _mm_blend_epi16(lo, _mm_slli_si128(hi, 2), 0xaa); // merge quads into octs y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)); y = _mm_shufflelo_epi16(y, _MM_SHUFFLE(0, 1, 2, 3)); lo = _mm_min_epu16(x, y); hi = _mm_max_epu16(x, y); x = _mm_unpacklo_epi16(lo, hi); y = _mm_srli_si128(x, 8); lo = _mm_min_epu16(x, y); hi = _mm_max_epu16(x, y); x = _mm_unpacklo_epi16(lo, hi); y = _mm_srli_si128(x, 8); lo = _mm_min_epu16(x, y); hi = _mm_max_epu16(x, y); return _mm_unpacklo_epi16(lo, hi); } template<> m256i SortHelper::sort(VTArg _hgfedcba) { VectorType hgfedcba = _hgfedcba; const m128i hgfe = hi128(hgfedcba); const m128i dcba = lo128(hgfedcba); m128i l = _mm_min_epi32(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea m128i h = _mm_max_epi32(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea m128i x = _mm_unpacklo_epi32(l, h); // ↑fb ↓fb ↑ea ↓ea m128i y = _mm_unpackhi_epi32(l, h); // ↑hd ↓hd ↑gc ↓gc l = _mm_min_epi32(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca h = _mm_max_epi32(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc) x = _mm_min_epi32(l, Reg::permute(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca) y = _mm_max_epi32(h, Reg::permute(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca) m128i b = Reg::shuffle(y, x); // b3 <= b2 <= b1 <= b0 m128i a = _mm_unpackhi_epi64(x, y); // a3 >= a2 >= a1 >= a0 // _mm_extract_epi32 from clang < 3.4 returns an unsigned int - the static_cast is free for // conforming compilers, but fixes broken ones if (VC_IS_UNLIKELY(static_cast(_mm_extract_epi32(x, 2)) >= static_cast(_mm_extract_epi32(y, 1)))) { return concat(Reg::permute(b), a); } else if (VC_IS_UNLIKELY(static_cast(_mm_extract_epi32(x, 0)) >= static_cast(_mm_extract_epi32(y, 3)))) { return concat(a, Reg::permute(b)); } // merge l = _mm_min_epi32(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0 h = _mm_max_epi32(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0 a = _mm_unpacklo_epi32(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0 b = _mm_unpackhi_epi32(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2 l = _mm_min_epi32(a, b); // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2 h = _mm_max_epi32(a, b); // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2) a = _mm_unpacklo_epi32(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2 b = _mm_unpackhi_epi32(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3 l = _mm_min_epi32(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2) ↑(↓a1b1,↓a3b3)) ↓a0b3 h = _mm_max_epi32(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2) ↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3) return concat(_mm_unpacklo_epi32(l, h), _mm_unpackhi_epi32(l, h)); } template<> m256i SortHelper::sort(VTArg _hgfedcba) { VectorType hgfedcba = _hgfedcba; const m128i hgfe = hi128(hgfedcba); const m128i dcba = lo128(hgfedcba); m128i l = _mm_min_epu32(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea m128i h = _mm_max_epu32(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea m128i x = _mm_unpacklo_epi32(l, h); // ↑fb ↓fb ↑ea ↓ea m128i y = _mm_unpackhi_epi32(l, h); // ↑hd ↓hd ↑gc ↓gc l = _mm_min_epu32(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca h = _mm_max_epu32(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc) x = _mm_min_epu32(l, Reg::permute(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca) y = _mm_max_epu32(h, Reg::permute(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca) m128i b = Reg::shuffle(y, x); // b3 <= b2 <= b1 <= b0 m128i a = _mm_unpackhi_epi64(x, y); // a3 >= a2 >= a1 >= a0 if (VC_IS_UNLIKELY(_mm_extract_epu32(x, 2) >= _mm_extract_epu32(y, 1))) { return concat(Reg::permute(b), a); } else if (VC_IS_UNLIKELY(_mm_extract_epu32(x, 0) >= _mm_extract_epu32(y, 3))) { return concat(a, Reg::permute(b)); } // merge l = _mm_min_epu32(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0 h = _mm_max_epu32(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0 a = _mm_unpacklo_epi32(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0 b = _mm_unpackhi_epi32(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2 l = _mm_min_epu32(a, b); // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2 h = _mm_max_epu32(a, b); // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2) a = _mm_unpacklo_epi32(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2 b = _mm_unpackhi_epi32(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3 l = _mm_min_epu32(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2) ↑(↓a1b1,↓a3b3)) ↓a0b3 h = _mm_max_epu32(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2) ↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3) return concat(_mm_unpacklo_epi32(l, h), _mm_unpackhi_epi32(l, h)); } template<> m256 SortHelper::sort(VTArg _hgfedcba) { VectorType hgfedcba = _hgfedcba; const m128 hgfe = hi128(hgfedcba); const m128 dcba = lo128(hgfedcba); m128 l = _mm_min_ps(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea m128 h = _mm_max_ps(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea m128 x = _mm_unpacklo_ps(l, h); // ↑fb ↓fb ↑ea ↓ea m128 y = _mm_unpackhi_ps(l, h); // ↑hd ↓hd ↑gc ↓gc l = _mm_min_ps(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca h = _mm_max_ps(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc) x = _mm_min_ps(l, Reg::permute(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca) y = _mm_max_ps(h, Reg::permute(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca) m128 a = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(x), _mm_castps_pd(y))); // a3 >= a2 >= a1 >= a0 m128 b = Reg::shuffle(y, x); // b3 <= b2 <= b1 <= b0 // merge l = _mm_min_ps(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0 h = _mm_max_ps(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0 a = _mm_unpacklo_ps(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0 b = _mm_unpackhi_ps(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2 l = _mm_min_ps(a, b); // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2 h = _mm_max_ps(a, b); // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2) a = _mm_unpacklo_ps(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2 b = _mm_unpackhi_ps(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3 l = _mm_min_ps(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2) ↑(↓a1b1,↓a3b3)) ↓a0b3 h = _mm_max_ps(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2) ↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3) return concat(_mm_unpacklo_ps(l, h), _mm_unpackhi_ps(l, h)); } template<> m256 SortHelper::sort(VTArg hgfedcba) { return SortHelper::sort(hgfedcba); } template<> void SortHelper::sort(m256d &VC_RESTRICT x, m256d &VC_RESTRICT y) { m256d l = _mm256_min_pd(x, y); // ↓x3y3 ↓x2y2 ↓x1y1 ↓x0y0 m256d h = _mm256_max_pd(x, y); // ↑x3y3 ↑x2y2 ↑x1y1 ↑x0y0 x = _mm256_unpacklo_pd(l, h); // ↑x2y2 ↓x2y2 ↑x0y0 ↓x0y0 y = _mm256_unpackhi_pd(l, h); // ↑x3y3 ↓x3y3 ↑x1y1 ↓x1y1 l = _mm256_min_pd(x, y); // ↓(↑x2y2,↑x3y3) ↓x3x2y3y2 ↓(↑x0y0,↑x1y1) ↓x1x0y1y0 h = _mm256_max_pd(x, y); // ↑x3x2y3y2 ↑(↓x2y2,↓x3y3) ↑x1x0y1y0 ↑(↓x0y0,↓x1y1) x = _mm256_unpacklo_pd(l, h); // ↑(↓x2y2,↓x3y3) ↓x3x2y3y2 ↑(↓x0y0,↓x1y1) ↓x1x0y1y0 y = _mm256_unpackhi_pd(h, l); // ↓(↑x2y2,↑x3y3) ↑x3x2y3y2 ↓(↑x0y0,↑x1y1) ↑x1x0y1y0 l = _mm256_min_pd(x, y); // ↓(↑(↓x2y2,↓x3y3) ↓(↑x2y2,↑x3y3)) ↓x3x2y3y2 ↓(↑(↓x0y0,↓x1y1) ↓(↑x0y0,↑x1y1)) ↓x1x0y1y0 h = _mm256_max_pd(x, y); // ↑(↑(↓x2y2,↓x3y3) ↓(↑x2y2,↑x3y3)) ↑x3x2y3y2 ↑(↑(↓x0y0,↓x1y1) ↓(↑x0y0,↑x1y1)) ↑x1x0y1y0 m256d a = Reg::permute(Reg::permute128(h, h)); // h0 h1 h3 h2 m256d b = Reg::permute(l); // l2 l3 l1 l0 // a3 >= a2 >= b1 >= b0 // b3 <= b2 <= a1 <= a0 // merge l = _mm256_min_pd(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0 h = _mm256_min_pd(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0 x = _mm256_unpacklo_pd(l, h); // ↑a2b2 ↓a2b2 ↑a0b0 ↓a0b0 y = _mm256_unpackhi_pd(l, h); // ↑a3b3 ↓a3b3 ↑a1b1 ↓a1b1 l = _mm256_min_pd(x, y); // ↓(↑a2b2,↑a3b3) ↓a2b3 ↓(↑a0b0,↑a1b1) ↓a1b0 h = _mm256_min_pd(x, y); // ↑a3b2 ↑(↓a2b2,↓a3b3) ↑a0b1 ↑(↓a0b0,↓a1b1) x = Reg::permute128(l, h); // ↑a0b1 ↑(↓a0b0,↓a1b1) ↓(↑a0b0,↑a1b1) ↓a1b0 y = Reg::permute128(l, h); // ↑a3b2 ↑(↓a2b2,↓a3b3) ↓(↑a2b2,↑a3b3) ↓a2b3 l = _mm256_min_pd(x, y); // ↓(↑a0b1,↑a3b2) ↓(↑(↓a0b0,↓a1b1) ↑(↓a2b2,↓a3b3)) ↓(↑a0b0,↑a1b1,↑a2b2,↑a3b3) ↓b0b3 h = _mm256_min_pd(x, y); // ↑a0a3 ↑(↓a0b0,↓a1b1,↓a2b2,↓a3b3) ↑(↓(↑a0b0,↑a1b1) ↓(↑a2b2,↑a3b3)) ↑(↓a1b0,↓a2b3) x = _mm256_unpacklo_pd(l, h); // h2 l2 h0 l0 y = _mm256_unpackhi_pd(l, h); // h3 l3 h1 l1 } template<> m256d SortHelper::sort(VTArg _dcba) { VectorType dcba = _dcba; /* * to find the second largest number find * max(min(max(ab),max(cd)), min(max(ad),max(bc))) * or * max(max(min(ab),min(cd)), min(max(ab),max(cd))) * const m256d adcb = avx_cast(concat(_mm_alignr_epi8(avx_cast(dc), avx_cast(ba), 8), _mm_alignr_epi8(avx_cast(ba), avx_cast(dc), 8))); const m256d l = _mm256_min_pd(dcba, adcb); // min(ad cd bc ab) const m256d h = _mm256_max_pd(dcba, adcb); // max(ad cd bc ab) // max(h3, h1) // max(min(h0,h2), min(h3,h1)) // min(max(l0,l2), max(l3,l1)) // min(l3, l1) const m256d ll = _mm256_min_pd(h, Reg::permute128(h, h)); // min(h3h1 h2h0 h1h3 h0h2) //const m256d hh = _mm256_max_pd(h3 ll1_3 l1 l0, h1 ll0_2 l3 l2); const m256d hh = _mm256_max_pd( Reg::permute128(_mm256_unpackhi_pd(ll, h), l), Reg::permute128(_mm256_blend_pd(h ll, 0x1), l)); _mm256_min_pd(hh0, hh1 */ ////////////////////////////////////////////////////////////////////////////////// // max(max(ac), max(bd)) // max(max(min(ac),min(bd)), min(max(ac),max(bd))) // min(max(min(ac),min(bd)), min(max(ac),max(bd))) // min(min(ac), min(bd)) m128d l = _mm_min_pd(lo128(dcba), hi128(dcba)); // min(bd) min(ac) m128d h = _mm_max_pd(lo128(dcba), hi128(dcba)); // max(bd) max(ac) m128d h0_l0 = _mm_unpacklo_pd(l, h); m128d h1_l1 = _mm_unpackhi_pd(l, h); l = _mm_min_pd(h0_l0, h1_l1); h = _mm_max_pd(h0_l0, h1_l1); return concat( _mm_min_pd(l, Reg::permute(h)), _mm_max_pd(h, Reg::permute(l)) ); // extract: 1 cycle // min/max: 4 cycles // unpacklo/hi: 2 cycles // min/max: 4 cycles // permute: 1 cycle // min/max: 4 cycles // insert: 1 cycle // ---------------------- // total: 17 cycles /* m256d cdab = Reg::permute(dcba); m256d l = _mm256_min_pd(dcba, cdab); m256d h = _mm256_max_pd(dcba, cdab); m256d maxmin_ba = Reg::permute128(l, h); m256d maxmin_dc = Reg::permute128(l, h); l = _mm256_min_pd(maxmin_ba, maxmin_dc); h = _mm256_max_pd(maxmin_ba, maxmin_dc); return _mm256_blend_pd(h, l, 0x55); */ /* // a b c d // b a d c // sort pairs m256d y, l, h; m128d l2, h2; y = shuffle(x, x); l = _mm256_min_pd(x, y); // min[ab ab cd cd] h = _mm256_max_pd(x, y); // max[ab ab cd cd] // 1 of 2 is at [0] // 1 of 4 is at [1] // 1 of 4 is at [2] // 1 of 2 is at [3] // don't be fooled by unpack here. It works differently for AVX pd than for SSE ps x = _mm256_unpacklo_pd(l, h); // l_ab h_ab l_cd h_cd l2 = _mm_min_pd(lo128(x), hi128(x)); // l_abcd l(h_ab hcd) h2 = _mm_max_pd(lo128(x), hi128(x)); // h(l_ab l_cd) h_abcd // either it is: return concat(l2, h2); // or: // concat(_mm_unpacklo_pd(l2, h2), _mm_unpackhi_pd(l2, h2)); // I'd like to have four useful compares const m128d dc = hi128(dcba); const m128d ba = lo128(dcba); const m256d adcb = avx_cast(concat(_mm_alignr_epi8(avx_cast(dc), avx_cast(ba), 8), _mm_alignr_epi8(avx_cast(ba), avx_cast(dc), 8))); const int extraCmp = _mm_movemask_pd(_mm_cmpgt_pd(dc, ba)); // 0x0: d <= b && c <= a // 0x1: d <= b && c > a // 0x2: d > b && c <= a // 0x3: d > b && c > a switch (_mm256_movemask_pd(_mm256_cmpgt_pd(dcba, adcb))) { // impossible: 0x0, 0xf case 0x1: // a <= b && b <= c && c <= d && d > a // abcd return Reg::permute(Reg::permute(dcba, dcba)); case 0x2: // a <= b && b <= c && c > d && d <= a // dabc return Reg::permute(adcb); case 0x3: // a <= b && b <= c && c > d && d > a // a[bd]c if (extraCmp & 2) { // abdc return Reg::permute(Reg::permute(dcba, dcba)); } else { // adbc return Reg::permute(adcb); } case 0x4: // a <= b && b > c && c <= d && d <= a // cdab; return Reg::permute(dcba); case 0x5: // a <= b && b > c && c <= d && d > a // [ac] < [bd] switch (extraCmp) { case 0x0: // d <= b && c <= a // cadb return shuffle<>(dcba, bcda); case 0x1: // d <= b && c > a case 0x2: // d > b && c <= a case 0x3: // d > b && c > a } case 0x6: // a <= b && b > c && c > d && d <= a // d[ac]b case 0x7: // a <= b && b > c && c > d && d > a // adcb; return permute(permute128(bcda, bcda)); case 0x8: // a > b && b <= c && c <= d && d <= a return bcda; case 0x9: // a > b && b <= c && c <= d && d > a // b[ac]d; case 0xa: // a > b && b <= c && c > d && d <= a // [ac] > [bd] case 0xb: // a > b && b <= c && c > d && d > a // badc; return permute128(dcba); case 0xc: // a > b && b > c && c <= d && d <= a // c[bd]a; case 0xd: // a > b && b > c && c <= d && d > a // cbad; return permute(bcda); case 0xe: // a > b && b > c && c > d && d <= a return dcba; } */ } } // namespace AVX } // namespace Vc /*OUTER_NAMESPACE_END*/ Vc-0.7.4/src/const.cpp000066400000000000000000000632111233512346000145150ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef V_ALIGN # ifdef __GNUC__ # define V_ALIGN(n) __attribute__((aligned(n))) # else # define V_ALIGN(n) __declspec(align(n)) # endif #endif #include "avx/const_data.h" #include "sse/const_data.h" #include #include #include #include #include "common/macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace AVX { // cacheline 1 V_ALIGN(64) extern const unsigned int _IndexesFromZero32[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; V_ALIGN(16) extern const unsigned short _IndexesFromZero16[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; V_ALIGN(16) extern const unsigned char _IndexesFromZero8 [16]= { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; template<> const double c_trig::data[] = { // cacheline 4 Vc_buildDouble(1, 0x921fb54442d18ull, -1), // π/4 Vc_buildDouble(1, 0x921fb40000000ull, -1), // π/4 - 30bits precision Vc_buildDouble(1, 0x4442d00000000ull, -25), // π/4 remainder1 - 32bits precision Vc_buildDouble(1, 0x8469898cc5170ull, -49), // π/4 remainder2 0.0625, 16., 0., // padding 0., // padding // cacheline 5 Vc_buildDouble( 1, 0x555555555554bull, -5), // ~ 1/4! Vc_buildDouble(-1, 0x6c16c16c14f91ull, -10), // ~-1/6! Vc_buildDouble( 1, 0xa01a019c844f5ull, -16), // ~ 1/8! Vc_buildDouble(-1, 0x27e4f7eac4bc6ull, -22), // ~-1/10! Vc_buildDouble( 1, 0x1ee9d7b4e3f05ull, -29), // ~ 1/12! Vc_buildDouble(-1, 0x8fa49a0861a9bull, -37), // ~-1/14! Vc_buildDouble(-1, 0x5555555555548ull, -3), // ~-1/3! Vc_buildDouble( 1, 0x111111110f7d0ull, -7), // ~ 1/5! // cacheline 8 Vc_buildDouble(-1, 0xa01a019bfdf03ull, -13), // ~-1/7! Vc_buildDouble( 1, 0x71de3567d48a1ull, -19), // ~ 1/9! Vc_buildDouble(-1, 0xae5e5a9291f5dull, -26), // ~-1/11! Vc_buildDouble( 1, 0x5d8fd1fd19ccdull, -33), // ~ 1/13! 0., // padding (for alignment with float) Vc_buildDouble(1, 0x8BE60DB939105ull, 0), // 4/π Vc_buildDouble(1, 0x921fb54442d18ull, 0), // π/2 Vc_buildDouble(1, 0x921fb54442d18ull, 1), // π // cacheline 10 Vc_buildDouble(-1, 0xc007fa1f72594ull, -1), // atan P coefficients Vc_buildDouble(-1, 0x028545b6b807aull, 4), // atan P coefficients Vc_buildDouble(-1, 0x2c08c36880273ull, 6), // atan P coefficients Vc_buildDouble(-1, 0xeb8bf2d05ba25ull, 6), // atan P coefficients Vc_buildDouble(-1, 0x03669fd28ec8eull, 6), // atan P coefficients Vc_buildDouble( 1, 0x8dbc45b14603cull, 4), // atan Q coefficients Vc_buildDouble( 1, 0x4a0dd43b8fa25ull, 7), // atan Q coefficients Vc_buildDouble( 1, 0xb0e18d2e2be3bull, 8), // atan Q coefficients // cacheline 12 Vc_buildDouble( 1, 0xe563f13b049eaull, 8), // atan Q coefficients Vc_buildDouble( 1, 0x8519efbbd62ecull, 7), // atan Q coefficients Vc_buildDouble( 1, 0x3504f333f9de6ull, 1), // tan( 3/8 π ) 0.66, // lower threshold for special casing in atan Vc_buildDouble(1, 0x1A62633145C07ull, -54), // remainder of pi/2 1.e-8, // small asin input threshold 0.625, // large asin input threshold 0., // padding // cacheline 14 Vc_buildDouble( 1, 0x84fc3988e9f08ull, -9), // asinCoeff0 Vc_buildDouble(-1, 0x2079259f9290full, -1), // asinCoeff0 Vc_buildDouble( 1, 0xbdff5baf33e6aull, 2), // asinCoeff0 Vc_buildDouble(-1, 0x991aaac01ab68ull, 4), // asinCoeff0 Vc_buildDouble( 1, 0xc896240f3081dull, 4), // asinCoeff0 Vc_buildDouble(-1, 0x5f2a2b6bf5d8cull, 4), // asinCoeff1 Vc_buildDouble( 1, 0x26219af6a7f42ull, 7), // asinCoeff1 Vc_buildDouble(-1, 0x7fe08959063eeull, 8), // asinCoeff1 // cacheline 16 Vc_buildDouble( 1, 0x56709b0b644beull, 8), // asinCoeff1 Vc_buildDouble( 1, 0x16b9b0bd48ad3ull, -8), // asinCoeff2 Vc_buildDouble(-1, 0x34341333e5c16ull, -1), // asinCoeff2 Vc_buildDouble( 1, 0x5c74b178a2dd9ull, 2), // asinCoeff2 Vc_buildDouble(-1, 0x04331de27907bull, 4), // asinCoeff2 Vc_buildDouble( 1, 0x39007da779259ull, 4), // asinCoeff2 Vc_buildDouble(-1, 0x0656c06ceafd5ull, 3), // asinCoeff2 Vc_buildDouble(-1, 0xd7b590b5e0eabull, 3), // asinCoeff3 // cacheline 18 Vc_buildDouble( 1, 0x19fc025fe9054ull, 6), // asinCoeff3 Vc_buildDouble(-1, 0x265bb6d3576d7ull, 7), // asinCoeff3 Vc_buildDouble( 1, 0x1705684ffbf9dull, 7), // asinCoeff3 Vc_buildDouble(-1, 0x898220a3607acull, 5), // asinCoeff3 }; #define _4(x) x template<> const float c_trig::data[] = { // cacheline _4(Vc_buildFloat( 1, 0x490FDB, -1)), // π/4 _4(Vc_buildFloat( 1, 0x491000, -1)), // π/4 - 12 bits precision _4(Vc_buildFloat(-1, 0x157000, -19)), // π/4 remainder1 - 12 bits precision _4(Vc_buildFloat(-1, 0x6F4B9F, -32)), // π/4 remainder2 _4(0.0625f), _4(16.f), _4(0.f), // padding _4(0.f), // padding _4(4.166664568298827e-2f), // ~ 1/4! _4(-1.388731625493765e-3f), // ~-1/6! _4(2.443315711809948e-5f), // ~ 1/8! _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(-1.6666654611e-1f), // ~-1/3! _4(8.3321608736e-3f), // ~ 1/5! // cacheline _4(-1.9515295891e-4f), // ~-1/7! _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(8192.f), // loss threshold _4(Vc_buildFloat(1, 0x22F983, 0)), // 1.27323949337005615234375 = 4/π _4(Vc_buildFloat(1, 0x490FDB, 0)), // π/2 _4(Vc_buildFloat(1, 0x490FDB, 1)), // π _4(8.05374449538e-2f), // atan P coefficients _4(1.38776856032e-1f), // atan P coefficients _4(1.99777106478e-1f), // atan P coefficients _4(3.33329491539e-1f), // atan P coefficients _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) // cacheline _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(2.414213562373095f), // tan( 3/8 π ) _4(0.414213562373095f), // tan( 1/8 π ) lower threshold for special casing in atan _4(Vc_buildFloat(-1, 0x3BBD2E, -25)), // remainder of pi/2 _4(1.e-4f), // small asin input threshold _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(4.2163199048e-2f), // asinCoeff0 _4(2.4181311049e-2f), // asinCoeff0 _4(4.5470025998e-2f), // asinCoeff0 _4(7.4953002686e-2f), // asinCoeff0 _4(1.6666752422e-1f), // asinCoeff0 _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) // cacheline _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) }; #undef _4 const unsigned int c_general::absMaskFloat[2] = { 0xffffffffu, 0x7fffffffu }; const unsigned int c_general::signMaskFloat[2] = { 0x0u, 0x80000000u }; const unsigned int c_general::highMaskFloat = 0xfffff000u; const float c_general::oneFloat = 1.f; const unsigned short c_general::minShort[2] = { 0x8000u, 0x8000u }; const unsigned short c_general::one16[2] = { 1, 1 }; const float c_general::_2power31 = 1u << 31; // cacheline 4 const unsigned long long c_general::highMaskDouble = 0xfffffffff8000000ull; const double c_general::oneDouble = 1.; const unsigned long long c_general::frexpMask = 0xbfefffffffffffffull; const unsigned long long c_log::data[21] = { 0x000003ff000003ffull // bias TODO: remove , 0x7ff0000000000000ull // exponentMask (+inf) , 0x3f1ab4c293c31bb0ull // P[0] , 0x3fdfd6f53f5652f2ull // P[1] , 0x4012d2baed926911ull // P[2] , 0x402cff72c63eeb2eull // P[3] , 0x4031efd6924bc84dull // P[4] , 0x401ed5637d7edcf8ull // P[5] , 0x40269320ae97ef8eull // Q[0] , 0x40469d2c4e19c033ull // Q[1] , 0x4054bf33a326bdbdull // Q[2] , 0x4051c9e2eb5eae21ull // Q[3] , 0x4037200a9e1f25b2ull // Q[4] , 0xfff0000000000000ull // -inf , 0x0010000000000000ull // min() , 0x3fe6a09e667f3bcdull // 1/sqrt(2) , 0x3fe6300000000000ull // round(ln(2) * 512) / 512 , 0xbf2bd0105c610ca8ull // ln(2) - round(ln(2) * 512) / 512 , 0x3fe0000000000000ull // 0.5 , 0x3fdbcb7b1526e50eull // log10(e) , 0x3ff71547652b82feull // log2(e) }; template<> const unsigned int c_log::data[21] = { 0x0000007fu // bias TODO: remove , 0x7f800000u // exponentMask (+inf) , 0x3d9021bbu // 7.0376836292e-2f // P[0] , 0xbdebd1b8u // -1.1514610310e-1f // P[1] , 0x3def251au // 1.1676998740e-1f // P[2] , 0xbdfe5d4fu // -1.2420140846e-1f // P[3] , 0x3e11e9bfu // 1.4249322787e-1f // P[4] , 0xbe2aae50u // -1.6668057665e-1f // P[5] , 0x3e4cceacu // 2.0000714765e-1f // P[6] , 0xbe7ffffcu // -2.4999993993e-1f // P[7] , 0x3eaaaaaau // 3.3333331174e-1f // P[8] , 0 // padding because of c_log , 0 // padding because of c_log , 0xff800000u // -inf , 0x00800000u // min() , 0x3f3504f3u // 1/sqrt(2) , 0x3f318000u // round(ln(2) * 512) / 512 , 0xb95e8083u // ln(2) - round(ln(2) * 512) / 512 , 0x3f000000u // 0.5 , 0x3ede5bd9u // log10(e) , 0x3fb8aa3bu // log2(e) }; } // namespace AVX namespace SSE { // cacheline 1 V_ALIGN(64) const int c_general::absMaskFloat[4] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; V_ALIGN(16) const unsigned int c_general::signMaskFloat[4] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; V_ALIGN(16) const unsigned int c_general::highMaskFloat[4] = { 0xfffff000u, 0xfffff000u, 0xfffff000u, 0xfffff000u }; V_ALIGN(16) const short c_general::minShort[8] = { -0x8000, -0x8000, -0x8000, -0x8000, -0x8000, -0x8000, -0x8000, -0x8000 }; V_ALIGN(16) extern const unsigned short _IndexesFromZero8[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; // cacheline 2 V_ALIGN(16) extern const unsigned int _IndexesFromZero4[4] = { 0, 1, 2, 3 }; V_ALIGN(16) const unsigned short c_general::one16[8] = { 1, 1, 1, 1, 1, 1, 1, 1 }; V_ALIGN(16) const unsigned int c_general::one32[4] = { 1, 1, 1, 1 }; V_ALIGN(16) const float c_general::oneFloat[4] = { 1.f, 1.f, 1.f, 1.f }; // cacheline 3 V_ALIGN(16) const unsigned long long c_general::highMaskDouble[2] = { 0xfffffffff8000000ull, 0xfffffffff8000000ull }; V_ALIGN(16) const double c_general::oneDouble[2] = { 1., 1. }; V_ALIGN(16) const long long c_general::absMaskDouble[2] = { 0x7fffffffffffffffll, 0x7fffffffffffffffll }; V_ALIGN(16) const unsigned long long c_general::signMaskDouble[2] = { 0x8000000000000000ull, 0x8000000000000000ull }; V_ALIGN(16) const unsigned long long c_general::frexpMask[2] = { 0xbfefffffffffffffull, 0xbfefffffffffffffull }; #define _2(x) x, x template<> const double c_trig::data[] = { // cacheline 4 _2(Vc_buildDouble(1, 0x921fb54442d18ull, -1)), // π/4 _2(Vc_buildDouble(1, 0x921fb40000000ull, -1)), // π/4 - 30bits precision _2(Vc_buildDouble(1, 0x4442d00000000ull, -25)), // π/4 remainder1 - 32bits precision _2(Vc_buildDouble(1, 0x8469898cc5170ull, -49)), // π/4 remainder2 // cacheline 5 _2(0.0625), _2(16.), _2(0.), // padding _2(0.), // padding // cacheline 6 _2(Vc_buildDouble( 1, 0x555555555554bull, -5)), // ~ 1/4! _2(Vc_buildDouble(-1, 0x6c16c16c14f91ull, -10)), // ~-1/6! _2(Vc_buildDouble( 1, 0xa01a019c844f5ull, -16)), // ~ 1/8! _2(Vc_buildDouble(-1, 0x27e4f7eac4bc6ull, -22)), // ~-1/10! // cacheline 7 _2(Vc_buildDouble( 1, 0x1ee9d7b4e3f05ull, -29)), // ~ 1/12! _2(Vc_buildDouble(-1, 0x8fa49a0861a9bull, -37)), // ~-1/14! _2(Vc_buildDouble(-1, 0x5555555555548ull, -3)), // ~-1/3! _2(Vc_buildDouble( 1, 0x111111110f7d0ull, -7)), // ~ 1/5! // cacheline 8 _2(Vc_buildDouble(-1, 0xa01a019bfdf03ull, -13)), // ~-1/7! _2(Vc_buildDouble( 1, 0x71de3567d48a1ull, -19)), // ~ 1/9! _2(Vc_buildDouble(-1, 0xae5e5a9291f5dull, -26)), // ~-1/11! _2(Vc_buildDouble( 1, 0x5d8fd1fd19ccdull, -33)), // ~ 1/13! // cacheline 9 _2(0.), // padding (for alignment with float) _2(Vc_buildDouble(1, 0x8BE60DB939105ull, 0)), // 4/π _2(Vc_buildDouble(1, 0x921fb54442d18ull, 0)), // π/2 _2(Vc_buildDouble(1, 0x921fb54442d18ull, 1)), // π // cacheline 10 _2(Vc_buildDouble(-1, 0xc007fa1f72594ull, -1)), // atan P coefficients _2(Vc_buildDouble(-1, 0x028545b6b807aull, 4)), // atan P coefficients _2(Vc_buildDouble(-1, 0x2c08c36880273ull, 6)), // atan P coefficients _2(Vc_buildDouble(-1, 0xeb8bf2d05ba25ull, 6)), // atan P coefficients // cacheline 11 _2(Vc_buildDouble(-1, 0x03669fd28ec8eull, 6)), // atan P coefficients _2(Vc_buildDouble( 1, 0x8dbc45b14603cull, 4)), // atan Q coefficients _2(Vc_buildDouble( 1, 0x4a0dd43b8fa25ull, 7)), // atan Q coefficients _2(Vc_buildDouble( 1, 0xb0e18d2e2be3bull, 8)), // atan Q coefficients // cacheline 12 _2(Vc_buildDouble( 1, 0xe563f13b049eaull, 8)), // atan Q coefficients _2(Vc_buildDouble( 1, 0x8519efbbd62ecull, 7)), // atan Q coefficients _2(Vc_buildDouble( 1, 0x3504f333f9de6ull, 1)), // tan( 3/8 π ) _2(0.66), // lower threshold for special casing in atan // cacheline 13 _2(Vc_buildDouble(1, 0x1A62633145C07ull, -54)), // remainder of pi/2 _2(1.e-8), // small asin input threshold _2(0.625), // large asin input threshold _2(0.), // padding // cacheline 14 _2(Vc_buildDouble( 1, 0x84fc3988e9f08ull, -9)), // asinCoeff0 _2(Vc_buildDouble(-1, 0x2079259f9290full, -1)), // asinCoeff0 _2(Vc_buildDouble( 1, 0xbdff5baf33e6aull, 2)), // asinCoeff0 _2(Vc_buildDouble(-1, 0x991aaac01ab68ull, 4)), // asinCoeff0 // cacheline 15 _2(Vc_buildDouble( 1, 0xc896240f3081dull, 4)), // asinCoeff0 _2(Vc_buildDouble(-1, 0x5f2a2b6bf5d8cull, 4)), // asinCoeff1 _2(Vc_buildDouble( 1, 0x26219af6a7f42ull, 7)), // asinCoeff1 _2(Vc_buildDouble(-1, 0x7fe08959063eeull, 8)), // asinCoeff1 // cacheline 16 _2(Vc_buildDouble( 1, 0x56709b0b644beull, 8)), // asinCoeff1 _2(Vc_buildDouble( 1, 0x16b9b0bd48ad3ull, -8)), // asinCoeff2 _2(Vc_buildDouble(-1, 0x34341333e5c16ull, -1)), // asinCoeff2 _2(Vc_buildDouble( 1, 0x5c74b178a2dd9ull, 2)), // asinCoeff2 // cacheline 17 _2(Vc_buildDouble(-1, 0x04331de27907bull, 4)), // asinCoeff2 _2(Vc_buildDouble( 1, 0x39007da779259ull, 4)), // asinCoeff2 _2(Vc_buildDouble(-1, 0x0656c06ceafd5ull, 3)), // asinCoeff2 _2(Vc_buildDouble(-1, 0xd7b590b5e0eabull, 3)), // asinCoeff3 // cacheline 18 _2(Vc_buildDouble( 1, 0x19fc025fe9054ull, 6)), // asinCoeff3 _2(Vc_buildDouble(-1, 0x265bb6d3576d7ull, 7)), // asinCoeff3 _2(Vc_buildDouble( 1, 0x1705684ffbf9dull, 7)), // asinCoeff3 _2(Vc_buildDouble(-1, 0x898220a3607acull, 5)), // asinCoeff3 }; #undef _2 #define _4(x) x, x, x, x template<> const float c_trig::data[] = { // cacheline _4(Vc_buildFloat( 1, 0x490FDB, -1)), // π/4 _4(Vc_buildFloat( 1, 0x491000, -1)), // π/4 - 12 bits precision _4(Vc_buildFloat(-1, 0x157000, -19)), // π/4 remainder1 - 12 bits precision _4(Vc_buildFloat(-1, 0x6F4B9F, -32)), // π/4 remainder2 // cacheline _4(0.0625f), _4(16.f), _4(0.f), // padding _4(0.f), // padding // cacheline _4(4.166664568298827e-2f), // ~ 1/4! _4(-1.388731625493765e-3f), // ~-1/6! _4(2.443315711809948e-5f), // ~ 1/8! _4(0.f), // padding (for alignment with double) // cacheline _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(-1.6666654611e-1f), // ~-1/3! _4(8.3321608736e-3f), // ~ 1/5! // cacheline _4(-1.9515295891e-4f), // ~-1/7! _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) // cacheline _4(8192.f), // loss threshold _4(Vc_buildFloat(1, 0x22F983, 0)), // 1.27323949337005615234375 = 4/π _4(Vc_buildFloat(1, 0x490FDB, 0)), // π/2 _4(Vc_buildFloat(1, 0x490FDB, 1)), // π // cacheline _4(8.05374449538e-2f), // atan P coefficients _4(1.38776856032e-1f), // atan P coefficients _4(1.99777106478e-1f), // atan P coefficients _4(3.33329491539e-1f), // atan P coefficients // cacheline _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) // cacheline _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(2.414213562373095f), // tan( 3/8 π ) _4(0.414213562373095f), // tan( 1/8 π ) lower threshold for special casing in atan // cacheline _4(Vc_buildFloat(-1, 0x3BBD2E, -25)), // remainder of pi/2 _4(1.e-4f), // small asin input threshold _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) // cacheline _4(4.2163199048e-2f), // asinCoeff0 _4(2.4181311049e-2f), // asinCoeff0 _4(4.5470025998e-2f), // asinCoeff0 _4(7.4953002686e-2f), // asinCoeff0 // cacheline _4(1.6666752422e-1f), // asinCoeff0 _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) // cacheline _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) // cacheline _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) // cacheline _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) _4(0.f), // padding (for alignment with double) }; #undef _4 // cacheline 8 V_ALIGN(16) extern const unsigned char _IndexesFromZero16[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; V_ALIGN(64) const unsigned long long c_log::data[21 * 2] = { /* 0*/ 0x000003ff000003ffull, 0x000003ff000003ffull // bias TODO: remove /* 1*/ , 0x7ff0000000000000ull, 0x7ff0000000000000ull // exponentMask (+inf) /* 2*/ , 0x3f1ab4c293c31bb0ull, 0x3f1ab4c293c31bb0ull // P[0] /* 3*/ , 0x3fdfd6f53f5652f2ull, 0x3fdfd6f53f5652f2ull // P[1] /* 4*/ , 0x4012d2baed926911ull, 0x4012d2baed926911ull // P[2] /* 5*/ , 0x402cff72c63eeb2eull, 0x402cff72c63eeb2eull // P[3] /* 6*/ , 0x4031efd6924bc84dull, 0x4031efd6924bc84dull // P[4] /* 7*/ , 0x401ed5637d7edcf8ull, 0x401ed5637d7edcf8ull // P[5] /* 8*/ , 0x40269320ae97ef8eull, 0x40269320ae97ef8eull // Q[0] /* 9*/ , 0x40469d2c4e19c033ull, 0x40469d2c4e19c033ull // Q[1] /*10*/ , 0x4054bf33a326bdbdull, 0x4054bf33a326bdbdull // Q[2] /*11*/ , 0x4051c9e2eb5eae21ull, 0x4051c9e2eb5eae21ull // Q[3] /*12*/ , 0x4037200a9e1f25b2ull, 0x4037200a9e1f25b2ull // Q[4] /*13*/ , 0xfff0000000000000ull, 0xfff0000000000000ull // -inf /*14*/ , 0x0010000000000000ull, 0x0010000000000000ull // min() /*15*/ , 0x3fe6a09e667f3bcdull, 0x3fe6a09e667f3bcdull // 1/sqrt(2) /*16*/ , 0x3fe6300000000000ull, 0x3fe6300000000000ull // round(ln(2) * 512) / 512 /*17*/ , 0xbf2bd0105c610ca8ull, 0xbf2bd0105c610ca8ull // ln(2) - round(ln(2) * 512) / 512 /*18*/ , 0x3fe0000000000000ull, 0x3fe0000000000000ull // 0.5 /*19*/ , 0x3fdbcb7b1526e50eull, 0x3fdbcb7b1526e50eull // log10(e) /*20*/ , 0x3ff71547652b82feull, 0x3ff71547652b82feull // log2(e) }; template<> V_ALIGN(64) const unsigned int c_log::data[21 * 4] = { 0x0000007fu, 0x0000007fu, 0x0000007fu, 0x0000007fu, // bias TODO: remove 0x7f800000u, 0x7f800000u, 0x7f800000u, 0x7f800000u, // exponentMask (+inf) 0x3d9021bbu, 0x3d9021bbu, 0x3d9021bbu, 0x3d9021bbu, // 7.0376836292e-2f // P[0] 0xbdebd1b8u, 0xbdebd1b8u, 0xbdebd1b8u, 0xbdebd1b8u, // -1.1514610310e-1f // P[1] 0x3def251au, 0x3def251au, 0x3def251au, 0x3def251au, // 1.1676998740e-1f // P[2] 0xbdfe5d4fu, 0xbdfe5d4fu, 0xbdfe5d4fu, 0xbdfe5d4fu, // -1.2420140846e-1f // P[3] 0x3e11e9bfu, 0x3e11e9bfu, 0x3e11e9bfu, 0x3e11e9bfu, // 1.4249322787e-1f // P[4] 0xbe2aae50u, 0xbe2aae50u, 0xbe2aae50u, 0xbe2aae50u, // -1.6668057665e-1f // P[5] 0x3e4cceacu, 0x3e4cceacu, 0x3e4cceacu, 0x3e4cceacu, // 2.0000714765e-1f // P[6] 0xbe7ffffcu, 0xbe7ffffcu, 0xbe7ffffcu, 0xbe7ffffcu, // -2.4999993993e-1f // P[7] 0x3eaaaaaau, 0x3eaaaaaau, 0x3eaaaaaau, 0x3eaaaaaau, // 3.3333331174e-1f // P[8] 0, 0, 0, 0, // padding because of c_log 0, 0, 0, 0, // padding because of c_log 0xff800000u, 0xff800000u, 0xff800000u, 0xff800000u, // -inf 0x00800000u, 0x00800000u, 0x00800000u, 0x00800000u, // min() 0x3f3504f3u, 0x3f3504f3u, 0x3f3504f3u, 0x3f3504f3u, // 1/sqrt(2) // ln(2) = 0x3fe62e42fefa39ef // ln(2) = Vc_buildDouble( 1, 0x00062e42fefa39ef, -1) // = Vc_buildFloat( 1, 0x00317217(f7d), -1) + Vc_buildFloat( 1, 0x0077d1cd, -25) // = Vc_buildFloat( 1, 0x00318000(000), -1) + Vc_buildFloat(-1, 0x005e8083, -13) 0x3f318000u, 0x3f318000u, 0x3f318000u, 0x3f318000u, // round(ln(2) * 512) / 512 0xb95e8083u, 0xb95e8083u, 0xb95e8083u, 0xb95e8083u, // ln(2) - round(ln(2) * 512) / 512 0x3f000000u, 0x3f000000u, 0x3f000000u, 0x3f000000u, // 0.5 0x3ede5bd9u, 0x3ede5bd9u, 0x3ede5bd9u, 0x3ede5bd9u, // log10(e) 0x3fb8aa3bu, 0x3fb8aa3bu, 0x3fb8aa3bu, 0x3fb8aa3bu, // log2(e) // log10(2) = 0x3fd34413509f79ff // = Vc_buildDouble( 1, 0x00034413509f79ff, -2) // = Vc_buildFloat( 1, 0x001a209a(84fbcff8), -2) + Vc_buildFloat( 1, 0x0004fbcff(8), -26) //Vc_buildFloat( 1, 0x001a209a, -2), // log10(2) //Vc_buildFloat( 1, 0x001a209a, -2), // log10(2) //Vc_buildFloat( 1, 0x001a209a, -2), // log10(2) //Vc_buildFloat( 1, 0x001a209a, -2), // log10(2) }; } // namespace SSE V_ALIGN(64) unsigned int RandomState[16] = { 0x5a383a4fu, 0xc68bd45eu, 0x691d6d86u, 0xb367e14fu, 0xd689dbaau, 0xfde442aau, 0x3d265423u, 0x1a77885cu, 0x36ed2684u, 0xfb1f049du, 0x19e52f31u, 0x821e4dd7u, 0x23996d25u, 0x5962725au, 0x6aced4ceu, 0xd4c610f3u }; // dummy symbol to emit warnings with GCC 4.3 namespace Warnings { void _operator_bracket_warning() {} } // namespace Warnings const char LIBRARY_VERSION[] = VC_VERSION_STRING; const unsigned int LIBRARY_VERSION_NUMBER = VC_VERSION_NUMBER; const unsigned int LIBRARY_ABI_VERSION = VC_LIBRARY_ABI_VERSION; void checkLibraryAbi(unsigned int compileTimeAbi, unsigned int versionNumber, const char *compileTimeVersion) { if (LIBRARY_ABI_VERSION != compileTimeAbi || LIBRARY_VERSION_NUMBER < versionNumber) { printf("The versions of libVc.a (%s) and Vc/version.h (%s) are incompatible. Aborting.\n", LIBRARY_VERSION, compileTimeVersion); abort(); } } } // namespace Vc /*OUTER_NAMESPACE_END*/ #undef V_ALIGN Vc-0.7.4/src/cpuid.cpp000066400000000000000000000427741233512346000145060ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2011-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include #include /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { CpuId::uint CpuId::s_ecx0 = 0; CpuId::uint CpuId::s_logicalProcessors = 0; CpuId::uint CpuId::s_processorFeaturesC = 0; CpuId::uint CpuId::s_processorFeaturesD = 0; CpuId::uint CpuId::s_processorFeatures8C = 0; CpuId::uint CpuId::s_processorFeatures8D = 0; CpuId::uint CpuId::s_L1Instruction = 0; CpuId::uint CpuId::s_L1Data = 0; CpuId::uint CpuId::s_L2Data = 0; CpuId::uint CpuId::s_L3Data = 0; CpuId::ushort CpuId::s_L1InstructionLineSize = 0; CpuId::ushort CpuId::s_L1DataLineSize = 0; CpuId::ushort CpuId::s_L2DataLineSize = 0; CpuId::ushort CpuId::s_L3DataLineSize = 0; CpuId::uint CpuId::s_L1Associativity = 0; CpuId::uint CpuId::s_L2Associativity = 0; CpuId::uint CpuId::s_L3Associativity = 0; CpuId::ushort CpuId::s_prefetch = 32; // The Intel ORM says that if CPUID(2) doesn't set the prefetch size it is 32 CpuId::uchar CpuId::s_brandIndex = 0; CpuId::uchar CpuId::s_cacheLineSize = 0; CpuId::uchar CpuId::s_processorModel = 0; CpuId::uchar CpuId::s_processorFamily = 0; CpuId::ProcessorType CpuId::s_processorType = CpuId::IntelReserved; bool CpuId::s_noL2orL3 = false; #ifdef VC_MSVC } // better not include intrin.h inside the Vc namespace :) /*OUTER_NAMESPACE_END*/ #include /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { #define CPUID(leaf) \ do { \ int out[4]; \ __cpuid(out, leaf); \ eax = out[0]; \ ebx = out[1]; \ ecx = out[2]; \ edx = out[3]; \ } while (false) #define CPUID_C(leaf, _ecx_) \ do { \ int out[4]; \ __cpuidex(out, leaf, _ecx_); \ eax = out[0]; \ ebx = out[1]; \ ecx = out[2]; \ edx = out[3]; \ } while (false) #elif defined(__i386__) && defined(__PIC__) // %ebx may be the PIC register. static inline void _Vc_cpuid(int leaf, unsigned int &eax, unsigned int &ebx, unsigned int &ecx, unsigned int &edx) { int tmpb; asm("mov %%ebx, %[tmpb]\n\t" "cpuid\n\t" "mov %%ebx, %[ebx]\n\t" "mov %[tmpb], %%ebx\n\t" : [tmpb]"=m"(tmpb), "=a"(eax), [ebx] "=m"(ebx), "+c"(ecx), "=d"(edx) : [leaf] "a"(leaf) ); } #define CPUID(leaf) \ ecx = 0; \ _Vc_cpuid(leaf, eax, ebx, ecx, edx) #define CPUID_C(leaf, _ecx_) \ ecx = _ecx_; \ _Vc_cpuid(leaf, eax, ebx, ecx, edx) #else #define CPUID(leaf) \ __asm__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(leaf)) #define CPUID_C(leaf, _ecx_) \ __asm__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(leaf), "c"(_ecx_)) #endif static unsigned int CpuIdAmdAssociativityTable(int bits) { switch (bits) { case 0x0: return 0; case 0x1: return 1; case 0x2: return 2; case 0x4: return 4; case 0x6: return 8; case 0x8: return 16; case 0xA: return 32; case 0xB: return 48; case 0xC: return 64; case 0xD: return 96; case 0xE: return 128; case 0xF: return 0xff; } return 0xffffffffu; } void CpuId::init() { { static bool done = false; if (done) return; done = true; } uint eax, ebx, ecx, edx; CPUID(0); s_ecx0 = ecx; CPUID(1); s_processorFeaturesC = ecx; s_processorFeaturesD = edx; s_processorModel = (eax & 0x000000f0) >> 4; s_processorFamily = (eax & 0x00000f00) >> 8; if (isAmd()) { if (s_processorFamily >= 0xf) { const uchar processorFamilyExt = (eax & 0x0ff00000) >> 20; s_processorFamily += processorFamilyExt; const uchar processorModelExt = (eax & 0x000f0000) >> 12; s_processorModel += processorModelExt; } } else if (s_processorFamily == 0xf) { const uchar processorFamilyExt = (eax & 0x0ff00000) >> 20; s_processorFamily += processorFamilyExt; const uchar processorModelExt = (eax & 0x000f0000) >> 12; s_processorModel += processorModelExt; } else if (s_processorFamily == 0x6) { const uchar processorModelExt = (eax & 0x000f0000) >> 12; s_processorModel += processorModelExt; } s_processorType = static_cast((eax & 0x00003000) >> 12); s_brandIndex = ebx & 0xff; ebx >>= 8; s_cacheLineSize = ebx & 0xff; ebx >>= 8; s_logicalProcessors = ebx & 0xff; CPUID(0x80000001); s_processorFeatures8C = ecx; s_processorFeatures8D = edx; if (isAmd()) { s_prefetch = cacheLineSize(); CPUID(0x80000005); s_L1DataLineSize = ecx & 0xff; s_L1Data = (ecx >> 24) * 1024; s_L1Associativity = (ecx >> 16) & 0xff; s_L1InstructionLineSize = edx & 0xff; s_L1Instruction = (edx >> 24) * 1024; CPUID(0x80000006); s_L2DataLineSize = ecx & 0xff; s_L2Data = (ecx >> 16) * 1024; s_L2Associativity = CpuIdAmdAssociativityTable((ecx >> 12) & 0xf); s_L3DataLineSize = edx & 0xff; s_L3Data = (edx >> 18) * 512 * 1024; s_L3Associativity = CpuIdAmdAssociativityTable((ecx >> 12) & 0xf); return; } // Intel only int repeat = 0; bool checkLeaf4 = false; do { CPUID(2); if (repeat == 0) { repeat = eax & 0xff; } if (0 == (0x80000000u & eax)) { for (int i = 0; i < 3; ++i) { eax >>= 8; interpret(eax & 0xff, &checkLeaf4); } } if (0 == (0x80000000u & ebx)) { for (int i = 0; i < 4; ++i) { interpret(ebx & 0xff, &checkLeaf4); ebx >>= 8; } } if (0 == (0x80000000u & ecx)) { for (int i = 0; i < 4; ++i) { interpret(ecx & 0xff, &checkLeaf4); ecx >>= 8; } } if (0 == (0x80000000u & edx)) { for (int i = 0; i < 4; ++i) { interpret(edx & 0xff, &checkLeaf4); edx >>= 8; } } } while (--repeat > 0); if (checkLeaf4) { s_prefetch = cacheLineSize(); if (s_prefetch == 0) { s_prefetch = 64; } eax = 1; for (int i = 0; eax & 0x1f; ++i) { CPUID_C(4, i); const int cacheLevel = (eax >> 5) & 7; //const int sharedBy = 1 + ((eax >> 14) & 0xfff); const int linesize = 1 + (ebx & 0xfff); ebx >>= 12; const int partitions = 1 + (ebx & 0x3ff); ebx >>= 10; const int ways = 1 + (ebx & 0x3ff); const int sets = 1 + ecx; const int size = ways * partitions * linesize * sets; switch (eax & 0x1f) { case 1: // data cache switch (cacheLevel) { case 1: s_L1Data = size; s_L1DataLineSize = linesize; s_L1Associativity = ways; break; case 2: s_L2Data = size; s_L2DataLineSize = linesize; s_L2Associativity = ways; break; case 3: s_L3Data = size; s_L3DataLineSize = linesize; s_L3Associativity = ways; break; } break; case 2: // instruction cache switch (cacheLevel) { case 1: s_L1Instruction = size; s_L1InstructionLineSize = linesize; break; } break; case 3: // unified cache switch (cacheLevel) { case 1: s_L1Data = size;// / sharedBy; s_L1DataLineSize = linesize; s_L1Associativity = ways; break; case 2: s_L2Data = size;// / sharedBy; s_L2DataLineSize = linesize; s_L2Associativity = ways; break; case 3: s_L3Data = size;// / sharedBy; s_L3DataLineSize = linesize; s_L3Associativity = ways; break; } break; case 0: // no more caches break; default: // reserved break; } } } } void CpuId::interpret(uchar byte, bool *checkLeaf4) { switch (byte) { case 0x06: s_L1Instruction = 8 * 1024; s_L1InstructionLineSize = 32; s_L1Associativity = 4; break; case 0x08: s_L1Instruction = 16 * 1024; s_L1InstructionLineSize = 32; s_L1Associativity = 4; break; case 0x09: s_L1Instruction = 32 * 1024; s_L1InstructionLineSize = 64; s_L1Associativity = 4; break; case 0x0A: s_L1Data = 8 * 1024; s_L1DataLineSize = 32; s_L1Associativity = 2; break; case 0x0C: s_L1Data = 16 * 1024; s_L1DataLineSize = 32; s_L1Associativity = 4; break; case 0x0D: s_L1Data = 16 * 1024; s_L1DataLineSize = 64; s_L1Associativity = 4; break; case 0x0E: s_L1Data = 24 * 1024; s_L1DataLineSize = 64; s_L1Associativity = 6; break; case 0x21: s_L2Data = 256 * 1024; s_L2DataLineSize = 64; s_L2Associativity = 8; break; case 0x22: s_L3Data = 512 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 4; break; case 0x23: s_L3Data = 1024 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 8; break; case 0x25: s_L3Data = 2 * 1024 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 8; break; case 0x29: s_L3Data = 4 * 1024 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 8; break; case 0x2C: s_L1Data = 32 * 1024; s_L1DataLineSize = 64; s_L1Associativity = 8; break; case 0x30: s_L1Data = 32 * 1024; s_L1DataLineSize = 64; s_L1Associativity = 8; break; case 0x40: s_noL2orL3 = true; break; case 0x41: s_L2Data = 128 * 1024; s_L2DataLineSize = 32; s_L2Associativity = 4; break; case 0x42: s_L2Data = 256 * 1024; s_L2DataLineSize = 32; s_L2Associativity = 4; break; case 0x43: s_L2Data = 512 * 1024; s_L2DataLineSize = 32; s_L2Associativity = 4; break; case 0x44: s_L2Data = 1024 * 1024; s_L2DataLineSize = 32; s_L2Associativity = 4; break; case 0x45: s_L2Data = 2 * 1024 * 1024; s_L2DataLineSize = 32; s_L2Associativity = 4; break; case 0x46: s_L3Data = 4 * 1024 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 4; break; case 0x47: s_L3Data = 8 * 1024 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 8; break; case 0x48: s_L2Data = 3 * 1024 * 1024; s_L2DataLineSize = 64; s_L2Associativity = 12; break; case 0x49: if (s_processorFamily == 0xf && s_processorModel == 0x6) { s_L3Data = 4 * 1024 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 16; } else { s_L2Data = 4 * 1024 * 1024; s_L2DataLineSize = 64; s_L2Associativity = 16; } break; case 0x4A: s_L3Data = 6 * 1024 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 12; break; case 0x4B: s_L3Data = 8 * 1024 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 16; break; case 0x4C: s_L3Data = 12 * 1024 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 12; break; case 0x4D: s_L3Data = 16 * 1024 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 16; break; case 0x4E: s_L2Data = 6 * 1024 * 1024; s_L2DataLineSize = 64; s_L2Associativity = 24; break; case 0x60: s_L1Data = 16 * 1024; s_L1DataLineSize = 64; s_L1Associativity = 8; break; case 0x66: s_L1Data = 8 * 1024; s_L1DataLineSize = 64; s_L1Associativity = 4; break; case 0x67: s_L1Data = 16 * 1024; s_L1DataLineSize = 64; s_L1Associativity = 4; break; case 0x68: s_L1Data = 32 * 1024; s_L1DataLineSize = 64; s_L1Associativity = 4; break; case 0x78: s_L2Data = 1024 * 1024; s_L2DataLineSize = 64; s_L2Associativity = 4; break; case 0x79: s_L2Data = 128 * 1024; s_L2DataLineSize = 64; s_L2Associativity = 8; break; case 0x7A: s_L2Data = 256 * 1024; s_L2DataLineSize = 64; s_L2Associativity = 8; break; case 0x7B: s_L2Data = 512 * 1024; s_L2DataLineSize = 64; s_L2Associativity = 8; break; case 0x7C: s_L2Data = 1024 * 1024; s_L2DataLineSize = 64; s_L2Associativity = 8; break; case 0x7D: s_L2Data = 2 * 1024 * 1024; s_L2DataLineSize = 64; s_L2Associativity = 8; break; case 0x7F: s_L2Data = 512 * 1024; s_L2DataLineSize = 64; s_L2Associativity = 2; break; case 0x80: s_L2Data = 512 * 1024; s_L2DataLineSize = 64; s_L2Associativity = 8; break; case 0x82: s_L2Data = 256 * 1024; s_L2DataLineSize = 32; s_L2Associativity = 8; break; case 0x83: s_L2Data = 512 * 1024; s_L2DataLineSize = 32; s_L2Associativity = 8; break; case 0x84: s_L2Data = 1024 * 1024; s_L2DataLineSize = 32; s_L2Associativity = 8; break; case 0x85: s_L2Data = 2 * 1024 * 1024; s_L2DataLineSize = 32; s_L2Associativity = 8; break; case 0x86: s_L2Data = 512 * 1024; s_L2DataLineSize = 64; s_L2Associativity = 4; break; case 0x87: s_L2Data = 1024 * 1024; s_L2DataLineSize = 64; s_L2Associativity = 8; break; case 0xD0: s_L3Data = 512 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 4; break; case 0xD1: s_L3Data = 1024 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 4; break; case 0xD2: s_L3Data = 2 * 1024 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 4; break; case 0xD6: s_L3Data = 1024 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 8; break; case 0xD7: s_L3Data = 2 * 1024 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 8; break; case 0xD8: s_L3Data = 4 * 1024 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 8; break; case 0xDC: s_L3Data = 3 * 512 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 12; break; case 0xDD: s_L3Data = 3 * 1024 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 12; break; case 0xDE: s_L3Data = 6 * 1024 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 12; break; case 0xE2: s_L3Data = 2 * 1024 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 16; break; case 0xE3: s_L3Data = 4 * 1024 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 16; break; case 0xE4: s_L3Data = 8 * 1024 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 16; break; case 0xEA: s_L3Data = 12 * 1024 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 24; break; case 0xEB: s_L3Data = 18 * 1024 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 24; break; case 0xEC: s_L3Data = 24 * 1024 * 1024; s_L3DataLineSize = 64; s_L3Associativity = 24; break; case 0xF0: s_prefetch = 64; break; case 0xF1: s_prefetch = 128; break; case 0xFF: // we have to use CPUID(4) to find out *checkLeaf4 = true; break; default: break; } } } // namespace Vc /*OUTER_NAMESPACE_END*/ // vim: sw=4 sts=4 et tw=100 Vc-0.7.4/src/support.cpp000066400000000000000000000066221233512346000151060ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2010-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include #include #include #ifdef VC_MSVC #include #endif #if defined(VC_GCC) && VC_GCC >= 0x40400 #define VC_TARGET_NO_SIMD __attribute__((target("no-sse2,no-avx"))) #else #define VC_TARGET_NO_SIMD #endif /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { VC_TARGET_NO_SIMD static inline bool xgetbvCheck(unsigned int bits) { #if defined(VC_MSVC) && VC_MSVC >= 160040219 // MSVC 2010 SP1 introduced _xgetbv unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); return (xcrFeatureMask & bits) == bits; #elif defined(VC_GNU_ASM) && !defined(VC_NO_XGETBV) unsigned int eax; asm("xgetbv" : "=a"(eax) : "c"(0) : "edx"); return (eax & bits) == bits; #else // can't check, but if OSXSAVE is true let's assume it'll work return bits > 0; // ignore 'warning: unused parameter' #endif } VC_TARGET_NO_SIMD bool isImplementationSupported(Implementation impl) { CpuId::init(); switch (impl) { case ScalarImpl: return true; case SSE2Impl: return CpuId::hasSse2(); case SSE3Impl: return CpuId::hasSse3(); case SSSE3Impl: return CpuId::hasSsse3(); case SSE41Impl: return CpuId::hasSse41(); case SSE42Impl: return CpuId::hasSse42(); case AVXImpl: return CpuId::hasOsxsave() && CpuId::hasAvx() && xgetbvCheck(0x6); case AVX2Impl: return false; case ImplementationMask: return false; } return false; } VC_TARGET_NO_SIMD Vc::Implementation bestImplementationSupported() { CpuId::init(); if (!CpuId::hasSse2 ()) return Vc::ScalarImpl; if (!CpuId::hasSse3 ()) return Vc::SSE2Impl; if (!CpuId::hasSsse3()) return Vc::SSE3Impl; if (!CpuId::hasSse41()) return Vc::SSSE3Impl; if (!CpuId::hasSse42()) return Vc::SSE41Impl; if (CpuId::hasAvx() && CpuId::hasOsxsave() && xgetbvCheck(0x6)) { return Vc::AVXImpl; } return Vc::SSE42Impl; } VC_TARGET_NO_SIMD unsigned int extraInstructionsSupported() { unsigned int flags = 0; if (CpuId::hasF16c()) flags |= Vc::Float16cInstructions; if (CpuId::hasFma4()) flags |= Vc::Fma4Instructions; if (CpuId::hasXop ()) flags |= Vc::XopInstructions; if (CpuId::hasPopcnt()) flags |= Vc::PopcntInstructions; if (CpuId::hasSse4a()) flags |= Vc::Sse4aInstructions; if (CpuId::hasFma ()) flags |= Vc::FmaInstructions; //if (CpuId::hasPclmulqdq()) flags |= Vc::PclmulqdqInstructions; //if (CpuId::hasAes()) flags |= Vc::AesInstructions; //if (CpuId::hasRdrand()) flags |= Vc::RdrandInstructions; return flags; } } // namespace Vc /*OUTER_NAMESPACE_END*/ #undef VC_TARGET_NO_SIMD // vim: sw=4 sts=4 et tw=100 Vc-0.7.4/src/trigonometric.cpp000066400000000000000000000373101233512346000162550ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright (C) 2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . }}}*/ #include #if defined(VC_IMPL_SSE) || defined(VC_IMPL_AVX) #include /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace { using Vc::Vector; using Vc::float_v; using Vc::double_v; using Vc::sfloat_v; template static Vc_ALWAYS_INLINE Vector cosSeries(const Vector &x) { typedef Const C; const Vector x2 = x * x; return ((C::cosCoeff(2) * x2 + C::cosCoeff(1)) * x2 + C::cosCoeff(0)) * (x2 * x2) - C::_1_2() * x2 + Vector::One(); } static Vc_ALWAYS_INLINE double_v cosSeries(const double_v &x) { typedef Const C; const double_v x2 = x * x; return (((((C::cosCoeff(5) * x2 + C::cosCoeff(4)) * x2 + C::cosCoeff(3)) * x2 + C::cosCoeff(2)) * x2 + C::cosCoeff(1)) * x2 + C::cosCoeff(0)) * (x2 * x2) - C::_1_2() * x2 + double_v::One(); } template static Vc_ALWAYS_INLINE Vector sinSeries(const Vector &x) { typedef Const C; const Vector x2 = x * x; return ((C::sinCoeff(2) * x2 + C::sinCoeff(1)) * x2 + C::sinCoeff(0)) * (x2 * x) + x; } static Vc_ALWAYS_INLINE double_v sinSeries(const double_v &x) { typedef Const C; const double_v x2 = x * x; return (((((C::sinCoeff(5) * x2 + C::sinCoeff(4)) * x2 + C::sinCoeff(3)) * x2 + C::sinCoeff(2)) * x2 + C::sinCoeff(1)) * x2 + C::sinCoeff(0)) * (x2 * x) + x; } template struct signed_integer { typedef int_v type; }; template<> struct signed_integer { typedef short_v type; }; template static Vc_ALWAYS_INLINE Vector<_T> foldInput(const Vector<_T> &_x, IV &quadrant) { typedef Vector<_T> V; typedef Const<_T> C; const V x = abs(_x); #if defined(VC_IMPL_FMA4) || defined(VC_IMPL_FMA) quadrant = static_cast(x * C::_4_pi() + V::One()); // prefer the fma here quadrant &= ~IV::One(); #else quadrant = static_cast(x * C::_4_pi()); quadrant += quadrant & IV::One(); #endif const V y = static_cast(quadrant); quadrant &= 7; return ((x - y * C::_pi_4_hi()) - y * C::_pi_4_rem1()) - y * C::_pi_4_rem2(); } static Vc_ALWAYS_INLINE double_v foldInput(const double_v &_x, int_v &quadrant) { typedef double_v V; typedef Const C; const V x = abs(_x); V y = trunc(x / C::_pi_4()); // * C::_4_pi() would work, but is >twice as imprecise V z = y - trunc(y * C::_1_16()) * C::_16(); // y modulo 16 quadrant = static_cast(z); int_m mask = (quadrant & int_v::One()) != int_v::Zero(); ++quadrant(mask); y(static_cast(mask)) += V::One(); quadrant &= 7; // since y is an integer we don't need to split y into low and high parts until the integer // requires more bits than there are zero bits at the end of _pi_4_hi (30 bits -> 1e9) return ((x - y * C::_pi_4_hi()) - y * C::_pi_4_rem1()) - y * C::_pi_4_rem2(); } } // anonymous namespace /* * algorithm for sine and cosine: * * The result can be calculated with sine or cosine depending on the π/4 section the input is * in. * sine ≈ x + x³ * cosine ≈ 1 - x² * * sine: * Map -x to x and invert the output * Extend precision of x - n * π/4 by calculating * ((x - n * p1) - n * p2) - n * p3 (p1 + p2 + p3 = π/4) * * Calculate Taylor series with tuned coefficients. * Fix sign. */ template<> template Vector<_T> Trigonometric::sin(const Vector<_T> &_x) { typedef Vector<_T> V; typedef typename V::Mask M; typedef typename signed_integer::type IV; IV quadrant; const V z = foldInput(_x, quadrant); const M sign = (_x < V::Zero()) ^ static_cast(quadrant > 3); quadrant(quadrant > 3) -= 4; V y = sinSeries(z); y(quadrant == IV::One() || quadrant == 2) = cosSeries(z); y(sign) = -y; return y; } template<> template<> double_v Trigonometric::sin(const double_v &_x) { typedef double_v V; typedef V::Mask M; int_v quadrant; M sign = _x < V::Zero(); const V x = foldInput(_x, quadrant); sign ^= static_cast(quadrant > 3); quadrant(quadrant > 3) -= 4; V y = sinSeries(x); y(static_cast(quadrant == int_v::One() || quadrant == 2)) = cosSeries(x); y(sign) = -y; return y; } template<> template Vector<_T> Trigonometric::cos(const Vector<_T> &_x) { typedef Vector<_T> V; typedef typename V::Mask M; typedef typename signed_integer::type IV; IV quadrant; const V x = foldInput(_x, quadrant); M sign = quadrant > 3; quadrant(quadrant > 3) -= 4; sign ^= quadrant > IV::One(); V y = cosSeries(x); y(quadrant == IV::One() || quadrant == 2) = sinSeries(x); y(sign) = -y; return y; } template<> template<> double_v Trigonometric::cos(const double_v &_x) { typedef double_v V; typedef V::Mask M; int_v quadrant; const V x = foldInput(_x, quadrant); M sign = static_cast(quadrant > 3); quadrant(quadrant > 3) -= 4; sign ^= static_cast(quadrant > int_v::One()); V y = cosSeries(x); y(static_cast(quadrant == int_v::One() || quadrant == 2)) = sinSeries(x); y(sign) = -y; return y; } template<> template void Trigonometric::sincos(const Vector<_T> &_x, Vector<_T> *_sin, Vector<_T> *_cos) { typedef Vector<_T> V; typedef typename V::Mask M; typedef typename signed_integer::type IV; IV quadrant; const V x = foldInput(_x, quadrant); M sign = static_cast(quadrant > 3); quadrant(quadrant > 3) -= 4; const V cos_s = cosSeries(x); const V sin_s = sinSeries(x); V c = cos_s; c(static_cast(quadrant == IV::One() || quadrant == 2)) = sin_s; c(sign ^ static_cast(quadrant > IV::One())) = -c; *_cos = c; V s = sin_s; s(static_cast(quadrant == IV::One() || quadrant == 2)) = cos_s; s(sign ^ static_cast(_x < V::Zero())) = -s; *_sin = s; } template<> template<> void Trigonometric::sincos(const double_v &_x, double_v *_sin, double_v *_cos) { typedef double_v V; typedef V::Mask M; int_v quadrant; const V x = foldInput(_x, quadrant); M sign = static_cast(quadrant > 3); quadrant(quadrant > 3) -= 4; const V cos_s = cosSeries(x); const V sin_s = sinSeries(x); V c = cos_s; c(static_cast(quadrant == int_v::One() || quadrant == 2)) = sin_s; c(sign ^ static_cast(quadrant > int_v::One())) = -c; *_cos = c; V s = sin_s; s(static_cast(quadrant == int_v::One() || quadrant == 2)) = cos_s; s(sign ^ static_cast(_x < V::Zero())) = -s; *_sin = s; } template<> template Vector<_T> Trigonometric::asin (const Vector<_T> &_x) { typedef Const<_T> C; typedef Vector<_T> V; typedef typename V::Mask M; const M &negative = _x < V::Zero(); const V &a = abs(_x); const M outOfRange = a > V::One(); const M &small = a < C::smallAsinInput(); const M >_0_5 = a > C::_1_2(); V x = a; V z = a * a; z(gt_0_5) = (V::One() - a) * C::_1_2(); x(gt_0_5) = sqrt(z); z = ((((C::asinCoeff0(0) * z + C::asinCoeff0(1)) * z + C::asinCoeff0(2)) * z + C::asinCoeff0(3)) * z + C::asinCoeff0(4)) * z * x + x; z(gt_0_5) = C::_pi_2() - (z + z); z(small) = a; z(negative) = -z; z.setQnan(outOfRange); return z; } template<> template<> double_v Trigonometric::asin (const double_v &_x) { typedef Const C; typedef double_v V; typedef V::Mask M; const M negative = _x < V::Zero(); const V a = abs(_x); const M outOfRange = a > V::One(); const M small = a < C::smallAsinInput(); const M large = a > C::largeAsinInput(); V zz = V::One() - a; const V r = (((C::asinCoeff0(0) * zz + C::asinCoeff0(1)) * zz + C::asinCoeff0(2)) * zz + C::asinCoeff0(3)) * zz + C::asinCoeff0(4); const V s = (((zz + C::asinCoeff1(0)) * zz + C::asinCoeff1(1)) * zz + C::asinCoeff1(2)) * zz + C::asinCoeff1(3); V sqrtzz = sqrt(zz + zz); V z = C::_pi_4() - sqrtzz; z -= sqrtzz * (zz * r / s) - C::_pi_2_rem(); z += C::_pi_4(); V a2 = a * a; const V p = ((((C::asinCoeff2(0) * a2 + C::asinCoeff2(1)) * a2 + C::asinCoeff2(2)) * a2 + C::asinCoeff2(3)) * a2 + C::asinCoeff2(4)) * a2 + C::asinCoeff2(5); const V q = ((((a2 + C::asinCoeff3(0)) * a2 + C::asinCoeff3(1)) * a2 + C::asinCoeff3(2)) * a2 + C::asinCoeff3(3)) * a2 + C::asinCoeff3(4); z(!large) = a * (a2 * p / q) + a; z(negative) = -z; z(small) = _x; z.setQnan(outOfRange); return z; } template<> template Vector<_T> Trigonometric::atan (const Vector<_T> &_x) { typedef Const<_T> C; typedef Vector<_T> V; typedef typename V::Mask M; V x = abs(_x); const M >_tan_3pi_8 = x > C::atanThrsHi(); const M >_tan_pi_8 = x > C::atanThrsLo() && !gt_tan_3pi_8; V y = V::Zero(); y(gt_tan_3pi_8) = C::_pi_2(); y(gt_tan_pi_8) = C::_pi_4(); x(gt_tan_3pi_8) = -V::One() / x; x(gt_tan_pi_8) = (x - V::One()) / (x + V::One()); const V &x2 = x * x; y += (((C::atanP(0) * x2 - C::atanP(1)) * x2 + C::atanP(2)) * x2 - C::atanP(3)) * x2 * x + x; y(_x < V::Zero()) = -y; y.setQnan(isnan(_x)); return y; } template<> template<> double_v Trigonometric::atan (const double_v &_x) { typedef Const C; typedef double_v V; typedef V::Mask M; M sign = _x < V::Zero(); V x = abs(_x); M finite = isfinite(_x); V ret = C::_pi_2(); V y = V::Zero(); const M large = x > C::atanThrsHi(); const M gt_06 = x > C::atanThrsLo(); V tmp = (x - V::One()) / (x + V::One()); tmp(large) = -V::One() / x; x(gt_06) = tmp; y(gt_06) = C::_pi_4(); y(large) = C::_pi_2(); V z = x * x; const V p = (((C::atanP(0) * z + C::atanP(1)) * z + C::atanP(2)) * z + C::atanP(3)) * z + C::atanP(4); const V q = ((((z + C::atanQ(0)) * z + C::atanQ(1)) * z + C::atanQ(2)) * z + C::atanQ(3)) * z + C::atanQ(4); z = z * p / q; z = x * z + x; V morebits = C::_pi_2_rem(); morebits(!large) *= C::_1_2(); z(gt_06) += morebits; ret(finite) = y + z; ret(sign) = -ret; ret.setQnan(isnan(_x)); return ret; } template<> template Vector<_T> Trigonometric::atan2(const Vector<_T> &y, const Vector<_T> &x) { typedef Const<_T> C; typedef Vector<_T> V; typedef typename V::Mask M; const M xZero = x == V::Zero(); const M yZero = y == V::Zero(); const M xMinusZero = xZero && x.isNegative(); const M yNeg = y < V::Zero(); const M xInf = !isfinite(x); const M yInf = !isfinite(y); V a = C::_pi().copySign(y); a.setZero(x >= V::Zero()); // setting x to any finite value will have atan(y/x) return sign(y/x)*pi/2, just in case x is inf V _x = x; _x(yInf) = V::One().copySign(x); a += atan(y / _x); // if x is +0 and y is +/-0 the result is +0 a.setZero(xZero && yZero); // for x = -0 we add/subtract pi to get the correct result a(xMinusZero) += C::_pi().copySign(y); // atan2(-Y, +/-0) = -pi/2 a(xZero && yNeg) = -C::_pi_2(); // if both inputs are inf the output is +/- (3)pi/4 a(xInf && yInf) += C::_pi_4().copySign(x ^ ~y); // correct the sign of y if the result is 0 a(a == V::Zero()) = a.copySign(y); // any NaN input will lead to NaN output a.setQnan(isnan(y) || isnan(x)); return a; } template<> template<> double_v Trigonometric::atan2 (const double_v &y, const double_v &x) { typedef Const C; typedef double_v V; typedef V::Mask M; const M xZero = x == V::Zero(); const M yZero = y == V::Zero(); const M xMinusZero = xZero && x.isNegative(); const M yNeg = y < V::Zero(); const M xInf = !isfinite(x); const M yInf = !isfinite(y); V a = V(C::_pi()).copySign(y); a.setZero(x >= V::Zero()); // setting x to any finite value will have atan(y/x) return sign(y/x)*pi/2, just in case x is inf V _x = x; _x(yInf) = V::One().copySign(x); a += atan(y / _x); // if x is +0 and y is +/-0 the result is +0 a.setZero(xZero && yZero); // for x = -0 we add/subtract pi to get the correct result a(xMinusZero) += C::_pi().copySign(y); // atan2(-Y, +/-0) = -pi/2 a(xZero && yNeg) = -C::_pi_2(); // if both inputs are inf the output is +/- (3)pi/4 a(xInf && yInf) += C::_pi_4().copySign(x ^ ~y); // correct the sign of y if the result is 0 a(a == V::Zero()) = a.copySign(y); // any NaN input will lead to NaN output a.setQnan(isnan(y) || isnan(x)); return a; } } // namespace Vc /*OUTER_NAMESPACE_END*/ #include // instantiate the non-specialized template functions above template Vc::float_v Vc::Trigonometric::sin(const Vc::float_v &); template Vc::sfloat_v Vc::Trigonometric::sin(const Vc::sfloat_v &); template Vc::float_v Vc::Trigonometric::cos(const Vc::float_v &); template Vc::sfloat_v Vc::Trigonometric::cos(const Vc::sfloat_v &); template void Vc::Trigonometric::sincos(const Vc::float_v &, Vc::float_v *, Vc::float_v *); template void Vc::Trigonometric::sincos(const Vc::sfloat_v &, Vc::sfloat_v *, Vc::sfloat_v *); template Vc::float_v Vc::Trigonometric::asin(const Vc::float_v &); template Vc::sfloat_v Vc::Trigonometric::asin(const Vc::sfloat_v &); template Vc::float_v Vc::Trigonometric::atan(const Vc::float_v &); template Vc::sfloat_v Vc::Trigonometric::atan(const Vc::sfloat_v &); template Vc::float_v Vc::Trigonometric::atan2(const Vc::float_v &, const Vc::float_v &); template Vc::sfloat_v Vc::Trigonometric::atan2(const Vc::sfloat_v &, const Vc::sfloat_v &); #endif Vc-0.7.4/sse/000077500000000000000000000000001233512346000126635ustar00rootroot00000000000000Vc-0.7.4/sse/casts.h000066400000000000000000000170001233512346000141470ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef SSE_CASTS_H #define SSE_CASTS_H #include "intrinsics.h" #include "types.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace SSE { template static Vc_ALWAYS_INLINE To Vc_CONST mm128_reinterpret_cast(VC_ALIGNED_PARAMETER(From) v) { return v; } template<> Vc_ALWAYS_INLINE _M128I Vc_CONST mm128_reinterpret_cast<_M128I, _M128 >(VC_ALIGNED_PARAMETER(_M128 ) v) { return _mm_castps_si128(v); } template<> Vc_ALWAYS_INLINE _M128I Vc_CONST mm128_reinterpret_cast<_M128I, _M128D>(VC_ALIGNED_PARAMETER(_M128D) v) { return _mm_castpd_si128(v); } template<> Vc_ALWAYS_INLINE _M128 Vc_CONST mm128_reinterpret_cast<_M128 , _M128D>(VC_ALIGNED_PARAMETER(_M128D) v) { return _mm_castpd_ps(v); } template<> Vc_ALWAYS_INLINE _M128 Vc_CONST mm128_reinterpret_cast<_M128 , _M128I>(VC_ALIGNED_PARAMETER(_M128I) v) { return _mm_castsi128_ps(v); } template<> Vc_ALWAYS_INLINE _M128D Vc_CONST mm128_reinterpret_cast<_M128D, _M128I>(VC_ALIGNED_PARAMETER(_M128I) v) { return _mm_castsi128_pd(v); } template<> Vc_ALWAYS_INLINE _M128D Vc_CONST mm128_reinterpret_cast<_M128D, _M128 >(VC_ALIGNED_PARAMETER(_M128 ) v) { return _mm_castps_pd(v); } template static Vc_ALWAYS_INLINE To Vc_CONST sse_cast(VC_ALIGNED_PARAMETER(From) v) { return mm128_reinterpret_cast(v); } template struct StaticCastHelper {}; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128 &v) { return _mm_cvttps_epi32(v); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128D &v) { return _mm_cvttpd_epi32(v); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128I &v) { return v; } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128I &v) { return v; } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128 &v) { return _mm_castps_si128(mm_blendv_ps( _mm_castsi128_ps(_mm_cvttps_epi32(v)), _mm_castsi128_ps(_mm_add_epi32(_mm_cvttps_epi32(_mm_sub_ps(v, _mm_set1_ps(1u << 31))), _mm_set1_epi32(1 << 31))), _mm_cmpge_ps(v, _mm_set1_ps(1u << 31)) )); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128D &v) { return _mm_cvttpd_epi32(v); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128I &v) { return v; } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128I &v) { return v; } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128 cast(const _M128 &v) { return v; } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128 cast(const _M128D &v) { return _mm_cvtpd_ps(v); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128 cast(const _M128I &v) { return _mm_cvtepi32_ps(v); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128 cast(const _M128I &v) { return mm_blendv_ps( _mm_cvtepi32_ps(v), _mm_add_ps(_mm_cvtepi32_ps(_mm_sub_epi32(v, _mm_set1_epi32(1 << 31))), _mm_set1_ps(1u << 31)), _mm_castsi128_ps(_mm_cmplt_epi32(v, _mm_setzero_si128())) ); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128D cast(const _M128 &v) { return _mm_cvtps_pd(v); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128D cast(const _M128D &v) { return v; } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128D cast(const _M128I &v) { return _mm_cvtepi32_pd(v); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128D cast(const _M128I &v) { return _mm_cvtepi32_pd(v); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE M256 cast(const _M128I &v) { return M256::create(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v, _mm_setzero_si128())), _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, _mm_setzero_si128()))); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE M256 cast(const _M128I &v) { const _M128I neg = _mm_cmplt_epi16(v, _mm_setzero_si128()); return M256::create(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v, neg)), _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, neg))); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const M256 &v) { return _mm_packs_epi32(_mm_cvttps_epi32(v[0]), _mm_cvttps_epi32(v[1])); } }; #ifdef VC_IMPL_SSE4_1 template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const M256 &v) { return _mm_packus_epi32(_mm_cvttps_epi32(v[0]), _mm_cvttps_epi32(v[1])); } }; #else template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const M256 &v) { return _mm_add_epi16(_mm_set1_epi16(-32768), _mm_packs_epi32( _mm_add_epi32(_mm_set1_epi32(-32768), _mm_cvttps_epi32(v[0])), _mm_add_epi32(_mm_set1_epi32(-32768), _mm_cvttps_epi32(v[1])) ) ); } }; #endif template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128 &v) { return _mm_packs_epi32(_mm_cvttps_epi32(v), _mm_setzero_si128()); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128I &v) { return v; } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128I &v) { return v; } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128 &v) { return _mm_packs_epi32(_mm_cvttps_epi32(v), _mm_setzero_si128()); } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128I &v) { return v; } }; template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128I &v) { return v; } }; } // namespace SSE } // namespace Vc /*OUTER_NAMESPACE_END*/ #endif // SSE_CASTS_H Vc-0.7.4/sse/const.h000066400000000000000000000147601233512346000141720ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_SSE_CONST_H #define VC_SSE_CONST_H #include "const_data.h" #include "vector.h" #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace SSE { template class Vector; template struct Const { typedef Vector<_T> V; typedef typename V::EntryType T; typedef typename V::Mask M; enum Constants { Stride = 16 / sizeof(T) }; static Vc_ALWAYS_INLINE Vc_CONST V _pi_4() { return load(&c_trig::data[0 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_hi() { return load(&c_trig::data[1 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem1() { return load(&c_trig::data[2 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem2() { return load(&c_trig::data[3 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V _1_16() { return load(&c_trig::data[4 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V _16() { return load(&c_trig::data[5 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V cosCoeff(int i) { return load(&c_trig::data[( 8 + i) * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V sinCoeff(int i) { return load(&c_trig::data[(14 + i) * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V atanP(int i) { return load(&c_trig::data[(24 + i) * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V atanQ(int i) { return load(&c_trig::data[(29 + i) * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V atanThrsHi() { return load(&c_trig::data[34 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V atanThrsLo() { return load(&c_trig::data[35 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_2_rem() { return load(&c_trig::data[36 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V lossThreshold() { return load(&c_trig::data[20 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V _4_pi() { return load(&c_trig::data[21 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_2() { return load(&c_trig::data[22 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi() { return load(&c_trig::data[23 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff0(int i) { return load(&c_trig::data[(40 + i) * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff1(int i) { return load(&c_trig::data[(45 + i) * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff2(int i) { return load(&c_trig::data[(49 + i) * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff3(int i) { return load(&c_trig::data[(55 + i) * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V smallAsinInput() { return load(&c_trig::data[37 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V largeAsinInput() { return load(&c_trig::data[38 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST M exponentMask() { return M(load(c_log::d(1)).data()); } static Vc_ALWAYS_INLINE Vc_CONST V _1_2() { return load(c_log::d(18)); } static Vc_ALWAYS_INLINE Vc_CONST V _1_sqrt2() { return load(c_log::d(15)); } static Vc_ALWAYS_INLINE Vc_CONST V P(int i) { return load(c_log::d(2 + i)); } static Vc_ALWAYS_INLINE Vc_CONST V Q(int i) { return load(c_log::d(8 + i)); } static Vc_ALWAYS_INLINE Vc_CONST V min() { return load(c_log::d(14)); } static Vc_ALWAYS_INLINE Vc_CONST V ln2_small() { return load(c_log::d(17)); } static Vc_ALWAYS_INLINE Vc_CONST V ln2_large() { return load(c_log::d(16)); } static Vc_ALWAYS_INLINE Vc_CONST V neginf() { return load(c_log::d(13)); } static Vc_ALWAYS_INLINE Vc_CONST V log10_e() { return load(c_log::d(19)); } static Vc_ALWAYS_INLINE Vc_CONST V log2_e() { return load(c_log::d(20)); } static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask() Vc_ALWAYS_INLINE_R Vc_CONST_R; static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask(int bits) Vc_ALWAYS_INLINE_R Vc_CONST_R; private: static Vc_ALWAYS_INLINE_L Vc_CONST_L V load(const T *mem) Vc_ALWAYS_INLINE_R Vc_CONST_R; }; template Vc_ALWAYS_INLINE Vc_CONST Vector Const::load(const T *mem) { return V(mem); } template<> Vc_ALWAYS_INLINE Vc_CONST sfloat_v Const::load(const float *mem) { return M256::dup(float_v(mem).data()); } template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask() { return Vector(reinterpret_cast(&c_general::highMaskFloat)); } template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask() { return Vector(reinterpret_cast(&c_general::highMaskDouble)); } template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask(int bits) { return _mm_castsi128_ps(_mm_slli_epi32(_mm_setallone_si128(), bits)); } template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask(int bits) { return _mm_castsi128_pd(_mm_slli_epi64(_mm_setallone_si128(), bits)); } template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask(int bits) { return M256::dup(Const::highMask(bits).data()); } template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::P(int i) { return M256::dup(Const::P(i).data()); } template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::Q(int i) { return M256::dup(Const::Q(i).data()); } template<> Vc_ALWAYS_INLINE Vc_CONST Vector::Mask Const::exponentMask() { return M256::dup(Const::exponentMask().data()); } } // namespace SSE } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // VC_SSE_CONST_H Vc-0.7.4/sse/const_data.h000066400000000000000000000047211233512346000151570ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright (C) 2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . }}}*/ #ifndef VC_SSE_CONST_DATA_H #define VC_SSE_CONST_DATA_H #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace SSE { ALIGN(16) extern const unsigned int _IndexesFromZero4[4]; ALIGN(16) extern const unsigned short _IndexesFromZero8[8]; ALIGN(16) extern const unsigned char _IndexesFromZero16[16]; struct c_general { ALIGN(64) static const unsigned int allone[4]; ALIGN(16) static const unsigned short one16[8]; ALIGN(16) static const unsigned int one32[4]; ALIGN(16) static const float oneFloat[4]; ALIGN(16) static const double oneDouble[2]; ALIGN(16) static const int absMaskFloat[4]; ALIGN(16) static const long long absMaskDouble[2]; ALIGN(16) static const unsigned int signMaskFloat[4]; ALIGN(16) static const unsigned int highMaskFloat[4]; ALIGN(16) static const unsigned long long signMaskDouble[2]; ALIGN(16) static const unsigned long long highMaskDouble[2]; ALIGN(16) static const short minShort[8]; ALIGN(16) static const unsigned long long frexpMask[2]; }; template struct c_trig { ALIGN(64) static const T data[]; }; template struct c_log { enum VectorSize { Size = 16 / sizeof(T) }; static Vc_ALWAYS_INLINE Vc_CONST const float *d(int i) { return reinterpret_cast(&data[i * Size]); } ALIGN(64) static const unsigned int data[]; }; template<> struct c_log { enum VectorSize { Size = 16 / sizeof(double) }; static Vc_ALWAYS_INLINE Vc_CONST const double *d(int i) { return reinterpret_cast(&data[i * Size]); } ALIGN(64) static const unsigned long long data[]; }; } // namespace SSE } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // VC_SSE_CONST_DATA_H Vc-0.7.4/sse/debug.h000066400000000000000000000047441233512346000141330ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2011-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_SSE_DEBUG_H #define VC_SSE_DEBUG_H #ifndef NDEBUG #include "types.h" #include #include #endif /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace SSE { #ifdef NDEBUG class DebugStream { public: DebugStream(const char *, const char *, int) {} template inline DebugStream &operator<<(const T &) { return *this; } }; #else class DebugStream { private: template static void printVector(V _x) { enum { Size = sizeof(V) / sizeof(T) }; union { V v; T m[Size]; } x = { _x }; std::cerr << '[' << std::setprecision(24) << x.m[0]; for (int i = 1; i < Size; ++i) { std::cerr << ", " << std::setprecision(24) << x.m[i]; } std::cerr << ']'; } public: DebugStream(const char *func, const char *file, int line) { std::cerr << "\033[1;40;33mDEBUG: " << file << ':' << line << ' ' << func << ' '; } template DebugStream &operator<<(const T &x) { std::cerr << x; return *this; } DebugStream &operator<<(__m128 x) { printVector(x); return *this; } DebugStream &operator<<(__m128d x) { printVector(x); return *this; } DebugStream &operator<<(__m128i x) { printVector(x); return *this; } ~DebugStream() { std::cerr << "\033[0m" << std::endl; } }; #endif #define VC_DEBUG ::Vc::SSE::DebugStream(__PRETTY_FUNCTION__, __FILE__, __LINE__) } // namespace SSE } // namespace Vc /*OUTER_NAMESPACE_END*/ #endif // VC_SSE_DEBUG_H Vc-0.7.4/sse/deinterleave.tcc000066400000000000000000000200131233512346000160210ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2010-2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace SSE { inline void deinterleave(Vector &a, Vector &b) { const _M128 tmp0 = _mm_unpacklo_ps(a.data(), b.data()); const _M128 tmp1 = _mm_unpackhi_ps(a.data(), b.data()); a.data() = _mm_unpacklo_ps(tmp0, tmp1); b.data() = _mm_unpackhi_ps(tmp0, tmp1); } inline void deinterleave(Vector &a, Vector &b, Vector::AsArg tmp) { a.data() = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(tmp.data(), 16), 16)); b.data() = _mm_cvtepi32_ps(_mm_srai_epi32(tmp.data(), 16)); } inline void deinterleave(Vector &a, Vector &b, Vector::AsArg tmp) { a.data() = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_slli_epi32(tmp.data(), 16), 16)); b.data() = _mm_cvtepi32_ps(_mm_srli_epi32(tmp.data(), 16)); } inline void deinterleave(Vector &a, Vector &b) { _M128 tmp0 = _mm_unpacklo_ps(a.data()[0], a.data()[1]); _M128 tmp1 = _mm_unpackhi_ps(a.data()[0], a.data()[1]); _M128 tmp2 = _mm_unpacklo_ps(b.data()[0], b.data()[1]); _M128 tmp3 = _mm_unpackhi_ps(b.data()[0], b.data()[1]); a.data()[0] = _mm_unpacklo_ps(tmp0, tmp1); b.data()[0] = _mm_unpackhi_ps(tmp0, tmp1); a.data()[1] = _mm_unpacklo_ps(tmp2, tmp3); b.data()[1] = _mm_unpackhi_ps(tmp2, tmp3); } inline void deinterleave(Vector &a, Vector &b, Vector::AsArg tmp0, Vector::AsArg tmp1) { a.data()[0] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(tmp0.data(), 16), 16)); b.data()[0] = _mm_cvtepi32_ps(_mm_srai_epi32(tmp0.data(), 16)); a.data()[1] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(tmp1.data(), 16), 16)); b.data()[1] = _mm_cvtepi32_ps(_mm_srai_epi32(tmp1.data(), 16)); } inline void deinterleave(Vector &a, Vector &b, Vector::AsArg tmp0, Vector::AsArg tmp1) { a.data()[0] = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_slli_epi32(tmp0.data(), 16), 16)); b.data()[0] = _mm_cvtepi32_ps(_mm_srli_epi32(tmp0.data(), 16)); a.data()[1] = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_slli_epi32(tmp1.data(), 16), 16)); b.data()[1] = _mm_cvtepi32_ps(_mm_srli_epi32(tmp1.data(), 16)); } inline void deinterleave(Vector &a, Vector &b) { _M128D tmp = _mm_unpacklo_pd(a.data(), b.data()); b.data() = _mm_unpackhi_pd(a.data(), b.data()); a.data() = tmp; } inline void deinterleave(Vector &a, Vector &b) { const _M128I tmp0 = _mm_unpacklo_epi32(a.data(), b.data()); const _M128I tmp1 = _mm_unpackhi_epi32(a.data(), b.data()); a.data() = _mm_unpacklo_epi32(tmp0, tmp1); b.data() = _mm_unpackhi_epi32(tmp0, tmp1); } inline void deinterleave(Vector &a, Vector &b) { const _M128I tmp0 = _mm_unpacklo_epi32(a.data(), b.data()); const _M128I tmp1 = _mm_unpackhi_epi32(a.data(), b.data()); a.data() = _mm_unpacklo_epi32(tmp0, tmp1); b.data() = _mm_unpackhi_epi32(tmp0, tmp1); } inline void deinterleave(Vector &a, Vector &b) { _M128I tmp0 = _mm_unpacklo_epi16(a.data(), b.data()); // a0 a4 b0 b4 a1 a5 b1 b5 _M128I tmp1 = _mm_unpackhi_epi16(a.data(), b.data()); // a2 a6 b2 b6 a3 a7 b3 b7 _M128I tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); // a0 a2 a4 a6 b0 b2 b4 b6 _M128I tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); // a1 a3 a5 a7 b1 b3 b5 b7 a.data() = _mm_unpacklo_epi16(tmp2, tmp3); b.data() = _mm_unpackhi_epi16(tmp2, tmp3); } inline void deinterleave(Vector &a, Vector &b) { _M128I tmp0 = _mm_unpacklo_epi16(a.data(), b.data()); // a0 a4 b0 b4 a1 a5 b1 b5 _M128I tmp1 = _mm_unpackhi_epi16(a.data(), b.data()); // a2 a6 b2 b6 a3 a7 b3 b7 _M128I tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); // a0 a2 a4 a6 b0 b2 b4 b6 _M128I tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); // a1 a3 a5 a7 b1 b3 b5 b7 a.data() = _mm_unpacklo_epi16(tmp2, tmp3); b.data() = _mm_unpackhi_epi16(tmp2, tmp3); } inline void deinterleave(Vector &a, Vector &b, Vector::AsArg tmp) { a.data() = _mm_srai_epi32(_mm_slli_epi32(tmp.data(), 16), 16); b.data() = _mm_srai_epi32(tmp.data(), 16); } inline void deinterleave(Vector &a, Vector &b, Vector::AsArg tmp) { a.data() = _mm_srli_epi32(_mm_slli_epi32(tmp.data(), 16), 16); b.data() = _mm_srli_epi32(tmp.data(), 16); } } // namespace SSE namespace Internal { template inline void HelperImpl::deinterleave( float_v &a, float_v &b, const float *m, A align) { a.load(m, align); b.load(m + float_v::Size, align); Vc::SSE::deinterleave(a, b); } template inline void HelperImpl::deinterleave( float_v &a, float_v &b, const short *m, A align) { short_v tmp(m, align); Vc::SSE::deinterleave(a, b, tmp); } template inline void HelperImpl::deinterleave( float_v &a, float_v &b, const unsigned short *m, A align) { ushort_v tmp(m, align); Vc::SSE::deinterleave(a, b, tmp); } template inline void HelperImpl::deinterleave( sfloat_v &a, sfloat_v &b, const float *m, A align) { a.load(m, align); b.load(m + sfloat_v::Size, align); Vc::SSE::deinterleave(a, b); } template inline void HelperImpl::deinterleave( sfloat_v &a, sfloat_v &b, const short *m, A align) { short_v tmp0(m, align); short_v tmp1(m + short_v::Size, align); Vc::SSE::deinterleave(a, b, tmp0, tmp1); } template inline void HelperImpl::deinterleave( sfloat_v &a, sfloat_v &b, const unsigned short *m, A align) { ushort_v tmp0(m, align); ushort_v tmp1(m + short_v::Size, align); Vc::SSE::deinterleave(a, b, tmp0, tmp1); } template inline void HelperImpl::deinterleave( double_v &a, double_v &b, const double *m, A align) { a.load(m, align); b.load(m + double_v::Size, align); Vc::SSE::deinterleave(a, b); } template inline void HelperImpl::deinterleave( int_v &a, int_v &b, const int *m, A align) { a.load(m, align); b.load(m + int_v::Size, align); Vc::SSE::deinterleave(a, b); } template inline void HelperImpl::deinterleave( int_v &a, int_v &b, const short *m, A align) { short_v tmp(m, align); Vc::SSE::deinterleave(a, b, tmp); } template inline void HelperImpl::deinterleave( uint_v &a, uint_v &b, const unsigned int *m, A align) { a.load(m, align); b.load(m + uint_v::Size, align); Vc::SSE::deinterleave(a, b); } template inline void HelperImpl::deinterleave( uint_v &a, uint_v &b, const unsigned short *m, A align) { ushort_v tmp(m, align); Vc::SSE::deinterleave(a, b, tmp); } template inline void HelperImpl::deinterleave( short_v &a, short_v &b, const short *m, A align) { a.load(m, align); b.load(m + short_v::Size, align); Vc::SSE::deinterleave(a, b); } template inline void HelperImpl::deinterleave( ushort_v &a, ushort_v &b, const unsigned short *m, A align) { a.load(m, align); b.load(m + ushort_v::Size, align); Vc::SSE::deinterleave(a, b); } } // namespace Internal } // namespace Vc /*OUTER_NAMESPACE_END*/ Vc-0.7.4/sse/forceToRegisters.tcc000066400000000000000000000220311233512346000166450ustar00rootroot00000000000000#ifdef VC_GNU_ASM template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x1) { __asm__ __volatile__(""::"x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x1) { __asm__ __volatile__("":"+x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x2, const Vector &x1) { __asm__ __volatile__(""::"x"(x2.data()), "x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x2, Vector &x1) { __asm__ __volatile__("":"+x"(x2.data()), "+x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x3, const Vector &x2, const Vector &x1) { __asm__ __volatile__(""::"x"(x3.data()), "x"(x2.data()), "x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x3, Vector &x2, Vector &x1) { __asm__ __volatile__("":"+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { __asm__ __volatile__(""::"x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x4, Vector &x3, Vector &x2, Vector &x1) { __asm__ __volatile__("":"+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x5, const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { __asm__ __volatile__(""::"x"(x5.data()), "x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x5, Vector &x4, Vector &x3, Vector &x2, Vector &x1) { __asm__ __volatile__("":"+x"(x5.data()), "+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x6, const Vector &x5, const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { __asm__ __volatile__(""::"x"(x6.data()), "x"(x5.data()), "x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x6, Vector &x5, Vector &x4, Vector &x3, Vector &x2, Vector &x1) { __asm__ __volatile__("":"+x"(x6.data()), "+x"(x5.data()), "+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x7, const Vector &x6, const Vector &x5, const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { __asm__ __volatile__(""::"x"(x7.data()), "x"(x6.data()), "x"(x5.data()), "x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x7, Vector &x6, Vector &x5, Vector &x4, Vector &x3, Vector &x2, Vector &x1) { __asm__ __volatile__("":"+x"(x7.data()), "+x"(x6.data()), "+x"(x5.data()), "+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x8, const Vector &x7, const Vector &x6, const Vector &x5, const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { __asm__ __volatile__(""::"x"(x8.data()), "x"(x7.data()), "x"(x6.data()), "x"(x5.data()), "x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); } template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x8, Vector &x7, Vector &x6, Vector &x5, Vector &x4, Vector &x3, Vector &x2, Vector &x1) { __asm__ __volatile__("":"+x"(x8.data()), "+x"(x7.data()), "+x"(x6.data()), "+x"(x5.data()), "+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); } #elif defined(VC_MSVC) #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x1*/) { } #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x1*/) { } #pragma optimize("g", on) #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x2*/, const Vector &/*x1*/) { } #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x2*/, Vector &/*x1*/) { } #pragma optimize("g", on) #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { } #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { } #pragma optimize("g", on) #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { } #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { } #pragma optimize("g", on) #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x5*/, const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { } #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x5*/, Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { } #pragma optimize("g", on) #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x6*/, const Vector &/*x5*/, const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { } #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x6*/, Vector &/*x5*/, Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { } #pragma optimize("g", on) #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x7*/, const Vector &/*x6*/, const Vector &/*x5*/, const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { } #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x7*/, Vector &/*x6*/, Vector &/*x5*/, Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { } #pragma optimize("g", on) #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x8*/, const Vector &/*x7*/, const Vector &/*x6*/, const Vector &/*x5*/, const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { } #pragma optimize("g", off) template static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x8*/, Vector &/*x7*/, Vector &/*x6*/, Vector &/*x5*/, Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { } #pragma optimize("g", on) #else #error "forceToRegisters unsupported on this compiler" #endif Vc-0.7.4/sse/helperimpl.h000066400000000000000000000070161233512346000152010ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2010-2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_SSE_DEINTERLEAVE_H #define VC_SSE_DEINTERLEAVE_H #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace Internal { template<> struct HelperImpl { typedef SSE::Vector float_v; typedef SSE::Vector sfloat_v; typedef SSE::Vector double_v; typedef SSE::Vector int_v; typedef SSE::Vector uint_v; typedef SSE::Vector short_v; typedef SSE::Vector ushort_v; template static void deinterleave(float_v &, float_v &, const float *, A); template static void deinterleave(float_v &, float_v &, const short *, A); template static void deinterleave(float_v &, float_v &, const unsigned short *, A); template static void deinterleave(sfloat_v &, sfloat_v &, const float *, A); template static void deinterleave(sfloat_v &, sfloat_v &, const short *, A); template static void deinterleave(sfloat_v &, sfloat_v &, const unsigned short *, A); template static void deinterleave(double_v &, double_v &, const double *, A); template static void deinterleave(int_v &, int_v &, const int *, A); template static void deinterleave(int_v &, int_v &, const short *, A); template static void deinterleave(uint_v &, uint_v &, const unsigned int *, A); template static void deinterleave(uint_v &, uint_v &, const unsigned short *, A); template static void deinterleave(short_v &, short_v &, const short *, A); template static void deinterleave(ushort_v &, ushort_v &, const unsigned short *, A); static Vc_ALWAYS_INLINE_L void prefetchForOneRead(const void *addr) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void prefetchForModify(const void *addr) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void prefetchClose(const void *addr) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void prefetchMid(const void *addr) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void prefetchFar(const void *addr) Vc_ALWAYS_INLINE_R; template static Vc_ALWAYS_INLINE_L void *malloc(size_t n) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void free(void *p) Vc_ALWAYS_INLINE_R; }; template<> struct HelperImpl : public HelperImpl {}; template<> struct HelperImpl : public HelperImpl {}; template<> struct HelperImpl : public HelperImpl {}; template<> struct HelperImpl : public HelperImpl {}; } // namespace Internal } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "deinterleave.tcc" #include "prefetches.tcc" #include "helperimpl.tcc" #include "undomacros.h" #endif // VC_SSE_DEINTERLEAVE_H Vc-0.7.4/sse/helperimpl.tcc000066400000000000000000000035231233512346000155220ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2010-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_SSE_HELPERIMPL_TCC #define VC_SSE_HELPERIMPL_TCC #include /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace Internal { template static _VC_CONSTEXPR size_t nextMultipleOf(size_t value) { return (value % X) > 0 ? value + X - (value % X) : value; } template Vc_ALWAYS_INLINE void *HelperImpl::malloc(size_t n) { switch (A) { case Vc::AlignOnVector: return _mm_malloc(nextMultipleOf(n), Vc::SSE::VectorAlignment); case Vc::AlignOnCacheline: // TODO: hardcoding 64 is not such a great idea return _mm_malloc(nextMultipleOf<64>(n), 64); case Vc::AlignOnPage: // TODO: hardcoding 4096 is not such a great idea return _mm_malloc(nextMultipleOf<4096>(n), 4096); default: #ifndef NDEBUG abort(); #endif return _mm_malloc(n, 8); } } Vc_ALWAYS_INLINE void HelperImpl::free(void *p) { _mm_free(p); } } // namespace Internal } // namespace Vc /*OUTER_NAMESPACE_END*/ #endif // VC_SSE_HELPERIMPL_TCC Vc-0.7.4/sse/interleavedmemory.tcc000066400000000000000000001675701233512346000171310ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright (C) 2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . }}}*/ #ifndef VC_SSE_INTERLEAVEDMEMORY_TCC #define VC_SSE_INTERLEAVEDMEMORY_TCC #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace Common { namespace { template struct InterleaveImpl; template<> struct InterleaveImpl { static inline void interleave(float *const data, const SSE::sfloat_v::IndexType &i,/*{{{*/ const SSE::sfloat_v::AsArg v0, const SSE::sfloat_v::AsArg v1) { const __m128 tmp0 = _mm_unpacklo_ps(v0.data()[0], v1.data()[0]); const __m128 tmp1 = _mm_unpackhi_ps(v0.data()[0], v1.data()[0]); const __m128 tmp2 = _mm_unpacklo_ps(v0.data()[1], v1.data()[1]); const __m128 tmp3 = _mm_unpackhi_ps(v0.data()[1], v1.data()[1]); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[4]]), tmp2); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), tmp2); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[6]]), tmp3); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), tmp3); }/*}}}*/ static inline void interleave(float *const data, const SSE::sfloat_v::IndexType &i,/*{{{*/ const SSE::sfloat_v::AsArg v0, const SSE::sfloat_v::AsArg v1, const SSE::sfloat_v::AsArg v2) { #ifdef VC_USE_MASKMOV_SCATTER const __m128i mask = _mm_set_epi32(0, -1, -1, -1); const __m128 tmp0 = _mm_unpacklo_ps(v0.data()[0], v1.data()[0]); const __m128 tmp1 = _mm_unpackhi_ps(v0.data()[0], v1.data()[0]); const __m128 tmp2 = _mm_unpacklo_ps(v2.data()[0], v2.data()[0]); const __m128 tmp3 = _mm_unpackhi_ps(v2.data()[0], v2.data()[0]); _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp0, tmp2)), mask, reinterpret_cast(&data[i[0]])); _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp2, tmp0)), mask, reinterpret_cast(&data[i[1]])); _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp1, tmp3)), mask, reinterpret_cast(&data[i[2]])); _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp3, tmp1)), mask, reinterpret_cast(&data[i[3]])); const __m128 tmp8 = _mm_unpacklo_ps(v0.data()[1], v1.data()[1]); const __m128 tmp9 = _mm_unpackhi_ps(v0.data()[1], v1.data()[1]); const __m128 tmp10 = _mm_unpacklo_ps(v2.data()[1], v2.data()[1]); const __m128 tmp11 = _mm_unpackhi_ps(v2.data()[1], v2.data()[1]); _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp8, tmp10)), mask, reinterpret_cast(&data[i[4]])); _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp10, tmp8)), mask, reinterpret_cast(&data[i[5]])); _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp9, tmp11)), mask, reinterpret_cast(&data[i[6]])); _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp11, tmp9)), mask, reinterpret_cast(&data[i[7]])); #else interleave(data, i, v0, v1); v2.scatter(data + 2, i); #endif }/*}}}*/ static inline void interleave(float *const data, const SSE::sfloat_v::IndexType &i,/*{{{*/ const SSE::sfloat_v::AsArg v0, const SSE::sfloat_v::AsArg v1, const SSE::sfloat_v::AsArg v2, const SSE::sfloat_v::AsArg v3) { const __m128 tmp0 = _mm_unpacklo_ps(v0.data()[0], v1.data()[0]); const __m128 tmp1 = _mm_unpackhi_ps(v0.data()[0], v1.data()[0]); const __m128 tmp2 = _mm_unpacklo_ps(v2.data()[0], v3.data()[0]); const __m128 tmp3 = _mm_unpackhi_ps(v2.data()[0], v3.data()[0]); _mm_storeu_ps(&data[i[0]], _mm_movelh_ps(tmp0, tmp2)); _mm_storeu_ps(&data[i[1]], _mm_movehl_ps(tmp2, tmp0)); _mm_storeu_ps(&data[i[2]], _mm_movelh_ps(tmp1, tmp3)); _mm_storeu_ps(&data[i[3]], _mm_movehl_ps(tmp3, tmp1)); const __m128 tmp8 = _mm_unpacklo_ps(v0.data()[1], v1.data()[1]); const __m128 tmp9 = _mm_unpackhi_ps(v0.data()[1], v1.data()[1]); const __m128 tmp10 = _mm_unpacklo_ps(v2.data()[1], v3.data()[1]); const __m128 tmp11 = _mm_unpackhi_ps(v2.data()[1], v3.data()[1]); _mm_storeu_ps(&data[i[4]], _mm_movelh_ps(tmp8, tmp10)); _mm_storeu_ps(&data[i[5]], _mm_movehl_ps(tmp10, tmp8)); _mm_storeu_ps(&data[i[6]], _mm_movelh_ps(tmp9, tmp11)); _mm_storeu_ps(&data[i[7]], _mm_movehl_ps(tmp11, tmp9)); }/*}}}*/ }; template struct InterleaveImpl { static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1) { const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v1.data()); const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v1.data()); #ifdef __x86_64__ const long long tmp00 = _mm_cvtsi128_si64(tmp0); const long long tmp01 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp0, tmp0)); const long long tmp10 = _mm_cvtsi128_si64(tmp1); const long long tmp11 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp1, tmp1)); *reinterpret_cast(&data[i[0]]) = tmp00; *reinterpret_cast(&data[i[1]]) = tmp00 >> 32; *reinterpret_cast(&data[i[2]]) = tmp01; *reinterpret_cast(&data[i[3]]) = tmp01 >> 32; *reinterpret_cast(&data[i[4]]) = tmp10; *reinterpret_cast(&data[i[5]]) = tmp10 >> 32; *reinterpret_cast(&data[i[6]]) = tmp11; *reinterpret_cast(&data[i[7]]) = tmp11 >> 32; #elif defined(VC_IMPL_SSE4_1) *reinterpret_cast(&data[i[0]]) = _mm_cvtsi128_si32(tmp0); *reinterpret_cast(&data[i[1]]) = _mm_extract_epi32(tmp0, 1); *reinterpret_cast(&data[i[2]]) = _mm_extract_epi32(tmp0, 2); *reinterpret_cast(&data[i[3]]) = _mm_extract_epi32(tmp0, 3); *reinterpret_cast(&data[i[4]]) = _mm_cvtsi128_si32(tmp1); *reinterpret_cast(&data[i[5]]) = _mm_extract_epi32(tmp1, 1); *reinterpret_cast(&data[i[6]]) = _mm_extract_epi32(tmp1, 2); *reinterpret_cast(&data[i[7]]) = _mm_extract_epi32(tmp1, 3); #else *reinterpret_cast(&data[i[0]]) = _mm_cvtsi128_si32(tmp0); *reinterpret_cast(&data[i[1]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 4)); *reinterpret_cast(&data[i[2]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 8)); *reinterpret_cast(&data[i[3]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 12)); *reinterpret_cast(&data[i[4]]) = _mm_cvtsi128_si32(tmp1); *reinterpret_cast(&data[i[5]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 4)); *reinterpret_cast(&data[i[6]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 8)); *reinterpret_cast(&data[i[7]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 12)); #endif }/*}}}*/ static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) { #ifdef VC_USE_MASKMOV_SCATTER const __m128i maskLo = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); const __m128i maskHi = _mm_set_epi16(0, -1, -1, -1, 0, 0, 0, 0); typename V::EntryType *const dataHi = data - 4; const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data()); const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data()); const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v1.data()); const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v1.data()); const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2); const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2); const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3); const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3); _mm_maskmoveu_si128(tmp4, maskLo, reinterpret_cast(&data[i[0]])); _mm_maskmoveu_si128(tmp4, maskHi, reinterpret_cast(&dataHi[i[1]])); _mm_maskmoveu_si128(tmp5, maskLo, reinterpret_cast(&data[i[2]])); _mm_maskmoveu_si128(tmp5, maskHi, reinterpret_cast(&dataHi[i[3]])); _mm_maskmoveu_si128(tmp6, maskLo, reinterpret_cast(&data[i[4]])); _mm_maskmoveu_si128(tmp6, maskHi, reinterpret_cast(&dataHi[i[5]])); _mm_maskmoveu_si128(tmp7, maskLo, reinterpret_cast(&data[i[6]])); _mm_maskmoveu_si128(tmp7, maskHi, reinterpret_cast(&dataHi[i[7]])); #else interleave(data, i, v0, v1); v2.scatter(data + 2, i); #endif }/*}}}*/ static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) { const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data()); const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data()); const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v3.data()); const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v3.data()); const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2); const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2); const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3); const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3); _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[0]]), tmp4); _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[2]]), tmp5); _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[4]]), tmp6); _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[6]]), tmp7); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), _mm_castsi128_ps(tmp4)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), _mm_castsi128_ps(tmp5)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), _mm_castsi128_ps(tmp6)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), _mm_castsi128_ps(tmp7)); }/*}}}*/ }; template struct InterleaveImpl { static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1) { const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data())); const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data())); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1); }/*}}}*/ static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) { #ifdef VC_USE_MASKMOV_SCATTER const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()), SSE::sse_cast<__m128>(v1.data())); const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()), SSE::sse_cast<__m128>(v1.data())); const __m128 tmp2 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v2.data()), SSE::sse_cast<__m128>(v2.data())); const __m128 tmp3 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v2.data()), SSE::sse_cast<__m128>(v2.data())); const __m128i mask = _mm_set_epi32(0, -1, -1, -1); _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp0, tmp2)), mask, reinterpret_cast(&data[i[0]])); _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp2, tmp0)), mask, reinterpret_cast(&data[i[1]])); _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp1, tmp3)), mask, reinterpret_cast(&data[i[2]])); _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp3, tmp1)), mask, reinterpret_cast(&data[i[3]])); #else const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data())); const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data())); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1); v2.scatter(data + 2, i); #endif }/*}}}*/ static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) { const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data())); const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data())); const __m128 tmp2 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v2.data()),SSE::sse_cast<__m128>(v3.data())); const __m128 tmp3 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v2.data()),SSE::sse_cast<__m128>(v3.data())); _mm_storeu_ps(reinterpret_cast(&data[i[0]]), _mm_movelh_ps(tmp0, tmp2)); _mm_storeu_ps(reinterpret_cast(&data[i[1]]), _mm_movehl_ps(tmp2, tmp0)); _mm_storeu_ps(reinterpret_cast(&data[i[2]]), _mm_movelh_ps(tmp1, tmp3)); _mm_storeu_ps(reinterpret_cast(&data[i[3]]), _mm_movehl_ps(tmp3, tmp1)); }/*}}}*/ }; template struct InterleaveImpl { static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1) { const __m128d tmp0 = _mm_unpacklo_pd(v0.data(), v1.data()); const __m128d tmp1 = _mm_unpackhi_pd(v0.data(), v1.data()); _mm_storeu_pd(&data[i[0]], tmp0); _mm_storeu_pd(&data[i[1]], tmp1); }/*}}}*/ static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) { interleave(data, i, v0, v1); v2.scatter(data + 2, i); }/*}}}*/ static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) { interleave(data, i, v0, v1); interleave(data + 2, i, v2, v3); }/*}}}*/ }; } // anonymous namespace template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ const typename V::AsArg v1) { InterleaveImpl::interleave(m_data, m_indexes, v0, v1); }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ const typename V::AsArg v1, const typename V::AsArg v2) { InterleaveImpl::interleave(m_data, m_indexes, v0, v1, v2); }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) { InterleaveImpl::interleave(m_data, m_indexes, v0, v1, v2, v3); }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4) { InterleaveImpl::interleave(m_data, m_indexes, v0, v1, v2, v3); v4.scatter(m_data + 4, m_indexes); }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5) { InterleaveImpl::interleave(m_data , m_indexes, v0, v1, v2, v3); InterleaveImpl::interleave(m_data + 4, m_indexes, v4, v5); }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6) { InterleaveImpl::interleave(m_data + 0, m_indexes, v0, v1, v2, v3); InterleaveImpl::interleave(m_data + 4, m_indexes, v4, v5, v6); }/*}}}*/ template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6, const typename V::AsArg v7) { InterleaveImpl::interleave(m_data + 0, m_indexes, v0, v1, v2, v3); InterleaveImpl::interleave(m_data + 4, m_indexes, v4, v5, v6, v7); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1) const/*{{{*/ { const __m128 a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[0]]))); const __m128 b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[1]]))); const __m128 c = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[2]]))); const __m128 d = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[3]]))); const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1] const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3] v0.data() = _mm_movelh_ps(tmp0, tmp1); v1.data() = _mm_movehl_ps(tmp1, tmp0); } /*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2) const/*{{{*/ { const __m128 a = _mm_loadu_ps(&m_data[m_indexes[0]]); const __m128 b = _mm_loadu_ps(&m_data[m_indexes[1]]); const __m128 c = _mm_loadu_ps(&m_data[m_indexes[2]]); const __m128 d = _mm_loadu_ps(&m_data[m_indexes[3]]); const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1] const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3] const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 XX XX] const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 XX XX] v0.data() = _mm_movelh_ps(tmp0, tmp1); v1.data() = _mm_movehl_ps(tmp1, tmp0); v2.data() = _mm_movelh_ps(tmp2, tmp3); } /*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3) const/*{{{*/ { const __m128 a = _mm_loadu_ps(&m_data[m_indexes[0]]); const __m128 b = _mm_loadu_ps(&m_data[m_indexes[1]]); const __m128 c = _mm_loadu_ps(&m_data[m_indexes[2]]); const __m128 d = _mm_loadu_ps(&m_data[m_indexes[3]]); const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1] const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3] const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 d0 d1] const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 d2 d3] v0.data() = _mm_movelh_ps(tmp0, tmp1); v1.data() = _mm_movehl_ps(tmp1, tmp0); v2.data() = _mm_movelh_ps(tmp2, tmp3); v3.data() = _mm_movehl_ps(tmp3, tmp2); } /*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4) const/*{{{*/ { v4.gather(m_data, m_indexes + I(4)); deinterleave(v0, v1, v2, v3); } /*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4, float_v &v5) const/*{{{*/ { const __m128 a = _mm_loadu_ps(&m_data[m_indexes[0]]); const __m128 e = _mm_loadu_ps(&m_data[4 + m_indexes[0]]); const __m128 b = _mm_loadu_ps(&m_data[m_indexes[1]]); const __m128 f = _mm_loadu_ps(&m_data[4 + m_indexes[1]]); const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1] const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 d0 d1] const __m128 tmp4 = _mm_unpacklo_ps(e, f); // [a0 a1 b0 b1] const __m128 c = _mm_loadu_ps(&m_data[m_indexes[2]]); const __m128 g = _mm_loadu_ps(&m_data[4 + m_indexes[2]]); const __m128 d = _mm_loadu_ps(&m_data[m_indexes[3]]); const __m128 h = _mm_loadu_ps(&m_data[4 + m_indexes[3]]); const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3] v0.data() = _mm_movelh_ps(tmp0, tmp1); v1.data() = _mm_movehl_ps(tmp1, tmp0); const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 d2 d3] v2.data() = _mm_movelh_ps(tmp2, tmp3); v3.data() = _mm_movehl_ps(tmp3, tmp2); const __m128 tmp5 = _mm_unpacklo_ps(g, h); // [a2 a3 b2 b3] v4.data() = _mm_movelh_ps(tmp4, tmp5); v5.data() = _mm_movehl_ps(tmp5, tmp4); } /*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4, float_v &v5, float_v &v6) const/*{{{*/ { const __m128 a = _mm_loadu_ps(&m_data[m_indexes[0]]); const __m128 e = _mm_loadu_ps(&m_data[4 + m_indexes[0]]); const __m128 b = _mm_loadu_ps(&m_data[m_indexes[1]]); const __m128 f = _mm_loadu_ps(&m_data[4 + m_indexes[1]]); const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1] const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 d0 d1] const __m128 tmp4 = _mm_unpacklo_ps(e, f); // [a0 a1 b0 b1] const __m128 tmp6 = _mm_unpackhi_ps(e, f); // [c0 c1 d0 d1] const __m128 c = _mm_loadu_ps(&m_data[m_indexes[2]]); const __m128 g = _mm_loadu_ps(&m_data[4 + m_indexes[2]]); const __m128 d = _mm_loadu_ps(&m_data[m_indexes[3]]); const __m128 h = _mm_loadu_ps(&m_data[4 + m_indexes[3]]); const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3] v0.data() = _mm_movelh_ps(tmp0, tmp1); v1.data() = _mm_movehl_ps(tmp1, tmp0); const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 d2 d3] v2.data() = _mm_movelh_ps(tmp2, tmp3); v3.data() = _mm_movehl_ps(tmp3, tmp2); const __m128 tmp5 = _mm_unpacklo_ps(g, h); // [a2 a3 b2 b3] v4.data() = _mm_movelh_ps(tmp4, tmp5); v5.data() = _mm_movehl_ps(tmp5, tmp4); const __m128 tmp7 = _mm_unpackhi_ps(g, h); // [c2 c3 d2 d3] v6.data() = _mm_movelh_ps(tmp6, tmp7); } /*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4, float_v &v5, float_v &v6, float_v &v7) const/*{{{*/ { const __m128 a = _mm_loadu_ps(&m_data[m_indexes[0]]); const __m128 e = _mm_loadu_ps(&m_data[4 + m_indexes[0]]); const __m128 b = _mm_loadu_ps(&m_data[m_indexes[1]]); const __m128 f = _mm_loadu_ps(&m_data[4 + m_indexes[1]]); const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1] const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 d0 d1] const __m128 tmp4 = _mm_unpacklo_ps(e, f); // [a0 a1 b0 b1] const __m128 tmp6 = _mm_unpackhi_ps(e, f); // [c0 c1 d0 d1] const __m128 c = _mm_loadu_ps(&m_data[m_indexes[2]]); const __m128 g = _mm_loadu_ps(&m_data[4 + m_indexes[2]]); const __m128 d = _mm_loadu_ps(&m_data[m_indexes[3]]); const __m128 h = _mm_loadu_ps(&m_data[4 + m_indexes[3]]); const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3] v0.data() = _mm_movelh_ps(tmp0, tmp1); v1.data() = _mm_movehl_ps(tmp1, tmp0); const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 d2 d3] v2.data() = _mm_movelh_ps(tmp2, tmp3); v3.data() = _mm_movehl_ps(tmp3, tmp2); const __m128 tmp5 = _mm_unpacklo_ps(g, h); // [a2 a3 b2 b3] v4.data() = _mm_movelh_ps(tmp4, tmp5); v5.data() = _mm_movehl_ps(tmp5, tmp4); const __m128 tmp7 = _mm_unpackhi_ps(g, h); // [c2 c3 d2 d3] v6.data() = _mm_movelh_ps(tmp6, tmp7); v7.data() = _mm_movehl_ps(tmp7, tmp6); }/*}}}*/ static inline void _sse_deinterleave_double(const double *VC_RESTRICT data, const uint_v &indexes, double_v &v0, double_v &v1)/*{{{*/ { const __m128d a = _mm_loadu_pd(&data[indexes[0]]); const __m128d b = _mm_loadu_pd(&data[indexes[1]]); v0.data() = _mm_unpacklo_pd(a, b); v1.data() = _mm_unpackhi_pd(a, b); }/*}}}*/ template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1) const {/*{{{*/ _sse_deinterleave_double(m_data, m_indexes, v0, v1); } /*}}}*/ template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1,/*{{{*/ double_v &v2) const { v2.gather(m_data + 2, m_indexes); _sse_deinterleave_double(m_data, m_indexes, v0, v1); } /*}}}*/ template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1,/*{{{*/ double_v &v2, double_v &v3) const { _sse_deinterleave_double(m_data , m_indexes, v0, v1); _sse_deinterleave_double(m_data + 2, m_indexes, v2, v3); } /*}}}*/ template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1,/*{{{*/ double_v &v2, double_v &v3, double_v &v4) const { v4.gather(m_data + 4, m_indexes); _sse_deinterleave_double(m_data , m_indexes, v0, v1); _sse_deinterleave_double(m_data + 2, m_indexes, v2, v3); } /*}}}*/ template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1,/*{{{*/ double_v &v2, double_v &v3, double_v &v4, double_v &v5) const { _sse_deinterleave_double(m_data , m_indexes, v0, v1); _sse_deinterleave_double(m_data + 2, m_indexes, v2, v3); _sse_deinterleave_double(m_data + 4, m_indexes, v4, v5); } /*}}}*/ template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1,/*{{{*/ double_v &v2, double_v &v3, double_v &v4, double_v &v5, double_v &v6) const { v6.gather(m_data + 6, m_indexes); _sse_deinterleave_double(m_data , m_indexes, v0, v1); _sse_deinterleave_double(m_data + 2, m_indexes, v2, v3); _sse_deinterleave_double(m_data + 4, m_indexes, v4, v5); } /*}}}*/ template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1,/*{{{*/ double_v &v2, double_v &v3, double_v &v4, double_v &v5, double_v &v6, double_v &v7) const { _sse_deinterleave_double(m_data , m_indexes, v0, v1); _sse_deinterleave_double(m_data + 2, m_indexes, v2, v3); _sse_deinterleave_double(m_data + 4, m_indexes, v4, v5); _sse_deinterleave_double(m_data + 6, m_indexes, v6, v7); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1) const {/*{{{*/ const __m128i a = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[0]])); const __m128i b = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[1]])); const __m128i c = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[2]])); const __m128i d = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[3]])); const __m128i e = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[4]])); const __m128i f = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[5]])); const __m128i g = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[6]])); const __m128i h = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[7]])); const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); } /*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ short_v &v2) const { const __m128i a = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[0]])); const __m128i b = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[1]])); const __m128i c = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[2]])); const __m128i d = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[3]])); const __m128i e = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[4]])); const __m128i f = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[5]])); const __m128i g = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[6]])); const __m128i h = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[7]])); const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ short_v &v2, short_v &v3) const { const __m128i a = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[0]])); const __m128i b = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[1]])); const __m128i c = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[2]])); const __m128i d = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[3]])); const __m128i e = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[4]])); const __m128i f = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[5]])); const __m128i g = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[6]])); const __m128i h = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[7]])); const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ short_v &v2, short_v &v3, short_v &v4) const { const __m128i a = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[0]])); const __m128i b = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[1]])); const __m128i c = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[2]])); const __m128i d = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[3]])); const __m128i e = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[4]])); const __m128i f = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[5]])); const __m128i g = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[6]])); const __m128i h = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[7]])); const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ short_v &v2, short_v &v3, short_v &v4, short_v &v5) const { const __m128i a = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[0]])); const __m128i b = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[1]])); const __m128i c = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[2]])); const __m128i d = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[3]])); const __m128i e = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[4]])); const __m128i f = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[5]])); const __m128i g = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[6]])); const __m128i h = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[7]])); const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); v5.data() = _mm_unpackhi_epi16(tmp8, tmp9); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ short_v &v2, short_v &v3, short_v &v4, short_v &v5, short_v &v6) const { const __m128i a = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[0]])); const __m128i b = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[1]])); const __m128i c = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[2]])); const __m128i d = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[3]])); const __m128i e = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[4]])); const __m128i f = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[5]])); const __m128i g = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[6]])); const __m128i h = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[7]])); const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 const __m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6 const __m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); v5.data() = _mm_unpackhi_epi16(tmp8, tmp9); v6.data() = _mm_unpacklo_epi16(tmp14, tmp15); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ short_v &v2, short_v &v3, short_v &v4, short_v &v5, short_v &v6, short_v &v7) const { const __m128i a = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[0]])); const __m128i b = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[1]])); const __m128i c = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[2]])); const __m128i d = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[3]])); const __m128i e = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[4]])); const __m128i f = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[5]])); const __m128i g = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[6]])); const __m128i h = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[7]])); const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 const __m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6 const __m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); v5.data() = _mm_unpackhi_epi16(tmp8, tmp9); v6.data() = _mm_unpacklo_epi16(tmp14, tmp15); v7.data() = _mm_unpackhi_epi16(tmp14, tmp15); }/*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1) const/*{{{*/ { const __m128 i0a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[0]]))); const __m128 i1a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[1]]))); const __m128 i2a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[2]]))); const __m128 i3a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[3]]))); const __m128 i4a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[4]]))); const __m128 i5a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[5]]))); const __m128 i6a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[6]]))); const __m128 i7a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[7]]))); const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1] const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3] const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5] const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7] v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67)); v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45)); } /*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2) const/*{{{*/ { const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]); const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]); const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]); const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]); const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]); const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]); const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]); const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]); const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1] const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3] const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5] const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7] v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67)); v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45)); const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1] const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3] const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5] const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7] v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67)); } /*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3) const/*{{{*/ { const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]); const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]); const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]); const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]); const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]); const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]); const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]); const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]); const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1] const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3] const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5] const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7] v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67)); v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45)); const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1] const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3] const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5] const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7] v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67)); v3.data() = Vc::SSE::M256::create(_mm_movehl_ps(cd23, cd01), _mm_movehl_ps(cd67, cd45)); } /*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4) const/*{{{*/ { const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]); const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]); const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]); const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]); const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]); const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]); const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]); const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]); v4.gather(m_data + float_v::Size, m_indexes); const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1] const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3] const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5] const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7] v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67)); v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45)); const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1] const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3] const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5] const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7] v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67)); v3.data() = Vc::SSE::M256::create(_mm_movehl_ps(cd23, cd01), _mm_movehl_ps(cd67, cd45)); } /*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4, sfloat_v &v5) const/*{{{*/ { const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]); const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]); const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]); const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]); const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]); const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]); const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]); const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]); const __m128 i0b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[0] + float_v::Size]))); const __m128 i1b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[1] + float_v::Size]))); const __m128 i2b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[2] + float_v::Size]))); const __m128 i3b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[3] + float_v::Size]))); const __m128 i4b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[4] + float_v::Size]))); const __m128 i5b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[5] + float_v::Size]))); const __m128 i6b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[6] + float_v::Size]))); const __m128 i7b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[7] + float_v::Size]))); const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1] const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3] const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5] const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7] v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67)); v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45)); const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1] const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3] const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5] const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7] v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67)); v3.data() = Vc::SSE::M256::create(_mm_movehl_ps(cd23, cd01), _mm_movehl_ps(cd67, cd45)); const __m128 ef01 = _mm_unpacklo_ps(i0b, i1b); // [e0 e1 f0 f1] const __m128 ef23 = _mm_unpacklo_ps(i2b, i3b); // [e2 e3 f2 f3] const __m128 ef45 = _mm_unpacklo_ps(i4b, i5b); // [e4 e5 f4 f5] const __m128 ef67 = _mm_unpacklo_ps(i6b, i7b); // [e6 e7 f6 f7] v4.data() = Vc::SSE::M256::create(_mm_movelh_ps(ef01, ef23), _mm_movelh_ps(ef45, ef67)); v5.data() = Vc::SSE::M256::create(_mm_movehl_ps(ef23, ef01), _mm_movehl_ps(ef67, ef45)); } /*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4, sfloat_v &v5, sfloat_v &v6) const/*{{{*/ { const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]); const __m128 i0b = _mm_loadu_ps(&m_data[m_indexes[0] + float_v::Size]); const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]); const __m128 i1b = _mm_loadu_ps(&m_data[m_indexes[1] + float_v::Size]); const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]); const __m128 i2b = _mm_loadu_ps(&m_data[m_indexes[2] + float_v::Size]); const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]); const __m128 i3b = _mm_loadu_ps(&m_data[m_indexes[3] + float_v::Size]); const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]); const __m128 i4b = _mm_loadu_ps(&m_data[m_indexes[4] + float_v::Size]); const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]); const __m128 i5b = _mm_loadu_ps(&m_data[m_indexes[5] + float_v::Size]); const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]); const __m128 i6b = _mm_loadu_ps(&m_data[m_indexes[6] + float_v::Size]); const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]); const __m128 i7b = _mm_loadu_ps(&m_data[m_indexes[7] + float_v::Size]); const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1] const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3] const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5] const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7] v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67)); v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45)); const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1] const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3] const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5] const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7] v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67)); v3.data() = Vc::SSE::M256::create(_mm_movehl_ps(cd23, cd01), _mm_movehl_ps(cd67, cd45)); const __m128 ef01 = _mm_unpacklo_ps(i0b, i1b); // [e0 e1 f0 f1] const __m128 ef23 = _mm_unpacklo_ps(i2b, i3b); // [e2 e3 f2 f3] const __m128 ef45 = _mm_unpacklo_ps(i4b, i5b); // [e4 e5 f4 f5] const __m128 ef67 = _mm_unpacklo_ps(i6b, i7b); // [e6 e7 f6 f7] v4.data() = Vc::SSE::M256::create(_mm_movelh_ps(ef01, ef23), _mm_movelh_ps(ef45, ef67)); v5.data() = Vc::SSE::M256::create(_mm_movehl_ps(ef23, ef01), _mm_movehl_ps(ef67, ef45)); const __m128 gh01 = _mm_unpackhi_ps(i0b, i1b); // [g0 g1 h0 h1] const __m128 gh23 = _mm_unpackhi_ps(i2b, i3b); // [g2 g3 h2 h3] const __m128 gh45 = _mm_unpackhi_ps(i4b, i5b); // [g4 g5 h4 h5] const __m128 gh67 = _mm_unpackhi_ps(i6b, i7b); // [g6 g7 h6 h7] v6.data() = Vc::SSE::M256::create(_mm_movelh_ps(gh01, gh23), _mm_movelh_ps(gh45, gh67)); } /*}}}*/ template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4, sfloat_v &v5, sfloat_v &v6, sfloat_v &v7) const/*{{{*/ { const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]); const __m128 i0b = _mm_loadu_ps(&m_data[m_indexes[0] + float_v::Size]); const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]); const __m128 i1b = _mm_loadu_ps(&m_data[m_indexes[1] + float_v::Size]); const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]); const __m128 i2b = _mm_loadu_ps(&m_data[m_indexes[2] + float_v::Size]); const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]); const __m128 i3b = _mm_loadu_ps(&m_data[m_indexes[3] + float_v::Size]); const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]); const __m128 i4b = _mm_loadu_ps(&m_data[m_indexes[4] + float_v::Size]); const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]); const __m128 i5b = _mm_loadu_ps(&m_data[m_indexes[5] + float_v::Size]); const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]); const __m128 i6b = _mm_loadu_ps(&m_data[m_indexes[6] + float_v::Size]); const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]); const __m128 i7b = _mm_loadu_ps(&m_data[m_indexes[7] + float_v::Size]); const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1] const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3] const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5] const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7] v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67)); v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45)); const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1] const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3] const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5] const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7] v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67)); v3.data() = Vc::SSE::M256::create(_mm_movehl_ps(cd23, cd01), _mm_movehl_ps(cd67, cd45)); const __m128 ef01 = _mm_unpacklo_ps(i0b, i1b); // [e0 e1 f0 f1] const __m128 ef23 = _mm_unpacklo_ps(i2b, i3b); // [e2 e3 f2 f3] const __m128 ef45 = _mm_unpacklo_ps(i4b, i5b); // [e4 e5 f4 f5] const __m128 ef67 = _mm_unpacklo_ps(i6b, i7b); // [e6 e7 f6 f7] v4.data() = Vc::SSE::M256::create(_mm_movelh_ps(ef01, ef23), _mm_movelh_ps(ef45, ef67)); v5.data() = Vc::SSE::M256::create(_mm_movehl_ps(ef23, ef01), _mm_movehl_ps(ef67, ef45)); const __m128 gh01 = _mm_unpackhi_ps(i0b, i1b); // [g0 g1 h0 h1] const __m128 gh23 = _mm_unpackhi_ps(i2b, i3b); // [g2 g3 h2 h3] const __m128 gh45 = _mm_unpackhi_ps(i4b, i5b); // [g4 g5 h4 h5] const __m128 gh67 = _mm_unpackhi_ps(i6b, i7b); // [g6 g7 h6 h7] v6.data() = Vc::SSE::M256::create(_mm_movelh_ps(gh01, gh23), _mm_movelh_ps(gh45, gh67)); v7.data() = Vc::SSE::M256::create(_mm_movehl_ps(gh23, gh01), _mm_movehl_ps(gh67, gh45)); }/*}}}*/ // forward types of equal size - ugly, but it works/*{{{*/ #define _forward(V, V2) \ template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1) const { \ reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1)); \ } \ template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2) const { \ reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ reinterpret_cast(v2)); \ } \ template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3) const { \ reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ reinterpret_cast(v2), reinterpret_cast(v3)); \ } \ template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, \ V &v4) const { \ reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ reinterpret_cast(v2), reinterpret_cast(v3), reinterpret_cast(v4)); \ } \ template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, \ V &v4, V &v5) const { \ reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ reinterpret_cast(v2), reinterpret_cast(v3), reinterpret_cast(v4), \ reinterpret_cast(v5)); \ } \ template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, \ V &v4, V &v5, V &v6) const { \ reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ reinterpret_cast(v2), reinterpret_cast(v3), reinterpret_cast(v4), \ reinterpret_cast(v5), reinterpret_cast(v6)); \ } \ template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, \ V &v4, V &v5, V &v6, V &v7) const { \ reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ reinterpret_cast(v2), reinterpret_cast(v3), reinterpret_cast(v4), \ reinterpret_cast(v5), reinterpret_cast(v6), reinterpret_cast(v7)); \ } _forward( int_v, float_v) _forward(uint_v, float_v) _forward(ushort_v, short_v) #undef _forward/*}}}*/ } // namespace Common } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // VC_SSE_INTERLEAVEDMEMORY_TCC // vim: foldmethod=marker Vc-0.7.4/sse/intrinsics.h000066400000000000000000000577641233512346000152440ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef SSE_INTRINSICS_H #define SSE_INTRINSICS_H #include "../common/windows_fix_intrin.h" // The GCC xxxintrin.h headers do not make sure that the intrinsics have C linkage. This not really // a problem, unless there is another place where the exact same functions are declared. Then the // linkage must be the same, otherwise it won't compile. Such a case occurs on Windows, where the // intrin.h header (included indirectly via unistd.h) declares many SSE intrinsics again. extern "C" { // MMX #include // SSE #include // SSE2 #include } #include "../common/fix_clang_emmintrin.h" #include "const_data.h" #include #include "macros.h" #ifdef __3dNOW__ extern "C" { #include } #endif /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace SSE { enum VectorAlignmentEnum { VectorAlignment = 16 }; #if defined(VC_GCC) && VC_GCC < 0x40600 && !defined(VC_DONT_FIX_SSE_SHIFT) static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi16(__m128i a, __m128i count) { __asm__("psllw %1,%0" : "+x"(a) : "x"(count)); return a; } static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi32(__m128i a, __m128i count) { __asm__("pslld %1,%0" : "+x"(a) : "x"(count)); return a; } static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi64(__m128i a, __m128i count) { __asm__("psllq %1,%0" : "+x"(a) : "x"(count)); return a; } static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi16(__m128i a, __m128i count) { __asm__("psrlw %1,%0" : "+x"(a) : "x"(count)); return a; } static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi32(__m128i a, __m128i count) { __asm__("psrld %1,%0" : "+x"(a) : "x"(count)); return a; } static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi64(__m128i a, __m128i count) { __asm__("psrlq %1,%0" : "+x"(a) : "x"(count)); return a; } #endif #ifdef VC_GCC // Redefine the mul/add/sub intrinsics to use GCC-specific operators instead of builtin // functions. This way the fp-contraction optimization step kicks in and creates FMAs! :) static Vc_INTRINSIC Vc_CONST __m128d _mm_mul_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) * static_cast<__v2df>(b)); } static Vc_INTRINSIC Vc_CONST __m128d _mm_add_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) + static_cast<__v2df>(b)); } static Vc_INTRINSIC Vc_CONST __m128d _mm_sub_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) - static_cast<__v2df>(b)); } static Vc_INTRINSIC Vc_CONST __m128 _mm_mul_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) * static_cast<__v4sf>(b)); } static Vc_INTRINSIC Vc_CONST __m128 _mm_add_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) + static_cast<__v4sf>(b)); } static Vc_INTRINSIC Vc_CONST __m128 _mm_sub_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) - static_cast<__v4sf>(b)); } #endif #if defined(VC_GNU_ASM) && !defined(NVALGRIND) static Vc_INTRINSIC __m128i Vc_CONST _mm_setallone() { __m128i r; __asm__("pcmpeqb %0,%0":"=x"(r)); return r; } #else static Vc_INTRINSIC __m128i Vc_CONST _mm_setallone() { __m128i r = _mm_setzero_si128(); return _mm_cmpeq_epi8(r, r); } #endif static Vc_INTRINSIC __m128i Vc_CONST _mm_setallone_si128() { return _mm_setallone(); } static Vc_INTRINSIC __m128d Vc_CONST _mm_setallone_pd() { return _mm_castsi128_pd(_mm_setallone()); } static Vc_INTRINSIC __m128 Vc_CONST _mm_setallone_ps() { return _mm_castsi128_ps(_mm_setallone()); } static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi8 () { return _mm_set1_epi8(1); } static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu8 () { return _mm_setone_epi8(); } static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi16() { return _mm_load_si128(reinterpret_cast(c_general::one16)); } static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu16() { return _mm_setone_epi16(); } static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi32() { return _mm_load_si128(reinterpret_cast(c_general::one32)); } static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu32() { return _mm_setone_epi32(); } static Vc_INTRINSIC __m128 Vc_CONST _mm_setone_ps() { return _mm_load_ps(c_general::oneFloat); } static Vc_INTRINSIC __m128d Vc_CONST _mm_setone_pd() { return _mm_load_pd(c_general::oneDouble); } static Vc_INTRINSIC __m128d Vc_CONST _mm_setabsmask_pd() { return _mm_load_pd(reinterpret_cast(c_general::absMaskDouble)); } static Vc_INTRINSIC __m128 Vc_CONST _mm_setabsmask_ps() { return _mm_load_ps(reinterpret_cast(c_general::absMaskFloat)); } static Vc_INTRINSIC __m128d Vc_CONST _mm_setsignmask_pd(){ return _mm_load_pd(reinterpret_cast(c_general::signMaskDouble)); } static Vc_INTRINSIC __m128 Vc_CONST _mm_setsignmask_ps(){ return _mm_load_ps(reinterpret_cast(c_general::signMaskFloat)); } //X static Vc_INTRINSIC __m128i Vc_CONST _mm_setmin_epi8 () { return _mm_slli_epi8 (_mm_setallone_si128(), 7); } static Vc_INTRINSIC __m128i Vc_CONST _mm_setmin_epi16() { return _mm_load_si128(reinterpret_cast(c_general::minShort)); } static Vc_INTRINSIC __m128i Vc_CONST _mm_setmin_epi32() { return _mm_load_si128(reinterpret_cast(c_general::signMaskFloat)); } //X static Vc_INTRINSIC __m128i Vc_CONST _mm_cmplt_epu8 (__m128i a, __m128i b) { return _mm_cmplt_epi8 ( //X _mm_xor_si128(a, _mm_setmin_epi8 ()), _mm_xor_si128(b, _mm_setmin_epi8 ())); } //X static Vc_INTRINSIC __m128i Vc_CONST _mm_cmpgt_epu8 (__m128i a, __m128i b) { return _mm_cmpgt_epi8 ( //X _mm_xor_si128(a, _mm_setmin_epi8 ()), _mm_xor_si128(b, _mm_setmin_epi8 ())); } static Vc_INTRINSIC __m128i Vc_CONST _mm_cmplt_epu16(__m128i a, __m128i b) { return _mm_cmplt_epi16( _mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); } static Vc_INTRINSIC __m128i Vc_CONST _mm_cmpgt_epu16(__m128i a, __m128i b) { return _mm_cmpgt_epi16( _mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); } static Vc_INTRINSIC __m128i Vc_CONST _mm_cmplt_epu32(__m128i a, __m128i b) { return _mm_cmplt_epi32( _mm_xor_si128(a, _mm_setmin_epi32()), _mm_xor_si128(b, _mm_setmin_epi32())); } static Vc_INTRINSIC __m128i Vc_CONST _mm_cmpgt_epu32(__m128i a, __m128i b) { return _mm_cmpgt_epi32( _mm_xor_si128(a, _mm_setmin_epi32()), _mm_xor_si128(b, _mm_setmin_epi32())); } } // namespace SSE } // namespace Vc /*OUTER_NAMESPACE_END*/ // SSE3 #ifdef VC_IMPL_SSE3 extern "C" { #include } #endif // SSSE3 #ifdef VC_IMPL_SSSE3 extern "C" { #include } #define mm_abs_epi8 _mm_abs_epi8 #define mm_abs_epi16 _mm_abs_epi16 #define mm_abs_epi32 _mm_abs_epi32 #define mm_alignr_epi8 _mm_alignr_epi8 /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace SSE { // not overriding _mm_set1_epi8 because this one should only be used for non-constants static Vc_INTRINSIC __m128i Vc_CONST set1_epi8(int a) { #if defined(VC_GCC) && VC_GCC < 0x40500 return _mm_shuffle_epi8(_mm_cvtsi32_si128(a), _mm_setzero_si128()); #else // GCC 4.5 nows about the pshufb improvement return _mm_set1_epi8(a); #endif } } // namespace SSE } // namespace Vc /*OUTER_NAMESPACE_END*/ #else /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace SSE { static Vc_INTRINSIC __m128i Vc_CONST mm_abs_epi8 (__m128i a) { __m128i negative = _mm_cmplt_epi8 (a, _mm_setzero_si128()); return _mm_add_epi8 (_mm_xor_si128(a, negative), _mm_and_si128(negative, _mm_setone_epi8())); } // positive value: // negative == 0 // a unchanged after xor // 0 >> 31 -> 0 // a + 0 -> a // negative value: // negative == -1 // a xor -1 -> -a - 1 // -1 >> 31 -> 1 // -a - 1 + 1 -> -a static Vc_INTRINSIC __m128i Vc_CONST mm_abs_epi16(__m128i a) { __m128i negative = _mm_cmplt_epi16(a, _mm_setzero_si128()); return _mm_add_epi16(_mm_xor_si128(a, negative), _mm_srli_epi16(negative, 15)); } static Vc_INTRINSIC __m128i Vc_CONST mm_abs_epi32(__m128i a) { __m128i negative = _mm_cmplt_epi32(a, _mm_setzero_si128()); return _mm_add_epi32(_mm_xor_si128(a, negative), _mm_srli_epi32(negative, 31)); } static Vc_INTRINSIC __m128i Vc_CONST set1_epi8(int a) { return _mm_set1_epi8(a); } static Vc_INTRINSIC __m128i Vc_CONST mm_alignr_epi8(__m128i a, __m128i b, const int s) { switch (s) { case 0: return b; case 1: return _mm_or_si128(_mm_slli_si128(a, 15), _mm_srli_si128(b, 1)); case 2: return _mm_or_si128(_mm_slli_si128(a, 14), _mm_srli_si128(b, 2)); case 3: return _mm_or_si128(_mm_slli_si128(a, 13), _mm_srli_si128(b, 3)); case 4: return _mm_or_si128(_mm_slli_si128(a, 12), _mm_srli_si128(b, 4)); case 5: return _mm_or_si128(_mm_slli_si128(a, 11), _mm_srli_si128(b, 5)); case 6: return _mm_or_si128(_mm_slli_si128(a, 10), _mm_srli_si128(b, 6)); case 7: return _mm_or_si128(_mm_slli_si128(a, 9), _mm_srli_si128(b, 7)); case 8: return _mm_or_si128(_mm_slli_si128(a, 8), _mm_srli_si128(b, 8)); case 9: return _mm_or_si128(_mm_slli_si128(a, 7), _mm_srli_si128(b, 9)); case 10: return _mm_or_si128(_mm_slli_si128(a, 6), _mm_srli_si128(b, 10)); case 11: return _mm_or_si128(_mm_slli_si128(a, 5), _mm_srli_si128(b, 11)); case 12: return _mm_or_si128(_mm_slli_si128(a, 4), _mm_srli_si128(b, 12)); case 13: return _mm_or_si128(_mm_slli_si128(a, 3), _mm_srli_si128(b, 13)); case 14: return _mm_or_si128(_mm_slli_si128(a, 2), _mm_srli_si128(b, 14)); case 15: return _mm_or_si128(_mm_slli_si128(a, 1), _mm_srli_si128(b, 15)); case 16: return a; case 17: return _mm_srli_si128(a, 1); case 18: return _mm_srli_si128(a, 2); case 19: return _mm_srli_si128(a, 3); case 20: return _mm_srli_si128(a, 4); case 21: return _mm_srli_si128(a, 5); case 22: return _mm_srli_si128(a, 6); case 23: return _mm_srli_si128(a, 7); case 24: return _mm_srli_si128(a, 8); case 25: return _mm_srli_si128(a, 9); case 26: return _mm_srli_si128(a, 10); case 27: return _mm_srli_si128(a, 11); case 28: return _mm_srli_si128(a, 12); case 29: return _mm_srli_si128(a, 13); case 30: return _mm_srli_si128(a, 14); case 31: return _mm_srli_si128(a, 15); } return _mm_setzero_si128(); } } // namespace SSE } // namespace Vc /*OUTER_NAMESPACE_END*/ #endif // SSE4.1 #ifdef VC_IMPL_SSE4_1 extern "C" { #include } /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace SSE { #define mm_blendv_pd _mm_blendv_pd #define mm_blendv_ps _mm_blendv_ps #define mm_blendv_epi8 _mm_blendv_epi8 #define mm_blend_epi16 _mm_blend_epi16 #define mm_blend_ps _mm_blend_ps #define mm_blend_pd _mm_blend_pd #define mm_min_epi32 _mm_min_epi32 #define mm_max_epi32 _mm_max_epi32 #define mm_min_epu32 _mm_min_epu32 #define mm_max_epu32 _mm_max_epu32 //#define mm_min_epi16 _mm_min_epi16 //#define mm_max_epi16 _mm_max_epi16 #define mm_min_epu16 _mm_min_epu16 #define mm_max_epu16 _mm_max_epu16 #define mm_min_epi8 _mm_min_epi8 #define mm_max_epi8 _mm_max_epi8 #define mm_cvtepu16_epi32 _mm_cvtepu16_epi32 #define mm_cvtepu8_epi16 _mm_cvtepu8_epi16 #define mm_cvtepi8_epi16 _mm_cvtepi8_epi16 #define mm_cvtepu16_epi32 _mm_cvtepu16_epi32 #define mm_cvtepi16_epi32 _mm_cvtepi16_epi32 #define mm_cvtepu8_epi32 _mm_cvtepu8_epi32 #define mm_cvtepi8_epi32 _mm_cvtepi8_epi32 #define mm_stream_load_si128 _mm_stream_load_si128 // TODO } // namespace SSE } // namespace Vc /*OUTER_NAMESPACE_END*/ #else /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace SSE { static Vc_INTRINSIC __m128d mm_blendv_pd(__m128d a, __m128d b, __m128d c) { return _mm_or_pd(_mm_andnot_pd(c, a), _mm_and_pd(c, b)); } static Vc_INTRINSIC __m128 mm_blendv_ps(__m128 a, __m128 b, __m128 c) { return _mm_or_ps(_mm_andnot_ps(c, a), _mm_and_ps(c, b)); } static Vc_INTRINSIC __m128i mm_blendv_epi8(__m128i a, __m128i b, __m128i c) { return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b)); } // only use the following blend functions with immediates as mask and, of course, compiling // with optimization static Vc_INTRINSIC __m128d mm_blend_pd(__m128d a, __m128d b, const int mask) { switch (mask) { case 0x0: return a; case 0x1: return _mm_shuffle_pd(b, a, 2); case 0x2: return _mm_shuffle_pd(a, b, 2); case 0x3: return b; default: abort(); return a; // should never be reached, but MSVC needs it else it warns about 'not all control paths return a value' } } static Vc_INTRINSIC __m128 mm_blend_ps(__m128 a, __m128 b, const int mask) { __m128i c; switch (mask) { case 0x0: return a; case 0x1: c = _mm_srli_si128(_mm_setallone_si128(), 12); break; case 0x2: c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 4); break; case 0x3: c = _mm_srli_si128(_mm_setallone_si128(), 8); break; case 0x4: c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 8); break; case 0x5: c = _mm_set_epi32(0, -1, 0, -1); break; case 0x6: c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 8), 4); break; case 0x7: c = _mm_srli_si128(_mm_setallone_si128(), 4); break; case 0x8: c = _mm_slli_si128(_mm_setallone_si128(), 12); break; case 0x9: c = _mm_set_epi32(-1, 0, 0, -1); break; case 0xa: c = _mm_set_epi32(-1, 0, -1, 0); break; case 0xb: c = _mm_set_epi32(-1, 0, -1, -1); break; case 0xc: c = _mm_slli_si128(_mm_setallone_si128(), 8); break; case 0xd: c = _mm_set_epi32(-1, -1, 0, -1); break; case 0xe: c = _mm_slli_si128(_mm_setallone_si128(), 4); break; case 0xf: return b; default: // may not happen abort(); c = _mm_setzero_si128(); break; } __m128 _c = _mm_castsi128_ps(c); return _mm_or_ps(_mm_andnot_ps(_c, a), _mm_and_ps(_c, b)); } static Vc_INTRINSIC __m128i mm_blend_epi16(__m128i a, __m128i b, const int mask) { __m128i c; switch (mask) { case 0x00: return a; case 0x01: c = _mm_srli_si128(_mm_setallone_si128(), 14); break; case 0x03: c = _mm_srli_si128(_mm_setallone_si128(), 12); break; case 0x07: c = _mm_srli_si128(_mm_setallone_si128(), 10); break; case 0x0f: return _mm_unpackhi_epi64(_mm_slli_si128(b, 8), a); case 0x1f: c = _mm_srli_si128(_mm_setallone_si128(), 6); break; case 0x3f: c = _mm_srli_si128(_mm_setallone_si128(), 4); break; case 0x7f: c = _mm_srli_si128(_mm_setallone_si128(), 2); break; case 0x80: c = _mm_slli_si128(_mm_setallone_si128(), 14); break; case 0xc0: c = _mm_slli_si128(_mm_setallone_si128(), 12); break; case 0xe0: c = _mm_slli_si128(_mm_setallone_si128(), 10); break; case 0xf0: c = _mm_slli_si128(_mm_setallone_si128(), 8); break; case 0xf8: c = _mm_slli_si128(_mm_setallone_si128(), 6); break; case 0xfc: c = _mm_slli_si128(_mm_setallone_si128(), 4); break; case 0xfe: c = _mm_slli_si128(_mm_setallone_si128(), 2); break; case 0xff: return b; case 0xcc: return _mm_unpacklo_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 3, 1))); case 0x33: return _mm_unpacklo_epi32(_mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 3, 1))); default: const __m128i shift = _mm_set_epi16(0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, -0x7fff); c = _mm_srai_epi16(_mm_mullo_epi16(_mm_set1_epi16(mask), shift), 15); break; } return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b)); } static Vc_INTRINSIC __m128i Vc_CONST mm_max_epi8 (__m128i a, __m128i b) { return mm_blendv_epi8(b, a, _mm_cmpgt_epi8 (a, b)); } static Vc_INTRINSIC __m128i Vc_CONST mm_max_epi32(__m128i a, __m128i b) { return mm_blendv_epi8(b, a, _mm_cmpgt_epi32(a, b)); } //X static Vc_INTRINSIC __m128i Vc_CONST mm_max_epu8 (__m128i a, __m128i b) { //X return mm_blendv_epi8(b, a, _mm_cmpgt_epu8 (a, b)); //X } static Vc_INTRINSIC __m128i Vc_CONST mm_max_epu16(__m128i a, __m128i b) { return mm_blendv_epi8(b, a, _mm_cmpgt_epu16(a, b)); } static Vc_INTRINSIC __m128i Vc_CONST mm_max_epu32(__m128i a, __m128i b) { return mm_blendv_epi8(b, a, _mm_cmpgt_epu32(a, b)); } //X static Vc_INTRINSIC __m128i Vc_CONST mm_min_epu8 (__m128i a, __m128i b) { //X return mm_blendv_epi8(a, b, _mm_cmpgt_epu8 (a, b)); //X } static Vc_INTRINSIC __m128i Vc_CONST mm_min_epu16(__m128i a, __m128i b) { return mm_blendv_epi8(a, b, _mm_cmpgt_epu16(a, b)); } static Vc_INTRINSIC __m128i Vc_CONST mm_min_epu32(__m128i a, __m128i b) { return mm_blendv_epi8(a, b, _mm_cmpgt_epu32(a, b)); } static Vc_INTRINSIC __m128i Vc_CONST mm_min_epi8 (__m128i a, __m128i b) { return mm_blendv_epi8(a, b, _mm_cmpgt_epi8 (a, b)); } static Vc_INTRINSIC __m128i Vc_CONST mm_min_epi32(__m128i a, __m128i b) { return mm_blendv_epi8(a, b, _mm_cmpgt_epi32(a, b)); } static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepu8_epi16(__m128i epu8) { return _mm_unpacklo_epi8(epu8, _mm_setzero_si128()); } static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepi8_epi16(__m128i epi8) { return _mm_unpacklo_epi8(epi8, _mm_cmplt_epi8(epi8, _mm_setzero_si128())); } static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepu16_epi32(__m128i epu16) { return _mm_unpacklo_epi16(epu16, _mm_setzero_si128()); } static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepi16_epi32(__m128i epu16) { return _mm_unpacklo_epi16(epu16, _mm_cmplt_epi16(epu16, _mm_setzero_si128())); } static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepu8_epi32(__m128i epu8) { return mm_cvtepu16_epi32(mm_cvtepu8_epi16(epu8)); } static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepi8_epi32(__m128i epi8) { const __m128i neg = _mm_cmplt_epi8(epi8, _mm_setzero_si128()); const __m128i epi16 = _mm_unpacklo_epi8(epi8, neg); return _mm_unpacklo_epi16(epi16, _mm_unpacklo_epi8(neg, neg)); } static Vc_INTRINSIC Vc_PURE __m128i mm_stream_load_si128(__m128i *mem) { return _mm_load_si128(mem); } } // namespace SSE } // namespace Vc /*OUTER_NAMESPACE_END*/ #endif #ifdef VC_IMPL_POPCNT #include #endif // SSE4.2 #ifdef VC_IMPL_SSE4_2 extern "C" { #include } #endif /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace SSE { static Vc_INTRINSIC Vc_CONST float extract_float_imm(const __m128 v, const size_t i) { float f; switch (i) { case 0: f = _mm_cvtss_f32(v); break; #if defined VC_IMPL_SSE4_1 && !defined VC_MSVC default: #ifdef VC_GCC f = __builtin_ia32_vec_ext_v4sf(static_cast<__v4sf>(v), (i)); #else // MSVC fails to compile this because it can't optimize i to an immediate _MM_EXTRACT_FLOAT(f, v, i); #endif break; #else case 1: f = _mm_cvtss_f32(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 4))); break; case 2: f = _mm_cvtss_f32(_mm_movehl_ps(v, v)); break; case 3: f = _mm_cvtss_f32(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 12))); break; #endif } return f; } static Vc_INTRINSIC Vc_CONST double extract_double_imm(const __m128d v, const size_t i) { if (i == 0) { return _mm_cvtsd_f64(v); } return _mm_cvtsd_f64(_mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(v), _mm_castpd_ps(v)))); } static Vc_INTRINSIC Vc_CONST float extract_float(const __m128 v, const size_t i) { #ifdef VC_GCC if (__builtin_constant_p(i)) { return extract_float_imm(v, i); //X if (index <= 1) { //X unsigned long long tmp = _mm_cvtsi128_si64(_mm_castps_si128(v)); //X if (index == 0) tmp &= 0xFFFFFFFFull; //X if (index == 1) tmp >>= 32; //X return Common::AliasingEntryHelper(tmp); //X } } else { typedef float float4[4] Vc_MAY_ALIAS; const float4 &data = reinterpret_cast(v); return data[i]; } #else union { __m128 v; float m[4]; } u; u.v = v; return u.m[i]; #endif } static Vc_INTRINSIC Vc_PURE __m128 _mm_stream_load(const float *mem) { #ifdef VC_IMPL_SSE4_1 return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(mem)))); #else return _mm_load_ps(mem); #endif } static Vc_INTRINSIC Vc_PURE __m128d _mm_stream_load(const double *mem) { #ifdef VC_IMPL_SSE4_1 return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(mem)))); #else return _mm_load_pd(mem); #endif } static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const int *mem) { #ifdef VC_IMPL_SSE4_1 return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(mem))); #else return _mm_load_si128(reinterpret_cast(mem)); #endif } static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned int *mem) { return _mm_stream_load(reinterpret_cast(mem)); } static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const short *mem) { return _mm_stream_load(reinterpret_cast(mem)); } static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned short *mem) { return _mm_stream_load(reinterpret_cast(mem)); } static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const signed char *mem) { return _mm_stream_load(reinterpret_cast(mem)); } static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned char *mem) { return _mm_stream_load(reinterpret_cast(mem)); } } // namespace SSE } // namespace Vc /*OUTER_NAMESPACE_END*/ // XOP / FMA4 #if defined(VC_IMPL_XOP) || defined(VC_IMPL_FMA4) extern "C" { #include } #endif #include "undomacros.h" #include "shuffle.h" #endif // SSE_INTRINSICS_H Vc-0.7.4/sse/limits.h000066400000000000000000000127271233512346000143460ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_SSE_LIMITS_H #define VC_SSE_LIMITS_H #include "intrinsics.h" #include "types.h" #include "macros.h" namespace std { template<> struct numeric_limits< ::Vc::SSE::ushort_v> : public numeric_limits { static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v max() _VC_NOEXCEPT { return ::Vc::SSE::_mm_setallone_si128(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v min() _VC_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v lowest() _VC_NOEXCEPT { return min(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v epsilon() _VC_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v round_error() _VC_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v infinity() _VC_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v quiet_NaN() _VC_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v signaling_NaN() _VC_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v denorm_min() _VC_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); } }; template<> struct numeric_limits< ::Vc::SSE::short_v> : public numeric_limits { static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v max() _VC_NOEXCEPT { return _mm_srli_epi16(::Vc::SSE::_mm_setallone_si128(), 1); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v min() _VC_NOEXCEPT { return ::Vc::SSE::_mm_setmin_epi16(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v lowest() _VC_NOEXCEPT { return min(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v epsilon() _VC_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v round_error() _VC_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v infinity() _VC_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v quiet_NaN() _VC_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v signaling_NaN() _VC_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v denorm_min() _VC_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); } }; template<> struct numeric_limits< ::Vc::SSE::uint_v> : public numeric_limits { static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v max() _VC_NOEXCEPT { return ::Vc::SSE::_mm_setallone_si128(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v min() _VC_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v lowest() _VC_NOEXCEPT { return min(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v epsilon() _VC_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v round_error() _VC_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v infinity() _VC_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v quiet_NaN() _VC_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v signaling_NaN() _VC_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v denorm_min() _VC_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); } }; template<> struct numeric_limits< ::Vc::SSE::int_v> : public numeric_limits { static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v max() _VC_NOEXCEPT { return _mm_srli_epi32(::Vc::SSE::_mm_setallone_si128(), 1); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v min() _VC_NOEXCEPT { return ::Vc::SSE::_mm_setmin_epi32(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v lowest() _VC_NOEXCEPT { return min(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v epsilon() _VC_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v round_error() _VC_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v infinity() _VC_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v quiet_NaN() _VC_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v signaling_NaN() _VC_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v denorm_min() _VC_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); } }; } // namespace std #include "undomacros.h" #endif // VC_SSE_LIMITS_H Vc-0.7.4/sse/macros.h000066400000000000000000000024521233512346000143230ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2010 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include "../common/macros.h" #ifndef VC_SSE_MACROS_H #define VC_SSE_MACROS_H #undef VC_SSE_UNDOMACROS_H #ifndef _M128 # define _M128 __m128 #endif #ifndef _M128I # define _M128I __m128i #endif #ifndef _M128D # define _M128D __m128d #endif #define STORE_VECTOR(type, name, vec) \ union { __m128i p; type v[16 / sizeof(type)]; } CAT(u, __LINE__); \ _mm_store_si128(&CAT(u, __LINE__).p, vec); \ const type *const name = &CAT(u, __LINE__).v[0] #if defined(VC_IMPL_SSE4_1) && !defined(VC_DISABLE_PTEST) #define VC_USE_PTEST #endif #endif // VC_SSE_MACROS_H Vc-0.7.4/sse/mask.h000066400000000000000000000603621233512346000137760ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef SSE_MASK_H #define SSE_MASK_H #include "intrinsics.h" #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace SSE { template struct MaskHelper; template<> struct MaskHelper<2> { static Vc_ALWAYS_INLINE Vc_CONST bool cmpeq (_M128 k1, _M128 k2) { return _mm_movemask_pd(_mm_castps_pd(k1)) == _mm_movemask_pd(_mm_castps_pd(k2)); } static Vc_ALWAYS_INLINE Vc_CONST bool cmpneq(_M128 k1, _M128 k2) { return _mm_movemask_pd(_mm_castps_pd(k1)) != _mm_movemask_pd(_mm_castps_pd(k2)); } }; template<> struct MaskHelper<4> { static Vc_ALWAYS_INLINE Vc_CONST bool cmpeq (_M128 k1, _M128 k2) { return _mm_movemask_ps(k1) == _mm_movemask_ps(k2); } static Vc_ALWAYS_INLINE Vc_CONST bool cmpneq(_M128 k1, _M128 k2) { return _mm_movemask_ps(k1) != _mm_movemask_ps(k2); } }; template<> struct MaskHelper<8> { static Vc_ALWAYS_INLINE Vc_CONST bool cmpeq (_M128 k1, _M128 k2) { return _mm_movemask_epi8(_mm_castps_si128(k1)) == _mm_movemask_epi8(_mm_castps_si128(k2)); } static Vc_ALWAYS_INLINE Vc_CONST bool cmpneq(_M128 k1, _M128 k2) { return _mm_movemask_epi8(_mm_castps_si128(k1)) != _mm_movemask_epi8(_mm_castps_si128(k2)); } }; class Float8Mask; template class Mask { friend class Mask<2u>; friend class Mask<4u>; friend class Mask<8u>; friend class Mask<16u>; friend class Float8Mask; public: FREE_STORE_OPERATORS_ALIGNED(16) // abstracts the way Masks are passed to functions, it can easily be changed to const ref here // Also Float8Mask requires const ref on MSVC 32bit. #if defined VC_MSVC && defined _WIN32 typedef const Mask &Argument; #else typedef Mask Argument; #endif Vc_ALWAYS_INLINE Mask() {} Vc_ALWAYS_INLINE Mask(const __m128 &x) : k(x) {} Vc_ALWAYS_INLINE Mask(const __m128d &x) : k(_mm_castpd_ps(x)) {} Vc_ALWAYS_INLINE Mask(const __m128i &x) : k(_mm_castsi128_ps(x)) {} Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerZero::ZEnum) : k(_mm_setzero_ps()) {} Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerOne::OEnum) : k(_mm_setallone_ps()) {} Vc_ALWAYS_INLINE explicit Mask(bool b) : k(b ? _mm_setallone_ps() : _mm_setzero_ps()) {} Vc_ALWAYS_INLINE Mask(const Mask &rhs) : k(rhs.k) {} Vc_ALWAYS_INLINE Mask(const Mask *a) : k(_mm_castsi128_ps(_mm_packs_epi16(a[0].dataI(), a[1].dataI()))) {} Vc_ALWAYS_INLINE explicit Mask(const Float8Mask &m); template Vc_ALWAYS_INLINE_L explicit Mask(const Mask &x) Vc_ALWAYS_INLINE_R; //X { //X _M128I tmp = x.dataI(); //X if (OtherSize < VectorSize) { //X tmp = _mm_packs_epi16(tmp, _mm_setzero_si128()); //X if (VectorSize / OtherSize >= 4u) { tmp = _mm_packs_epi16(tmp, _mm_setzero_si128()); } //X if (VectorSize / OtherSize >= 8u) { tmp = _mm_packs_epi16(tmp, _mm_setzero_si128()); } //X } else if (OtherSize > VectorSize) { //X tmp = _mm_unpacklo_epi8(tmp, tmp); //X if (OtherSize / VectorSize >= 4u) { tmp = _mm_unpacklo_epi8(tmp, tmp); } //X if (OtherSize / VectorSize >= 8u) { tmp = _mm_unpacklo_epi8(tmp, tmp); } //X } //X k = _mm_castsi128_ps(tmp); //X } inline void expand(Mask *x) const; Vc_ALWAYS_INLINE Vc_PURE bool operator==(const Mask &rhs) const { return MaskHelper::cmpeq (k, rhs.k); } Vc_ALWAYS_INLINE Vc_PURE bool operator!=(const Mask &rhs) const { return MaskHelper::cmpneq(k, rhs.k); } Vc_ALWAYS_INLINE Vc_PURE Mask operator!() const { return _mm_andnot_si128(dataI(), _mm_setallone_si128()); } Vc_ALWAYS_INLINE Mask &operator&=(const Mask &rhs) { k = _mm_and_ps(k, rhs.k); return *this; } Vc_ALWAYS_INLINE Mask &operator|=(const Mask &rhs) { k = _mm_or_ps (k, rhs.k); return *this; } Vc_ALWAYS_INLINE Mask &operator^=(const Mask &rhs) { k = _mm_xor_ps(k, rhs.k); return *this; } Vc_ALWAYS_INLINE Vc_PURE bool isFull () const { return #ifdef VC_USE_PTEST _mm_testc_si128(dataI(), _mm_setallone_si128()); // return 1 if (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff) == (~0 & k) #else _mm_movemask_epi8(dataI()) == 0xffff; #endif } Vc_ALWAYS_INLINE Vc_PURE bool isEmpty() const { return #ifdef VC_USE_PTEST _mm_testz_si128(dataI(), dataI()); // return 1 if (0, 0, 0, 0) == (k & k) #else _mm_movemask_epi8(dataI()) == 0x0000; #endif } Vc_ALWAYS_INLINE Vc_PURE bool isMix() const { #ifdef VC_USE_PTEST return _mm_test_mix_ones_zeros(dataI(), _mm_setallone_si128()); #else const int tmp = _mm_movemask_epi8(dataI()); return tmp != 0 && (tmp ^ 0xffff) != 0; #endif } #ifndef VC_NO_AUTOMATIC_BOOL_FROM_MASK Vc_ALWAYS_INLINE Vc_PURE operator bool() const { return isFull(); } #endif Vc_ALWAYS_INLINE_L Vc_PURE_L int shiftMask() const Vc_ALWAYS_INLINE_R Vc_PURE_R; Vc_ALWAYS_INLINE_L Vc_PURE_L int toInt() const Vc_ALWAYS_INLINE_R Vc_PURE_R; Vc_ALWAYS_INLINE Vc_PURE _M128 data () const { return k; } Vc_ALWAYS_INLINE Vc_PURE _M128I dataI() const { return _mm_castps_si128(k); } Vc_ALWAYS_INLINE Vc_PURE _M128D dataD() const { return _mm_castps_pd(k); } template Vc_ALWAYS_INLINE Vc_PURE Mask cast() const { return Mask(k); } Vc_ALWAYS_INLINE_L Vc_PURE_L bool operator[](int index) const Vc_ALWAYS_INLINE_R Vc_PURE_R; Vc_ALWAYS_INLINE_L Vc_PURE_L int count() const Vc_ALWAYS_INLINE_R Vc_PURE_R; /** * Returns the index of the first one in the mask. * * The return value is undefined if the mask is empty. */ Vc_ALWAYS_INLINE_L Vc_PURE_L int firstOne() const Vc_ALWAYS_INLINE_R Vc_PURE_R; private: #ifdef VC_COMPILE_BENCHMARKS public: #endif _M128 k; }; struct ForeachHelper { _long mask; bool brk; bool outerBreak; Vc_ALWAYS_INLINE ForeachHelper(_long _mask) : mask(_mask), brk(false), outerBreak(false) {} Vc_ALWAYS_INLINE bool outer() const { return (mask != 0) && !outerBreak; } Vc_ALWAYS_INLINE bool inner() { return (brk = !brk); } Vc_ALWAYS_INLINE void noBreak() { outerBreak = false; } Vc_ALWAYS_INLINE _long next() { outerBreak = true; #ifdef VC_GNU_ASM const _long bit = __builtin_ctzl(mask); __asm__("btr %1,%0" : "+r"(mask) : "r"(bit)); #elif defined(_WIN64) unsigned long bit; _BitScanForward64(&bit, mask); _bittestandreset64(&mask, bit); #elif defined(_WIN32) unsigned long bit; _BitScanForward(&bit, mask); _bittestandreset(&mask, bit); #else #error "Not implemented yet. Please contact vc-devel@compeng.uni-frankfurt.de" #endif return bit; } }; #define Vc_foreach_bit(_it_, _mask_) \ for (Vc::SSE::ForeachHelper Vc__make_unique(foreach_bit_obj)((_mask_).toInt()); Vc__make_unique(foreach_bit_obj).outer(); ) \ for (_it_ = Vc__make_unique(foreach_bit_obj).next(); Vc__make_unique(foreach_bit_obj).inner(); Vc__make_unique(foreach_bit_obj).noBreak()) template Vc_ALWAYS_INLINE Vc_PURE int Mask::shiftMask() const { return _mm_movemask_epi8(dataI()); } template<> template<> Vc_ALWAYS_INLINE Mask<2>::Mask(const Mask<4> &x) { k = _mm_unpacklo_ps(x.data(), x.data()); } template<> template<> Vc_ALWAYS_INLINE Mask<2>::Mask(const Mask<8> &x) { _M128I tmp = _mm_unpacklo_epi16(x.dataI(), x.dataI()); k = _mm_castsi128_ps(_mm_unpacklo_epi32(tmp, tmp)); } template<> template<> Vc_ALWAYS_INLINE Mask<2>::Mask(const Mask<16> &x) { _M128I tmp = _mm_unpacklo_epi8(x.dataI(), x.dataI()); tmp = _mm_unpacklo_epi16(tmp, tmp); k = _mm_castsi128_ps(_mm_unpacklo_epi32(tmp, tmp)); } template<> template<> Vc_ALWAYS_INLINE Mask<4>::Mask(const Mask<2> &x) { k = _mm_castsi128_ps(_mm_packs_epi16(x.dataI(), _mm_setzero_si128())); } template<> template<> Vc_ALWAYS_INLINE Mask<4>::Mask(const Mask<8> &x) { k = _mm_castsi128_ps(_mm_unpacklo_epi16(x.dataI(), x.dataI())); } template<> template<> Vc_ALWAYS_INLINE Mask<4>::Mask(const Mask<16> &x) { _M128I tmp = _mm_unpacklo_epi8(x.dataI(), x.dataI()); k = _mm_castsi128_ps(_mm_unpacklo_epi16(tmp, tmp)); } template<> template<> Vc_ALWAYS_INLINE Mask<8>::Mask(const Mask<2> &x) { _M128I tmp = _mm_packs_epi16(x.dataI(), x.dataI()); k = _mm_castsi128_ps(_mm_packs_epi16(tmp, tmp)); } template<> template<> Vc_ALWAYS_INLINE Mask<8>::Mask(const Mask<4> &x) { k = _mm_castsi128_ps(_mm_packs_epi16(x.dataI(), x.dataI())); } template<> template<> Vc_ALWAYS_INLINE Mask<8>::Mask(const Mask<16> &x) { k = _mm_castsi128_ps(_mm_unpacklo_epi8(x.dataI(), x.dataI())); } template<> inline void Mask< 4>::expand(Mask<2> *x) const { x[0].k = _mm_unpacklo_ps(data(), data()); x[1].k = _mm_unpackhi_ps(data(), data()); } template<> inline void Mask< 8>::expand(Mask<4> *x) const { x[0].k = _mm_castsi128_ps(_mm_unpacklo_epi16(dataI(), dataI())); x[1].k = _mm_castsi128_ps(_mm_unpackhi_epi16(dataI(), dataI())); } template<> inline void Mask<16>::expand(Mask<8> *x) const { x[0].k = _mm_castsi128_ps(_mm_unpacklo_epi8 (dataI(), dataI())); x[1].k = _mm_castsi128_ps(_mm_unpackhi_epi8 (dataI(), dataI())); } template<> Vc_ALWAYS_INLINE Vc_PURE int Mask< 2>::toInt() const { return _mm_movemask_pd(dataD()); } template<> Vc_ALWAYS_INLINE Vc_PURE int Mask< 4>::toInt() const { return _mm_movemask_ps(data ()); } template<> Vc_ALWAYS_INLINE Vc_PURE int Mask< 8>::toInt() const { return _mm_movemask_epi8(_mm_packs_epi16(dataI(), _mm_setzero_si128())); } template<> Vc_ALWAYS_INLINE Vc_PURE int Mask<16>::toInt() const { return _mm_movemask_epi8(dataI()); } template<> Vc_ALWAYS_INLINE Vc_PURE bool Mask< 2>::operator[](int index) const { return toInt() & (1 << index); } template<> Vc_ALWAYS_INLINE Vc_PURE bool Mask< 4>::operator[](int index) const { return toInt() & (1 << index); } template<> Vc_ALWAYS_INLINE Vc_PURE bool Mask< 8>::operator[](int index) const { return shiftMask() & (1 << 2 * index); } template<> Vc_ALWAYS_INLINE Vc_PURE bool Mask<16>::operator[](int index) const { return toInt() & (1 << index); } template<> Vc_ALWAYS_INLINE Vc_PURE int Mask<2>::count() const { int mask = _mm_movemask_pd(dataD()); return (mask & 1) + (mask >> 1); } template<> Vc_ALWAYS_INLINE Vc_PURE int Mask<4>::count() const { #ifdef VC_IMPL_POPCNT return _mm_popcnt_u32(_mm_movemask_ps(data())); //X tmp = (tmp & 5) + ((tmp >> 1) & 5); //X return (tmp & 3) + ((tmp >> 2) & 3); #else _M128I x = _mm_srli_epi32(dataI(), 31); x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3))); x = _mm_add_epi32(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(x); #endif } template<> Vc_ALWAYS_INLINE Vc_PURE int Mask<8>::count() const { #ifdef VC_IMPL_POPCNT return _mm_popcnt_u32(_mm_movemask_epi8(dataI())) / 2; #else //X int tmp = _mm_movemask_epi8(dataI()); //X tmp = (tmp & 0x1111) + ((tmp >> 2) & 0x1111); //X tmp = (tmp & 0x0303) + ((tmp >> 4) & 0x0303); //X return (tmp & 0x000f) + ((tmp >> 8) & 0x000f); _M128I x = _mm_srli_epi16(dataI(), 15); x = _mm_add_epi16(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3))); x = _mm_add_epi16(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(0, 1, 2, 3))); x = _mm_add_epi16(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 3, 0, 1))); return _mm_extract_epi16(x, 0); #endif } template<> Vc_ALWAYS_INLINE Vc_PURE int Mask<16>::count() const { int tmp = _mm_movemask_epi8(dataI()); #ifdef VC_IMPL_POPCNT return _mm_popcnt_u32(tmp); #else tmp = (tmp & 0x5555) + ((tmp >> 1) & 0x5555); tmp = (tmp & 0x3333) + ((tmp >> 2) & 0x3333); tmp = (tmp & 0x0f0f) + ((tmp >> 4) & 0x0f0f); return (tmp & 0x00ff) + ((tmp >> 8) & 0x00ff); #endif } class Float8Mask { enum Constants { PartialSize = 4, VectorSize = 8 }; public: FREE_STORE_OPERATORS_ALIGNED(16) // abstracts the way Masks are passed to functions, it can easily be changed to const ref here // Also Float8Mask requires const ref on MSVC 32bit. #if defined VC_MSVC && defined _WIN32 typedef const Float8Mask & Argument; #else typedef Float8Mask Argument; #endif Vc_ALWAYS_INLINE Float8Mask() {} Vc_ALWAYS_INLINE Float8Mask(const M256 &x) : k(x) {} Vc_ALWAYS_INLINE explicit Float8Mask(VectorSpecialInitializerZero::ZEnum) { k[0] = _mm_setzero_ps(); k[1] = _mm_setzero_ps(); } Vc_ALWAYS_INLINE explicit Float8Mask(VectorSpecialInitializerOne::OEnum) { k[0] = _mm_setallone_ps(); k[1] = _mm_setallone_ps(); } Vc_ALWAYS_INLINE explicit Float8Mask(bool b) { const __m128 tmp = b ? _mm_setallone_ps() : _mm_setzero_ps(); k[0] = tmp; k[1] = tmp; } Vc_ALWAYS_INLINE Float8Mask(const Mask &a) { k[0] = _mm_castsi128_ps(_mm_unpacklo_epi16(a.dataI(), a.dataI())); k[1] = _mm_castsi128_ps(_mm_unpackhi_epi16(a.dataI(), a.dataI())); } Vc_ALWAYS_INLINE Vc_PURE bool operator==(const Float8Mask &rhs) const { return MaskHelper::cmpeq (k[0], rhs.k[0]) && MaskHelper::cmpeq (k[1], rhs.k[1]); } Vc_ALWAYS_INLINE Vc_PURE bool operator!=(const Float8Mask &rhs) const { return MaskHelper::cmpneq(k[0], rhs.k[0]) || MaskHelper::cmpneq(k[1], rhs.k[1]); } Vc_ALWAYS_INLINE Vc_PURE Float8Mask operator&&(const Float8Mask &rhs) const { Float8Mask r; r.k[0] = _mm_and_ps(k[0], rhs.k[0]); r.k[1] = _mm_and_ps(k[1], rhs.k[1]); return r; } Vc_ALWAYS_INLINE Vc_PURE Float8Mask operator& (const Float8Mask &rhs) const { Float8Mask r; r.k[0] = _mm_and_ps(k[0], rhs.k[0]); r.k[1] = _mm_and_ps(k[1], rhs.k[1]); return r; } Vc_ALWAYS_INLINE Vc_PURE Float8Mask operator||(const Float8Mask &rhs) const { Float8Mask r; r.k[0] = _mm_or_ps(k[0], rhs.k[0]); r.k[1] = _mm_or_ps(k[1], rhs.k[1]); return r; } Vc_ALWAYS_INLINE Vc_PURE Float8Mask operator| (const Float8Mask &rhs) const { Float8Mask r; r.k[0] = _mm_or_ps(k[0], rhs.k[0]); r.k[1] = _mm_or_ps(k[1], rhs.k[1]); return r; } Vc_ALWAYS_INLINE Vc_PURE Float8Mask operator^ (const Float8Mask &rhs) const { Float8Mask r; r.k[0] = _mm_xor_ps(k[0], rhs.k[0]); r.k[1] = _mm_xor_ps(k[1], rhs.k[1]); return r; } Vc_ALWAYS_INLINE Vc_PURE Float8Mask operator!() const { Float8Mask r; r.k[0] = _mm_andnot_ps(k[0], _mm_setallone_ps()); r.k[1] = _mm_andnot_ps(k[1], _mm_setallone_ps()); return r; } Vc_ALWAYS_INLINE Float8Mask &operator&=(const Float8Mask &rhs) { k[0] = _mm_and_ps(k[0], rhs.k[0]); k[1] = _mm_and_ps(k[1], rhs.k[1]); return *this; } Vc_ALWAYS_INLINE Float8Mask &operator|=(const Float8Mask &rhs) { k[0] = _mm_or_ps (k[0], rhs.k[0]); k[1] = _mm_or_ps (k[1], rhs.k[1]); return *this; } Vc_ALWAYS_INLINE Float8Mask &operator^=(const Float8Mask &rhs) { k[0] = _mm_xor_ps(k[0], rhs.k[0]); k[1] = _mm_xor_ps(k[1], rhs.k[1]); return *this; } Vc_ALWAYS_INLINE Vc_PURE bool isFull () const { const _M128 tmp = _mm_and_ps(k[0], k[1]); #ifdef VC_USE_PTEST return _mm_testc_si128(_mm_castps_si128(tmp), _mm_setallone_si128()); #else return _mm_movemask_ps(tmp) == 0xf; //_mm_movemask_ps(k[0]) == 0xf && //_mm_movemask_ps(k[1]) == 0xf; #endif } Vc_ALWAYS_INLINE Vc_PURE bool isEmpty() const { const _M128 tmp = _mm_or_ps(k[0], k[1]); #ifdef VC_USE_PTEST return _mm_testz_si128(_mm_castps_si128(tmp), _mm_castps_si128(tmp)); #else return _mm_movemask_ps(tmp) == 0x0; //_mm_movemask_ps(k[0]) == 0x0 && //_mm_movemask_ps(k[1]) == 0x0; #endif } Vc_ALWAYS_INLINE Vc_PURE bool isMix() const { // consider [1111 0000] // solution: // if k[0] != k[1] => return true // if k[0] == k[1] => return k[0].isMix #ifdef VC_USE_PTEST __m128i tmp = _mm_castps_si128(_mm_xor_ps(k[0], k[1])); // tmp == 0 <=> k[0] == k[1] return !_mm_testz_si128(tmp, tmp) || _mm_test_mix_ones_zeros(_mm_castps_si128(k[0]), _mm_setallone_si128()); #else const int tmp = _mm_movemask_ps(k[0]) + _mm_movemask_ps(k[1]); return tmp > 0x0 && tmp < (0xf + 0xf); #endif } #ifndef VC_NO_AUTOMATIC_BOOL_FROM_MASK Vc_ALWAYS_INLINE Vc_PURE operator bool() const { return isFull(); } #endif Vc_ALWAYS_INLINE Vc_PURE int shiftMask() const { return (_mm_movemask_ps(k[1]) << 4) + _mm_movemask_ps(k[0]); } Vc_ALWAYS_INLINE Vc_PURE int toInt() const { return (_mm_movemask_ps(k[1]) << 4) + _mm_movemask_ps(k[0]); } Vc_ALWAYS_INLINE Vc_PURE const M256 &data () const { return k; } Vc_ALWAYS_INLINE Vc_PURE bool operator[](int index) const { return (toInt() & (1 << index)) != 0; } Vc_ALWAYS_INLINE Vc_PURE int count() const { #ifdef VC_IMPL_POPCNT return _mm_popcnt_u32(toInt()); #else //X int tmp1 = _mm_movemask_ps(k[0]); //X int tmp2 = _mm_movemask_ps(k[1]); //X tmp1 = (tmp1 & 5) + ((tmp1 >> 1) & 5); //X tmp2 = (tmp2 & 5) + ((tmp2 >> 1) & 5); //X return (tmp1 & 3) + (tmp2 & 3) + ((tmp1 >> 2) & 3) + ((tmp2 >> 2) & 3); _M128I x = _mm_add_epi32(_mm_srli_epi32(_mm_castps_si128(k[0]), 31), _mm_srli_epi32(_mm_castps_si128(k[1]), 31)); x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3))); x = _mm_add_epi32(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(x); #endif } Vc_ALWAYS_INLINE_L Vc_PURE_L int firstOne() const Vc_ALWAYS_INLINE_R Vc_PURE_R; private: #ifdef VC_COMPILE_BENCHMARKS public: #endif M256 k; }; template Vc_ALWAYS_INLINE Vc_PURE int Mask::firstOne() const { const int mask = toInt(); #ifdef _MSC_VER unsigned long bit; _BitScanForward(&bit, mask); #else int bit; __asm__("bsf %1,%0" : "=&r"(bit) : "r"(mask)); #endif return bit; } Vc_ALWAYS_INLINE Vc_PURE int Float8Mask::firstOne() const { const int mask = toInt(); #ifdef _MSC_VER unsigned long bit; _BitScanForward(&bit, mask); #else int bit; __asm__("bsf %1,%0" : "=&r"(bit) : "r"(mask)); #endif return bit; } template Vc_ALWAYS_INLINE Mask::Mask(const Float8Mask &m) : k(_mm_castsi128_ps(_mm_packs_epi32(_mm_castps_si128(m.data()[0]), _mm_castps_si128(m.data()[1])))) {} class Float8GatherMask { public: Float8GatherMask(const Mask<8u> &k) : mask(k.toInt()) {} Float8GatherMask(const Float8Mask &k) : mask(k.toInt()) {} int toInt() const { return mask; } private: const int mask; }; /** * Loop over all set bits in the mask. The iterator variable will be set to the position of the set * bits. A mask of e.g. 00011010 would result in the loop being called with the iterator being set to * 1, 3, and 4. * * This allows you to write: * \code * float_v a = ...; * foreach_bit(int i, a < 0.f) { * std::cout << a[i] << "\n"; * } * \endcode * The example prints all the values in \p a that are negative, and only those. * * \param it The iterator variable. For example "int i". * \param mask The mask to iterate over. You can also just write a vector operation that returns a * mask. */ //X #define foreach_bit(it, mask) //X for (int _sse_vector_foreach_inner = 1, ForeachScope _sse_vector_foreach_scope(mask.toInt()), int it = _sse_vector_foreach_scope.bit(); _sse_vector_foreach_inner; --_sse_vector_foreach_inner) //X for (int _sse_vector_foreach_mask = (mask).toInt(), int _sse_vector_foreach_it = _sse_bitscan(mask.toInt()); //X _sse_vector_foreach_it > 0; //X _sse_vector_foreach_it = _sse_bitscan_initialized(_sse_vector_foreach_it, mask.data())) //X for (int _sse_vector_foreach_inner = 1, it = _sse_vector_foreach_it; _sse_vector_foreach_inner; --_sse_vector_foreach_inner) // Operators // let binary and/or/xor work for any combination of masks (as long as they have the same sizeof) template Mask operator& (const Mask &lhs, const Mask &rhs) { return _mm_and_ps(lhs.data(), rhs.data()); } template Mask operator| (const Mask &lhs, const Mask &rhs) { return _mm_or_ps (lhs.data(), rhs.data()); } template Mask operator^ (const Mask &lhs, const Mask &rhs) { return _mm_xor_ps(lhs.data(), rhs.data()); } // binary and/or/xor cannot work with one operand larger than the other template void operator& (const Mask &lhs, const Float8Mask &rhs); template void operator| (const Mask &lhs, const Float8Mask &rhs); template void operator^ (const Mask &lhs, const Float8Mask &rhs); template void operator& (const Float8Mask &rhs, const Mask &lhs); template void operator| (const Float8Mask &rhs, const Mask &lhs); template void operator^ (const Float8Mask &rhs, const Mask &lhs); // disable logical and/or for incompatible masks template void operator&&(const Mask &lhs, const Mask &rhs); template void operator||(const Mask &lhs, const Mask &rhs); template void operator&&(const Mask &lhs, const Float8Mask &rhs); template void operator||(const Mask &lhs, const Float8Mask &rhs); template void operator&&(const Float8Mask &rhs, const Mask &lhs); template void operator||(const Float8Mask &rhs, const Mask &lhs); // logical and/or for compatible masks template Vc_ALWAYS_INLINE Vc_PURE Mask operator&&(const Mask &lhs, const Mask &rhs) { return _mm_and_ps(lhs.data(), rhs.data()); } template Vc_ALWAYS_INLINE Vc_PURE Mask operator||(const Mask &lhs, const Mask &rhs) { return _mm_or_ps (lhs.data(), rhs.data()); } Vc_ALWAYS_INLINE Vc_PURE Mask<8> operator&&(const Float8Mask &rhs, const Mask<8> &lhs) { return static_cast >(rhs) && lhs; } Vc_ALWAYS_INLINE Vc_PURE Mask<8> operator||(const Float8Mask &rhs, const Mask<8> &lhs) { return static_cast >(rhs) || lhs; } Vc_ALWAYS_INLINE Vc_PURE Mask<8> operator&&(const Mask<8> &rhs, const Float8Mask &lhs) { return rhs && static_cast >(lhs); } Vc_ALWAYS_INLINE Vc_PURE Mask<8> operator||(const Mask<8> &rhs, const Float8Mask &lhs) { return rhs || static_cast >(lhs); } } // namespace SSE } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // SSE_MASK_H Vc-0.7.4/sse/math.h000066400000000000000000000204731233512346000137730ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_SSE_MATH_H #define VC_SSE_MATH_H #include "const.h" #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace SSE { /** * splits \p v into exponent and mantissa, the sign is kept with the mantissa * * The return value will be in the range [0.5, 1.0[ * The \p e value will be an integer defining the power-of-two exponent */ inline double_v frexp(const double_v &v, int_v *e) { const __m128i exponentBits = Const::exponentMask().dataI(); const __m128i exponentPart = _mm_and_si128(_mm_castpd_si128(v.data()), exponentBits); *e = _mm_sub_epi32(_mm_srli_epi64(exponentPart, 52), _mm_set1_epi32(0x3fe)); const __m128d exponentMaximized = _mm_or_pd(v.data(), _mm_castsi128_pd(exponentBits)); double_v ret = _mm_and_pd(exponentMaximized, _mm_load_pd(reinterpret_cast(&c_general::frexpMask[0]))); double_m zeroMask = v == double_v::Zero(); ret(isnan(v) || !isfinite(v) || zeroMask) = v; e->setZero(zeroMask.data()); return ret; } inline float_v frexp(const float_v &v, int_v *e) { const __m128i exponentBits = Const::exponentMask().dataI(); const __m128i exponentPart = _mm_and_si128(_mm_castps_si128(v.data()), exponentBits); *e = _mm_sub_epi32(_mm_srli_epi32(exponentPart, 23), _mm_set1_epi32(0x7e)); const __m128 exponentMaximized = _mm_or_ps(v.data(), _mm_castsi128_ps(exponentBits)); float_v ret = _mm_and_ps(exponentMaximized, _mm_castsi128_ps(_mm_set1_epi32(0xbf7fffffu))); ret(isnan(v) || !isfinite(v) || v == float_v::Zero()) = v; e->setZero(v == float_v::Zero()); return ret; } inline sfloat_v frexp(const sfloat_v &v, short_v *e) { const __m128i exponentBits = Const::exponentMask().dataI(); const __m128i exponentPart0 = _mm_and_si128(_mm_castps_si128(v.data()[0]), exponentBits); const __m128i exponentPart1 = _mm_and_si128(_mm_castps_si128(v.data()[1]), exponentBits); *e = _mm_sub_epi16(_mm_packs_epi32(_mm_srli_epi32(exponentPart0, 23), _mm_srli_epi32(exponentPart1, 23)), _mm_set1_epi16(0x7e)); const __m128 exponentMaximized0 = _mm_or_ps(v.data()[0], _mm_castsi128_ps(exponentBits)); const __m128 exponentMaximized1 = _mm_or_ps(v.data()[1], _mm_castsi128_ps(exponentBits)); sfloat_v ret = M256::create( _mm_and_ps(exponentMaximized0, _mm_castsi128_ps(_mm_set1_epi32(0xbf7fffffu))), _mm_and_ps(exponentMaximized1, _mm_castsi128_ps(_mm_set1_epi32(0xbf7fffffu))) ); sfloat_m zeroMask = v == sfloat_v::Zero(); ret(isnan(v) || !isfinite(v) || zeroMask) = v; e->setZero(static_cast(zeroMask)); return ret; } /* -> x * 2^e * x == NaN -> NaN * x == (-)inf -> (-)inf */ inline double_v ldexp(double_v::AsArg v, int_v::AsArg _e) { int_v e = _e; e.setZero((v == double_v::Zero()).dataI()); const __m128i exponentBits = _mm_slli_epi64(e.data(), 52); return _mm_castsi128_pd(_mm_add_epi64(_mm_castpd_si128(v.data()), exponentBits)); } inline float_v ldexp(float_v::AsArg v, int_v::AsArg _e) { int_v e = _e; e.setZero(static_cast(v == float_v::Zero())); return (v.reinterpretCast() + (e << 23)).reinterpretCast(); } inline sfloat_v ldexp(sfloat_v::AsArg v, short_v::AsArg _e) { short_v e = _e; e.setZero(static_cast(v == sfloat_v::Zero())); e <<= (23 - 16); const __m128i exponentBits0 = _mm_unpacklo_epi16(_mm_setzero_si128(), e.data()); const __m128i exponentBits1 = _mm_unpackhi_epi16(_mm_setzero_si128(), e.data()); return M256::create(_mm_castsi128_ps(_mm_add_epi32(_mm_castps_si128(v.data()[0]), exponentBits0)), _mm_castsi128_ps(_mm_add_epi32(_mm_castps_si128(v.data()[1]), exponentBits1))); } #ifdef VC_IMPL_SSE4_1 inline double_v trunc(double_v::AsArg v) { return _mm_round_pd(v.data(), 0x3); } inline float_v trunc(float_v::AsArg v) { return _mm_round_ps(v.data(), 0x3); } inline sfloat_v trunc(sfloat_v::AsArg v) { return M256::create(_mm_round_ps(v.data()[0], 0x3), _mm_round_ps(v.data()[1], 0x3)); } inline double_v floor(double_v::AsArg v) { return _mm_floor_pd(v.data()); } inline float_v floor(float_v::AsArg v) { return _mm_floor_ps(v.data()); } inline sfloat_v floor(sfloat_v::AsArg v) { return M256::create(_mm_floor_ps(v.data()[0]), _mm_floor_ps(v.data()[1])); } inline double_v ceil(double_v::AsArg v) { return _mm_ceil_pd(v.data()); } inline float_v ceil(float_v::AsArg v) { return _mm_ceil_ps(v.data()); } inline sfloat_v ceil(sfloat_v::AsArg v) { return M256::create(_mm_ceil_ps(v.data()[0]), _mm_ceil_ps(v.data()[1])); } #else static inline void floor_shift(float_v &v, float_v::AsArg e) { int_v x = _mm_setallone_si128(); x <<= 23; x >>= static_cast(e); v &= x.reinterpretCast(); } static inline void floor_shift(sfloat_v &v, sfloat_v::AsArg e) { int_v x = _mm_setallone_si128(); x <<= 23; int_v y = x; x >>= _mm_cvttps_epi32(e.data()[0]); y >>= _mm_cvttps_epi32(e.data()[1]); v.data()[0] = _mm_and_ps(v.data()[0], _mm_castsi128_ps(x.data())); v.data()[1] = _mm_and_ps(v.data()[1], _mm_castsi128_ps(y.data())); } static inline void floor_shift(double_v &v, double_v::AsArg e) { const long long initialMask = 0xfff0000000000000ull; const uint_v shifts = static_cast(e); union d_ll { long long ll; double d; }; d_ll mask0 = { initialMask >> shifts[0] }; d_ll mask1 = { initialMask >> shifts[1] }; v &= double_v(_mm_setr_pd(mask0.d, mask1.d)); } template inline Vector trunc(VC_ALIGNED_PARAMETER(Vector) _v) { typedef Vector V; typedef typename V::Mask M; V v = _v; V e = abs(v).exponent(); const M negativeExponent = e < 0; e.setZero(negativeExponent); //const M negativeInput = v < V::Zero(); floor_shift(v, e); v.setZero(negativeExponent); //v(negativeInput && _v != v) -= V::One(); return v; } template inline Vector floor(VC_ALIGNED_PARAMETER(Vector) _v) { typedef Vector V; typedef typename V::Mask M; V v = _v; V e = abs(v).exponent(); const M negativeExponent = e < 0; e.setZero(negativeExponent); const M negativeInput = v < V::Zero(); floor_shift(v, e); v.setZero(negativeExponent); v(negativeInput && _v != v) -= V::One(); return v; } template inline Vector ceil(VC_ALIGNED_PARAMETER(Vector) _v) { typedef Vector V; typedef typename V::Mask M; V v = _v; V e = abs(v).exponent(); const M negativeExponent = e < 0; e.setZero(negativeExponent); const M positiveInput = v > V::Zero(); floor_shift(v, e); v.setZero(negativeExponent); v(positiveInput && _v != v) += V::One(); return v; } #endif } // namespace SSE } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #define VC__USE_NAMESPACE SSE #include "../common/trigonometric.h" #define VC__USE_NAMESPACE SSE #include "../common/logarithm.h" #define VC__USE_NAMESPACE SSE #include "../common/exponential.h" #undef VC__USE_NAMESPACE #endif // VC_SSE_MATH_H Vc-0.7.4/sse/prefetches.tcc000066400000000000000000000035701233512346000155130ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2010 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_SSE_PREFETCHES_TCC #define VC_SSE_PREFETCHES_TCC /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace Internal { Vc_ALWAYS_INLINE void HelperImpl::prefetchForOneRead(const void *addr) { _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_NTA); } Vc_ALWAYS_INLINE void HelperImpl::prefetchClose(const void *addr) { _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T0); } Vc_ALWAYS_INLINE void HelperImpl::prefetchMid(const void *addr) { _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T1); } Vc_ALWAYS_INLINE void HelperImpl::prefetchFar(const void *addr) { _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T2); } Vc_ALWAYS_INLINE void HelperImpl::prefetchForModify(const void *addr) { #if defined(__3dNOW__) && (!defined(VC_CLANG) || VC_CLANG >= 0x30200) _m_prefetchw(const_cast(addr)); #else _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T0); #endif } } // namespace Internal } // namespace Vc /*OUTER_NAMESPACE_END*/ #endif // VC_SSE_PREFETCHES_TCC Vc-0.7.4/sse/shuffle.h000066400000000000000000000223071233512346000144740ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2011-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_SSE_SHUFFLE_H #define VC_SSE_SHUFFLE_H #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { enum VecPos { X0, X1, X2, X3, X4, X5, X6, X7, Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7 }; namespace Mem { // shuffle([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2] template static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) { VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range); VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range); return _mm_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64); } // shuffle([x0 x1], [y0 y1]) = [x1 y0] template static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) { VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= Y0, Incorrect_Range); VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= Y1, Incorrect_Range); return _mm_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2); } #if !defined(VC_IMPL_SSE4_1) && !defined(VC_IMPL_AVX) #define Vc_MAKE_INTRINSIC__(name__) Vc::SSE::_VC_CAT(m,m,_,name__) #else #define Vc_MAKE_INTRINSIC__(name__) _VC_CAT(_,mm,_,name__) #endif // blend([x0 x1], [y0, y1]) = [x0 y1] template static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) { VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range); VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range); return Vc_MAKE_INTRINSIC__(blend_pd)(x, y, (Dst0 / Y0) + (Dst1 / Y0) * 2); } // blend([x0 x1], [y0, y1]) = [x0 y1] template static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) { VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range); VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range); VC_STATIC_ASSERT(Dst2 == X2 || Dst2 == Y2, Incorrect_Range); VC_STATIC_ASSERT(Dst3 == X3 || Dst3 == Y3, Incorrect_Range); return Vc_MAKE_INTRINSIC__(blend_ps)(x, y, (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 + (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8); } template static Vc_ALWAYS_INLINE __m128i Vc_CONST blend(__m128i x, __m128i y) { VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range); VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range); VC_STATIC_ASSERT(Dst2 == X2 || Dst2 == Y2, Incorrect_Range); VC_STATIC_ASSERT(Dst3 == X3 || Dst3 == Y3, Incorrect_Range); VC_STATIC_ASSERT(Dst4 == X4 || Dst4 == Y4, Incorrect_Range); VC_STATIC_ASSERT(Dst5 == X5 || Dst5 == Y5, Incorrect_Range); VC_STATIC_ASSERT(Dst6 == X6 || Dst6 == Y6, Incorrect_Range); VC_STATIC_ASSERT(Dst7 == X7 || Dst7 == Y7, Incorrect_Range); return Vc_MAKE_INTRINSIC__(blend_epi16)(x, y, (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 + (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 + (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 + (Dst6 / Y6) * 64 + (Dst7 / Y7) *128 ); } // permute([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2] template static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) { VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); return _mm_shuffle_ps(x, x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } template static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) { VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } template static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteLo(__m128i x) { VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); return _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } template static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteHi(__m128i x) { VC_STATIC_ASSERT(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, Incorrect_Range); VC_STATIC_ASSERT(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, Incorrect_Range); return _mm_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64); } template static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) { VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); VC_STATIC_ASSERT(Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4, Incorrect_Range); VC_STATIC_ASSERT(Dst4 <= X7 && Dst5 <= X7 && Dst6 <= X7 && Dst7 <= X7, Incorrect_Range); if (Dst0 != X0 || Dst1 != X1 || Dst2 != X2 || Dst3 != X3) { x = _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } if (Dst4 != X4 || Dst5 != X5 || Dst6 != X6 || Dst7 != X7) { x = _mm_shufflehi_epi16(x, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64); } return x; } } // namespace Mem // The shuffles and permutes above use memory ordering. The ones below use register ordering: namespace Reg { // shuffle([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1] template static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) { return Mem::shuffle(x, y); } // shuffle([x1 x0], [y1 y0]) = [y0 x1] template static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) { return Mem::shuffle(x, y); } // shuffle([x3 x2 x1 x0]) = [x3 x0 x2 x1] template static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) { VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } // shuffle([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1] template static Vc_ALWAYS_INLINE __m128i Vc_CONST shuffle(__m128i x, __m128i y) { VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range); VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range); return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x), _mm_castsi128_ps(y), Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64)); } // blend([x1 x0], [y1, y0]) = [x1 y0] template static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) { return Mem::blend(x, y); } template static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) { return Mem::blend(x, y); } } // namespace Reg } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // VC_SSE_SHUFFLE_H Vc-0.7.4/sse/types.h000066400000000000000000000134231233512346000142030ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef SSE_TYPES_H #define SSE_TYPES_H #include "intrinsics.h" #include "../common/storage.h" #define VC_DOUBLE_V_SIZE 2 #define VC_FLOAT_V_SIZE 4 #define VC_SFLOAT_V_SIZE 8 #define VC_INT_V_SIZE 4 #define VC_UINT_V_SIZE 4 #define VC_SHORT_V_SIZE 8 #define VC_USHORT_V_SIZE 8 #include "../common/types.h" #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace SSE { template class Vector; template class WriteMaskedVector; // define our own long because on Windows64 long == int while on Linux long == max. register width // since we want to have a type that depends on 32 vs. 64 bit we need to do some special casing on Windows #ifdef _WIN64 typedef __int64 _long; typedef unsigned __int64 _ulong; #else typedef long _long; typedef unsigned long _ulong; #endif class Float8Mask; class Float8GatherMask; template class Mask; /* * Hack to create a vector object with 8 floats */ typedef Vc::sfloat float8; class M256 { public: //Vc_INTRINSIC M256() {} //Vc_INTRINSIC M256(_M128 a, _M128 b) { d[0] = a; d[1] = b; } static Vc_INTRINSIC Vc_CONST M256 dup(_M128 a) { M256 r; r.d[0] = a; r.d[1] = a; return r; } static Vc_INTRINSIC Vc_CONST M256 create(_M128 a, _M128 b) { M256 r; r.d[0] = a; r.d[1] = b; return r; } Vc_INTRINSIC _M128 &operator[](int i) { return d[i]; } Vc_INTRINSIC const _M128 &operator[](int i) const { return d[i]; } private: #ifdef VC_COMPILE_BENCHMARKS public: #endif _M128 d[2]; }; #ifdef VC_CHECK_ALIGNMENT static Vc_ALWAYS_INLINE void assertCorrectAlignment(const M256 *ptr) { const size_t s = sizeof(__m128); if((reinterpret_cast(ptr) & ((s ^ (s & (s - 1))) - 1)) != 0) { fprintf(stderr, "A vector with incorrect alignment has just been created. Look at the stacktrace to find the guilty object.\n"); abort(); } } #endif template struct ParameterHelper { typedef T ByValue; typedef T & Reference; typedef const T & ConstRef; }; #if defined VC_MSVC && !defined _WIN64 // The calling convention on WIN32 can't guarantee alignment. // An exception are the first three arguments, which may be passed in a register. template<> struct ParameterHelper { typedef const M256 & ByValue; typedef M256 & Reference; typedef const M256 & ConstRef; }; #endif template struct VectorHelper {}; template struct IndexTypeHelper; template<> struct IndexTypeHelper<2u> { typedef unsigned int Type; }; template<> struct IndexTypeHelper<4u> { typedef unsigned int Type; }; template<> struct IndexTypeHelper<8u> { typedef unsigned short Type; }; template<> struct IndexTypeHelper<16u>{ typedef unsigned char Type; }; template struct CtorTypeHelper { typedef T Type; }; template<> struct CtorTypeHelper { typedef int Type; }; template<> struct CtorTypeHelper { typedef unsigned int Type; }; template<> struct CtorTypeHelper { typedef double Type; }; template struct ExpandTypeHelper { typedef T Type; }; template<> struct ExpandTypeHelper { typedef int Type; }; template<> struct ExpandTypeHelper { typedef unsigned int Type; }; template<> struct ExpandTypeHelper { typedef double Type; }; template struct VectorTypeHelper { typedef __m128i Type; }; template<> struct VectorTypeHelper { typedef __m128d Type; }; template<> struct VectorTypeHelper< float> { typedef __m128 Type; }; template<> struct VectorTypeHelper { typedef M256 Type; }; template struct DetermineMask { typedef Mask Type; }; template<> struct DetermineMask { typedef Float8Mask Type; }; template struct DetermineGatherMask { typedef T Type; }; template<> struct DetermineGatherMask { typedef Float8GatherMask Type; }; template struct VectorTraits { typedef typename VectorTypeHelper::Type VectorType; typedef typename DetermineEntryType::Type EntryType; enum Constants { Size = sizeof(VectorType) / sizeof(EntryType), HasVectorDivision = !IsInteger::Value }; typedef typename DetermineMask::Type MaskType; typedef typename DetermineGatherMask::Type GatherMaskType; typedef Vector::Type> IndexType; typedef Common::VectorMemoryUnion StorageType; }; template struct VectorHelperSize; template > class STRUCT_ALIGN1(16) VectorAlignedBaseT { public: FREE_STORE_OPERATORS_ALIGNED(16) } STRUCT_ALIGN2(16); } // namespace SSE } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #endif // SSE_TYPES_H Vc-0.7.4/sse/undomacros.h000066400000000000000000000017041233512346000152100ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2010 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VC_SSE_UNDOMACROS_H #define VC_SSE_UNDOMACROS_H #undef VC_SSE_MACROS_H #undef STORE_VECTOR #ifdef VC_USE_PTEST #undef VC_USE_PTEST #endif #endif // VC_SSE_UNDOMACROS_H #include "../common/undomacros.h" Vc-0.7.4/sse/vector.h000066400000000000000000000702101233512346000143360ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef SSE_VECTOR_H #define SSE_VECTOR_H #include "intrinsics.h" #include "types.h" #include "vectorhelper.h" #include "mask.h" #include "../common/aliasingentryhelper.h" #include "../common/memoryfwd.h" #include #include #include "macros.h" #ifdef isfinite #undef isfinite #endif #ifdef isnan #undef isnan #endif /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace SSE { template class WriteMaskedVector { friend class Vector; typedef typename VectorTraits::MaskType Mask; typedef typename Vector::EntryType EntryType; public: FREE_STORE_OPERATORS_ALIGNED(16) //prefix Vc_INTRINSIC Vector &operator++() { vec->data() = VectorHelper::add(vec->data(), VectorHelper::notMaskedToZero(VectorHelper::one(), mask.data()) ); return *vec; } Vc_INTRINSIC Vector &operator--() { vec->data() = VectorHelper::sub(vec->data(), VectorHelper::notMaskedToZero(VectorHelper::one(), mask.data()) ); return *vec; } //postfix Vc_INTRINSIC Vector operator++(int) { Vector ret(*vec); vec->data() = VectorHelper::add(vec->data(), VectorHelper::notMaskedToZero(VectorHelper::one(), mask.data()) ); return ret; } Vc_INTRINSIC Vector operator--(int) { Vector ret(*vec); vec->data() = VectorHelper::sub(vec->data(), VectorHelper::notMaskedToZero(VectorHelper::one(), mask.data()) ); return ret; } Vc_INTRINSIC Vector &operator+=(const Vector &x) { vec->data() = VectorHelper::add(vec->data(), VectorHelper::notMaskedToZero(x.data(), mask.data())); return *vec; } Vc_INTRINSIC Vector &operator-=(const Vector &x) { vec->data() = VectorHelper::sub(vec->data(), VectorHelper::notMaskedToZero(x.data(), mask.data())); return *vec; } Vc_INTRINSIC Vector &operator*=(const Vector &x) { vec->assign(VectorHelper::mul(vec->data(), x.data()), mask); return *vec; } Vc_INTRINSIC Vector &operator/=(const Vector &x); Vc_INTRINSIC Vector &operator+=(EntryType x) { return operator+=(Vector(x)); } Vc_INTRINSIC Vector &operator-=(EntryType x) { return operator-=(Vector(x)); } Vc_INTRINSIC Vector &operator*=(EntryType x) { return operator*=(Vector(x)); } Vc_INTRINSIC Vector &operator/=(EntryType x) { return operator/=(Vector(x)); } Vc_INTRINSIC Vector &operator=(const Vector &x) { vec->assign(x, mask); return *vec; } Vc_INTRINSIC Vector &operator=(EntryType x) { vec->assign(Vector(x), mask); return *vec; } template Vc_INTRINSIC void call(const F &f) const { return vec->call(f, mask); } template Vc_INTRINSIC void call(F &f) const { return vec->call(f, mask); } template Vc_INTRINSIC Vector apply(const F &f) const { return vec->apply(f, mask); } template Vc_INTRINSIC Vector apply(F &f) const { return vec->apply(f, mask); } private: Vc_ALWAYS_INLINE WriteMaskedVector(Vector *v, const Mask &k) : vec(v), mask(k) {} Vector *const vec; Mask mask; }; template class Vector { friend class WriteMaskedVector; protected: #ifdef VC_COMPILE_BENCHMARKS public: #endif typedef typename VectorTraits::StorageType StorageType; StorageType d; typedef typename VectorTraits::GatherMaskType GatherMask; typedef VectorHelper::VectorType> HV; typedef VectorHelper HT; public: FREE_STORE_OPERATORS_ALIGNED(16) enum Constants { Size = VectorTraits::Size }; typedef typename VectorTraits::VectorType VectorType; typedef typename VectorTraits::EntryType EntryType; typedef typename VectorTraits::IndexType IndexType; typedef typename VectorTraits::MaskType Mask; typedef typename Mask::Argument MaskArg; typedef Vc::Memory, Size> Memory; #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN typedef const Vector &AsArg; #else typedef const Vector AsArg; #endif typedef T _T; /////////////////////////////////////////////////////////////////////////////////////////// // uninitialized Vc_ALWAYS_INLINE Vector() {} /////////////////////////////////////////////////////////////////////////////////////////// // constants explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerZero::ZEnum) Vc_INTRINSIC_R; explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerOne::OEnum) Vc_INTRINSIC_R; explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerIndexesFromZero::IEnum) Vc_INTRINSIC_R; static Vc_INTRINSIC_L Vector Zero() Vc_INTRINSIC_R; static Vc_INTRINSIC_L Vector One() Vc_INTRINSIC_R; static Vc_INTRINSIC_L Vector IndexesFromZero() Vc_INTRINSIC_R; static Vc_INTRINSIC_L Vector Random() Vc_INTRINSIC_R; /////////////////////////////////////////////////////////////////////////////////////////// // internal: required to enable returning objects of VectorType Vc_ALWAYS_INLINE Vector(const VectorType &x) : d(x) {} /////////////////////////////////////////////////////////////////////////////////////////// // static_cast / copy ctor template explicit Vc_INTRINSIC_L Vector(const Vector &x) Vc_INTRINSIC_R; // implicit cast template Vc_INTRINSIC_L Vector &operator=(const Vector &x) Vc_INTRINSIC_R; // copy assignment Vc_ALWAYS_INLINE Vector &operator=(AsArg v) { d.v() = v.d.v(); return *this; } /////////////////////////////////////////////////////////////////////////////////////////// // broadcast explicit Vc_INTRINSIC_L Vector(EntryType a) Vc_INTRINSIC_R; template Vc_INTRINSIC Vector(TT x, VC_EXACT_TYPE(TT, EntryType, void *) = 0) : d(HT::set(x)) {} static Vc_INTRINSIC Vector broadcast4(const EntryType *x) { return Vector(x); } Vc_ALWAYS_INLINE Vector &operator=(EntryType a) { d.v() = HT::set(a); return *this; } /////////////////////////////////////////////////////////////////////////////////////////// // load ctors explicit Vc_INTRINSIC_L Vector(const EntryType *x) Vc_INTRINSIC_R; template Vc_INTRINSIC_L Vector(const EntryType *x, Alignment align) Vc_INTRINSIC_R; template explicit Vc_INTRINSIC_L Vector(const OtherT *x) Vc_INTRINSIC_R; template Vc_INTRINSIC_L Vector(const OtherT *x, Alignment align) Vc_INTRINSIC_R; /////////////////////////////////////////////////////////////////////////////////////////// // load member functions Vc_INTRINSIC_L void load(const EntryType *mem) Vc_INTRINSIC_R; template Vc_INTRINSIC_L void load(const EntryType *mem, Alignment align) Vc_INTRINSIC_R; template Vc_INTRINSIC_L void load(const OtherT *mem) Vc_INTRINSIC_R; template Vc_INTRINSIC_L void load(const OtherT *mem, Alignment align) Vc_INTRINSIC_R; /////////////////////////////////////////////////////////////////////////////////////////// // expand 1 float_v to 2 double_v XXX rationale? remove it for release? XXX explicit Vc_INTRINSIC_L Vector(const Vector::Type> *a) Vc_INTRINSIC_R; inline void expand(Vector::Type> *x) const; /////////////////////////////////////////////////////////////////////////////////////////// // zeroing Vc_INTRINSIC_L void setZero() Vc_INTRINSIC_R; Vc_INTRINSIC_L void setZero(const Mask &k) Vc_INTRINSIC_R; Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R; Vc_INTRINSIC_L void setQnan(typename Mask::Argument k) Vc_INTRINSIC_R; /////////////////////////////////////////////////////////////////////////////////////////// // stores Vc_INTRINSIC_L void store(EntryType *mem) const Vc_INTRINSIC_R; Vc_INTRINSIC_L void store(EntryType *mem, const Mask &mask) const Vc_INTRINSIC_R; template Vc_INTRINSIC_L void store(EntryType *mem, A align) const Vc_INTRINSIC_R; template Vc_INTRINSIC_L void store(EntryType *mem, const Mask &mask, A align) const Vc_INTRINSIC_R; /////////////////////////////////////////////////////////////////////////////////////////// // swizzles Vc_INTRINSIC_L Vc_PURE_L const Vector &abcd() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector cdab() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector badc() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector aaaa() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector bbbb() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector cccc() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector dddd() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector bcad() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector bcda() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector dabc() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector acbd() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector dbca() const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L const Vector dcba() const Vc_INTRINSIC_R Vc_PURE_R; /////////////////////////////////////////////////////////////////////////////////////////// // gathers template Vector(const EntryType *mem, const IndexT *indexes); template Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes); template Vector(const EntryType *mem, const IndexT *indexes, MaskArg mask); template Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask); template Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes); template Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask); template Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes); template Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask); template Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes); template Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask); template void gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes); template void gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask); #ifdef VC_USE_SET_GATHERS template void gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask); #endif template void gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes); template void gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask); template void gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes); template void gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask); template void gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes); template void gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask); /////////////////////////////////////////////////////////////////////////////////////////// // scatters template void scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const; template void scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) const; template void scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) const; template void scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const; template void scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) const; template void scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const; template void scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) const; template void scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) const; //prefix Vc_INTRINSIC Vector &operator++() { data() = VectorHelper::add(data(), VectorHelper::one()); return *this; } Vc_INTRINSIC Vector &operator--() { data() = VectorHelper::sub(data(), VectorHelper::one()); return *this; } //postfix Vc_INTRINSIC Vector operator++(int) { const Vector r = *this; data() = VectorHelper::add(data(), VectorHelper::one()); return r; } Vc_INTRINSIC Vector operator--(int) { const Vector r = *this; data() = VectorHelper::sub(data(), VectorHelper::one()); return r; } Vc_INTRINSIC Common::AliasingEntryHelper operator[](size_t index) { #if defined(VC_GCC) && VC_GCC >= 0x40300 && VC_GCC < 0x40400 ::Vc::Warnings::_operator_bracket_warning(); #endif return d.m(index); } Vc_INTRINSIC_L EntryType operator[](size_t index) const Vc_PURE Vc_INTRINSIC_R; Vc_INTRINSIC Vector Vc_PURE operator~() const { return VectorHelper::andnot_(data(), VectorHelper::allone()); } Vc_ALWAYS_INLINE_L Vector::Type> operator-() const Vc_ALWAYS_INLINE_R; Vc_INTRINSIC Vector Vc_PURE operator+() const { return *this; } #define OP(symbol, fun) \ Vc_INTRINSIC Vector &operator symbol##=(const Vector &x) { data() = VectorHelper::fun(data(), x.data()); return *this; } \ Vc_INTRINSIC Vector &operator symbol##=(EntryType x) { return operator symbol##=(Vector(x)); } \ Vc_INTRINSIC Vector Vc_PURE operator symbol(const Vector &x) const { return HT::fun(data(), x.data()); } \ template Vc_INTRINSIC VC_EXACT_TYPE(TT, EntryType, Vector) Vc_PURE operator symbol(TT x) const { return operator symbol(Vector(x)); } OP(+, add) OP(-, sub) OP(*, mul) #undef OP Vc_INTRINSIC_L Vector &operator<<=(AsArg shift) Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector operator<< (AsArg shift) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector &operator<<=( int shift) Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector operator<< ( int shift) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector &operator>>=(AsArg shift) Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector operator>> (AsArg shift) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector &operator>>=( int shift) Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector operator>> ( int shift) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector &operator/=(const Vector &x) Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector operator/ (const Vector &x) const Vc_PURE Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector &operator/=(EntryType x) Vc_INTRINSIC_R; template Vc_INTRINSIC_L VC_EXACT_TYPE(TT, typename DetermineEntryType::Type, Vector) operator/(TT x) const Vc_PURE Vc_INTRINSIC_R; #define OP(symbol, fun) \ Vc_INTRINSIC_L Vector &operator symbol##=(const Vector &x) Vc_INTRINSIC_R; \ Vc_INTRINSIC_L Vector operator symbol(const Vector &x) const Vc_PURE Vc_INTRINSIC_R; \ Vc_INTRINSIC Vector &operator symbol##=(EntryType x) { return operator symbol##=(Vector(x)); } \ template Vc_INTRINSIC VC_EXACT_TYPE(TT, EntryType, Vector) Vc_PURE operator symbol(TT x) const { return operator symbol(Vector(x)); } OP(|, or_) OP(&, and_) OP(^, xor_) #undef OP #define OPcmp(symbol, fun) \ Vc_INTRINSIC Mask Vc_PURE operator symbol(const Vector &x) const { return VectorHelper::fun(data(), x.data()); } \ template Vc_INTRINSIC VC_EXACT_TYPE(TT, EntryType, Mask) Vc_PURE operator symbol(TT x) const { return operator symbol(Vector(x)); } OPcmp(==, cmpeq) OPcmp(!=, cmpneq) OPcmp(>=, cmpnlt) OPcmp(>, cmpnle) OPcmp(<, cmplt) OPcmp(<=, cmple) #undef OPcmp Vc_INTRINSIC_L Vc_PURE_L Mask isNegative() const Vc_PURE_R Vc_INTRINSIC_R; Vc_ALWAYS_INLINE void fusedMultiplyAdd(const Vector &factor, const Vector &summand) { VectorHelper::fma(data(), factor.data(), summand.data()); } Vc_ALWAYS_INLINE void assign( const Vector &v, const Mask &mask ) { const VectorType k = mm128_reinterpret_cast(mask.data()); data() = VectorHelper::blend(data(), v.data(), k); } template Vc_ALWAYS_INLINE Vc_PURE V2 staticCast() const { return StaticCastHelper::cast(data()); } template Vc_ALWAYS_INLINE Vc_PURE V2 reinterpretCast() const { return mm128_reinterpret_cast(data()); } Vc_INTRINSIC WriteMaskedVector operator()(const Mask &k) { return WriteMaskedVector(this, k); } /** * \return \p true This vector was completely filled. m2 might be 0 or != 0. You still have * to test this. * \p false This vector was not completely filled. m2 is all 0. */ //inline bool pack(Mask &m1, Vector &v2, Mask &m2) { //return VectorHelper::pack(data(), m1.data, v2.data(), m2.data); //} Vc_ALWAYS_INLINE Vc_PURE VectorType &data() { return d.v(); } Vc_ALWAYS_INLINE Vc_PURE const VectorType &data() const { return d.v(); } Vc_INTRINSIC EntryType min() const { return VectorHelper::min(data()); } Vc_INTRINSIC EntryType max() const { return VectorHelper::max(data()); } Vc_INTRINSIC EntryType product() const { return VectorHelper::mul(data()); } Vc_INTRINSIC EntryType sum() const { return VectorHelper::add(data()); } Vc_INTRINSIC_L EntryType min(MaskArg m) const Vc_INTRINSIC_R; Vc_INTRINSIC_L EntryType max(MaskArg m) const Vc_INTRINSIC_R; Vc_INTRINSIC_L EntryType product(MaskArg m) const Vc_INTRINSIC_R; Vc_INTRINSIC_L EntryType sum(MaskArg m) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector shifted(int amount) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector rotated(int amount) const Vc_INTRINSIC_R; inline Vc_PURE Vector sorted() const { return SortHelper::sort(data()); } template void callWithValuesSorted(F &f) { EntryType value = d.m(0); f(value); for (int i = 1; i < Size; ++i) { if (d.m(i) != value) { value = d.m(i); f(value); } } } template Vc_INTRINSIC void call(const F &f) const { for_all_vector_entries(i, f(EntryType(d.m(i))); ); } template Vc_INTRINSIC void call(F &f) const { for_all_vector_entries(i, f(EntryType(d.m(i))); ); } template Vc_INTRINSIC void call(const F &f, const Mask &mask) const { Vc_foreach_bit(size_t i, mask) { f(EntryType(d.m(i))); } } template Vc_INTRINSIC void call(F &f, const Mask &mask) const { Vc_foreach_bit(size_t i, mask) { f(EntryType(d.m(i))); } } template Vc_INTRINSIC Vector apply(const F &f) const { Vector r; for_all_vector_entries(i, r.d.m(i) = f(EntryType(d.m(i))); ); return r; } template Vc_INTRINSIC Vector apply(F &f) const { Vector r; for_all_vector_entries(i, r.d.m(i) = f(EntryType(d.m(i))); ); return r; } template Vc_INTRINSIC Vector apply(const F &f, const Mask &mask) const { Vector r(*this); Vc_foreach_bit (size_t i, mask) { r.d.m(i) = f(EntryType(r.d.m(i))); } return r; } template Vc_INTRINSIC Vector apply(F &f, const Mask &mask) const { Vector r(*this); Vc_foreach_bit (size_t i, mask) { r.d.m(i) = f(EntryType(r.d.m(i))); } return r; } template Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) { for_all_vector_entries(i, d.m(i) = f(i); ); } Vc_INTRINSIC void fill(EntryType (&f)()) { for_all_vector_entries(i, d.m(i) = f(); ); } Vc_INTRINSIC_L Vector copySign(typename Vector::AsArg reference) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector exponent() const Vc_INTRINSIC_R; }; typedef Vector double_v; typedef Vector float_v; typedef Vector sfloat_v; typedef Vector int_v; typedef Vector uint_v; typedef Vector short_v; typedef Vector ushort_v; typedef double_v::Mask double_m; typedef float_v::Mask float_m; typedef sfloat_v::Mask sfloat_m; typedef int_v::Mask int_m; typedef uint_v::Mask uint_m; typedef short_v::Mask short_m; typedef ushort_v::Mask ushort_m; template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::broadcast4(const float *x) { const _M128 &v = VectorHelper<_M128>::load(x, Aligned); return Vector(M256::create(v, v)); } template class SwizzledVector : public Vector {}; static Vc_ALWAYS_INLINE Vc_PURE int_v min(const int_v &x, const int_v &y) { return mm_min_epi32(x.data(), y.data()); } static Vc_ALWAYS_INLINE Vc_PURE uint_v min(const uint_v &x, const uint_v &y) { return mm_min_epu32(x.data(), y.data()); } static Vc_ALWAYS_INLINE Vc_PURE short_v min(const short_v &x, const short_v &y) { return _mm_min_epi16(x.data(), y.data()); } static Vc_ALWAYS_INLINE Vc_PURE ushort_v min(const ushort_v &x, const ushort_v &y) { return mm_min_epu16(x.data(), y.data()); } static Vc_ALWAYS_INLINE Vc_PURE float_v min(const float_v &x, const float_v &y) { return _mm_min_ps(x.data(), y.data()); } static Vc_ALWAYS_INLINE Vc_PURE double_v min(const double_v &x, const double_v &y) { return _mm_min_pd(x.data(), y.data()); } static Vc_ALWAYS_INLINE Vc_PURE int_v max(const int_v &x, const int_v &y) { return mm_max_epi32(x.data(), y.data()); } static Vc_ALWAYS_INLINE Vc_PURE uint_v max(const uint_v &x, const uint_v &y) { return mm_max_epu32(x.data(), y.data()); } static Vc_ALWAYS_INLINE Vc_PURE short_v max(const short_v &x, const short_v &y) { return _mm_max_epi16(x.data(), y.data()); } static Vc_ALWAYS_INLINE Vc_PURE ushort_v max(const ushort_v &x, const ushort_v &y) { return mm_max_epu16(x.data(), y.data()); } static Vc_ALWAYS_INLINE Vc_PURE float_v max(const float_v &x, const float_v &y) { return _mm_max_ps(x.data(), y.data()); } static Vc_ALWAYS_INLINE Vc_PURE double_v max(const double_v &x, const double_v &y) { return _mm_max_pd(x.data(), y.data()); } static Vc_ALWAYS_INLINE Vc_PURE sfloat_v min(const sfloat_v &x, const sfloat_v &y) { return M256::create(_mm_min_ps(x.data()[0], y.data()[0]), _mm_min_ps(x.data()[1], y.data()[1])); } static Vc_ALWAYS_INLINE Vc_PURE sfloat_v max(const sfloat_v &x, const sfloat_v &y) { return M256::create(_mm_max_ps(x.data()[0], y.data()[0]), _mm_max_ps(x.data()[1], y.data()[1])); } template static Vc_ALWAYS_INLINE Vc_PURE Vector sqrt (const Vector &x) { return VectorHelper::sqrt(x.data()); } template static Vc_ALWAYS_INLINE Vc_PURE Vector rsqrt(const Vector &x) { return VectorHelper::rsqrt(x.data()); } template static Vc_ALWAYS_INLINE Vc_PURE Vector abs (const Vector &x) { return VectorHelper::abs(x.data()); } template static Vc_ALWAYS_INLINE Vc_PURE Vector reciprocal(const Vector &x) { return VectorHelper::reciprocal(x.data()); } template static Vc_ALWAYS_INLINE Vc_PURE Vector round(const Vector &x) { return VectorHelper::round(x.data()); } template static Vc_ALWAYS_INLINE Vc_PURE typename Vector::Mask isfinite(const Vector &x) { return VectorHelper::isFinite(x.data()); } template static Vc_ALWAYS_INLINE Vc_PURE typename Vector::Mask isnan(const Vector &x) { return VectorHelper::isNaN(x.data()); } #include "forceToRegisters.tcc" #ifdef VC_GNU_ASM template<> Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x1) { __asm__ __volatile__(""::"x"(x1.data()[0]), "x"(x1.data()[1])); } #elif defined(VC_MSVC) #pragma optimize("g", off) template<> Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x1*/) { } #endif } // namespace SSE } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" #include "vector.tcc" #include "math.h" #endif // SSE_VECTOR_H Vc-0.7.4/sse/vector.tcc000066400000000000000000002167311233512346000146720ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2010-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include "limits.h" #include "../common/bitscanintrinsics.h" #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { ALIGN(64) extern unsigned int RandomState[16]; namespace SSE { template static Vc_ALWAYS_INLINE Vc_CONST const T *_IndexesFromZero() { if (Size == 4) { return reinterpret_cast(_IndexesFromZero4); } else if (Size == 8) { return reinterpret_cast(_IndexesFromZero8); } else if (Size == 16) { return reinterpret_cast(_IndexesFromZero16); } return 0; } /////////////////////////////////////////////////////////////////////////////////////////// // constants {{{1 template Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerZero::ZEnum) : d(VectorHelper::zero()) { } template Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerOne::OEnum) : d(VectorHelper::one()) { } template Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerIndexesFromZero::IEnum) : d(VectorHelper::load(_IndexesFromZero(), Aligned)) { } template Vc_INTRINSIC Vc_CONST Vector Vector::Zero() { return VectorHelper::zero(); } template Vc_INTRINSIC Vc_CONST Vector Vector::One() { return VectorHelper::one(); } template Vc_INTRINSIC Vc_CONST Vector Vector::IndexesFromZero() { return VectorHelper::load(_IndexesFromZero(), Aligned); } // conversion/casts {{{1 template template Vc_INTRINSIC Vector::Vector(const Vector &x) : d(StaticCastHelper::cast(x.data())) { } template<> template<> Vc_INTRINSIC short_v &Vector::operator=(const ushort_v &x) { data() = StaticCastHelper::cast(x.data()); return *this; } template<> template<> Vc_INTRINSIC ushort_v &Vector::operator=(const short_v &x) { data() = StaticCastHelper::cast(x.data()); return *this; } template<> template<> Vc_INTRINSIC int_v &Vector::operator=(const uint_v &x) { data() = StaticCastHelper::cast(x.data()); return *this; } template<> template<> Vc_INTRINSIC uint_v &Vector::operator=(const int_v &x) { data() = StaticCastHelper::cast(x.data()); return *this; } // broadcasts {{{1 template Vc_INTRINSIC Vector::Vector(EntryType a) : d(VectorHelper::set(a)) { } /////////////////////////////////////////////////////////////////////////////////////////// // load ctors {{{1 template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *x) { load(x); } template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *x, A a) { load(x, a); } template template Vc_ALWAYS_INLINE Vector::Vector(const OtherT *x) { load(x); } template template Vc_ALWAYS_INLINE Vector::Vector(const OtherT *x, A a) { load(x, a); } /////////////////////////////////////////////////////////////////////////////////////////// // load member functions {{{1 template Vc_INTRINSIC void Vector::load(const EntryType *mem) { load(mem, Aligned); } template template Vc_INTRINSIC void Vector::load(const EntryType *mem, A align) { d.v() = VectorHelper::load(mem, align); } template template Vc_INTRINSIC void Vector::load(const OtherT *mem) { load(mem, Aligned); } // float8: simply use the float implementation twice {{{2 template<> template Vc_INTRINSIC void Vector::load(const OtherT *x, A a) { d.v() = M256::create( Vector(&x[0], a).data(), Vector(&x[4], a).data() ); } // LoadHelper {{{2 template struct LoadHelper; // float {{{2 template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const double *mem, Flags f) { return _mm_movelh_ps(_mm_cvtpd_ps(VectorHelper<__m128d>::load(&mem[0], f)), _mm_cvtpd_ps(VectorHelper<__m128d>::load(&mem[2], f))); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const unsigned int *mem, Flags f) { return StaticCastHelper::cast(VectorHelper<__m128i>::load(mem, f)); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const int *mem, Flags f) { return StaticCastHelper::cast(VectorHelper<__m128i>::load(mem, f)); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const unsigned short *mem, Flags f) { return _mm_cvtepi32_ps(LoadHelper::load(mem, f)); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const short *mem, Flags f) { return _mm_cvtepi32_ps(LoadHelper::load(mem, f)); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const unsigned char *mem, Flags f) { return _mm_cvtepi32_ps(LoadHelper::load(mem, f)); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const signed char *mem, Flags f) { return _mm_cvtepi32_ps(LoadHelper::load(mem, f)); } }; // int {{{2 template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned int *mem, Flags f) { return VectorHelper<__m128i>::load(mem, f); } }; // no difference between streaming and alignment, because the // 32/64 bit loads are not available as streaming loads, and can always be unaligned template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned short *mem, Flags) { return mm_cvtepu16_epi32( _mm_loadl_epi64(reinterpret_cast(mem))); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const short *mem, Flags) { return mm_cvtepi16_epi32(_mm_loadl_epi64(reinterpret_cast(mem))); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags) { return mm_cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast(mem))); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const signed char *mem, Flags) { return mm_cvtepi8_epi32(_mm_cvtsi32_si128(*reinterpret_cast(mem))); } }; // unsigned int {{{2 template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned short *mem, Flags) { return mm_cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast(mem))); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags) { return mm_cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast(mem))); } }; // short {{{2 template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned short *mem, Flags f) { return VectorHelper<__m128i>::load(mem, f); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags) { return mm_cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast(mem))); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const signed char *mem, Flags) { return mm_cvtepi8_epi16(_mm_loadl_epi64(reinterpret_cast(mem))); } }; // unsigned short {{{2 template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags) { return mm_cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast(mem))); } }; // general load, implemented via LoadHelper {{{2 template template Vc_INTRINSIC void Vector::load(const SrcT *x, Flags f) { d.v() = LoadHelper::load(x, f); } /////////////////////////////////////////////////////////////////////////////////////////// // expand/combine {{{1 template Vc_INTRINSIC Vector::Vector(const Vector::Type> *a) : d(VectorHelper::concat(a[0].data(), a[1].data())) { } template inline void Vector::expand(Vector::Type> *x) const { if (Size == 8u) { x[0].data() = VectorHelper::expand0(data()); x[1].data() = VectorHelper::expand1(data()); } } /////////////////////////////////////////////////////////////////////////////////////////// // zeroing {{{1 template Vc_INTRINSIC void Vector::setZero() { data() = VectorHelper::zero(); } template Vc_INTRINSIC void Vector::setZero(const Mask &k) { data() = VectorHelper::andnot_(mm128_reinterpret_cast(k.data()), data()); } template<> Vc_INTRINSIC void Vector::setQnan() { data() = _mm_setallone_pd(); } template<> Vc_INTRINSIC void Vector::setQnan(Mask::Argument k) { data() = _mm_or_pd(data(), k.dataD()); } template<> Vc_INTRINSIC void Vector::setQnan() { data() = _mm_setallone_ps(); } template<> Vc_INTRINSIC void Vector::setQnan(Mask::Argument k) { data() = _mm_or_ps(data(), k.data()); } template<> Vc_INTRINSIC void Vector::setQnan() { d.v()[0] = _mm_setallone_ps(); d.v()[1] = _mm_setallone_ps(); } template<> Vc_INTRINSIC void Vector::setQnan(Mask::Argument k) { d.v()[0] = _mm_or_ps(d.v()[0], k.data()[0]); d.v()[1] = _mm_or_ps(d.v()[1], k.data()[1]); } /////////////////////////////////////////////////////////////////////////////////////////// // stores {{{1 template Vc_INTRINSIC void Vector::store(EntryType *mem) const { VectorHelper::store(mem, data(), Aligned); } template Vc_INTRINSIC void Vector::store(EntryType *mem, const Mask &mask) const { VectorHelper::store(mem, data(), mm128_reinterpret_cast(mask.data()), Aligned); } template template Vc_INTRINSIC void Vector::store(EntryType *mem, A align) const { VectorHelper::store(mem, data(), align); } template template Vc_INTRINSIC void Vector::store(EntryType *mem, const Mask &mask, A align) const { HV::store(mem, data(), mm128_reinterpret_cast(mask.data()), align); } /////////////////////////////////////////////////////////////////////////////////////////// // division {{{1 template Vc_INTRINSIC Vector &WriteMaskedVector::operator/=(const Vector &x) { return operator=(*vec / x); } template<> Vc_INTRINSIC int_v &WriteMaskedVector::operator/=(const int_v &x) { Vc_foreach_bit (int i, mask) { vec->d.m(i) /= x.d.m(i); } return *vec; } template<> Vc_INTRINSIC uint_v &WriteMaskedVector::operator/=(const uint_v &x) { Vc_foreach_bit (int i, mask) { vec->d.m(i) /= x.d.m(i); } return *vec; } template<> Vc_INTRINSIC short_v &WriteMaskedVector::operator/=(const short_v &x) { Vc_foreach_bit (int i, mask) { vec->d.m(i) /= x.d.m(i); } return *vec; } template<> Vc_INTRINSIC ushort_v &WriteMaskedVector::operator/=(const ushort_v &x) { Vc_foreach_bit (int i, mask) { vec->d.m(i) /= x.d.m(i); } return *vec; } template inline Vector &Vector::operator/=(EntryType x) { if (VectorTraits::HasVectorDivision) { return operator/=(Vector(x)); } for_all_vector_entries(i, d.m(i) /= x; ); return *this; } template template Vc_INTRINSIC Vc_PURE VC_EXACT_TYPE(TT, typename DetermineEntryType::Type, Vector) Vector::operator/(TT x) const { if (VectorTraits::HasVectorDivision) { return operator/(Vector(x)); } Vector r; for_all_vector_entries(i, r.d.m(i) = d.m(i) / x; ); return r; } template inline Vector &Vector::operator/=(const Vector &x) { for_all_vector_entries(i, d.m(i) /= x.d.m(i); ); return *this; } template inline Vc_PURE Vector Vector::operator/(const Vector &x) const { Vector r; for_all_vector_entries(i, r.d.m(i) = d.m(i) / x.d.m(i); ); return r; } template<> inline Vector &Vector::operator/=(const Vector &x) { __m128 lo = _mm_cvtepi32_ps(VectorHelper::expand0(d.v())); __m128 hi = _mm_cvtepi32_ps(VectorHelper::expand1(d.v())); lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper::expand0(x.d.v()))); hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper::expand1(x.d.v()))); d.v() = _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi)); return *this; } template<> inline Vc_PURE Vector Vector::operator/(const Vector &x) const { __m128 lo = _mm_cvtepi32_ps(VectorHelper::expand0(d.v())); __m128 hi = _mm_cvtepi32_ps(VectorHelper::expand1(d.v())); lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper::expand0(x.d.v()))); hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper::expand1(x.d.v()))); return _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi)); } template<> inline Vector &Vector::operator/=(const Vector &x) { __m128 lo = _mm_cvtepi32_ps(VectorHelper::expand0(d.v())); __m128 hi = _mm_cvtepi32_ps(VectorHelper::expand1(d.v())); lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper::expand0(x.d.v()))); hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper::expand1(x.d.v()))); d.v() = _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi)); return *this; } template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator/(const Vector &x) const { __m128 lo = _mm_cvtepi32_ps(VectorHelper::expand0(d.v())); __m128 hi = _mm_cvtepi32_ps(VectorHelper::expand1(d.v())); lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper::expand0(x.d.v()))); hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper::expand1(x.d.v()))); return _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi)); } template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) { d.v() = _mm_div_ps(d.v(), x.d.v()); return *this; } template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator/(const Vector &x) const { return _mm_div_ps(d.v(), x.d.v()); } template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) { d.v()[0] = _mm_div_ps(d.v()[0], x.d.v()[0]); d.v()[1] = _mm_div_ps(d.v()[1], x.d.v()[1]); return *this; } template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator/(const Vector &x) const { Vector r; r.d.v()[0] = _mm_div_ps(d.v()[0], x.d.v()[0]); r.d.v()[1] = _mm_div_ps(d.v()[1], x.d.v()[1]); return r; } template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) { d.v() = _mm_div_pd(d.v(), x.d.v()); return *this; } template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator/(const Vector &x) const { return _mm_div_pd(d.v(), x.d.v()); } /////////////////////////////////////////////////////////////////////////////////////////// // operator- {{{1 template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { return _mm_xor_pd(d.v(), _mm_setsignmask_pd()); } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { return _mm_xor_ps(d.v(), _mm_setsignmask_ps()); } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { return M256::create( _mm_xor_ps(d.v()[0], _mm_setsignmask_ps()), _mm_xor_ps(d.v()[1], _mm_setsignmask_ps())); } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { #ifdef VC_IMPL_SSSE3 return _mm_sign_epi32(d.v(), _mm_setallone_si128()); #else return _mm_add_epi32(_mm_xor_si128(d.v(), _mm_setallone_si128()), _mm_setone_epi32()); #endif } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { #ifdef VC_IMPL_SSSE3 return _mm_sign_epi32(d.v(), _mm_setallone_si128()); #else return _mm_add_epi32(_mm_xor_si128(d.v(), _mm_setallone_si128()), _mm_setone_epi32()); #endif } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { #ifdef VC_IMPL_SSSE3 return _mm_sign_epi16(d.v(), _mm_setallone_si128()); #else return _mm_mullo_epi16(d.v(), _mm_setallone_si128()); #endif } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { #ifdef VC_IMPL_SSSE3 return _mm_sign_epi16(d.v(), _mm_setallone_si128()); #else return _mm_mullo_epi16(d.v(), _mm_setallone_si128()); #endif } /////////////////////////////////////////////////////////////////////////////////////////// // integer ops {{{1 #define OP_IMPL(T, symbol, fun) \ template<> Vc_ALWAYS_INLINE Vector &Vector::operator symbol##=(const Vector &x) \ { \ d.v() = VectorHelper::fun(d.v(), x.d.v()); \ return *this; \ } \ template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator symbol(const Vector &x) const \ { \ return VectorHelper::fun(d.v(), x.d.v()); \ } OP_IMPL(int, &, and_) OP_IMPL(int, |, or_) OP_IMPL(int, ^, xor_) OP_IMPL(unsigned int, &, and_) OP_IMPL(unsigned int, |, or_) OP_IMPL(unsigned int, ^, xor_) OP_IMPL(short, &, and_) OP_IMPL(short, |, or_) OP_IMPL(short, ^, xor_) OP_IMPL(unsigned short, &, and_) OP_IMPL(unsigned short, |, or_) OP_IMPL(unsigned short, ^, xor_) OP_IMPL(float, &, and_) OP_IMPL(float, |, or_) OP_IMPL(float, ^, xor_) OP_IMPL(float8, &, and_) OP_IMPL(float8, |, or_) OP_IMPL(float8, ^, xor_) OP_IMPL(double, &, and_) OP_IMPL(double, |, or_) OP_IMPL(double, ^, xor_) #undef OP_IMPL #ifdef VC_IMPL_XOP static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const int_v &value, const int_v &count) { return _mm_sha_epi32(value.data(), count.data()); } static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const uint_v &value, const uint_v &count) { return _mm_shl_epi32(value.data(), count.data()); } static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const short_v &value, const short_v &count) { return _mm_sha_epi16(value.data(), count.data()); } static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const ushort_v &value, const ushort_v &count) { return _mm_shl_epi16(value.data(), count.data()); } static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const int_v &value, const int_v &count) { return shiftLeft(value, -count ); } static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const uint_v &value, const uint_v &count) { return shiftLeft(value, uint_v(-count)); } static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const short_v &value, const short_v &count) { return shiftLeft(value, -count ); } static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const ushort_v &value, const ushort_v &count) { return shiftLeft(value, ushort_v(-count)); } #define _VC_OP(T, symbol, impl) \ template<> Vc_INTRINSIC T &T::operator symbol##=(T::AsArg shift) \ { \ d.v() = impl(*this, shift); \ return *this; \ } \ template<> Vc_INTRINSIC Vc_PURE T T::operator symbol (T::AsArg shift) const \ { \ return impl(*this, shift); \ } VC_APPLY_2(VC_LIST_INT_VECTOR_TYPES, _VC_OP, <<, shiftLeft) VC_APPLY_2(VC_LIST_INT_VECTOR_TYPES, _VC_OP, >>, shiftRight) #undef _VC_OP #else #if defined(VC_GCC) && VC_GCC == 0x40600 && defined(VC_IMPL_XOP) #define VC_WORKAROUND __attribute__((optimize("no-tree-vectorize"),weak)) #else #define VC_WORKAROUND Vc_INTRINSIC #endif #define OP_IMPL(T, symbol) \ template<> VC_WORKAROUND Vector &Vector::operator symbol##=(Vector::AsArg x) \ { \ for_all_vector_entries(i, \ d.m(i) symbol##= x.d.m(i); \ ); \ return *this; \ } \ template<> inline Vc_PURE Vector Vector::operator symbol(Vector::AsArg x) const \ { \ Vector r; \ for_all_vector_entries(i, \ r.d.m(i) = d.m(i) symbol x.d.m(i); \ ); \ return r; \ } OP_IMPL(int, <<) OP_IMPL(int, >>) OP_IMPL(unsigned int, <<) OP_IMPL(unsigned int, >>) OP_IMPL(short, <<) OP_IMPL(short, >>) OP_IMPL(unsigned short, <<) OP_IMPL(unsigned short, >>) #undef OP_IMPL #undef VC_WORKAROUND #endif template Vc_ALWAYS_INLINE Vector &Vector::operator>>=(int shift) { d.v() = VectorHelper::shiftRight(d.v(), shift); return *this; } template Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator>>(int shift) const { return VectorHelper::shiftRight(d.v(), shift); } template Vc_ALWAYS_INLINE Vector &Vector::operator<<=(int shift) { d.v() = VectorHelper::shiftLeft(d.v(), shift); return *this; } template Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator<<(int shift) const { return VectorHelper::shiftLeft(d.v(), shift); } /////////////////////////////////////////////////////////////////////////////////////////// // swizzles {{{1 template Vc_INTRINSIC Vc_PURE const Vector &Vector::abcd() const { return *this; } template Vc_INTRINSIC Vc_PURE const Vector Vector::cdab() const { return Mem::permute(data()); } template Vc_INTRINSIC Vc_PURE const Vector Vector::badc() const { return Mem::permute(data()); } template Vc_INTRINSIC Vc_PURE const Vector Vector::aaaa() const { return Mem::permute(data()); } template Vc_INTRINSIC Vc_PURE const Vector Vector::bbbb() const { return Mem::permute(data()); } template Vc_INTRINSIC Vc_PURE const Vector Vector::cccc() const { return Mem::permute(data()); } template Vc_INTRINSIC Vc_PURE const Vector Vector::dddd() const { return Mem::permute(data()); } template Vc_INTRINSIC Vc_PURE const Vector Vector::bcad() const { return Mem::permute(data()); } template Vc_INTRINSIC Vc_PURE const Vector Vector::bcda() const { return Mem::permute(data()); } template Vc_INTRINSIC Vc_PURE const Vector Vector::dabc() const { return Mem::permute(data()); } template Vc_INTRINSIC Vc_PURE const Vector Vector::acbd() const { return Mem::permute(data()); } template Vc_INTRINSIC Vc_PURE const Vector Vector::dbca() const { return Mem::permute(data()); } template Vc_INTRINSIC Vc_PURE const Vector Vector::dcba() const { return Mem::permute(data()); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::cdab() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::badc() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::aaaa() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::bbbb() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::cccc() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::dddd() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::bcad() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::bcda() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::dabc() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::acbd() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::dbca() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::dcba() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } #define VC_SWIZZLES_16BIT_IMPL(T) \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::cdab() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::badc() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::aaaa() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::bbbb() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::cccc() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::dddd() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::bcad() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::bcda() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::dabc() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::acbd() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::dbca() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::dcba() const { return Mem::permute(data()); } VC_SWIZZLES_16BIT_IMPL(short) VC_SWIZZLES_16BIT_IMPL(unsigned short) #undef VC_SWIZZLES_16BIT_IMPL // operators {{{1 #include "../common/operators.h" // isNegative {{{1 template<> Vc_INTRINSIC Vc_PURE float_m float_v::isNegative() const { return sse_cast<__m128>(_mm_srai_epi32(sse_cast<__m128i>(_mm_and_ps(_mm_setsignmask_ps(), d.v())), 31)); } template<> Vc_INTRINSIC Vc_PURE sfloat_m sfloat_v::isNegative() const { return M256::create( sse_cast<__m128>(_mm_srai_epi32(sse_cast<__m128i>(_mm_and_ps(_mm_setsignmask_ps(), d.v()[0])), 31)), sse_cast<__m128>(_mm_srai_epi32(sse_cast<__m128i>(_mm_and_ps(_mm_setsignmask_ps(), d.v()[1])), 31)) ); } template<> Vc_INTRINSIC Vc_PURE double_m double_v::isNegative() const { return Mem::permute(sse_cast<__m128>( _mm_srai_epi32(sse_cast<__m128i>(_mm_and_pd(_mm_setsignmask_pd(), d.v())), 31) )); } // gathers {{{1 template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, const IndexT *indexes) { gather(mem, indexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes) { gather(mem, indexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, const IndexT *indexes, MaskArg mask) : d(HT::zero()) { gather(mem, indexes, mask); } template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask) : d(HT::zero()) { gather(mem, indexes, mask); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { gather(array, member1, indexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) : d(HT::zero()) { gather(array, member1, indexes, mask); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { gather(array, member1, member2, indexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) : d(HT::zero()) { gather(array, member1, member2, indexes, mask); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { gather(array, ptrMember1, outerIndexes, innerIndexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) : d(HT::zero()) { gather(array, ptrMember1, outerIndexes, innerIndexes, mask); } template struct IndexSizeChecker { static void check() {} }; template struct IndexSizeChecker, Size> { static void check() { VC_STATIC_ASSERT(Vector::Size >= Size, IndexVector_must_have_greater_or_equal_number_of_entries); } }; template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_pd(mem[indexes[0]], mem[indexes[1]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v()[0] = _mm_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]); d.v()[1] = _mm_setr_ps(mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); } #ifdef VC_USE_SET_GATHERS template template Vc_ALWAYS_INLINE void Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask) { IndexSizeChecker, Size>::check(); Vector indexesTmp = indexes; indexesTmp.setZero(!static_cast::Mask>(mask)); (*this)(mask) = Vector(mem, indexesTmp); } #endif #ifdef VC_USE_BSF_GATHERS #define VC_MASKED_GATHER \ int bits = mask.toInt(); \ while (bits) { \ const int i = _bit_scan_forward(bits); \ bits &= ~(1 << i); /* btr? */ \ d.m(i) = ith_value(i); \ } #elif defined(VC_USE_POPCNT_BSF_GATHERS) #define VC_MASKED_GATHER \ unsigned int bits = mask.toInt(); \ unsigned int low, high = 0; \ switch (mask.count()) { \ case 8: \ high = _bit_scan_reverse(bits); \ d.m(high) = ith_value(high); \ high = (1 << high); \ case 7: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ d.m(low) = ith_value(low); \ case 6: \ high = _bit_scan_reverse(bits); \ d.m(high) = ith_value(high); \ high = (1 << high); \ case 5: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ d.m(low) = ith_value(low); \ case 4: \ high = _bit_scan_reverse(bits); \ d.m(high) = ith_value(high); \ high = (1 << high); \ case 3: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ d.m(low) = ith_value(low); \ case 2: \ high = _bit_scan_reverse(bits); \ d.m(high) = ith_value(high); \ case 1: \ low = _bit_scan_forward(bits); \ d.m(low) = ith_value(low); \ case 0: \ break; \ } #else #define VC_MASKED_GATHER \ if (mask.isEmpty()) { \ return; \ } \ for_all_vector_entries(i, \ if (mask[i]) d.m(i) = ith_value(i); \ ); #endif template template Vc_INTRINSIC void Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) { IndexSizeChecker::check(); #define ith_value(_i_) (mem[indexes[_i_]]) VC_MASKED_GATHER #undef ith_value } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_pd(array[indexes[0]].*(member1), array[indexes[1]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v()[0] = _mm_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1)); d.v()[1] = _mm_setr_ps(array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1), array[indexes[7]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1), array[indexes[7]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1), array[indexes[7]].*(member1)); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) { IndexSizeChecker::check(); #define ith_value(_i_) (array[indexes[_i_]].*(member1)) VC_MASKED_GATHER #undef ith_value } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_pd(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v()[0] = _mm_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2)); d.v()[1] = _mm_setr_ps(array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) { IndexSizeChecker::check(); #define ith_value(_i_) (array[indexes[_i_]].*(member1).*(member2)) VC_MASKED_GATHER #undef ith_value } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm_setr_pd((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v()[0] = _mm_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]); d.v()[1] = _mm_setr_ps((array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) { IndexSizeChecker::check(); IndexSizeChecker::check(); #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]] VC_MASKED_GATHER #undef ith_value } // scatters {{{1 #undef VC_MASKED_GATHER #ifdef VC_USE_BSF_SCATTERS #define VC_MASKED_SCATTER \ int bits = mask.toInt(); \ while (bits) { \ const int i = _bit_scan_forward(bits); \ bits ^= (1 << i); /* btr? */ \ ith_value(i) = d.m(i); \ } #elif defined(VC_USE_POPCNT_BSF_SCATTERS) #define VC_MASKED_SCATTER \ unsigned int bits = mask.toInt(); \ unsigned int low, high = 0; \ switch (mask.count()) { \ case 8: \ high = _bit_scan_reverse(bits); \ ith_value(high) = d.m(high); \ high = (1 << high); \ case 7: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ ith_value(low) = d.m(low); \ case 6: \ high = _bit_scan_reverse(bits); \ ith_value(high) = d.m(high); \ high = (1 << high); \ case 5: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ ith_value(low) = d.m(low); \ case 4: \ high = _bit_scan_reverse(bits); \ ith_value(high) = d.m(high); \ high = (1 << high); \ case 3: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ ith_value(low) = d.m(low); \ case 2: \ high = _bit_scan_reverse(bits); \ ith_value(high) = d.m(high); \ case 1: \ low = _bit_scan_forward(bits); \ ith_value(low) = d.m(low); \ case 0: \ break; \ } #else #define VC_MASKED_SCATTER \ if (mask.isEmpty()) { \ return; \ } \ for_all_vector_entries(i, \ if (mask[i]) ith_value(i) = d.m(i); \ ); #endif template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const { for_all_vector_entries(i, mem[indexes[i]] = d.m(i); ); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) const { #define ith_value(_i_) mem[indexes[_i_]] VC_MASKED_SCATTER #undef ith_value } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) const { for_all_vector_entries(i, array[indexes[i]].*(member1) = d.m(i); ); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const { #define ith_value(_i_) array[indexes[_i_]].*(member1) VC_MASKED_SCATTER #undef ith_value } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) const { for_all_vector_entries(i, array[indexes[i]].*(member1).*(member2) = d.m(i); ); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const { #define ith_value(_i_) array[indexes[_i_]].*(member1).*(member2) VC_MASKED_SCATTER #undef ith_value } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) const { for_all_vector_entries(i, (array[innerIndexes[i]].*(ptrMember1))[outerIndexes[i]] = d.m(i); ); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) const { #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]] VC_MASKED_SCATTER #undef ith_value } /////////////////////////////////////////////////////////////////////////////////////////// // operator[] {{{1 template Vc_INTRINSIC typename Vector::EntryType Vc_PURE Vector::operator[](size_t index) const { return d.m(index); } #ifdef VC_GCC template<> Vc_INTRINSIC double Vc_PURE Vector::operator[](size_t index) const { if (__builtin_constant_p(index)) { return extract_double_imm(d.v(), index); } return d.m(index); } template<> Vc_INTRINSIC float Vc_PURE Vector::operator[](size_t index) const { return extract_float(d.v(), index); } template<> Vc_INTRINSIC float Vc_PURE Vector::operator[](size_t index) const { if (__builtin_constant_p(index)) { if (index < 4) { return extract_float_imm(d.v()[0], index); } return extract_float_imm(d.v()[1], index - 4); } return d.m(index); } template<> Vc_INTRINSIC int Vc_PURE Vector::operator[](size_t index) const { if (__builtin_constant_p(index)) { #if VC_GCC >= 0x40601 || !defined(VC_USE_VEX_CODING) // GCC < 4.6.1 incorrectly uses vmovq instead of movq for the following #ifdef __x86_64__ if (index == 0) return _mm_cvtsi128_si64(d.v()) & 0xFFFFFFFFull; if (index == 1) return _mm_cvtsi128_si64(d.v()) >> 32; #else if (index == 0) return _mm_cvtsi128_si32(d.v()); #endif #endif #ifdef VC_IMPL_SSE4_1 return _mm_extract_epi32(d.v(), index); #else return _mm_cvtsi128_si32(_mm_srli_si128(d.v(), index * 4)); #endif } return d.m(index); } template<> Vc_INTRINSIC unsigned int Vc_PURE Vector::operator[](size_t index) const { if (__builtin_constant_p(index)) { #if VC_GCC >= 0x40601 || !defined(VC_USE_VEX_CODING) // GCC < 4.6.1 incorrectly uses vmovq instead of movq for the following #ifdef __x86_64__ if (index == 0) return _mm_cvtsi128_si64(d.v()) & 0xFFFFFFFFull; if (index == 1) return _mm_cvtsi128_si64(d.v()) >> 32; #else if (index == 0) return _mm_cvtsi128_si32(d.v()); #endif #endif #ifdef VC_IMPL_SSE4_1 return _mm_extract_epi32(d.v(), index); #else return _mm_cvtsi128_si32(_mm_srli_si128(d.v(), index * 4)); #endif } return d.m(index); } template<> Vc_INTRINSIC short Vc_PURE Vector::operator[](size_t index) const { if (__builtin_constant_p(index)) { return _mm_extract_epi16(d.v(), index); } return d.m(index); } template<> Vc_INTRINSIC unsigned short Vc_PURE Vector::operator[](size_t index) const { if (__builtin_constant_p(index)) { return _mm_extract_epi16(d.v(), index); } return d.m(index); } #endif // GCC /////////////////////////////////////////////////////////////////////////////////////////// // horizontal ops {{{1 #ifndef VC_IMPL_SSE4_1 // without SSE4.1 integer multiplication is slow and we rather multiply the scalars template<> Vc_INTRINSIC Vc_PURE int Vector::product() const { return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3)); } template<> Vc_INTRINSIC Vc_PURE unsigned int Vector::product() const { return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3)); } #endif template Vc_ALWAYS_INLINE Vc_PURE typename Vector::EntryType Vector::min(MaskArg m) const { Vector tmp = std::numeric_limits >::max(); tmp(m) = *this; return tmp.min(); } template Vc_ALWAYS_INLINE Vc_PURE typename Vector::EntryType Vector::max(MaskArg m) const { Vector tmp = std::numeric_limits >::min(); tmp(m) = *this; return tmp.max(); } template Vc_ALWAYS_INLINE Vc_PURE typename Vector::EntryType Vector::product(MaskArg m) const { Vector tmp(VectorSpecialInitializerOne::One); tmp(m) = *this; return tmp.product(); } template Vc_ALWAYS_INLINE Vc_PURE typename Vector::EntryType Vector::sum(MaskArg m) const { Vector tmp(VectorSpecialInitializerZero::Zero); tmp(m) = *this; return tmp.sum(); } /////////////////////////////////////////////////////////////////////////////////////////// // copySign {{{1 template<> Vc_INTRINSIC Vc_PURE Vector Vector::copySign(Vector::AsArg reference) const { return _mm_or_ps( _mm_and_ps(reference.d.v(), _mm_setsignmask_ps()), _mm_and_ps(d.v(), _mm_setabsmask_ps()) ); } template<> Vc_INTRINSIC Vc_PURE Vector Vector::copySign(Vector::AsArg reference) const { return M256::create( _mm_or_ps( _mm_and_ps(reference.d.v()[0], _mm_setsignmask_ps()), _mm_and_ps(d.v()[0], _mm_setabsmask_ps()) ), _mm_or_ps( _mm_and_ps(reference.d.v()[1], _mm_setsignmask_ps()), _mm_and_ps(d.v()[1], _mm_setabsmask_ps()) ) ); } template<> Vc_INTRINSIC Vc_PURE Vector Vector::copySign(Vector::AsArg reference) const { return _mm_or_pd( _mm_and_pd(reference.d.v(), _mm_setsignmask_pd()), _mm_and_pd(d.v(), _mm_setabsmask_pd()) ); }//}}}1 // exponent {{{1 template<> Vc_INTRINSIC Vc_PURE Vector Vector::exponent() const { VC_ASSERT((*this >= 0.f).isFull()); return Internal::exponent(d.v()); } template<> Vc_INTRINSIC Vc_PURE Vector Vector::exponent() const { VC_ASSERT((*this >= 0.f).isFull()); return Internal::exponent(d.v()); } template<> Vc_INTRINSIC Vc_PURE Vector Vector::exponent() const { VC_ASSERT((*this >= 0.).isFull()); return Internal::exponent(d.v()); } // }}}1 // Random {{{1 static void _doRandomStep(Vector &state0, Vector &state1) { state0.load(&Vc::RandomState[0]); state1.load(&Vc::RandomState[uint_v::Size]); (state1 * 0xdeece66du + 11).store(&Vc::RandomState[uint_v::Size]); uint_v(_mm_xor_si128((state0 * 0xdeece66du + 11).data(), _mm_srli_epi32(state1.data(), 16))).store(&Vc::RandomState[0]); } template Vc_ALWAYS_INLINE Vector Vector::Random() { Vector state0, state1; _doRandomStep(state0, state1); return state0.reinterpretCast >(); } template<> Vc_ALWAYS_INLINE Vector Vector::Random() { Vector state0, state1; _doRandomStep(state0, state1); return _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), HT::one()), HT::one()); } template<> Vc_ALWAYS_INLINE Vector Vector::Random() { Vector state0, state1; _doRandomStep(state0, state1); state1 ^= state0 >> 16; return M256::create( _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), VectorHelper::one()), VectorHelper::one()), _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state1.data(), 2)), VectorHelper::one()), VectorHelper::one()) ); } template<> Vc_ALWAYS_INLINE Vector Vector::Random() { typedef unsigned long long uint64 Vc_MAY_ALIAS; uint64 state0 = *reinterpret_cast(&Vc::RandomState[8]); uint64 state1 = *reinterpret_cast(&Vc::RandomState[10]); const __m128i state = _mm_load_si128(reinterpret_cast(&Vc::RandomState[8])); *reinterpret_cast(&Vc::RandomState[ 8]) = (state0 * 0x5deece66dull + 11); *reinterpret_cast(&Vc::RandomState[10]) = (state1 * 0x5deece66dull + 11); return (Vector(_mm_castsi128_pd(_mm_srli_epi64(state, 12))) | One()) - One(); } // shifted / rotated {{{1 template Vc_INTRINSIC Vc_PURE Vector Vector::shifted(int amount) const { enum { EntryTypeSizeof = sizeof(EntryType) }; switch (amount) { case 0: return *this; case 1: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 1 * EntryTypeSizeof)); case 2: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 2 * EntryTypeSizeof)); case 3: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 3 * EntryTypeSizeof)); case 4: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 4 * EntryTypeSizeof)); case 5: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 5 * EntryTypeSizeof)); case 6: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 6 * EntryTypeSizeof)); case 7: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 7 * EntryTypeSizeof)); case 8: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 8 * EntryTypeSizeof)); case -1: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 1 * EntryTypeSizeof)); case -2: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 2 * EntryTypeSizeof)); case -3: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 3 * EntryTypeSizeof)); case -4: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 4 * EntryTypeSizeof)); case -5: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 5 * EntryTypeSizeof)); case -6: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 6 * EntryTypeSizeof)); case -7: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 7 * EntryTypeSizeof)); case -8: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 8 * EntryTypeSizeof)); } return Zero(); } template<> Vc_INTRINSIC Vc_PURE sfloat_v sfloat_v::shifted(int amount) const { enum { EntryTypeSizeof = sizeof(EntryType) }; switch (amount) { case -7: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 3 * EntryTypeSizeof))); case -6: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 2 * EntryTypeSizeof))); case -5: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 1 * EntryTypeSizeof))); case -4: return M256::create(_mm_setzero_ps(), d.v()[0]); case -3: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 3 * EntryTypeSizeof)), _mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 1 * EntryTypeSizeof))); case -2: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 2 * EntryTypeSizeof)), _mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 2 * EntryTypeSizeof))); case -1: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 1 * EntryTypeSizeof)), _mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 3 * EntryTypeSizeof))); case 0: return *this; case 1: return M256::create(_mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 1 * EntryTypeSizeof)), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 1 * EntryTypeSizeof))); case 2: return M256::create(_mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 2 * EntryTypeSizeof)), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 2 * EntryTypeSizeof))); case 3: return M256::create(_mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 3 * EntryTypeSizeof)), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 3 * EntryTypeSizeof))); case 4: return M256::create(d.v()[1], _mm_setzero_ps()); case 5: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 1 * EntryTypeSizeof)), _mm_setzero_ps()); case 6: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 2 * EntryTypeSizeof)), _mm_setzero_ps()); case 7: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 3 * EntryTypeSizeof)), _mm_setzero_ps()); } return Zero(); } template Vc_INTRINSIC Vc_PURE Vector Vector::rotated(int amount) const { enum { EntryTypeSizeof = sizeof(EntryType) }; const __m128i v = mm128_reinterpret_cast<__m128i>(d.v()); switch (static_cast(amount) % Size) { case 0: return *this; case 1: return mm128_reinterpret_cast(mm_alignr_epi8(v, v, 1 * EntryTypeSizeof)); case 2: return mm128_reinterpret_cast(mm_alignr_epi8(v, v, 2 * EntryTypeSizeof)); case 3: return mm128_reinterpret_cast(mm_alignr_epi8(v, v, 3 * EntryTypeSizeof)); // warning "Immediate parameter to intrinsic call too large" disabled in VcMacros.cmake. // ICC fails to see that the modulo operation (Size == sizeof(VectorType) / sizeof(EntryType)) // disables the following four calls unless sizeof(EntryType) == 2. case 4: return mm128_reinterpret_cast(mm_alignr_epi8(v, v, 4 * EntryTypeSizeof)); case 5: return mm128_reinterpret_cast(mm_alignr_epi8(v, v, 5 * EntryTypeSizeof)); case 6: return mm128_reinterpret_cast(mm_alignr_epi8(v, v, 6 * EntryTypeSizeof)); case 7: return mm128_reinterpret_cast(mm_alignr_epi8(v, v, 7 * EntryTypeSizeof)); } return Zero(); } template<> Vc_INTRINSIC Vc_PURE sfloat_v sfloat_v::rotated(int amount) const { enum { EntryTypeSizeof = sizeof(EntryType) }; const __m128i v0 = sse_cast<__m128i>(d.v()[0]); const __m128i v1 = sse_cast<__m128i>(d.v()[1]); switch (static_cast(amount) % Size) { case 0: return *this; case 1: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v1, v0, 1 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v0, v1, 1 * EntryTypeSizeof))); case 2: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v1, v0, 2 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v0, v1, 2 * EntryTypeSizeof))); case 3: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v1, v0, 3 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v0, v1, 3 * EntryTypeSizeof))); case 4: return M256::create(d.v()[1], d.v()[0]); case 5: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v0, v1, 1 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v1, v0, 1 * EntryTypeSizeof))); case 6: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v0, v1, 2 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v1, v0, 2 * EntryTypeSizeof))); case 7: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v0, v1, 3 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v1, v0, 3 * EntryTypeSizeof))); } return Zero(); } // }}}1 // sorted specializations {{{1 template<> inline Vc_PURE uint_v uint_v::sorted() const { __m128i x = data(); __m128i y = _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)); __m128i l = mm_min_epu32(x, y); __m128i h = mm_max_epu32(x, y); x = _mm_unpacklo_epi32(l, h); y = _mm_unpackhi_epi32(h, l); // sort quads l = mm_min_epu32(x, y); h = mm_max_epu32(x, y); x = _mm_unpacklo_epi32(l, h); y = _mm_unpackhi_epi64(x, x); l = mm_min_epu32(x, y); h = mm_max_epu32(x, y); return _mm_unpacklo_epi32(l, h); } template<> inline Vc_PURE ushort_v ushort_v::sorted() const { __m128i lo, hi, y, x = data(); // sort pairs y = Mem::permute(x); lo = mm_min_epu16(x, y); hi = mm_max_epu16(x, y); x = mm_blend_epi16(lo, hi, 0xaa); // merge left and right quads y = Mem::permute(x); lo = mm_min_epu16(x, y); hi = mm_max_epu16(x, y); x = mm_blend_epi16(lo, hi, 0xcc); y = _mm_srli_si128(x, 2); lo = mm_min_epu16(x, y); hi = mm_max_epu16(x, y); x = mm_blend_epi16(lo, _mm_slli_si128(hi, 2), 0xaa); // merge quads into octs y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)); y = _mm_shufflelo_epi16(y, _MM_SHUFFLE(0, 1, 2, 3)); lo = mm_min_epu16(x, y); hi = mm_max_epu16(x, y); x = _mm_unpacklo_epi16(lo, hi); y = _mm_srli_si128(x, 8); lo = mm_min_epu16(x, y); hi = mm_max_epu16(x, y); x = _mm_unpacklo_epi16(lo, hi); y = _mm_srli_si128(x, 8); lo = mm_min_epu16(x, y); hi = mm_max_epu16(x, y); return _mm_unpacklo_epi16(lo, hi); } // }}}1 } // namespace SSE } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "undomacros.h" // vim: foldmethod=marker Vc-0.7.4/sse/vectorhelper.h000066400000000000000000001302061233512346000155400ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef SSE_VECTORHELPER_H #define SSE_VECTORHELPER_H #include "types.h" #include #include "macros.h" /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace SSE { namespace Internal { Vc_INTRINSIC Vc_CONST __m128 exponent(__m128 v) { __m128i tmp = _mm_srli_epi32(_mm_castps_si128(v), 23); tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x7f)); return _mm_cvtepi32_ps(tmp); } Vc_INTRINSIC Vc_CONST M256 exponent(VC_ALIGNED_PARAMETER(M256) v) { __m128i tmp0 = _mm_srli_epi32(_mm_castps_si128(v[0]), 23); __m128i tmp1 = _mm_srli_epi32(_mm_castps_si128(v[1]), 23); tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f)); tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f)); return M256::create( _mm_cvtepi32_ps(tmp0), _mm_cvtepi32_ps(tmp1)); } Vc_INTRINSIC Vc_CONST __m128d exponent(__m128d v) { __m128i tmp = _mm_srli_epi64(_mm_castpd_si128(v), 52); tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x3ff)); return _mm_cvtepi32_pd(_mm_shuffle_epi32(tmp, 0x08)); } } // namespace Internal template struct SortHelper { static inline Vc_CONST_L VectorType sort(VectorType) Vc_CONST_R; }; template struct SortHelper { static inline Vc_PURE_L M256 sort(const M256 &) Vc_PURE_R; }; #define OP0(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name() { return code; } #define OP2(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VectorTypeArg a, VectorTypeArg b) { return code; } #define OP3(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VectorTypeArg a, VectorTypeArg b, VectorTypeArg c) { return code; } template<> struct VectorHelper { typedef M256 VectorType; #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN typedef const VectorType &VectorTypeArg; #else typedef const VectorType VectorTypeArg; #endif template static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const float *x, A) Vc_ALWAYS_INLINE_R Vc_PURE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, AlignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, UnalignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, VectorTypeArg m, AlignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, VectorTypeArg m, UnalignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, VectorTypeArg m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, VectorTypeArg m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; OP0(allone, VectorType::create(_mm_setallone_ps(), _mm_setallone_ps())) OP0(zero, VectorType::create(_mm_setzero_ps(), _mm_setzero_ps())) OP2(or_, VectorType::create(_mm_or_ps(a[0], b[0]), _mm_or_ps(a[1], b[1]))) OP2(xor_, VectorType::create(_mm_xor_ps(a[0], b[0]), _mm_xor_ps(a[1], b[1]))) OP2(and_, VectorType::create(_mm_and_ps(a[0], b[0]), _mm_and_ps(a[1], b[1]))) OP2(andnot_, VectorType::create(_mm_andnot_ps(a[0], b[0]), _mm_andnot_ps(a[1], b[1]))) OP3(blend, VectorType::create(mm_blendv_ps(a[0], b[0], c[0]), mm_blendv_ps(a[1], b[1], c[1]))) }; #undef OP0 #undef OP2 #undef OP3 #define OP0(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name() { return code; } #define OP1(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a) { return code; } #define OP2(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b) { return code; } #define OP3(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b, const VectorType c) { return code; } template<> struct VectorHelper<_M128> { typedef _M128 VectorType; template static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const float *x, A) Vc_ALWAYS_INLINE_R Vc_PURE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, AlignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, UnalignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, const VectorType m, AlignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, const VectorType m, UnalignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; OP0(allone, _mm_setallone_ps()) OP0(zero, _mm_setzero_ps()) OP2(or_, _mm_or_ps(a, b)) OP2(xor_, _mm_xor_ps(a, b)) OP2(and_, _mm_and_ps(a, b)) OP2(andnot_, _mm_andnot_ps(a, b)) OP3(blend, mm_blendv_ps(a, b, c)) }; template<> struct VectorHelper<_M128D> { typedef _M128D VectorType; template static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const double *x, A) Vc_ALWAYS_INLINE_R Vc_PURE_R; static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, AlignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, UnalignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, const VectorType m, AlignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, const VectorType m, UnalignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; OP0(allone, _mm_setallone_pd()) OP0(zero, _mm_setzero_pd()) OP2(or_, _mm_or_pd(a, b)) OP2(xor_, _mm_xor_pd(a, b)) OP2(and_, _mm_and_pd(a, b)) OP2(andnot_, _mm_andnot_pd(a, b)) OP3(blend, mm_blendv_pd(a, b, c)) }; template<> struct VectorHelper<_M128I> { typedef _M128I VectorType; template static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const T *x, AlignedFlag) Vc_ALWAYS_INLINE_R Vc_PURE_R; template static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const T *x, UnalignedFlag) Vc_ALWAYS_INLINE_R Vc_PURE_R; template static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const T *x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R Vc_PURE_R; template static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const T *x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R Vc_PURE_R; template static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, AlignedFlag) Vc_ALWAYS_INLINE_R; template static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, UnalignedFlag) Vc_ALWAYS_INLINE_R; template static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; template static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; template static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, const VectorType m, AlignedFlag) Vc_ALWAYS_INLINE_R; template static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, const VectorType m, UnalignedFlag) Vc_ALWAYS_INLINE_R; template static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; template static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; OP0(allone, _mm_setallone_si128()) OP0(zero, _mm_setzero_si128()) OP2(or_, _mm_or_si128(a, b)) OP2(xor_, _mm_xor_si128(a, b)) OP2(and_, _mm_and_si128(a, b)) OP2(andnot_, _mm_andnot_si128(a, b)) OP3(blend, mm_blendv_epi8(a, b, c)) }; #undef OP1 #undef OP2 #undef OP3 #define OP1(op) \ static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a) { return CAT(_mm_##op##_, SUFFIX)(a); } #define OP(op) \ static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return CAT(_mm_##op##_ , SUFFIX)(a, b); } #define OP_(op) \ static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return CAT(_mm_##op , SUFFIX)(a, b); } #define OPx(op, op2) \ static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return CAT(_mm_##op2##_, SUFFIX)(a, b); } #define OPcmp(op) \ static Vc_ALWAYS_INLINE Vc_CONST VectorType cmp##op(const VectorType a, const VectorType b) { return CAT(_mm_cmp##op##_, SUFFIX)(a, b); } #define OP_CAST_(op) \ static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return CAT(_mm_castps_, SUFFIX)( \ _mm_##op##ps(CAT(CAT(_mm_cast, SUFFIX), _ps)(a), \ CAT(CAT(_mm_cast, SUFFIX), _ps)(b))); \ } #define MINMAX \ static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return CAT(_mm_min_, SUFFIX)(a, b); } \ static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return CAT(_mm_max_, SUFFIX)(a, b); } template<> struct VectorHelper { typedef _M128D VectorType; typedef double EntryType; #define SUFFIX pd OP_(or_) OP_(and_) OP_(xor_) static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(_mm_castps_pd(mask), a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a) { return CAT(_mm_set1_, SUFFIX)(a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a, const double b) { return CAT(_mm_set_, SUFFIX)(a, b); } static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); } static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); }// set(1.); } #ifdef VC_IMPL_FMA4 static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = _mm_macc_pd(v1, v2, v3); } #else static inline void fma(VectorType &v1, VectorType v2, VectorType v3) { VectorType h1 = _mm_and_pd(v1, _mm_load_pd(reinterpret_cast(&c_general::highMaskDouble))); VectorType h2 = _mm_and_pd(v2, _mm_load_pd(reinterpret_cast(&c_general::highMaskDouble))); #if defined(VC_GCC) && VC_GCC < 0x40703 // GCC before 4.7.3 uses an incorrect optimization where it replaces the subtraction with an andnot // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54703 asm("":"+x"(h1), "+x"(h2)); #endif const VectorType l1 = _mm_sub_pd(v1, h1); const VectorType l2 = _mm_sub_pd(v2, h2); const VectorType ll = mul(l1, l2); const VectorType lh = add(mul(l1, h2), mul(h1, l2)); const VectorType hh = mul(h1, h2); // ll < lh < hh for all entries is certain const VectorType lh_lt_v3 = cmplt(abs(lh), abs(v3)); // |lh| < |v3| const VectorType b = mm_blendv_pd(v3, lh, lh_lt_v3); const VectorType c = mm_blendv_pd(lh, v3, lh_lt_v3); v1 = add(add(ll, b), add(c, hh)); } #endif OP(add) OP(sub) OP(mul) OPcmp(eq) OPcmp(neq) OPcmp(lt) OPcmp(nlt) OPcmp(le) OPcmp(nle) OP1(sqrt) static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VectorType x) { return _mm_div_pd(one(), sqrt(x)); } static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) { return _mm_div_pd(one(), x); } static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) { return _mm_cmpunord_pd(x, x); } static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) { return _mm_cmpord_pd(x, _mm_mul_pd(zero(), x)); } static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return CAT(_mm_and_, SUFFIX)(a, _mm_setabsmask_pd()); } MINMAX static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { a = _mm_min_sd(a, _mm_unpackhi_pd(a, a)); return _mm_cvtsd_f64(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { a = _mm_max_sd(a, _mm_unpackhi_pd(a, a)); return _mm_cvtsd_f64(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { a = _mm_mul_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1))); return _mm_cvtsd_f64(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { a = _mm_add_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1))); return _mm_cvtsd_f64(a); } #undef SUFFIX static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { #ifdef VC_IMPL_SSE4_1 return _mm_round_pd(a, _MM_FROUND_NINT); #else //XXX: slow: _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); return _mm_cvtepi32_pd(_mm_cvtpd_epi32(a)); #endif } }; template<> struct VectorHelper { typedef float EntryType; typedef _M128 VectorType; #define SUFFIX ps OP_(or_) OP_(and_) OP_(xor_) static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(mask, a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return CAT(_mm_set1_, SUFFIX)(a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d) { return CAT(_mm_set_, SUFFIX)(a, b, c, d); } static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); } static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); }// set(1.f); } static Vc_ALWAYS_INLINE Vc_CONST _M128 concat(_M128D a, _M128D b) { return _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b)); } #ifdef VC_IMPL_FMA4 static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = _mm_macc_ps(v1, v2, v3); } #else static inline void fma(VectorType &v1, VectorType v2, VectorType v3) { __m128d v1_0 = _mm_cvtps_pd(v1); __m128d v1_1 = _mm_cvtps_pd(_mm_movehl_ps(v1, v1)); __m128d v2_0 = _mm_cvtps_pd(v2); __m128d v2_1 = _mm_cvtps_pd(_mm_movehl_ps(v2, v2)); __m128d v3_0 = _mm_cvtps_pd(v3); __m128d v3_1 = _mm_cvtps_pd(_mm_movehl_ps(v3, v3)); v1 = _mm_movelh_ps( _mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_0, v2_0), v3_0)), _mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_1, v2_1), v3_1))); } #endif OP(add) OP(sub) OP(mul) OPcmp(eq) OPcmp(neq) OPcmp(lt) OPcmp(nlt) OPcmp(le) OPcmp(nle) OP1(sqrt) OP1(rsqrt) static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) { return _mm_cmpunord_ps(x, x); } static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) { return _mm_cmpord_ps(x, _mm_mul_ps(zero(), x)); } static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) { return _mm_rcp_ps(x); } static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return CAT(_mm_and_, SUFFIX)(a, _mm_setabsmask_ps()); } MINMAX static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { a = _mm_min_ps(a, _mm_movehl_ps(a, a)); // a = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3) a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); // a = min(a0, a1), a1, a2, a3 return _mm_cvtss_f32(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { a = _mm_max_ps(a, _mm_movehl_ps(a, a)); // a = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3) a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); // a = max(a0, a1), a1, a2, a3 return _mm_cvtss_f32(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { a = _mm_mul_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3))); a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1))); return _mm_cvtss_f32(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { a = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3))); a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1))); return _mm_cvtss_f32(a); } #undef SUFFIX static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { #ifdef VC_IMPL_SSE4_1 return _mm_round_ps(a, _MM_FROUND_NINT); #else //XXX slow: _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); return _mm_cvtepi32_ps(_mm_cvtps_epi32(a)); #endif } }; template<> struct VectorHelper { typedef float EntryType; typedef M256 VectorType; #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN typedef const VectorType &VectorTypeArg; #else typedef const VectorType VectorTypeArg; #endif static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { const _M128 x = _mm_set1_ps(a); return VectorType::create(x, x); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d) { const _M128 x = _mm_set_ps(a, b, c, d); return VectorType::create(x, x); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d, const float e, const float f, const float g, const float h) { return VectorType::create(_mm_set_ps(a, b, c, d), _mm_set_ps(e, f, g, h)); } static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return VectorType::create(_mm_setzero_ps(), _mm_setzero_ps()); } static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return set(1.f); } #define REUSE_FLOAT_IMPL1(fun) \ static Vc_ALWAYS_INLINE Vc_CONST VectorType fun(VectorTypeArg x) { \ return VectorType::create(VectorHelper::fun(x[0]), VectorHelper::fun(x[1])); \ } #define REUSE_FLOAT_IMPL2(fun) \ static Vc_ALWAYS_INLINE Vc_CONST VectorType fun(VectorTypeArg x, VectorTypeArg y) { \ return VectorType::create(VectorHelper::fun(x[0], y[0]), VectorHelper::fun(x[1], y[1])); \ } REUSE_FLOAT_IMPL1(reciprocal) REUSE_FLOAT_IMPL1(sqrt) REUSE_FLOAT_IMPL1(rsqrt) REUSE_FLOAT_IMPL1(isNaN) REUSE_FLOAT_IMPL1(isFinite) REUSE_FLOAT_IMPL1(abs) REUSE_FLOAT_IMPL1(round) REUSE_FLOAT_IMPL2(and_) REUSE_FLOAT_IMPL2(or_) REUSE_FLOAT_IMPL2(xor_) REUSE_FLOAT_IMPL2(notMaskedToZero) REUSE_FLOAT_IMPL2(add) REUSE_FLOAT_IMPL2(sub) REUSE_FLOAT_IMPL2(mul) REUSE_FLOAT_IMPL2(cmple) REUSE_FLOAT_IMPL2(cmpnle) REUSE_FLOAT_IMPL2(cmplt) REUSE_FLOAT_IMPL2(cmpnlt) REUSE_FLOAT_IMPL2(cmpeq) REUSE_FLOAT_IMPL2(cmpneq) REUSE_FLOAT_IMPL2(min) REUSE_FLOAT_IMPL2(max) static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorTypeArg a) { return VectorHelper::min(VectorHelper::min(a[0], a[1])); } static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorTypeArg a) { return VectorHelper::max(VectorHelper::max(a[0], a[1])); } static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorTypeArg a) { return VectorHelper::mul(VectorHelper::mul(a[0], a[1])); } static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorTypeArg a) { return VectorHelper::add(VectorHelper::add(a[0], a[1])); } static inline void fma(VectorType &a, VectorTypeArg b, VectorTypeArg c) { VectorHelper::fma(a[0], b[0], c[0]); VectorHelper::fma(a[1], b[1], c[1]); } #undef REUSE_FLOAT_IMPL2 #undef REUSE_FLOAT_IMPL1 }; template<> struct VectorHelper { typedef int EntryType; typedef _M128I VectorType; #define SUFFIX si128 OP_(or_) OP_(and_) OP_(xor_) static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); } static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); } #undef SUFFIX #define SUFFIX epi32 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a) { return CAT(_mm_set1_, SUFFIX)(a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a, const int b, const int c, const int d) { return CAT(_mm_set_, SUFFIX)(a, b, c, d); } static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); } static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) { return CAT(_mm_slli_, SUFFIX)(a, shift); } static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) { return CAT(_mm_srai_, SUFFIX)(a, shift); } static Vc_INTRINSIC Vc_CONST VectorType abs(const VectorType a) { return mm_abs_epi32(a); } static Vc_INTRINSIC Vc_CONST VectorType min(const VectorType a, const VectorType b) { return mm_min_epi32(a, b); } static Vc_INTRINSIC Vc_CONST VectorType max(const VectorType a, const VectorType b) { return mm_max_epi32(a, b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(a); } #ifdef VC_IMPL_SSE4_1 static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(VectorType a, VectorType b) { return _mm_mullo_epi32(a, b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(a); } #else static inline Vc_CONST VectorType mul(const VectorType a, const VectorType b) { const VectorType aShift = _mm_srli_si128(a, 4); const VectorType ab02 = _mm_mul_epu32(a, b); // [a0 * b0, a2 * b2] const VectorType bShift = _mm_srli_si128(b, 4); const VectorType ab13 = _mm_mul_epu32(aShift, bShift); // [a1 * b1, a3 * b3] return _mm_unpacklo_epi32(_mm_shuffle_epi32(ab02, 8), _mm_shuffle_epi32(ab13, 8)); } #endif OP(add) OP(sub) OPcmp(eq) OPcmp(lt) OPcmp(gt) static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(const VectorType a, const VectorType b) { _M128I x = cmpeq(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(const VectorType a, const VectorType b) { _M128I x = cmplt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (const VectorType a, const VectorType b) { _M128I x = cmpgt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(const VectorType a, const VectorType b) { return cmpgt(a, b); } #undef SUFFIX static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; } }; template<> struct VectorHelper { typedef unsigned int EntryType; typedef _M128I VectorType; #define SUFFIX si128 OP_CAST_(or_) OP_CAST_(and_) OP_CAST_(xor_) static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); } static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); } #undef SUFFIX #define SUFFIX epu32 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); } static Vc_INTRINSIC Vc_CONST VectorType min(const VectorType a, const VectorType b) { return mm_min_epu32(a, b); } static Vc_INTRINSIC Vc_CONST VectorType max(const VectorType a, const VectorType b) { return mm_max_epu32(a, b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(a); } static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); } static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a, const VectorType b) { return VectorHelper::mul(a, b); } //X template static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a) { //X switch (b) { //X case 0: return zero(); //X case 1: return a; //X case 2: return _mm_slli_epi32(a, 1); //X case 4: return _mm_slli_epi32(a, 2); //X case 8: return _mm_slli_epi32(a, 3); //X case 16: return _mm_slli_epi32(a, 4); //X case 32: return _mm_slli_epi32(a, 5); //X case 64: return _mm_slli_epi32(a, 6); //X case 128: return _mm_slli_epi32(a, 7); //X case 256: return _mm_slli_epi32(a, 8); //X case 512: return _mm_slli_epi32(a, 9); //X case 1024: return _mm_slli_epi32(a, 10); //X case 2048: return _mm_slli_epi32(a, 11); //X } //X return mul(a, set(b)); //X } #undef SUFFIX #define SUFFIX epi32 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) { return CAT(_mm_slli_, SUFFIX)(a, shift); } static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) { return CAT(_mm_srli_, SUFFIX)(a, shift); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a) { return CAT(_mm_set1_, SUFFIX)(a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d) { return CAT(_mm_set_, SUFFIX)(a, b, c, d); } OP(add) OP(sub) OPcmp(eq) static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(const VectorType a, const VectorType b) { return _mm_andnot_si128(cmpeq(a, b), _mm_setallone_si128()); } #ifndef USE_INCORRECT_UNSIGNED_COMPARE static Vc_ALWAYS_INLINE Vc_CONST VectorType cmplt(const VectorType a, const VectorType b) { return _mm_cmplt_epu32(a, b); } static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpgt(const VectorType a, const VectorType b) { return _mm_cmpgt_epu32(a, b); } #else OPcmp(lt) OPcmp(gt) #endif static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(const VectorType a, const VectorType b) { return _mm_andnot_si128(cmplt(a, b), _mm_setallone_si128()); } static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (const VectorType a, const VectorType b) { return _mm_andnot_si128(cmpgt(a, b), _mm_setallone_si128()); } static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(const VectorType a, const VectorType b) { return cmpgt(a, b); } #undef SUFFIX static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; } }; template<> struct VectorHelper { typedef _M128I VectorType; typedef signed short EntryType; #define SUFFIX si128 OP_(or_) OP_(and_) OP_(xor_) static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); } static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); } static Vc_ALWAYS_INLINE Vc_CONST _M128I concat(_M128I a, _M128I b) { return _mm_packs_epi32(a, b); } static Vc_ALWAYS_INLINE Vc_CONST _M128I expand0(_M128I x) { return _mm_srai_epi32(_mm_unpacklo_epi16(x, x), 16); } static Vc_ALWAYS_INLINE Vc_CONST _M128I expand1(_M128I x) { return _mm_srai_epi32(_mm_unpackhi_epi16(x, x), 16); } #undef SUFFIX #define SUFFIX epi16 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); } static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) { return CAT(_mm_slli_, SUFFIX)(a, shift); } static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) { return CAT(_mm_srai_, SUFFIX)(a, shift); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c, const EntryType d, const EntryType e, const EntryType f, const EntryType g, const EntryType h) { return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h); } static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); } static Vc_INTRINSIC Vc_CONST VectorType abs(const VectorType a) { return mm_abs_epi16(a); } OPx(mul, mullo) OP(min) OP(max) static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); // & 0xffff is implicit } static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); // & 0xffff is implicit } static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); // & 0xffff is implicit } static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); // & 0xffff is implicit } OP(add) OP(sub) OPcmp(eq) OPcmp(lt) OPcmp(gt) static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(const VectorType a, const VectorType b) { _M128I x = cmpeq(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(const VectorType a, const VectorType b) { _M128I x = cmplt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (const VectorType a, const VectorType b) { _M128I x = cmpgt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(const VectorType a, const VectorType b) { return cmpgt(a, b); } #undef SUFFIX static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; } }; template<> struct VectorHelper { typedef _M128I VectorType; typedef unsigned short EntryType; #define SUFFIX si128 OP_CAST_(or_) OP_CAST_(and_) OP_CAST_(xor_) static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); } static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); } #ifdef VC_IMPL_SSE4_1 static Vc_ALWAYS_INLINE Vc_CONST _M128I concat(_M128I a, _M128I b) { return _mm_packus_epi32(a, b); } #else // XXX too bad, but this is broken without SSE 4.1 static Vc_ALWAYS_INLINE Vc_CONST _M128I concat(_M128I a, _M128I b) { return _mm_packs_epi32(a, b); } #endif static Vc_ALWAYS_INLINE Vc_CONST _M128I expand0(_M128I x) { return _mm_srli_epi32(_mm_unpacklo_epi16(x, x), 16); } static Vc_ALWAYS_INLINE Vc_CONST _M128I expand1(_M128I x) { return _mm_srli_epi32(_mm_unpackhi_epi16(x, x), 16); } #undef SUFFIX #define SUFFIX epu16 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); } //X template static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a) { //X switch (b) { //X case 0: return zero(); //X case 1: return a; //X case 2: return _mm_slli_epi16(a, 1); //X case 4: return _mm_slli_epi16(a, 2); //X case 8: return _mm_slli_epi16(a, 3); //X case 16: return _mm_slli_epi16(a, 4); //X case 32: return _mm_slli_epi16(a, 5); //X case 64: return _mm_slli_epi16(a, 6); //X case 128: return _mm_slli_epi16(a, 7); //X case 256: return _mm_slli_epi16(a, 8); //X case 512: return _mm_slli_epi16(a, 9); //X case 1024: return _mm_slli_epi16(a, 10); //X case 2048: return _mm_slli_epi16(a, 11); //X } //X return mul(a, set(b)); //X } #if !defined(USE_INCORRECT_UNSIGNED_COMPARE) || VC_IMPL_SSE4_1 static Vc_INTRINSIC Vc_CONST VectorType min(const VectorType a, const VectorType b) { return CAT(mm_min_, SUFFIX)(a, b); } static Vc_INTRINSIC Vc_CONST VectorType max(const VectorType a, const VectorType b) { return CAT(mm_max_, SUFFIX)(a, b); } #endif #undef SUFFIX #define SUFFIX epi16 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) { return CAT(_mm_slli_, SUFFIX)(a, shift); } static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) { return CAT(_mm_srli_, SUFFIX)(a, shift); } static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); } OPx(mul, mullo) // should work correctly for all values #if defined(USE_INCORRECT_UNSIGNED_COMPARE) && !defined(VC_IMPL_SSE4_1) OP(min) OP(max) // XXX breaks for values with MSB set #endif static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); // & 0xffff is implicit } static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); // & 0xffff is implicit } static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); // & 0xffff is implicit } static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); // & 0xffff is implicit } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c, const EntryType d, const EntryType e, const EntryType f, const EntryType g, const EntryType h) { return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h); } OP(add) OP(sub) OPcmp(eq) static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(const VectorType a, const VectorType b) { return _mm_andnot_si128(cmpeq(a, b), _mm_setallone_si128()); } #ifndef USE_INCORRECT_UNSIGNED_COMPARE static Vc_ALWAYS_INLINE Vc_CONST VectorType cmplt(const VectorType a, const VectorType b) { return _mm_cmplt_epu16(a, b); } static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpgt(const VectorType a, const VectorType b) { return _mm_cmpgt_epu16(a, b); } #else OPcmp(lt) OPcmp(gt) #endif static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(const VectorType a, const VectorType b) { return _mm_andnot_si128(cmplt(a, b), _mm_setallone_si128()); } static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (const VectorType a, const VectorType b) { return _mm_andnot_si128(cmpgt(a, b), _mm_setallone_si128()); } static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(const VectorType a, const VectorType b) { return cmpgt(a, b); } #undef SUFFIX static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; } }; #undef OP1 #undef OP #undef OP_ #undef OPx #undef OPcmp } // namespace SSE } // namespace Vc /*OUTER_NAMESPACE_END*/ #include "vectorhelper.tcc" #include "undomacros.h" #endif // SSE_VECTORHELPER_H Vc-0.7.4/sse/vectorhelper.tcc000066400000000000000000000532011233512346000160610ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include "casts.h" #include /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { namespace SSE { //////////////////////////////////////////////////////////////////////////////////////////////////// // float_v template<> Vc_ALWAYS_INLINE Vc_PURE _M128 VectorHelper<_M128>::load(const float *x, AlignedFlag) { return _mm_load_ps(x); } template<> Vc_ALWAYS_INLINE Vc_PURE _M128 VectorHelper<_M128>::load(const float *x, UnalignedFlag) { return _mm_loadu_ps(x); } template<> Vc_ALWAYS_INLINE Vc_PURE _M128 VectorHelper<_M128>::load(const float *x, StreamingAndAlignedFlag) { return _mm_stream_load(x); } template<> Vc_ALWAYS_INLINE Vc_PURE _M128 VectorHelper<_M128>::load(const float *x, StreamingAndUnalignedFlag) { return load(x, Unaligned); } //////////////////////////////////////////////////////////////////////////////////////////////////// // stores Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, AlignedFlag) { _mm_store_ps(mem, x); } Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, UnalignedFlag) { _mm_storeu_ps(mem, x); } Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, StreamingAndAlignedFlag) { _mm_stream_ps(mem, x); } Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, StreamingAndUnalignedFlag) { _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_setallone_si128(), reinterpret_cast(mem)); } Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, const VectorType m, AlignedFlag) { _mm_store_ps(mem, mm_blendv_ps(_mm_load_ps(mem), x, m)); } Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, const VectorType m, UnalignedFlag) { _mm_storeu_ps(mem, mm_blendv_ps(_mm_loadu_ps(mem), x, m)); } Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag) { _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_castps_si128(m), reinterpret_cast(mem)); } Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag) { _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_castps_si128(m), reinterpret_cast(mem)); } //////////////////////////////////////////////////////////////////////////////////////////////////// // sfloat_v template<> Vc_ALWAYS_INLINE Vc_PURE M256 VectorHelper::load(const float *x, AlignedFlag) { return VectorType::create(_mm_load_ps(x), _mm_load_ps(x + 4)); } template<> Vc_ALWAYS_INLINE Vc_PURE M256 VectorHelper::load(const float *x, UnalignedFlag) { return VectorType::create(_mm_loadu_ps(x), _mm_loadu_ps(x + 4)); } template<> Vc_ALWAYS_INLINE Vc_PURE M256 VectorHelper::load(const float *x, StreamingAndAlignedFlag) { return VectorType::create(_mm_stream_load(&x[0]), _mm_stream_load(&x[4])); } template<> Vc_ALWAYS_INLINE Vc_PURE M256 VectorHelper::load(const float *x, StreamingAndUnalignedFlag) { return load(x, Unaligned); } //////////////////////////////////////////////////////////////////////////////////////////////////// // stores Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VectorTypeArg x, AlignedFlag) { _mm_store_ps(mem, x[0]); _mm_store_ps(mem + 4, x[1]); } Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VectorTypeArg x, UnalignedFlag) { _mm_storeu_ps(mem, x[0]); _mm_storeu_ps(mem + 4, x[1]); } Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VectorTypeArg x, StreamingAndAlignedFlag) { _mm_stream_ps(mem, x[0]); _mm_stream_ps(mem + 4, x[1]); } Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VectorTypeArg x, StreamingAndUnalignedFlag) { _mm_maskmoveu_si128(_mm_castps_si128(x[0]), _mm_setallone_si128(), reinterpret_cast(mem)); _mm_maskmoveu_si128(_mm_castps_si128(x[1]), _mm_setallone_si128(), reinterpret_cast(mem + 4)); } Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VectorTypeArg x, VectorTypeArg m, AlignedFlag) { _mm_store_ps(mem, mm_blendv_ps(_mm_load_ps(mem), x[0], m[0])); _mm_store_ps(mem + 4, mm_blendv_ps(_mm_load_ps(mem + 4), x[1], m[1])); } Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VectorTypeArg x, VectorTypeArg m, UnalignedFlag) { _mm_storeu_ps(mem, mm_blendv_ps(_mm_loadu_ps(mem), x[0], m[0])); _mm_storeu_ps(mem + 4, mm_blendv_ps(_mm_loadu_ps(mem + 4), x[1], m[1])); } Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VectorTypeArg x, VectorTypeArg m, StreamingAndAlignedFlag) { _mm_maskmoveu_si128(_mm_castps_si128(x[0]), _mm_castps_si128(m[0]), reinterpret_cast(mem)); _mm_maskmoveu_si128(_mm_castps_si128(x[1]), _mm_castps_si128(m[1]), reinterpret_cast(mem + 4)); } Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VectorTypeArg x, VectorTypeArg m, StreamingAndUnalignedFlag) { _mm_maskmoveu_si128(_mm_castps_si128(x[0]), _mm_castps_si128(m[0]), reinterpret_cast(mem)); _mm_maskmoveu_si128(_mm_castps_si128(x[1]), _mm_castps_si128(m[1]), reinterpret_cast(mem + 4)); } //////////////////////////////////////////////////////////////////////////////////////////////////// // double_v template<> Vc_ALWAYS_INLINE Vc_PURE _M128D VectorHelper<_M128D>::load(const double *x, AlignedFlag) { return _mm_load_pd(x); } template<> Vc_ALWAYS_INLINE Vc_PURE _M128D VectorHelper<_M128D>::load(const double *x, UnalignedFlag) { return _mm_loadu_pd(x); } template<> Vc_ALWAYS_INLINE Vc_PURE _M128D VectorHelper<_M128D>::load(const double *x, StreamingAndAlignedFlag) { return _mm_stream_load(x); } template<> Vc_ALWAYS_INLINE Vc_PURE _M128D VectorHelper<_M128D>::load(const double *x, StreamingAndUnalignedFlag) { return load(x, Unaligned); } //////////////////////////////////////////////////////////////////////////////////////////////////// // stores Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, AlignedFlag) { _mm_store_pd(mem, x); } Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, UnalignedFlag) { _mm_storeu_pd(mem, x); } Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, StreamingAndAlignedFlag) { _mm_stream_pd(mem, x); } Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, StreamingAndUnalignedFlag) { _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_setallone_si128(), reinterpret_cast(mem)); } Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, const VectorType m, AlignedFlag) { _mm_store_pd(mem, mm_blendv_pd(_mm_load_pd(mem), x, m)); } Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, const VectorType m, UnalignedFlag) { _mm_storeu_pd(mem, mm_blendv_pd(_mm_loadu_pd(mem), x, m)); } Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag) { _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_castpd_si128(m), reinterpret_cast(mem)); } Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag) { _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_castpd_si128(m), reinterpret_cast(mem)); } //////////////////////////////////////////////////////////////////////////////////////////////////// // int_v, uint_v, short_v, ushort_v template Vc_ALWAYS_INLINE Vc_PURE _M128I VectorHelper<_M128I>::load(const T *x, AlignedFlag) { return _mm_load_si128(reinterpret_cast(x)); } template Vc_ALWAYS_INLINE Vc_PURE _M128I VectorHelper<_M128I>::load(const T *x, UnalignedFlag) { return _mm_loadu_si128(reinterpret_cast(x)); } template Vc_ALWAYS_INLINE Vc_PURE _M128I VectorHelper<_M128I>::load(const T *x, StreamingAndAlignedFlag) { return _mm_stream_load(x); } template Vc_ALWAYS_INLINE Vc_PURE _M128I VectorHelper<_M128I>::load(const T *x, StreamingAndUnalignedFlag) { return load(x, Unaligned); } //////////////////////////////////////////////////////////////////////////////////////////////////// // stores template Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, AlignedFlag) { _mm_store_si128(reinterpret_cast(mem), x); } template Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, UnalignedFlag) { _mm_storeu_si128(reinterpret_cast(mem), x); } template Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, StreamingAndAlignedFlag) { _mm_stream_si128(reinterpret_cast(mem), x); } template Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, StreamingAndUnalignedFlag) { _mm_maskmoveu_si128(x, _mm_setallone_si128(), reinterpret_cast(mem)); } template Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, const VectorType m, AlignedFlag align) { store(mem, mm_blendv_epi8(load(mem, align), x, m), align); } template Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, const VectorType m, UnalignedFlag align) { store(mem, mm_blendv_epi8(load(mem, align), x, m), align); } template Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag) { _mm_maskmoveu_si128(x, m, reinterpret_cast(mem)); } template Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag) { _mm_maskmoveu_si128(x, m, reinterpret_cast(mem)); } template<> inline Vc_CONST _M128I SortHelper<_M128I, 8>::sort(_M128I x) { _M128I lo, hi, y; // sort pairs y = Mem::permute(x); lo = _mm_min_epi16(x, y); hi = _mm_max_epi16(x, y); x = mm_blend_epi16(lo, hi, 0xaa); // merge left and right quads y = Mem::permute(x); lo = _mm_min_epi16(x, y); hi = _mm_max_epi16(x, y); x = mm_blend_epi16(lo, hi, 0xcc); y = _mm_srli_si128(x, 2); lo = _mm_min_epi16(x, y); hi = _mm_max_epi16(x, y); x = mm_blend_epi16(lo, _mm_slli_si128(hi, 2), 0xaa); // merge quads into octs y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)); y = _mm_shufflelo_epi16(y, _MM_SHUFFLE(0, 1, 2, 3)); lo = _mm_min_epi16(x, y); hi = _mm_max_epi16(x, y); x = _mm_unpacklo_epi16(lo, hi); y = _mm_srli_si128(x, 8); lo = _mm_min_epi16(x, y); hi = _mm_max_epi16(x, y); x = _mm_unpacklo_epi16(lo, hi); y = _mm_srli_si128(x, 8); lo = _mm_min_epi16(x, y); hi = _mm_max_epi16(x, y); return _mm_unpacklo_epi16(lo, hi); } template<> inline Vc_CONST _M128I SortHelper<_M128I, 4>::sort(_M128I x) { /* // in 16,67% of the cases the merge can be replaced by an append // x = [a b c d] // y = [c d a b] _M128I y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)); _M128I l = mm_min_epi32(x, y); // min[ac bd ac bd] _M128I h = mm_max_epi32(x, y); // max[ac bd ac bd] if (IS_UNLIKELY(_mm_cvtsi128_si32(h) <= l[1])) { // l[0] < h[0] < l[1] < h[1] return _mm_unpacklo_epi32(l, h); } // h[0] > l[1] */ // sort pairs _M128I y = _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)); _M128I l = mm_min_epi32(x, y); _M128I h = mm_max_epi32(x, y); x = _mm_unpacklo_epi32(l, h); y = _mm_unpackhi_epi32(h, l); // sort quads l = mm_min_epi32(x, y); h = mm_max_epi32(x, y); x = _mm_unpacklo_epi32(l, h); y = _mm_unpackhi_epi64(x, x); l = mm_min_epi32(x, y); h = mm_max_epi32(x, y); return _mm_unpacklo_epi32(l, h); } template<> inline Vc_CONST _M128 SortHelper<_M128, 4>::sort(_M128 x) { _M128 y = _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1)); _M128 l = _mm_min_ps(x, y); _M128 h = _mm_max_ps(x, y); x = _mm_unpacklo_ps(l, h); y = _mm_unpackhi_ps(h, l); l = _mm_min_ps(x, y); h = _mm_max_ps(x, y); x = _mm_unpacklo_ps(l, h); y = _mm_movehl_ps(x, x); l = _mm_min_ps(x, y); h = _mm_max_ps(x, y); return _mm_unpacklo_ps(l, h); //X _M128 k = _mm_cmpgt_ps(x, y); //X k = _mm_shuffle_ps(k, k, _MM_SHUFFLE(2, 2, 0, 0)); //X x = mm_blendv_ps(x, y, k); //X y = _mm_shuffle_ps(x, x, _MM_SHUFFLE(1, 0, 3, 2)); //X k = _mm_cmpgt_ps(x, y); //X k = _mm_shuffle_ps(k, k, _MM_SHUFFLE(1, 0, 1, 0)); //X x = mm_blendv_ps(x, y, k); //X y = _mm_shuffle_ps(x, x, _MM_SHUFFLE(3, 1, 2, 0)); //X k = _mm_cmpgt_ps(x, y); //X k = _mm_shuffle_ps(k, k, _MM_SHUFFLE(0, 1, 1, 0)); //X return mm_blendv_ps(x, y, k); } template<> inline Vc_PURE M256 SortHelper::sort(const M256 &_x) { M256 x = _x; typedef SortHelper<_M128, 4> H; _M128 a, b, l, h; a = H::sort(x[0]); b = H::sort(x[1]); // merge b = _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)); l = _mm_min_ps(a, b); h = _mm_max_ps(a, b); a = _mm_unpacklo_ps(l, h); b = _mm_unpackhi_ps(l, h); l = _mm_min_ps(a, b); h = _mm_max_ps(a, b); a = _mm_unpacklo_ps(l, h); b = _mm_unpackhi_ps(l, h); l = _mm_min_ps(a, b); h = _mm_max_ps(a, b); x[0] = _mm_unpacklo_ps(l, h); x[1] = _mm_unpackhi_ps(l, h); return x; } template<> inline Vc_CONST _M128D SortHelper<_M128D, 2>::sort(_M128D x) { const _M128D y = _mm_shuffle_pd(x, x, _MM_SHUFFLE2(0, 1)); return _mm_unpacklo_pd(_mm_min_sd(x, y), _mm_max_sd(x, y)); } // can be used to multiply with a constant. For some special constants it doesn't need an extra // vector but can use a shift instead, basically encoding the factor in the instruction. template Vc_ALWAYS_INLINE Vc_CONST IndexType mulConst(const IndexType x) { typedef VectorHelper H; switch (constant) { case 0: return H::zero(); case 1: return x; case 2: return H::slli(x.data(), 1); case 4: return H::slli(x.data(), 2); case 8: return H::slli(x.data(), 3); case 16: return H::slli(x.data(), 4); case 32: return H::slli(x.data(), 5); case 64: return H::slli(x.data(), 6); case 128: return H::slli(x.data(), 7); case 256: return H::slli(x.data(), 8); case 512: return H::slli(x.data(), 9); case 1024: return H::slli(x.data(), 10); case 2048: return H::slli(x.data(), 11); } #ifndef VC_IMPL_SSE4_1 // without SSE 4.1 int multiplication is not so nice if (sizeof(typename IndexType::EntryType) == 4) { switch (constant) { case 3: return H::add( x.data() , H::slli(x.data(), 1)); case 5: return H::add( x.data() , H::slli(x.data(), 2)); case 9: return H::add( x.data() , H::slli(x.data(), 3)); case 17: return H::add( x.data() , H::slli(x.data(), 4)); case 33: return H::add( x.data() , H::slli(x.data(), 5)); case 65: return H::add( x.data() , H::slli(x.data(), 6)); case 129: return H::add( x.data() , H::slli(x.data(), 7)); case 257: return H::add( x.data() , H::slli(x.data(), 8)); case 513: return H::add( x.data() , H::slli(x.data(), 9)); case 1025: return H::add( x.data() , H::slli(x.data(), 10)); case 2049: return H::add( x.data() , H::slli(x.data(), 11)); case 6: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 2)); case 10: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 3)); case 18: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 4)); case 34: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 5)); case 66: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 6)); case 130: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 7)); case 258: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 8)); case 514: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 9)); case 1026: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 10)); case 2050: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 11)); case 12: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 3)); case 20: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 4)); case 36: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 5)); case 68: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 6)); case 132: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 7)); case 260: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 8)); case 516: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 9)); case 1028: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 10)); case 2052: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 11)); case 24: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 4)); case 40: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 5)); case 72: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 6)); case 136: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 7)); case 264: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 8)); case 520: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 9)); case 1032: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 10)); case 2056: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 11)); case 48: return H::add(H::slli(x.data(), 4), H::slli(x.data(), 5)); case 80: return H::add(H::slli(x.data(), 4), H::slli(x.data(), 6)); case 144: return H::add(H::slli(x.data(), 4), H::slli(x.data(), 7)); case 272: return H::add(H::slli(x.data(), 4), H::slli(x.data(), 8)); case 528: return H::add(H::slli(x.data(), 4), H::slli(x.data(), 9)); case 1040: return H::add(H::slli(x.data(), 4), H::slli(x.data(), 10)); case 2064: return H::add(H::slli(x.data(), 4), H::slli(x.data(), 11)); case 96: return H::add(H::slli(x.data(), 5), H::slli(x.data(), 6)); case 160: return H::add(H::slli(x.data(), 5), H::slli(x.data(), 7)); case 288: return H::add(H::slli(x.data(), 5), H::slli(x.data(), 8)); case 544: return H::add(H::slli(x.data(), 5), H::slli(x.data(), 9)); case 1056: return H::add(H::slli(x.data(), 5), H::slli(x.data(), 10)); case 2080: return H::add(H::slli(x.data(), 5), H::slli(x.data(), 11)); case 192: return H::add(H::slli(x.data(), 6), H::slli(x.data(), 7)); case 320: return H::add(H::slli(x.data(), 6), H::slli(x.data(), 8)); case 576: return H::add(H::slli(x.data(), 6), H::slli(x.data(), 9)); case 1088: return H::add(H::slli(x.data(), 6), H::slli(x.data(), 10)); case 2112: return H::add(H::slli(x.data(), 6), H::slli(x.data(), 11)); case 384: return H::add(H::slli(x.data(), 7), H::slli(x.data(), 8)); case 640: return H::add(H::slli(x.data(), 7), H::slli(x.data(), 9)); case 1152: return H::add(H::slli(x.data(), 7), H::slli(x.data(), 10)); case 2176: return H::add(H::slli(x.data(), 7), H::slli(x.data(), 11)); case 768: return H::add(H::slli(x.data(), 8), H::slli(x.data(), 9)); case 1280: return H::add(H::slli(x.data(), 8), H::slli(x.data(), 10)); case 2304: return H::add(H::slli(x.data(), 8), H::slli(x.data(), 11)); case 1536: return H::add(H::slli(x.data(), 9), H::slli(x.data(), 10)); case 2560: return H::add(H::slli(x.data(), 9), H::slli(x.data(), 11)); case 3072: return H::add(H::slli(x.data(),10), H::slli(x.data(), 11)); } } #endif return H::mul(x.data(), H::set(constant)); } } // namespace SSE } // namespace Vc /*OUTER_NAMESPACE_END*/ Vc-0.7.4/test.cmake000066400000000000000000000237271233512346000140650ustar00rootroot00000000000000if(NOT CTEST_SOURCE_DIRECTORY) get_filename_component(CTEST_SOURCE_DIRECTORY "${CMAKE_CURRENT_LIST_FILE}" PATH) endif() set(dashboard_model "$ENV{dashboard_model}") if(NOT dashboard_model) set(dashboard_model "Experimental") endif() set(target_architecture "$ENV{target_architecture}") set(skip_tests "$ENV{skip_tests}") set(build_type "$ENV{build_type}") if(NOT build_type) set(build_type "Release") endif() # better make sure we get english output (this is vital for the implicit_type_conversion_failures tests) set(ENV{LANG} "en_US") find_program(UNAME uname) if(UNAME) execute_process(COMMAND ${UNAME} -s OUTPUT_VARIABLE arch OUTPUT_STRIP_TRAILING_WHITESPACE) string(TOLOWER "${arch}" arch) execute_process(COMMAND ${UNAME} -m OUTPUT_VARIABLE chip OUTPUT_STRIP_TRAILING_WHITESPACE) string(TOLOWER "${chip}" chip) else() find_program(CMD cmd) if(CMD) execute_process(COMMAND cmd /D /Q /C ver OUTPUT_VARIABLE arch OUTPUT_STRIP_TRAILING_WHITESPACE) string(REGEX REPLACE "^.*Windows[^0-9]*([.0-9]+).*$" "Windows \\1" arch "${arch}") else() string(TOLOWER "$ENV{TARGET_PLATFORM}" arch) if(arch) if("$ENV{WindowsSDKVersionOverride}") set(arch "${arch} SDK $ENV{WindowsSDKVersionOverride}") endif() else() string(TOLOWER "${CMAKE_SYSTEM_NAME}" arch) endif() endif() execute_process(COMMAND reg query "HKLM\\HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0" /v Identifier OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE processorId) if("${processorId}" MATCHES "AMD64") set(chip "x86_64") elseif("${processorId}" MATCHES "x86") set(chip "x86") else() set(chip "unknown") endif() endif() if("${arch}" MATCHES "[Ww]indows" OR "${arch}" MATCHES "win7") find_program(CL cl) execute_process(COMMAND ${CL} /nologo -EP "${CTEST_SOURCE_DIRECTORY}/cmake/msvc_version.c" OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE COMPILER_VERSION) string(STRIP "${COMPILER_VERSION}" COMPILER_VERSION) if("${CL}" MATCHES "amd64") set(COMPILER_VERSION "${COMPILER_VERSION} x86 64bit") elseif("${CL}" MATCHES "ia64") set(COMPILER_VERSION "${COMPILER_VERSION} Itanium") else() set(COMPILER_VERSION "${COMPILER_VERSION} x86 32bit") endif() set(number_of_processors "$ENV{NUMBER_OF_PROCESSORS}") if(NOT number_of_processors) execute_process(COMMAND reg query "HKLM\\HARDWARE\\DESCRIPTION\\System\\CentralProcessor" OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE number_of_processors) string(REGEX REPLACE "[^0-9]+" "," number_of_processors "${number_of_processors}") string(REGEX REPLACE "^.*," "" number_of_processors "${number_of_processors}") math(EXPR number_of_processors "1 + ${number_of_processors}") endif() elseif(arch MATCHES "mingw") find_program(CL cl) find_program(GXX "g++") if("$ENV{CXX}" MATCHES "g\\+\\+") set(GXX "$ENV{CXX}") endif() if(GXX) execute_process(COMMAND "${GXX}" --version OUTPUT_VARIABLE COMPILER_VERSION ERROR_VARIABLE COMPILER_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE) string(REPLACE "\n" ";" COMPILER_VERSION "${COMPILER_VERSION}") list(GET COMPILER_VERSION 0 COMPILER_VERSION) elseif(CL) execute_process(COMMAND ${CL} /nologo -EP "${CTEST_SOURCE_DIRECTORY}/cmake/msvc_version.c" OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE COMPILER_VERSION) string(STRIP "${COMPILER_VERSION}" COMPILER_VERSION) else() message(FATAL_ERROR "unknown compiler") endif() execute_process(COMMAND reg query "HKLM\\HARDWARE\\DESCRIPTION\\System\\CentralProcessor" COMMAND grep -c CentralProcessor OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE number_of_processors) else() set(_cxx "$ENV{CXX}") if(NOT _cxx) set(_cxx "c++") endif() execute_process(COMMAND ${_cxx} --version OUTPUT_VARIABLE COMPILER_VERSION ERROR_VARIABLE COMPILER_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE) string(REPLACE "\n" ";" COMPILER_VERSION "${COMPILER_VERSION}") list(GET COMPILER_VERSION 0 COMPILER_VERSION) string(REPLACE "Open64 Compiler Suite: Version" "Open64" COMPILER_VERSION "${COMPILER_VERSION}") if(arch STREQUAL "darwin") execute_process(COMMAND sysctl -n hw.ncpu OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE number_of_processors) else() execute_process(COMMAND grep -c processor /proc/cpuinfo OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE number_of_processors) endif() endif() file(READ "${CTEST_SOURCE_DIRECTORY}/.git/HEAD" git_branch) string(STRIP "${git_branch}" git_branch) # -> ref: refs/heads/master string(REGEX REPLACE "^.*/" "" git_branch "${git_branch}") # -> master if(arch STREQUAL "linux") execute_process(COMMAND lsb_release -d COMMAND cut -f2 OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE lsbRelease) set(CTEST_BUILD_NAME "${lsbRelease}") else() set(CTEST_BUILD_NAME "${arch}") endif() string(STRIP "${CTEST_BUILD_NAME} ${chip} ${COMPILER_VERSION} $ENV{CXXFLAGS}" CTEST_BUILD_NAME) set(CTEST_BUILD_NAME "${CTEST_BUILD_NAME} ${build_type}") if(target_architecture) set(CTEST_BUILD_NAME "${CTEST_BUILD_NAME} ${target_architecture}") else() execute_process(COMMAND cmake -Darch=${arch} -P ${CTEST_SOURCE_DIRECTORY}/print_target_architecture.cmake OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE auto_target_arch ERROR_VARIABLE auto_target_arch) set(CTEST_BUILD_NAME "${CTEST_BUILD_NAME} ${auto_target_arch}") endif() string(REPLACE "/" "_" CTEST_BUILD_NAME "${git_branch}: ${CTEST_BUILD_NAME}") string(REPLACE "+" "x" CTEST_BUILD_NAME "${CTEST_BUILD_NAME}") # CDash fails to escape '+' correctly in URIs string(REGEX REPLACE "[][ ():]" "_" CTEST_BINARY_DIRECTORY "${CTEST_BUILD_NAME}") set(CTEST_BINARY_DIRECTORY "${CTEST_SOURCE_DIRECTORY}/build-${dashboard_model}-${CTEST_BINARY_DIRECTORY}") file(MAKE_DIRECTORY "${CTEST_BINARY_DIRECTORY}") execute_process(COMMAND hostname -s RESULT_VARIABLE ok OUTPUT_VARIABLE CTEST_SITE ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) if(NOT ok EQUAL 0) execute_process(COMMAND hostname OUTPUT_VARIABLE CTEST_SITE ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) endif() Set(CTEST_START_WITH_EMPTY_BINARY_DIRECTORY_ONCE TRUE) set(CTEST_NOTES_FILES "${CTEST_SOURCE_DIRECTORY}/.git/HEAD" "${CTEST_SOURCE_DIRECTORY}/.git/refs/heads/${git_branch}") set(compiler) if(COMPILER_VERSION MATCHES "clang") set(compiler "clang") elseif(COMPILER_VERSION MATCHES "g\\+\\+") if(WIN32) set(compiler "MinGW") else() set(compiler "GCC") endif() elseif(COMPILER_VERSION MATCHES "MSVC") set(compiler "MSVC") elseif(COMPILER_VERSION MATCHES "ICC") set(compiler "ICC") elseif(COMPILER_VERSION MATCHES "Open64") set(compiler "Open64") endif() if(COMPILER_VERSION MATCHES "\\((experimental|prerelease)\\)" OR COMPILER_VERSION MATCHES "clang version 3.5") set(compiler "experimental") endif() include(${CTEST_SOURCE_DIRECTORY}/CTestCustom.cmake) include(${CTEST_SOURCE_DIRECTORY}/CTestConfig.cmake) set(CTEST_USE_LAUNCHERS 1) # much improved error/warning message logging if(WIN32) set(MAKE_ARGS "-k") else() set(MAKE_ARGS "-j${number_of_processors} -k") endif() message("********************************") #message("src: ${CTEST_SOURCE_DIRECTORY}") #message("obj: ${CTEST_BINARY_DIRECTORY}") message("build name: ${CTEST_BUILD_NAME}") message("site: ${CTEST_SITE}") message("model: ${dashboard_model}") message("********************************") if(WIN32) if("${compiler}" STREQUAL "MSVC") find_program(JOM jom) if(JOM) set(CTEST_CMAKE_GENERATOR "NMake Makefiles JOM") set(CMAKE_MAKE_PROGRAM "jom") else() set(CTEST_CMAKE_GENERATOR "NMake Makefiles") set(CMAKE_MAKE_PROGRAM "nmake") set(MAKE_ARGS "-I") endif() elseif("${compiler}" STREQUAL "MinGW") set(CTEST_CMAKE_GENERATOR "MSYS Makefiles") set(CMAKE_MAKE_PROGRAM "make") else() message(FATAL_ERROR "unknown cmake generator required (compiler: ${compiler})") endif() else() set(CTEST_CMAKE_GENERATOR "Unix Makefiles") set(CMAKE_MAKE_PROGRAM "make") endif() set(configure_options "-DCTEST_USE_LAUNCHERS=${CTEST_USE_LAUNCHERS};-DCMAKE_BUILD_TYPE=${build_type};-DBUILD_EXAMPLES=TRUE;-DTEST_OPERATOR_FAILURES=TRUE") if(target_architecture) set(configure_options "${configure_options};-DTARGET_ARCHITECTURE=${target_architecture}") endif() macro(go) set_property(GLOBAL PROPERTY SubProject ${compiler}) set_property(GLOBAL PROPERTY Label other) CTEST_START (${dashboard_model}) set(res 0) if(NOT ${dashboard_model} STREQUAL "Experimental") CTEST_UPDATE (SOURCE "${CTEST_SOURCE_DIRECTORY}" RETURN_VALUE res) if(res GREATER 0) ctest_submit(PARTS Update) endif() endif() if(NOT ${dashboard_model} STREQUAL "Continuous" OR res GREATER 0) CTEST_CONFIGURE (BUILD "${CTEST_BINARY_DIRECTORY}" OPTIONS "${configure_options}" APPEND RETURN_VALUE res) ctest_submit(PARTS Notes Configure) if(res EQUAL 0) foreach(label other Scalar SSE AVX) set_property(GLOBAL PROPERTY Label ${label}) set(CTEST_BUILD_TARGET "${label}") set(CTEST_BUILD_COMMAND "${CMAKE_MAKE_PROGRAM} ${MAKE_ARGS} ${CTEST_BUILD_TARGET}") ctest_build( BUILD "${CTEST_BINARY_DIRECTORY}" APPEND RETURN_VALUE res) ctest_submit(PARTS Build) if(res EQUAL 0 AND NOT skip_tests) ctest_test( BUILD "${CTEST_BINARY_DIRECTORY}" APPEND RETURN_VALUE res PARALLEL_LEVEL ${number_of_processors} INCLUDE_LABEL "${label}") ctest_submit(PARTS Test) endif() endforeach() endif() endif() endmacro() if(${dashboard_model} STREQUAL "Continuous") while(${CTEST_ELAPSED_TIME} LESS 64800) set(START_TIME ${CTEST_ELAPSED_TIME}) go() ctest_sleep(${START_TIME} 1200 ${CTEST_ELAPSED_TIME}) endwhile() else() CTEST_EMPTY_BINARY_DIRECTORY(${CTEST_BINARY_DIRECTORY}) go() endif() Vc-0.7.4/tests/000077500000000000000000000000001233512346000132335ustar00rootroot00000000000000Vc-0.7.4/tests/CMakeLists.txt000066400000000000000000000331441233512346000160000ustar00rootroot00000000000000include(AddFileDependencies) add_definitions(-DCOMPILE_FOR_UNIT_TESTS) # -DVC_CHECK_ALIGNMENT) if(Vc_COMPILER_IS_MSVC) AddCompilerFlag("/wd4267") # Disable warning "conversion from 'size_t' to 'int', possible loss of data" AddCompilerFlag("/wd4723") # Disable warning "potential divide by 0" (suppress doesn't work) endif() if(DEFINED Vc_INSIDE_ROOT) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "") # Reset the ROOT default executable destination set(Vc_TEST_TARGET_PREFIX "vc-") else() set(Vc_TEST_TARGET_PREFIX "") endif() set(CXX11_FLAG) if(NOT DEFINED Vc_INSIDE_ROOT) set(_cxx11_flags "-std=c++11" "-std=c++0x") if(Vc_COMPILER_IS_GCC AND WIN32) # MinGW fails to compile POSIX code unless gnu++11 is used set(_cxx11_flags "-std=gnu++11" "-std=gnu++0x") endif() foreach(_flag ${_cxx11_flags}) string(REGEX REPLACE "[-+/:= ]" "_" _flag_esc "${_flag}") check_cxx_compiler_flag("${_flag}" check_cxx_compiler_flag_${_flag_esc}) if(check_cxx_compiler_flag_${_flag_esc}) set(CXX11_FLAG ${_flag}) break() endif() endforeach() endif() macro(vc_add_run_target _target) add_custom_target(run_${_target} ${_target} DEPENDS ${_target} COMMENT "Execute ${_target} test" VERBATIM ) endmacro() macro(vc_add_test _name) foreach(_std cxx98 cxx11) set(_extra_flags) set(name ${_name}) foreach(_arg ${ARGN}) set(_extra_flags "${_extra_flags} -D${_arg}") set(name "${name}_${_arg}") endforeach() if("${_std}" STREQUAL "cxx11") if(NOT CXX11_FLAG) break() endif() set(SAVE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") if(CMAKE_CXX_FLAGS MATCHES " -ansi ") string(REPLACE " -ansi " " " CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") endif() set(_extra_flags "${_extra_flags} ${CXX11_FLAG}") set(name "c++11_${name}") endif() set(_target "${name}_scalar") list(FIND disabled_targets ${_target} _disabled) if(_disabled EQUAL -1) file(GLOB _extra_deps "${CMAKE_SOURCE_DIR}/scalar/*.tcc" "${CMAKE_SOURCE_DIR}/scalar/*.h" "${CMAKE_SOURCE_DIR}/common/*.h") add_file_dependencies(${_name}.cpp "${_extra_deps}") add_executable(${_target} EXCLUDE_FROM_ALL ${_name}.cpp) target_link_libraries(${_target} Vc) add_target_property(${_target} COMPILE_FLAGS "-DVC_IMPL=Scalar ${_extra_flags}") add_target_property(${_target} LABELS "Scalar") add_dependencies(build_tests ${_target}) add_dependencies(Scalar ${_target}) add_test(${Vc_TEST_TARGET_PREFIX}${_target} "${CMAKE_CURRENT_BINARY_DIR}/${_target}") set_property(TEST ${Vc_TEST_TARGET_PREFIX}${_target} PROPERTY LABELS "Scalar") vc_add_run_target(${_target}) endif() if(USE_SSE2 AND NOT Vc_SSE_INTRINSICS_BROKEN) set(DVC_IMPL "-DVC_IMPL=SSE") if(USE_XOP) set(DVC_IMPL "${DVC_IMPL}+XOP") endif() if(USE_FMA) set(DVC_IMPL "${DVC_IMPL}+FMA") elseif(USE_FMA4) set(DVC_IMPL "${DVC_IMPL}+FMA4") endif() set(_target "${name}_sse") list(FIND disabled_targets ${_target} _disabled) if(_disabled EQUAL -1) file(GLOB _extra_deps "${CMAKE_SOURCE_DIR}/sse/*.tcc" "${CMAKE_SOURCE_DIR}/sse/*.h" "${CMAKE_SOURCE_DIR}/common/*.h") add_file_dependencies(${_name}.cpp "${_extra_deps}") add_executable(${_target} EXCLUDE_FROM_ALL ${_name}.cpp) target_link_libraries(${_target} Vc) add_target_property(${_target} COMPILE_FLAGS "${DVC_IMPL} ${_extra_flags}") add_target_property(${_target} LABELS "SSE") add_dependencies(build_tests ${_target}) add_dependencies(SSE ${_target}) add_test(${Vc_TEST_TARGET_PREFIX}${_target} "${CMAKE_CURRENT_BINARY_DIR}/${_target}") set_property(TEST ${Vc_TEST_TARGET_PREFIX}${_target} PROPERTY LABELS "SSE") vc_add_run_target(${_target}) endif() endif() if(USE_AVX) set(DVC_IMPL "-DVC_IMPL=AVX") if(USE_XOP) set(DVC_IMPL "${DVC_IMPL}+XOP") endif() if(USE_FMA) set(DVC_IMPL "${DVC_IMPL}+FMA") elseif(USE_FMA4) set(DVC_IMPL "${DVC_IMPL}+FMA4") endif() set(_target "${name}_avx") list(FIND disabled_targets ${_target} _disabled) if(_disabled EQUAL -1) file(GLOB _extra_deps "${CMAKE_SOURCE_DIR}/avx/*.tcc" "${CMAKE_SOURCE_DIR}/avx/*.h" "${CMAKE_SOURCE_DIR}/common/*.h") add_file_dependencies(${_name}.cpp "${_extra_deps}") add_executable(${_target} EXCLUDE_FROM_ALL ${_name}.cpp) target_link_libraries(${_target} Vc) add_target_property(${_target} COMPILE_FLAGS "${DVC_IMPL} ${_extra_flags}") add_target_property(${_target} LABELS "AVX") add_dependencies(build_tests ${_target}) add_dependencies(AVX ${_target}) add_test(${Vc_TEST_TARGET_PREFIX}${_target} "${CMAKE_CURRENT_BINARY_DIR}/${_target}") set_property(TEST ${Vc_TEST_TARGET_PREFIX}${_target} PROPERTY LABELS "AVX") vc_add_run_target(${_target}) endif() endif(USE_AVX) if("${_std}" STREQUAL "cxx11") set(CMAKE_CXX_FLAGS "${SAVE_CXX_FLAGS}") endif() endforeach() endmacro(vc_add_test) vc_add_test(stlcontainer) vc_add_test(scalaraccess) vc_add_test(memory) vc_add_test(arithmetics) vc_add_test(implicit_type_conversion) vc_add_test(expandandmerge) vc_add_test(load) vc_add_test(store) vc_add_test(gather) vc_add_test(gather VC_USE_BSF_GATHERS) vc_add_test(gather VC_USE_POPCNT_BSF_GATHERS) vc_add_test(gather VC_USE_SET_GATHERS) vc_add_test(scatter) vc_add_test(scatter VC_USE_BSF_SCATTERS) vc_add_test(scatter VC_USE_POPCNT_BSF_SCATTERS) vc_add_test(math) vc_add_test(math VC_LOG_ILP) vc_add_test(math VC_LOG_ILP2) vc_add_test(mask) vc_add_test(utils) vc_add_test(deinterleave) vc_add_test(deinterleave VC_USE_MASKMOV_SCATTER) vc_add_test(casts) vc_add_test(swizzles) if(USE_SSE2 AND NOT Vc_SSE_INTRINSICS_BROKEN) list(FIND disabled_targets sse_blend _disabled) if(_disabled EQUAL -1) add_executable(sse2_blend EXCLUDE_FROM_ALL sse_blend.cpp) add_target_property(sse2_blend COMPILE_FLAGS "-DVC_IMPL=SSE2") add_target_property(sse2_blend LABELS "SSE") add_dependencies(build_tests ${_target}) add_dependencies(SSE sse2_blend) add_test(${Vc_TEST_TARGET_PREFIX}sse2_blend "${CMAKE_CURRENT_BINARY_DIR}/sse2_blend") set_property(TEST ${Vc_TEST_TARGET_PREFIX}sse2_blend PROPERTY LABELS "SSE") target_link_libraries(sse2_blend Vc) if(USE_SSE4_1) add_executable(sse4_blend EXCLUDE_FROM_ALL sse_blend.cpp) add_target_property(sse4_blend COMPILE_FLAGS "-DVC_IMPL=SSE4_1") add_target_property(sse4_blend LABELS "SSE") add_dependencies(build_tests ${_target}) add_dependencies(SSE sse4_blend) add_test(${Vc_TEST_TARGET_PREFIX}sse4_blend "${CMAKE_CURRENT_BINARY_DIR}/sse4_blend") set_property(TEST ${Vc_TEST_TARGET_PREFIX}sse4_blend PROPERTY LABELS "SSE") target_link_libraries(sse4_blend Vc) endif() endif() endif() add_executable(supportfunctions EXCLUDE_FROM_ALL supportfunctions.cpp) target_link_libraries(supportfunctions Vc) add_target_property(supportfunctions LABELS "other") add_dependencies(build_tests ${_target}) add_dependencies(other supportfunctions) add_test(${Vc_TEST_TARGET_PREFIX}supportfunctions "${CMAKE_CURRENT_BINARY_DIR}/supportfunctions") set_property(TEST ${Vc_TEST_TARGET_PREFIX}supportfunctions PROPERTY LABELS "other") vc_add_run_target(supportfunctions) get_property(_incdirs DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES) set(incdirs) foreach(_d ${_incdirs}) list(APPEND incdirs "-I${_d}") endforeach() separate_arguments(_flags UNIX_COMMAND ${CMAKE_CXX_FLAGS}) foreach(_f ${_flags}) if(_f MATCHES "^-m" OR _f MATCHES "^/arch:" OR _f MATCHES "^-x") list(REMOVE_ITEM _flags "${_f}") endif() endforeach() set(TEST_OPERATOR_FAILURES FALSE CACHE BOOL "Run implicit type conversion operator tests.") if(TEST_OPERATOR_FAILURES) macro(vc_test_implicit_type_conversion_failures A B) foreach(impl Scalar SSE AVX) if("${impl}" STREQUAL "Scalar") set(_implFlags) elseif("${impl}" STREQUAL "SSE") if(Vc_COMPILER_IS_MSVC) AddCompilerFlag("/arch:SSE2" CXX_FLAGS _implFlags) string(STRIP "${_implFlags}" _implFlags) elseif(Vc_COMPILER_IS_INTEL) set(_implFlags "-xSSE2") else() set(_implFlags "-msse2") endif() elseif("${impl}" STREQUAL "AVX") if(Vc_AVX_INTRINSICS_BROKEN) break() endif() if(Vc_COMPILER_IS_MSVC) set(_implFlags "/arch:AVX") elseif(Vc_COMPILER_IS_INTEL) set(_implFlags "-xAVX") else() set(_implFlags "-mavx") endif() endif() set(type_b ${B}) foreach(type_a ${A} ${B}) foreach(op "^" "==" "*") # "/" "+" "-" "&" "|" "!=" "<=" ">=" "<" ">") set(name "implicit_type_conversion_failures_${type_a}_${op}_${type_b}_${impl}") add_test(NAME "${Vc_TEST_TARGET_PREFIX}${name}" WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMAND ${CMAKE_CXX_COMPILER} ${_flags} ${_implFlags} ${incdirs} -o "${name}.tmp" "-DTYPE_A=${type_a}" "-DTEST_OP=${op}" "-DTYPE_B=${type_b}" "-DVC_IMPL=${impl}" ${CMAKE_CURRENT_SOURCE_DIR}/implicit_type_conversion_failures.cpp ) set_property(TEST "${name}" PROPERTY LABELS "${impl}") set_tests_properties("${name}" PROPERTIES PASS_REGULAR_EXPRESSION "invalid operands to binary expression;error: no match for .*operator\\${op};error C267[789]: binary .*\\${op}.* no (global )?operator found;error: no operator \"\\${op}\" matches these operands" FAIL_REGULAR_EXPRESSION "no such file or directory;undefined reference to" ) endforeach() set(type_b ${A}) endforeach() endforeach() endmacro() vc_test_implicit_type_conversion_failures("double_v" "float_v") vc_test_implicit_type_conversion_failures("double_v" "short_v") vc_test_implicit_type_conversion_failures("double_v" "ushort_v") vc_test_implicit_type_conversion_failures("double_v" "int_v") vc_test_implicit_type_conversion_failures("double_v" "uint_v") vc_test_implicit_type_conversion_failures( "float_v" "double") vc_test_implicit_type_conversion_failures( "float_v" "short_v") vc_test_implicit_type_conversion_failures( "float_v" "ushort_v") vc_test_implicit_type_conversion_failures("sfloat_v" "double_v") vc_test_implicit_type_conversion_failures("sfloat_v" "double") vc_test_implicit_type_conversion_failures("sfloat_v" "float_v") vc_test_implicit_type_conversion_failures("sfloat_v" "int_v") vc_test_implicit_type_conversion_failures("sfloat_v" "uint_v") vc_test_implicit_type_conversion_failures( "short_v" "int_v") vc_test_implicit_type_conversion_failures( "short_v" "uint_v") vc_test_implicit_type_conversion_failures("ushort_v" "int_v") vc_test_implicit_type_conversion_failures("ushort_v" "uint_v") vc_test_implicit_type_conversion_failures("double_v" "bool") vc_test_implicit_type_conversion_failures("sfloat_v" "bool") vc_test_implicit_type_conversion_failures( "float_v" "bool") vc_test_implicit_type_conversion_failures( "int_v" "bool") vc_test_implicit_type_conversion_failures( "uint_v" "bool") vc_test_implicit_type_conversion_failures( "short_v" "bool") vc_test_implicit_type_conversion_failures("ushort_v" "bool") endif() # compile and link test for targets that need to link lots of stuff together add_library(linkTestLibDynamic1 SHARED EXCLUDE_FROM_ALL linkTestLib0.cpp linkTestLib1.cpp) add_library(linkTestLibDynamic2 SHARED EXCLUDE_FROM_ALL linkTestLib0.cpp linkTestLib1.cpp) add_library(linkTestLibStatic STATIC EXCLUDE_FROM_ALL linkTestLib2.cpp linkTestLib3.cpp) add_executable(linkTest EXCLUDE_FROM_ALL linkTest0.cpp linkTest1.cpp) add_dependencies(build_tests linkTest) add_dependencies(other linkTest) target_link_libraries(linkTestLibDynamic1 Vc) target_link_libraries(linkTestLibDynamic2 Vc) add_target_property(linkTestLibDynamic1 COMPILE_FLAGS "-DPOSTFIX=A") add_target_property(linkTestLibDynamic2 COMPILE_FLAGS "-DPOSTFIX=B") target_link_libraries(linkTestLibStatic Vc) target_link_libraries(linkTest Vc linkTestLibDynamic1 linkTestLibDynamic2 linkTestLibStatic) # Use the following program to generate the sincos-reference-*.dat files #add_executable(convert-sincos-reference EXCLUDE_FROM_ALL convert-sincos-reference.cpp) set(_deps) foreach(fun sincos asin acos atan ln log2 log10) foreach(filename reference-${fun}-sp.dat reference-${fun}-dp.dat) add_custom_command(OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${filename}" COMMAND ${CMAKE_COMMAND} -Dfilename=${filename} -P ${CMAKE_CURRENT_SOURCE_DIR}/download.cmake DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/download.cmake WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMENT "Downloading Test Data: ${filename}" VERBATIM ) list(APPEND _deps "${CMAKE_CURRENT_BINARY_DIR}/${filename}") endforeach() endforeach() add_custom_target(download-testdata ALL DEPENDS ${_deps} ) add_dependencies(other download-testdata) Vc-0.7.4/tests/arithmetics.cpp000066400000000000000000000400461233512346000162570ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include "unittest.h" #include #include #include #include using namespace Vc; template void testZero() { Vec a(Zero), b(Zero); COMPARE(a, b); Vec c, d(1); c.setZero(); COMPARE(a, c); d.setZero(); COMPARE(a, d); d = static_cast(0); COMPARE(a, d); const typename Vec::EntryType zero = 0; COMPARE(a, Vec(zero)); COMPARE(b, Vec(zero)); COMPARE(c, Vec(zero)); COMPARE(d, Vec(zero)); } template void testCmp() { typedef typename Vec::EntryType T; Vec a(Zero), b(Zero); COMPARE(a, b); if (!(a != b).isEmpty()) { std::cerr << a << " != " << b << ", (a != b) = " << (a != b) << ", (a == b) = " << (a == b) << std::endl; } VERIFY((a != b).isEmpty()); Vec c(1); VERIFY((a < c).isFull()); VERIFY((c > a).isFull()); VERIFY((a <= b).isFull()); VERIFY((a <= c).isFull()); VERIFY((b >= a).isFull()); VERIFY((c >= a).isFull()); { const T max = static_cast(std::numeric_limits::max() * 0.95); const T min = 0; const T step = max / 200; T j = min; VERIFY(Vec(Zero) == Vec(j)); VERIFY(!(Vec(Zero) < Vec(j))); VERIFY(!(Vec(Zero) > Vec(j))); VERIFY(!(Vec(Zero) != Vec(j))); j += step; for (int i = 0; i < 200; ++i, j += step) { if(Vec(Zero) >= Vec(j)) { std::cout << j << " " << Vec(j) << " " << (Vec(Zero) >= Vec(j)) << std::endl; } VERIFY(Vec(Zero) < Vec(j)); VERIFY(Vec(j) > Vec(Zero)); VERIFY(!(Vec(Zero) >= Vec(j))); VERIFY(!(Vec(j) <= Vec(Zero))); VERIFY(!static_cast(Vec(Zero) >= Vec(j))); VERIFY(!static_cast(Vec(j) <= Vec(Zero))); } } if (std::numeric_limits::min() <= 0) { const T min = static_cast(std::numeric_limits::min() * 0.95); if (min == 0) { return; } const T step = min / T(-201); T j = min; for (int i = 0; i < 200; ++i, j += step) { VERIFY(Vec(j) < Vec(Zero)); VERIFY(Vec(Zero) > Vec(j)); VERIFY(!(Vec(Zero) <= Vec(j))); VERIFY(!(Vec(j) >= Vec(Zero))); } } } template void testIsMix() { Vec a(IndexesFromZero); Vec b(Zero); Vec c(One); if (Vec::Size > 1) { VERIFY((a == b).isMix()); VERIFY((a != b).isMix()); VERIFY((a == c).isMix()); VERIFY((a != c).isMix()); VERIFY(!(a == a).isMix()); VERIFY(!(a != a).isMix()); } else { // masks of size 1 can never be a mix of 0 and 1 VERIFY(!(a == b).isMix()); VERIFY(!(a != b).isMix()); VERIFY(!(a == c).isMix()); VERIFY(!(a != c).isMix()); VERIFY(!(a == a).isMix()); VERIFY(!(a != a).isMix()); } } template void testAdd() { Vec a(Zero), b(Zero); COMPARE(a, b); a += 1; Vec c(1); COMPARE(a, c); COMPARE(a, b + 1); COMPARE(a, b + c); Vec x(Zero); } template void testSub() { Vec a(2), b(2); COMPARE(a, b); a -= 1; Vec c(1); COMPARE(a, c); COMPARE(a, b - 1); COMPARE(a, b - c); } template void testMul() { for (int i = 0; i < 10000; ++i) { V a = V::Random(); V b = V::Random(); V reference = a; for (int j = 0; j < V::Size; ++j) { // this could overflow - but at least the compiler can't know about it so it doesn't // matter that it's undefined behavior in C++. The only thing that matters is what the // hardware does... reference[j] *= b[j]; } COMPARE(a * b, reference) << a << " * " << b; } } template void testMulAdd() { for (unsigned int i = 0; i < 0xffff; ++i) { const Vec i2(i * i + 1); Vec a(i); FUZZY_COMPARE(a * a + 1, i2); } } template void testMulSub() { typedef typename Vec::EntryType T; for (unsigned int i = 0; i < 0xffff; ++i) { const T j = static_cast(i); const Vec test(j); FUZZY_COMPARE(test * test - test, Vec(j * j - j)); } } template void testDiv() { typedef typename Vec::EntryType T; // If this test fails for ICC see here: // http://software.intel.com/en-us/forums/topic/488995 const T stepsize = std::max(T(1), T(std::numeric_limits::max() / 1024)); for (T divisor = 1; divisor < 5; ++divisor) { for (T scalar = std::numeric_limits::min(); scalar < std::numeric_limits::max() - stepsize + 1; scalar += stepsize) { Vec vector(scalar); Vec reference(scalar / divisor); COMPARE(vector / divisor, reference) << '\n' << vector << " / " << divisor << ", reference: " << scalar << " / " << divisor << " = " << scalar / divisor; vector /= divisor; COMPARE(vector, reference); } } } template void testAnd() { Vec a(0x7fff); Vec b(0xf); COMPARE((a & 0xf), b); Vec c(IndexesFromZero); COMPARE(c, (c & 0xf)); const typename Vec::EntryType zero = 0; COMPARE((c & 0x7ff0), Vec(zero)); } template void testShift() { typedef typename Vec::EntryType T; const T step = std::max(1, std::numeric_limits::max() / 1000); enum { NShifts = sizeof(T) * 8 }; for (Vec x = std::numeric_limits::min() + Vec::IndexesFromZero(); x < std::numeric_limits::max() - step; x += step) { for (size_t shift = 0; shift < NShifts; ++shift) { const Vec rightShift = x >> shift; const Vec leftShift = x << shift; for (size_t k = 0; k < Vec::Size; ++k) { COMPARE(rightShift[k], T(x[k] >> shift)) << ", x[k] = " << x[k] << ", shift = " << shift; COMPARE(leftShift [k], T(x[k] << shift)) << ", x[k] = " << x[k] << ", shift = " << shift; } } } Vec a(1); Vec b(2); // left shifts COMPARE((a << 1), b); COMPARE((a << 2), (a << 2)); COMPARE((a << 2), (b << 1)); Vec shifts(IndexesFromZero); a <<= shifts; for (typename Vec::EntryType i = 0, x = 1; i < Vec::Size; ++i, x <<= 1) { COMPARE(a[i], x); } // right shifts a = Vec(4); COMPARE((a >> 1), b); COMPARE((a >> 2), (a >> 2)); COMPARE((a >> 2), (b >> 1)); a = Vec(16); a >>= shifts; for (typename Vec::EntryType i = 0, x = 16; i < Vec::Size; ++i, x >>= 1) { COMPARE(a[i], x); } } template void testOnesComplement() { Vec a(One); Vec b = ~a; COMPARE(~a, b); COMPARE(~b, a); COMPARE(~(a + b), Vec(Zero)); } template struct NegateRangeHelper { typedef int Iterator; static const Iterator Start; static const Iterator End; }; template<> struct NegateRangeHelper { typedef unsigned int Iterator; static const Iterator Start; static const Iterator End; }; template<> const int NegateRangeHelper::Start = -0xffffff; template<> const int NegateRangeHelper::End = 0xffffff - 133; template<> const int NegateRangeHelper::Start = -0xffffff; template<> const int NegateRangeHelper::End = 0xffffff - 133; template<> const int NegateRangeHelper::Start = -0x7fffffff; template<> const int NegateRangeHelper::End = 0x7fffffff - 0xee; const unsigned int NegateRangeHelper::Start = 0; const unsigned int NegateRangeHelper::End = 0xffffffff - 0xee; template<> const int NegateRangeHelper::Start = -0x7fff; template<> const int NegateRangeHelper::End = 0x7fff - 0xee; template<> const int NegateRangeHelper::Start = 0; template<> const int NegateRangeHelper::End = 0xffff - 0xee; template void testNegate() { typedef typename Vec::EntryType T; typedef NegateRangeHelper Range; for (typename Range::Iterator i = Range::Start; i < Range::End; i += 0xef) { T i2 = static_cast(i); Vec a(i2); COMPARE(static_cast(-a), Vec(-i2)) << " i2: " << i2; } } template void testMin() { typedef typename Vec::EntryType T; typedef typename Vec::Mask Mask; typedef typename Vec::IndexType I; Vec v(I::IndexesFromZero()); COMPARE(v.min(), static_cast(0)); COMPARE((T(Vec::Size) - v).min(), static_cast(1)); int j = 0; Mask m; do { m = allMasks(j++); if (m.isEmpty()) { break; } COMPARE(v.min(m), static_cast(m.firstOne())) << m << v; } while (true); } template void testMax() { typedef typename Vec::EntryType T; typedef typename Vec::Mask Mask; typedef typename Vec::IndexType I; Vec v(I::IndexesFromZero()); COMPARE(v.max(), static_cast(Vec::Size - 1)); v = T(Vec::Size) - v; COMPARE(v.max(), static_cast(Vec::Size)); int j = 0; Mask m; do { m = allMasks(j++); if (m.isEmpty()) { break; } COMPARE(v.max(m), static_cast(Vec::Size - m.firstOne())) << m << v; } while (true); } template void testProduct() { typedef typename Vec::EntryType T; typedef typename Vec::Mask Mask; for (int i = 0; i < 10; ++i) { T x = static_cast(i); Vec v(x); T x2 = x; for (int k = Vec::Size; k > 1; k /= 2) { x2 *= x2; } COMPARE(v.product(), x2); int j = 0; Mask m; do { m = allMasks(j++); if (m.isEmpty()) { break; } if (std::numeric_limits::is_exact) { x2 = x; for (int k = m.count(); k > 1; --k) { x2 *= x; } } else { x2 = static_cast(pow(static_cast(x), m.count())); } COMPARE(v.product(m), x2) << m << v; } while (true); } } template void testSum() { typedef typename Vec::EntryType T; typedef typename Vec::Mask Mask; for (int i = 0; i < 10; ++i) { T x = static_cast(i); Vec v(x); COMPARE(v.sum(), x * Vec::Size); int j = 0; Mask m; do { m = allMasks(j++); COMPARE(v.sum(m), x * m.count()) << m << v; } while (!m.isEmpty()); } } template void fma() { for (int i = 0; i < 1000; ++i) { V a = V::Random(); const V b = V::Random(); const V c = V::Random(); const V reference = a * b + c; a.fusedMultiplyAdd(b, c); COMPARE(a, reference) << ", a = " << a << ", b = " << b << ", c = " << c; } } template<> void fma() { float_v b = Vc_buildFloat(1, 0x000001, 0); float_v c = Vc_buildFloat(1, 0x000000, -24); float_v a = b; /*a *= b; a += c; COMPARE(a, float_v(Vc_buildFloat(1, 0x000002, 0))); a = b;*/ a.fusedMultiplyAdd(b, c); COMPARE(a, float_v(Vc_buildFloat(1, 0x000003, 0))); a = Vc_buildFloat(1, 0x000002, 0); b = Vc_buildFloat(1, 0x000002, 0); c = Vc_buildFloat(-1, 0x000000, 0); /*a *= b; a += c; COMPARE(a, float_v(Vc_buildFloat(1, 0x000000, -21))); a = b;*/ a.fusedMultiplyAdd(b, c); // 1 + 2^-21 + 2^-44 - 1 == (1 + 2^-20)*2^-18 COMPARE(a, float_v(Vc_buildFloat(1, 0x000001, -21))); } template<> void fma() { sfloat_v b = Vc_buildFloat(1, 0x000001, 0); sfloat_v c = Vc_buildFloat(1, 0x000000, -24); sfloat_v a = b; /*a *= b; a += c; COMPARE(a, sfloat_v(Vc_buildFloat(1, 0x000002, 0))); a = b;*/ a.fusedMultiplyAdd(b, c); COMPARE(a, sfloat_v(Vc_buildFloat(1, 0x000003, 0))); a = Vc_buildFloat(1, 0x000002, 0); b = Vc_buildFloat(1, 0x000002, 0); c = Vc_buildFloat(-1, 0x000000, 0); /*a *= b; a += c; COMPARE(a, sfloat_v(Vc_buildFloat(1, 0x000000, -21))); a = b;*/ a.fusedMultiplyAdd(b, c); // 1 + 2^-21 + 2^-44 - 1 == (1 + 2^-20)*2^-18 COMPARE(a, sfloat_v(Vc_buildFloat(1, 0x000001, -21))); } template<> void fma() { double_v b = Vc_buildDouble(1, 0x0000000000001, 0); double_v c = Vc_buildDouble(1, 0x0000000000000, -53); double_v a = b; a.fusedMultiplyAdd(b, c); COMPARE(a, double_v(Vc_buildDouble(1, 0x0000000000003, 0))); a = Vc_buildDouble(1, 0x0000000000002, 0); b = Vc_buildDouble(1, 0x0000000000002, 0); c = Vc_buildDouble(-1, 0x0000000000000, 0); a.fusedMultiplyAdd(b, c); // 1 + 2^-50 + 2^-102 - 1 COMPARE(a, double_v(Vc_buildDouble(1, 0x0000000000001, -50))); } int main(int argc, char **argv) { initTest(argc, argv); testAllTypes(fma); runTest(testZero); runTest(testZero); runTest(testZero); runTest(testZero); runTest(testZero); runTest(testZero); runTest(testZero); runTest(testCmp); runTest(testCmp); runTest(testCmp); runTest(testCmp); runTest(testCmp); runTest(testCmp); runTest(testCmp); runTest(testIsMix); runTest(testIsMix); //runTest(testIsMix); //runTest(testIsMix); runTest(testIsMix); runTest(testIsMix); //runTest(testIsMix); runTest(testAdd); runTest(testAdd); runTest(testAdd); runTest(testAdd); runTest(testAdd); runTest(testAdd); runTest(testAdd); runTest(testSub); runTest(testSub); runTest(testSub); runTest(testSub); runTest(testSub); runTest(testSub); runTest(testSub); runTest(testMul); runTest(testMul); runTest(testMul); runTest(testMul); runTest(testMul); runTest(testMul); runTest(testMul); runTest(testDiv); runTest(testDiv); runTest(testDiv); runTest(testDiv); runTest(testDiv); runTest(testDiv); runTest(testDiv); runTest(testAnd); runTest(testAnd); runTest(testAnd); runTest(testAnd); // no operator& for float/double runTest(testShift); runTest(testShift); runTest(testShift); runTest(testShift); runTest(testMulAdd); runTest(testMulAdd); runTest(testMulAdd); runTest(testMulAdd); runTest(testMulAdd); runTest(testMulAdd); runTest(testMulAdd); runTest(testMulSub); runTest(testMulSub); runTest(testMulSub); runTest(testMulSub); runTest(testMulSub); runTest(testMulSub); runTest(testMulSub); runTest(testOnesComplement); runTest(testOnesComplement); runTest(testOnesComplement); runTest(testOnesComplement); testAllTypes(testNegate); testAllTypes(testMin); testAllTypes(testMax); testAllTypes(testProduct); testAllTypes(testSum); return 0; } Vc-0.7.4/tests/casts.cpp000066400000000000000000000100161233512346000150520ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2010-2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include "unittest.h" #include #include using namespace Vc; template void testNumber(double n) { typedef typename V1::EntryType T1; typedef typename V2::EntryType T2; // compare casts from T1 -> T2 with casts from V1 -> V2 const T1 n1 = static_cast(n); //std::cerr << "n1 = " << n1 << ", static_cast(n1) = " << static_cast(n1) << std::endl; COMPARE(static_cast(V1(n1)), V2(static_cast(n1))) << "\n n1: " << n1; } template double maxHelper() { return static_cast(std::numeric_limits::max()); } template<> double maxHelper() { const int intDigits = std::numeric_limits::digits; const int floatDigits = std::numeric_limits::digits; return static_cast(((int(1) << floatDigits) - 1) << (intDigits - floatDigits)); } template<> double maxHelper() { const int intDigits = std::numeric_limits::digits; const int floatDigits = std::numeric_limits::digits; return static_cast(((unsigned(1) << floatDigits) - 1) << (intDigits - floatDigits)); } template void testCast2() { typedef typename V1::EntryType T1; typedef typename V2::EntryType T2; const double max = std::min(maxHelper(), maxHelper()); const double min = std::max( std::numeric_limits::is_integer ? static_cast(std::numeric_limits::min()) : static_cast(-std::numeric_limits::max()), std::numeric_limits::is_integer ? static_cast(std::numeric_limits::min()) : static_cast(-std::numeric_limits::max()) ); testNumber(0.); testNumber(1.); testNumber(2.); testNumber(max); testNumber(max / 4 + max / 2); testNumber(max / 2); testNumber(max / 4); testNumber(min); } template void testCast() { testCast2(); } #define _CONCAT(A, B) A ## _ ## B #define CONCAT(A, B) _CONCAT(A, B) template struct T2Helper { typedef T1 V1; typedef T2 V2; }; void testFloatIndexesFromZero() { Vc::float_v test(Vc::int_v::IndexesFromZero()); for (int i = 0; i < float_v::Size; ++i) { COMPARE(test[i], float(i)); } } int main(int argc, char **argv) { initTest(argc, argv); #define TEST(v1, v2) \ typedef T2Helper CONCAT(v1, v2); \ runTest(testCast) TEST(float_v, float_v); TEST(float_v, int_v); TEST(float_v, uint_v); // needs special handling for different Size: //TEST(float_v, double_v); //TEST(float_v, short_v); //TEST(float_v, ushort_v); TEST(int_v, float_v); TEST(int_v, int_v); TEST(int_v, uint_v); TEST(uint_v, float_v); TEST(uint_v, int_v); TEST(uint_v, uint_v); TEST(ushort_v, sfloat_v); TEST(ushort_v, short_v); TEST(ushort_v, ushort_v); TEST(short_v, sfloat_v); TEST(short_v, short_v); TEST(short_v, ushort_v); TEST(sfloat_v, sfloat_v); TEST(sfloat_v, short_v); TEST(sfloat_v, ushort_v); #undef TEST runTest(testFloatIndexesFromZero); return 0; } Vc-0.7.4/tests/const.h000066400000000000000000000072031233512346000145340ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright (C) 2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . }}}*/ #ifndef VC_TESTS_CONST_H_ #define VC_TESTS_CONST_H_ #include /*OUTER_NAMESPACE_BEGIN*/ namespace Vc { template struct Math; template<> struct Math { static _VC_CONSTEXPR float e() { return 2.7182818284590452353602874713526625f; } static _VC_CONSTEXPR float log2e() { return 1.4426950408889634073599246810018921f; } static _VC_CONSTEXPR float log10e() { return 0.4342944819032518276511289189166051f; } static _VC_CONSTEXPR float ln2() { return Vc_buildFloat(1, 0x317218, -1); } // .693147182464599609375 static _VC_CONSTEXPR float ln10() { return 2.3025850929940456840179914546843642f; } static _VC_CONSTEXPR float pi() { return 3.1415926535897932384626433832795029f; } static _VC_CONSTEXPR float pi_2() { return 1.5707963267948966192313216916397514f; } static _VC_CONSTEXPR float pi_4() { return 0.7853981633974483096156608458198757f; } static _VC_CONSTEXPR float _1_pi() { return 0.3183098861837906715377675267450287f; } static _VC_CONSTEXPR float _2_pi() { return 0.6366197723675813430755350534900574f; } static _VC_CONSTEXPR float _2_sqrtpi() { return 1.1283791670955125738961589031215452f; } static _VC_CONSTEXPR float sqrt2() { return 1.4142135623730950488016887242096981f; } static _VC_CONSTEXPR float sqrt1_2() { return 0.7071067811865475244008443621048490f; } }; template<> struct Math { static _VC_CONSTEXPR double e() { return 2.7182818284590452353602874713526625; } static _VC_CONSTEXPR double log2e() { return 1.4426950408889634073599246810018921; } static _VC_CONSTEXPR double log10e() { return 0.4342944819032518276511289189166051; } static _VC_CONSTEXPR double ln2() { return Vc_buildDouble(1, 0x62E42FEFA39EFull, -1); } // .69314718055994528622676398299518041312694549560546875 static _VC_CONSTEXPR double ln10() { return 2.3025850929940456840179914546843642; } static _VC_CONSTEXPR double pi() { return 3.1415926535897932384626433832795029; } static _VC_CONSTEXPR double pi_2() { return 1.5707963267948966192313216916397514; } static _VC_CONSTEXPR double pi_4() { return 0.7853981633974483096156608458198757; } static _VC_CONSTEXPR double _1_pi() { return 0.3183098861837906715377675267450287; } static _VC_CONSTEXPR double _2_pi() { return 0.6366197723675813430755350534900574; } static _VC_CONSTEXPR double _2_sqrtpi() { return 1.1283791670955125738961589031215452; } static _VC_CONSTEXPR double sqrt2() { return 1.4142135623730950488016887242096981; } static _VC_CONSTEXPR double sqrt1_2() { return 0.7071067811865475244008443621048490; } }; } // namespace Vc /*OUTER_NAMESPACE_END*/ #include #endif // VC_TESTS_CONST_H_ Vc-0.7.4/tests/convert-sincos-reference.cpp000066400000000000000000000122131233512346000206460ustar00rootroot00000000000000#include template struct SincosReference { const T x, s, c; }; template struct Reference { const T x, ref; }; template struct Data { static const SincosReference sincosReference[]; static const Reference asinReference[]; static const Reference acosReference[]; static const Reference atanReference[]; static const Reference lnReference[]; static const Reference log2Reference[]; static const Reference log10Reference[]; }; namespace Function { enum Function { sincos, atan, asin, acos, ln, log2, log10 }; } template static inline const char *filenameOut(); template<> inline const char *filenameOut() { return "sincos-reference-single.dat"; } template<> inline const char *filenameOut() { return "sincos-reference-double.dat"; } template<> inline const char *filenameOut() { return "atan-reference-single.dat"; } template<> inline const char *filenameOut() { return "atan-reference-double.dat"; } template<> inline const char *filenameOut() { return "asin-reference-single.dat"; } template<> inline const char *filenameOut() { return "asin-reference-double.dat"; } template<> inline const char *filenameOut() { return "acos-reference-single.dat"; } template<> inline const char *filenameOut() { return "acos-reference-double.dat"; } template<> inline const char *filenameOut() { return "reference-ln-sp.dat"; } template<> inline const char *filenameOut() { return "reference-ln-dp.dat"; } template<> inline const char *filenameOut() { return "reference-log2-sp.dat"; } template<> inline const char *filenameOut() { return "reference-log2-dp.dat"; } template<> inline const char *filenameOut() { return "reference-log10-sp.dat"; } template<> inline const char *filenameOut() { return "reference-log10-dp.dat"; } template<> const SincosReference Data::sincosReference[] = { #include "sincos-reference-single.h" }; template<> const SincosReference Data::sincosReference[] = { #include "sincos-reference-double.h" }; template<> const Reference Data::asinReference[] = { #include "asin-reference-single.h" }; template<> const Reference Data::asinReference[] = { #include "asin-reference-double.h" }; template<> const Reference Data::acosReference[] = { #include "acos-reference-single.h" }; template<> const Reference Data::acosReference[] = { #include "acos-reference-double.h" }; template<> const Reference Data::atanReference[] = { #include "atan-reference-single.h" }; template<> const Reference Data::atanReference[] = { #include "atan-reference-double.h" }; template<> const Reference Data::lnReference[] = { #include "reference-ln-sp.h" }; template<> const Reference Data::lnReference[] = { #include "reference-ln-dp.h" }; template<> const Reference Data::log2Reference[] = { #include "reference-log2-sp.h" }; template<> const Reference Data::log2Reference[] = { #include "reference-log2-dp.h" }; template<> const Reference Data::log10Reference[] = { #include "reference-log10-sp.h" }; template<> const Reference Data::log10Reference[] = { #include "reference-log10-dp.h" }; template static void convert() { FILE *file; file = fopen(filenameOut(), "wb"); fwrite(&Data::sincosReference[0], sizeof(SincosReference), sizeof(Data::sincosReference) / sizeof(SincosReference), file); fclose(file); file = fopen(filenameOut(), "wb"); fwrite(&Data::atanReference[0], sizeof(Reference), sizeof(Data::atanReference) / sizeof(Reference), file); fclose(file); file = fopen(filenameOut(), "wb"); fwrite(&Data::asinReference[0], sizeof(Reference), sizeof(Data::asinReference) / sizeof(Reference), file); fclose(file); file = fopen(filenameOut(), "wb"); fwrite(&Data::acosReference[0], sizeof(Reference), sizeof(Data::acosReference) / sizeof(Reference), file); fclose(file); file = fopen(filenameOut(), "wb"); fwrite(&Data::lnReference[0], sizeof(Reference), sizeof(Data::lnReference) / sizeof(Reference), file); fclose(file); file = fopen(filenameOut(), "wb"); fwrite(&Data::log2Reference[0], sizeof(Reference), sizeof(Data::log2Reference) / sizeof(Reference), file); fclose(file); file = fopen(filenameOut(), "wb"); fwrite(&Data::log10Reference[0], sizeof(Reference), sizeof(Data::log10Reference) / sizeof(Reference), file); fclose(file); } int main() { convert(); convert(); return 0; } Vc-0.7.4/tests/deinterleave.cpp000066400000000000000000000360441233512346000164150ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2010-2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include "unittest.h" #include #include using namespace Vc; /* * V \ M | float | double | ushort | short | uint | int * ---------+---------------------------------------------- * float_v | X | | X | X | | * sfloat_v | X | | X | X | | * double_v | | X | | | | * int_v | | | | X | | X * uint_v | | | X | | X | * short_v | | | | X | | * ushort_v | | | X | | | */ template struct TPair { typedef A V; typedef B M; }; typedef TPair float_float; typedef TPair float_ushort; typedef TPair float_short; typedef TPair sfloat_float; typedef TPair sfloat_ushort; typedef TPair sfloat_short; typedef TPair double_double; typedef TPair short_short; typedef TPair ushort_ushort; typedef TPair int_int; typedef TPair int_short; typedef TPair uint_uint; typedef TPair uint_ushort; template void testDeinterleave() { typedef typename Pair::V V; typedef typename Pair::M M; typedef typename V::IndexType I; const bool isSigned = std::numeric_limits::is_signed; const typename V::EntryType offset = isSigned ? -512 : 0; const V _0246 = static_cast(I::IndexesFromZero()) * 2 + offset; M memory[1024]; for (int i = 0; i < 1024; ++i) { memory[i] = static_cast(i + offset); } V a, b; for (int i = 0; i < 1024 - 2 * V::Size; ++i) { // note that a 32 bit integer is certainly enough to decide on alignment... // ... but uintptr_t is C99 but not C++ yet // ... and GCC refuses to do the cast, even if I know what I'm doing if (reinterpret_cast(&memory[i]) & (VectorAlignment - 1)) { Vc::deinterleave(&a, &b, &memory[i], Unaligned); } else { Vc::deinterleave(&a, &b, &memory[i]); } COMPARE(_0246 + i, a); COMPARE(_0246 + i + 1, b); } } template struct SomeStruct { T d[N]; }; template struct Types { typedef typename V::EntryType T; typedef typename V::IndexType I; typedef typename V::AsArg VArg; typedef typename I::AsArg IArg; typedef SomeStruct S; typedef const Vc::InterleavedMemoryWrapper &Wrapper; }; template struct TestDeinterleaveGatherCompare; template struct TestDeinterleaveGatherCompare { static void test(typename Types::Wrapper data_v, typename Types::IArg indexes, const typename V::AsArg reference) { V v0, v1, v2, v3, v4, v5, v6, v7; (v0, v1, v2, v3, v4, v5, v6, v7) = data_v[indexes]; COMPARE(v0, reference + 0) << "N = 8"; COMPARE(v1, reference + 1) << "N = 8"; COMPARE(v2, reference + 2) << "N = 8"; COMPARE(v3, reference + 3) << "N = 8"; COMPARE(v4, reference + 4) << "N = 8"; COMPARE(v5, reference + 5) << "N = 8"; COMPARE(v6, reference + 6) << "N = 8"; COMPARE(v7, reference + 7) << "N = 8"; TestDeinterleaveGatherCompare::test(data_v, indexes, reference); } }; template struct TestDeinterleaveGatherCompare { static void test(typename Types::Wrapper data_v, typename Types::IArg indexes, const typename V::AsArg reference) { V v0, v1, v2, v3, v4, v5, v6; (v0, v1, v2, v3, v4, v5, v6) = data_v[indexes]; COMPARE(v0, reference + 0) << "N = 7"; COMPARE(v1, reference + 1) << "N = 7"; COMPARE(v2, reference + 2) << "N = 7"; COMPARE(v3, reference + 3) << "N = 7"; COMPARE(v4, reference + 4) << "N = 7"; COMPARE(v5, reference + 5) << "N = 7"; COMPARE(v6, reference + 6) << "N = 7"; TestDeinterleaveGatherCompare::test(data_v, indexes, reference); } }; template struct TestDeinterleaveGatherCompare { static void test(typename Types::Wrapper data_v, typename Types::IArg indexes, const typename V::AsArg reference) { V v0, v1, v2, v3, v4, v5; (v0, v1, v2, v3, v4, v5) = data_v[indexes]; COMPARE(v0, reference + 0) << "N = 6"; COMPARE(v1, reference + 1) << "N = 6"; COMPARE(v2, reference + 2) << "N = 6"; COMPARE(v3, reference + 3) << "N = 6"; COMPARE(v4, reference + 4) << "N = 6"; COMPARE(v5, reference + 5) << "N = 6"; TestDeinterleaveGatherCompare::test(data_v, indexes, reference); } }; template struct TestDeinterleaveGatherCompare { static void test(typename Types::Wrapper data_v, typename Types::IArg indexes, const typename V::AsArg reference) { V v0, v1, v2, v3, v4; (v0, v1, v2, v3, v4) = data_v[indexes]; COMPARE(v0, reference + 0) << "N = 5"; COMPARE(v1, reference + 1) << "N = 5"; COMPARE(v2, reference + 2) << "N = 5"; COMPARE(v3, reference + 3) << "N = 5"; COMPARE(v4, reference + 4) << "N = 5"; TestDeinterleaveGatherCompare::test(data_v, indexes, reference); } }; template struct TestDeinterleaveGatherCompare { static void test(typename Types::Wrapper data_v, typename Types::IArg indexes, const typename V::AsArg reference) { V a, b, c, d; (a, b, c, d) = data_v[indexes]; COMPARE(a, reference + 0) << "N = 4"; COMPARE(b, reference + 1) << "N = 4"; COMPARE(c, reference + 2) << "N = 4"; COMPARE(d, reference + 3) << "N = 4"; TestDeinterleaveGatherCompare::test(data_v, indexes, reference); } }; template struct TestDeinterleaveGatherCompare { static void test(typename Types::Wrapper data_v, typename Types::IArg indexes, const typename V::AsArg reference) { V a, b, c; (a, b, c) = data_v[indexes]; COMPARE(a, reference + 0) << "N = 3"; COMPARE(b, reference + 1) << "N = 3"; COMPARE(c, reference + 2) << "N = 3"; TestDeinterleaveGatherCompare::test(data_v, indexes, reference); } }; template struct TestDeinterleaveGatherCompare { static void test(typename Types::Wrapper data_v, typename Types::IArg indexes, const typename V::AsArg reference) { V a, b; (a, b) = data_v[indexes]; COMPARE(a, reference + 0) << "N = 2"; COMPARE(b, reference + 1) << "N = 2"; } }; size_t createNMask(size_t N) { size_t NMask = (N >> 1) | (N >> 2); for (size_t shift = 2; shift < sizeof(size_t) * 8; shift *= 2) { NMask |= NMask >> shift; } return NMask; } template void testDeinterleaveGatherImpl() { typedef typename V::EntryType T; typedef typename V::IndexType I; typedef SomeStruct S; typedef Vc::InterleavedMemoryWrapper Wrapper; const size_t N = std::min(std::numeric_limits::max(), 1024 * 1024 / sizeof(S)); const size_t NMask = createNMask(N); S *data = Vc::malloc(N); for (size_t i = 0; i < N; ++i) { for (size_t j = 0; j < StructSize; ++j) { data[i].d[j] = i * StructSize + j; } } const Wrapper data_v(data); for (int retest = 0; retest < 10000; ++retest) { I indexes = (I::Random() >> 10) & I(NMask); VERIFY(indexes >= 0); VERIFY(indexes < N); const V reference = static_cast(indexes) * V(StructSize); TestDeinterleaveGatherCompare::test(data_v, indexes, reference); } } template void testDeinterleaveGather() { testDeinterleaveGatherImpl(); testDeinterleaveGatherImpl(); testDeinterleaveGatherImpl(); testDeinterleaveGatherImpl(); testDeinterleaveGatherImpl(); testDeinterleaveGatherImpl(); testDeinterleaveGatherImpl(); } template struct TestInterleavingScatterCompare; #define _IMPL(STRUCTSIZE, _code_) \ template struct TestInterleavingScatterCompare { \ typedef TestInterleavingScatterCompare NextTest; \ template static void test(Wrapper &data, const typename V::IndexType &i) { \ _code_ \ } \ } _IMPL(2, const V v0 = V::Random(); const V v1 = V::Random(); V t0; V t1; data[i] = (v0, v1); (t0, t1) = data[i]; COMPARE(t0, v0) << 2; COMPARE(t1, v1) << 2; ); _IMPL(3, const V v0 = V::Random(); const V v1 = V::Random(); const V v2 = V::Random(); V t0; V t1; V t2; data[i] = (v0, v1, v2); (t0, t1, t2) = data[i]; COMPARE(t0, v0) << 3; COMPARE(t1, v1) << 3; COMPARE(t2, v2) << 3; NextTest::test(data, i); ); _IMPL(4, const V v0 = V::Random(); const V v1 = V::Random(); const V v2 = V::Random(); const V v3 = V::Random(); V t0; V t1; V t2; V t3; data[i] = (v0, v1, v2, v3); (t0, t1, t2, t3) = data[i]; COMPARE(t0, v0) << 4; COMPARE(t1, v1) << 4; COMPARE(t2, v2) << 4; COMPARE(t3, v3) << 4; NextTest::test(data, i); ); _IMPL(5, const V v0 = V::Random(); const V v1 = V::Random(); const V v2 = V::Random(); const V v3 = V::Random(); const V v4 = V::Random(); V t0; V t1; V t2; V t3; V t4; data[i] = (v0, v1, v2, v3, v4); (t0, t1, t2, t3, t4) = data[i]; COMPARE(t0, v0) << 5; COMPARE(t1, v1) << 5; COMPARE(t2, v2) << 5; COMPARE(t3, v3) << 5; COMPARE(t4, v4) << 5; NextTest::test(data, i); ); _IMPL(6, const V v0 = V::Random(); const V v1 = V::Random(); const V v2 = V::Random(); const V v3 = V::Random(); const V v4 = V::Random(); const V v5 = V::Random(); V t0; V t1; V t2; V t3; V t4; V t5; data[i] = (v0, v1, v2, v3, v4, v5); (t0, t1, t2, t3, t4, t5) = data[i]; COMPARE(t0, v0) << 6; COMPARE(t1, v1) << 6; COMPARE(t2, v2) << 6; COMPARE(t3, v3) << 6; COMPARE(t4, v4) << 6; COMPARE(t5, v5) << 6; NextTest::test(data, i); ); _IMPL(7, const V v0 = V::Random(); const V v1 = V::Random(); const V v2 = V::Random(); const V v3 = V::Random(); const V v4 = V::Random(); const V v5 = V::Random(); const V v6 = V::Random(); V t0; V t1; V t2; V t3; V t4; V t5; V t6; data[i] = (v0, v1, v2, v3, v4, v5, v6); (t0, t1, t2, t3, t4, t5, t6) = data[i]; COMPARE(t0, v0) << 7; COMPARE(t1, v1) << 7; COMPARE(t2, v2) << 7; COMPARE(t3, v3) << 7; COMPARE(t4, v4) << 7; COMPARE(t5, v5) << 7; COMPARE(t6, v6) << 7; NextTest::test(data, i); ); _IMPL(8, const V v0 = V::Random(); const V v1 = V::Random(); const V v2 = V::Random(); const V v3 = V::Random(); const V v4 = V::Random(); const V v5 = V::Random(); const V v6 = V::Random(); const V v7 = V::Random(); V t0; V t1; V t2; V t3; V t4; V t5; V t6; V t7; data[i] = (v0, v1, v2, v3, v4, v5, v6, v7); (t0, t1, t2, t3, t4, t5, t6, t7) = data[i]; COMPARE(t0, v0) << 8; COMPARE(t1, v1) << 8; COMPARE(t2, v2) << 8; COMPARE(t3, v3) << 8; COMPARE(t4, v4) << 8; COMPARE(t5, v5) << 8; COMPARE(t6, v6) << 8; COMPARE(t7, v7) << 8; NextTest::test(data, i); ); template void testInterleavingScatterImpl() { typedef typename V::EntryType T; typedef typename V::IndexType I; typedef SomeStruct S; typedef Vc::InterleavedMemoryWrapper Wrapper; const size_t N = std::min(std::numeric_limits::max(), 1024 * 1024 / sizeof(S)); const size_t NMask = createNMask(N); S *data = Vc::malloc(N); std::memset(data, 0, sizeof(S) * N); Wrapper data_v(data); for (int retest = 0; retest < 10000; ++retest) { I indexes = (I::Random() >> 10) & I(NMask); if (I::Size != 1) { // ensure the indexes are unique while(!(indexes.sorted() == indexes.sorted().rotated(1)).isEmpty()) { indexes = (I::Random() >> 10) & I(NMask); } } VERIFY(indexes >= 0); VERIFY(indexes < N); TestInterleavingScatterCompare::test(data_v, indexes); } } template void testInterleavingScatter() { testInterleavingScatterImpl(); testInterleavingScatterImpl(); testInterleavingScatterImpl(); testInterleavingScatterImpl(); testInterleavingScatterImpl(); testInterleavingScatterImpl(); testInterleavingScatterImpl(); } int main() { runTest(testDeinterleave); runTest(testDeinterleave); runTest(testDeinterleave); runTest(testDeinterleave); runTest(testDeinterleave); runTest(testDeinterleave); runTest(testDeinterleave); runTest(testDeinterleave); runTest(testDeinterleave); runTest(testDeinterleave); runTest(testDeinterleave); runTest(testDeinterleave); runTest(testDeinterleave); testAllTypes(testDeinterleaveGather); testAllTypes(testInterleavingScatter); } Vc-0.7.4/tests/download.cmake000066400000000000000000000001401233512346000160370ustar00rootroot00000000000000file(DOWNLOAD "http://compeng.uni-frankfurt.de/~kretz/Vc-testdata/${filename}" "./${filename}") Vc-0.7.4/tests/expandandmerge.cpp000066400000000000000000000054001233512346000167200ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include "unittest.h" #include using namespace Vc; enum { VectorSizeFactor = short_v::Size / int_v::Size }; void testSigned() { for (int start = -32000; start < 32000; start += 5) { int_v a[VectorSizeFactor]; for (int i = 0; i < VectorSizeFactor; ++i) { a[i] = int_v(IndexesFromZero) + int_v::Size * i + start; } short_v b(a); COMPARE(b, short_v(IndexesFromZero) + start); // false positive: warning: ‘c’ is used uninitialized in this function int_v c[VectorSizeFactor]; b.expand(c); for (int i = 0; i < VectorSizeFactor; ++i) { COMPARE(c[i], int_v(IndexesFromZero) + int_v::Size * i + start); } } } void testUnsigned() { #if defined(VC_IMPL_SSE4_1) || defined(VC_IMPL_AVX) for (unsigned int start = 0; start < 64000; start += 5) { #else for (unsigned int start = 0; start < 32000; start += 5) { #endif uint_v a[VectorSizeFactor]; for (unsigned int i = 0; i < VectorSizeFactor; ++i) { a[i] = uint_v(IndexesFromZero) + uint_v::Size * i + start; } ushort_v b(a); COMPARE(b, ushort_v(IndexesFromZero) + start); // false positive: warning: ‘c’ is used uninitialized in this function uint_v c[VectorSizeFactor]; b.expand(c); for (unsigned int i = 0; i < VectorSizeFactor; ++i) { COMPARE(c[i], uint_v(IndexesFromZero) + uint_v::Size * i + start); } } for (unsigned int start = 32000; start < 64000; start += 5) { ushort_v b(IndexesFromZero); b += start; COMPARE(b, ushort_v(IndexesFromZero) + start); // false positive: warning: ‘c’ may be used uninitialized in this function uint_v c[VectorSizeFactor]; b.expand(c); for (unsigned int i = 0; i < VectorSizeFactor; ++i) { COMPARE(c[i], uint_v(IndexesFromZero) + uint_v::Size * i + start); } } } int main() { runTest(testSigned); runTest(testUnsigned); return 0; } Vc-0.7.4/tests/gather.cpp000066400000000000000000000142471233512346000152210ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include "unittest.h" #include using namespace Vc; template void maskedGatherArray() { typedef typename Vec::IndexType It; typedef typename Vec::EntryType T; T mem[Vec::Size]; for (int i = 0; i < Vec::Size; ++i) { mem[i] = i + 1; } It indexes = It::IndexesFromZero(); for_all_masks(Vec, m) { const Vec a(mem, indexes, m); for (int i = 0; i < Vec::Size; ++i) { COMPARE(a[i], m[i] ? mem[i] : 0) << " i = " << i << ", m = " << m; } T x = Vec::Size + 1; Vec b = x; b.gather(mem, indexes, m); for (int i = 0; i < Vec::Size; ++i) { COMPARE(b[i], m[i] ? mem[i] : x) << " i = " << i << ", m = " << m; } } } template void gatherArray() { typedef typename Vec::IndexType It; typedef typename Vec::EntryType T; typedef typename It::Mask M; const int count = 39999; T array[count]; for (int i = 0; i < count; ++i) { array[i] = i + 1; } M mask; for (It i = It(IndexesFromZero); !(mask = (i < count)).isEmpty(); i += Vec::Size) { const Vec ii(i + 1); const typename Vec::Mask castedMask = static_cast(mask); if (castedMask.isFull()) { Vec a(array, i); COMPARE(a, ii) << "\n i: " << i; Vec b(Zero); b.gather(array, i); COMPARE(b, ii); COMPARE(a, b); } Vec b(Zero); b.gather(array, i, castedMask); COMPARE(castedMask, (b == ii)) << ", b = " << b << ", ii = " << ii << ", i = " << i; if (!castedMask.isFull()) { COMPARE(!castedMask, b == Vec(Zero)); } } const typename Vec::Mask k(Zero); Vec a(One); a.gather(array, It(IndexesFromZero), k); COMPARE(a, Vec(One)); } template struct Struct { T a; char x; T b; short y; T c; char z; }; template void gatherStruct() { typedef typename Vec::IndexType It; typedef typename Vec::EntryType T; typedef Struct S; const int count = 3999; S array[count]; for (int i = 0; i < count; ++i) { array[i].a = i; array[i].b = i + 1; array[i].c = i + 2; } typename It::Mask mask; for (It i = It(IndexesFromZero); !(mask = (i < count)).isEmpty(); i += Vec::Size) { // if Vec is double_v the cast keeps only the lower two values, which is why the == // comparison works const Vec i0(i); const Vec i1(i + 1); const Vec i2(i + 2); const typename Vec::Mask castedMask(mask); if (castedMask.isFull()) { Vec a(array, &S::a, i); COMPARE(a, i0) << "\ni: " << i; a.gather(array, &S::b, i); COMPARE(a, i1); a.gather(array, &S::c, i); COMPARE(a, i2); } Vec b(Zero); b.gather(array, &S::a, i, castedMask); COMPARE(castedMask, (b == i0)); if (!castedMask.isFull()) { COMPARE(!castedMask, b == Vec(Zero)); } b.gather(array, &S::b, i, castedMask); COMPARE(castedMask, (b == i1)); if (!castedMask.isFull()) { COMPARE(!castedMask, b == Vec(Zero)); } b.gather(array, &S::c, i, castedMask); COMPARE(castedMask, (b == i2)); if (!castedMask.isFull()) { COMPARE(!castedMask, b == Vec(Zero)); } } } template struct Row { T *data; }; template void gather2dim() { typedef typename Vec::IndexType It; typedef typename Vec::EntryType T; const int count = 399; typedef Row S; S array[count]; for (int i = 0; i < count; ++i) { array[i].data = new T[count]; for (int j = 0; j < count; ++j) { array[i].data[j] = 2 * i + j + 1; } } typename It::Mask mask; for (It i = It(IndexesFromZero); !(mask = (i < count)).isEmpty(); i += Vec::Size) { for (It j = It(IndexesFromZero); !(mask &= (j < count)).isEmpty(); j += Vec::Size) { const Vec i0(i * 2 + j + 1); const typename Vec::Mask castedMask(mask); Vec a(array, &S::data, i, j, castedMask); COMPARE(castedMask, castedMask && (a == i0)) << ", a = " << a << ", i0 = " << i0 << ", i = " << i << ", j = " << j; Vec b(Zero); b.gather(array, &S::data, i, j, castedMask); COMPARE(castedMask, (b == i0)); if (!castedMask.isFull()) { COMPARE(!castedMask, b == Vec(Zero)); } else { Vec c(array, &S::data, i, j); VERIFY((c == i0).isFull()); Vec d(Zero); d.gather(array, &S::data, i, j); VERIFY((d == i0).isFull()); } } } for (int i = 0; i < count; ++i) { delete[] array[i].data; } } int main(int argc, char **argv) { initTest(argc, argv); testAllTypes(gatherArray); testAllTypes(maskedGatherArray); #if defined(VC_CLANG) && VC_CLANG <= 0x030000 // clang fails with: // candidate template ignored: failed template argument deduction // template inline Vector(const S1 *array, const T S1::* member1, IT indexes, Mask mask = true) #warning "Skipping compilation of tests gatherStruct and gather2dim because of clang bug" #else testAllTypes(gatherStruct); testAllTypes(gather2dim); #endif return 0; } Vc-0.7.4/tests/implicit_type_conversion.cpp000066400000000000000000000276761233512346000211010ustar00rootroot00000000000000/*{{{ Copyright (C) 2012 Matthias Kretz This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . }}}*/ #include "unittest.h" //#define QUICK 1 using namespace Vc; typedef unsigned short ushort; typedef unsigned int uint; typedef unsigned long ulong; typedef long long longlong; typedef unsigned long long ulonglong; #ifdef QUICK #define _TYPE_TEST(a, b, c) #define _TYPE_TEST_ERR(a, b) #else #if defined(VC_GCC) && VC_GCC == 0x40801 // Skipping tests involving operator& because of a bug in GCC 4.8.1 (http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57532) #define _TYPE_TEST(a, b, c) \ COMPARE(typeid(a() * b()), typeid(c)); \ COMPARE(typeid(a() / b()), typeid(c)); \ COMPARE(typeid(a() + b()), typeid(c)); \ COMPARE(typeid(a() - b()), typeid(c)); \ COMPARE(typeid(a() | b()), typeid(c)); \ COMPARE(typeid(a() ^ b()), typeid(c)); \ COMPARE(typeid(a() == b()), typeid(c::Mask)); \ COMPARE(typeid(a() != b()), typeid(c::Mask)); \ COMPARE(typeid(a() <= b()), typeid(c::Mask)); \ COMPARE(typeid(a() >= b()), typeid(c::Mask)); \ COMPARE(typeid(a() < b()), typeid(c::Mask)); #else #define _TYPE_TEST(a, b, c) \ COMPARE(typeid(a() * b()), typeid(c)); \ COMPARE(typeid(a() / b()), typeid(c)); \ COMPARE(typeid(a() + b()), typeid(c)); \ COMPARE(typeid(a() - b()), typeid(c)); \ COMPARE(typeid(a() & b()), typeid(c)); \ COMPARE(typeid(a() | b()), typeid(c)); \ COMPARE(typeid(a() ^ b()), typeid(c)); \ COMPARE(typeid(a() == b()), typeid(c::Mask)); \ COMPARE(typeid(a() != b()), typeid(c::Mask)); \ COMPARE(typeid(a() <= b()), typeid(c::Mask)); \ COMPARE(typeid(a() >= b()), typeid(c::Mask)); \ COMPARE(typeid(a() < b()), typeid(c::Mask)); #endif #define _TYPE_TEST_ERR(a, b) \ COMPARE(typeid(a() * b()), typeid(Vc::Error::invalid_operands_of_types)); \ COMPARE(typeid(a() / b()), typeid(Vc::Error::invalid_operands_of_types)); \ COMPARE(typeid(a() + b()), typeid(Vc::Error::invalid_operands_of_types)); \ COMPARE(typeid(a() - b()), typeid(Vc::Error::invalid_operands_of_types)); \ COMPARE(typeid(a() & b()), typeid(Vc::Error::invalid_operands_of_types)); \ COMPARE(typeid(a() | b()), typeid(Vc::Error::invalid_operands_of_types)); \ COMPARE(typeid(a() ^ b()), typeid(Vc::Error::invalid_operands_of_types)); \ COMPARE(typeid(a() == b()), typeid(Vc::Error::invalid_operands_of_types)); \ COMPARE(typeid(a() != b()), typeid(Vc::Error::invalid_operands_of_types)); \ COMPARE(typeid(a() <= b()), typeid(Vc::Error::invalid_operands_of_types)); \ COMPARE(typeid(a() >= b()), typeid(Vc::Error::invalid_operands_of_types)); \ COMPARE(typeid(a() < b()), typeid(Vc::Error::invalid_operands_of_types)); #endif #define TYPE_TEST(a, b, c) \ _TYPE_TEST(a, b, c) \ COMPARE(typeid(a() > b()), typeid(c::Mask)) template struct TestImplicitCast { static bool test(const T &) { return true; } static bool test( ... ) { return false; } }; enum SomeEnum { EnumValue = 0 }; SomeEnum Enum() { return EnumValue; } void testImplicitTypeConversions() { VERIFY( TestImplicitCast< int>::test(double())); VERIFY( TestImplicitCast< int>::test( float())); VERIFY( TestImplicitCast< int>::test( Enum())); VERIFY( TestImplicitCast< int>::test( short())); VERIFY( TestImplicitCast< int>::test(ushort())); VERIFY( TestImplicitCast< int>::test( char())); VERIFY( TestImplicitCast< int>::test( uint())); VERIFY( TestImplicitCast< int>::test( long())); VERIFY( TestImplicitCast< int>::test( ulong())); VERIFY( TestImplicitCast< int>::test( bool())); VERIFY( TestImplicitCast::test(double())); VERIFY(!TestImplicitCast::test( float())); VERIFY(!TestImplicitCast::test( int())); VERIFY( TestImplicitCast< float_v>::test( float())); VERIFY( TestImplicitCast::test( float())); VERIFY( TestImplicitCast< int_v>::test( int())); VERIFY( TestImplicitCast< uint_v>::test( uint())); VERIFY( TestImplicitCast< short_v>::test( short())); VERIFY( TestImplicitCast::test(ushort())); TYPE_TEST( double_v, double_v, double_v); TYPE_TEST( double_v, double, double_v); TYPE_TEST( double_v, float, double_v); TYPE_TEST( double_v, short, double_v); TYPE_TEST( double_v, ushort, double_v); TYPE_TEST( double_v, int, double_v); TYPE_TEST( double_v, uint, double_v); TYPE_TEST( double_v, long, double_v); TYPE_TEST( double_v, ulong, double_v); TYPE_TEST( double_v, longlong, double_v); TYPE_TEST( double_v, ulonglong, double_v); TYPE_TEST( double_v, Enum, double_v); TYPE_TEST( double, double_v, double_v); TYPE_TEST( float, double_v, double_v); TYPE_TEST( short, double_v, double_v); TYPE_TEST( ushort, double_v, double_v); TYPE_TEST( int, double_v, double_v); TYPE_TEST( uint, double_v, double_v); TYPE_TEST( long, double_v, double_v); TYPE_TEST( ulong, double_v, double_v); TYPE_TEST( longlong, double_v, double_v); TYPE_TEST(ulonglong, double_v, double_v); // double_v done TYPE_TEST( float_v, float_v, float_v); TYPE_TEST( float_v, float, float_v); TYPE_TEST( float_v, short, float_v); TYPE_TEST( float_v, ushort, float_v); TYPE_TEST( float_v, int_v, float_v); TYPE_TEST( float_v, int, float_v); TYPE_TEST( float_v, uint_v, float_v); TYPE_TEST( float_v, uint, float_v); TYPE_TEST( float_v, long, float_v); TYPE_TEST( float_v, ulong, float_v); TYPE_TEST( float_v, longlong, float_v); TYPE_TEST( float_v, ulonglong, float_v); TYPE_TEST( float, float_v, float_v); TYPE_TEST( short, float_v, float_v); TYPE_TEST( ushort, float_v, float_v); TYPE_TEST( int_v, float_v, float_v); TYPE_TEST( int, float_v, float_v); TYPE_TEST( uint_v, float_v, float_v); TYPE_TEST( uint, float_v, float_v); TYPE_TEST( long, float_v, float_v); TYPE_TEST( ulong, float_v, float_v); TYPE_TEST( longlong, float_v, float_v); TYPE_TEST(ulonglong, float_v, float_v); // double_v + float_v done TYPE_TEST( sfloat_v, sfloat_v, sfloat_v); TYPE_TEST( sfloat_v, float, sfloat_v); TYPE_TEST( sfloat_v, short_v, sfloat_v); TYPE_TEST( sfloat_v, short, sfloat_v); TYPE_TEST( sfloat_v, ushort_v, sfloat_v); TYPE_TEST( sfloat_v, ushort, sfloat_v); TYPE_TEST( sfloat_v, int, sfloat_v); TYPE_TEST( sfloat_v, uint, sfloat_v); TYPE_TEST( sfloat_v, long, sfloat_v); TYPE_TEST( sfloat_v, ulong, sfloat_v); TYPE_TEST( sfloat_v, longlong, sfloat_v); TYPE_TEST( sfloat_v, ulonglong, sfloat_v); TYPE_TEST( sfloat_v, sfloat_v, sfloat_v); TYPE_TEST( float, sfloat_v, sfloat_v); TYPE_TEST( short_v, sfloat_v, sfloat_v); TYPE_TEST( short, sfloat_v, sfloat_v); TYPE_TEST( ushort_v, sfloat_v, sfloat_v); TYPE_TEST( ushort, sfloat_v, sfloat_v); TYPE_TEST( int, sfloat_v, sfloat_v); TYPE_TEST( uint, sfloat_v, sfloat_v); TYPE_TEST( long, sfloat_v, sfloat_v); TYPE_TEST( ulong, sfloat_v, sfloat_v); TYPE_TEST( longlong, sfloat_v, sfloat_v); TYPE_TEST(ulonglong, sfloat_v, sfloat_v); // double_v + float_v + sfloat_v done TYPE_TEST( short_v, short_v, short_v); TYPE_TEST( short_v, short, short_v); TYPE_TEST( short_v, ushort_v, ushort_v); TYPE_TEST( short_v, ushort, ushort_v); TYPE_TEST( short_v, int, short_v); TYPE_TEST( short_v, uint, ushort_v); TYPE_TEST( short_v, long, short_v); TYPE_TEST( short_v, ulong, ushort_v); TYPE_TEST( short_v, longlong, short_v); TYPE_TEST( short_v, ulonglong, ushort_v); TYPE_TEST( short, short_v, short_v); TYPE_TEST( ushort_v, short_v, ushort_v); TYPE_TEST( ushort, short_v, ushort_v); TYPE_TEST( int, short_v, short_v); TYPE_TEST( uint, short_v, ushort_v); TYPE_TEST( long, short_v, short_v); TYPE_TEST( ulong, short_v, ushort_v); TYPE_TEST( longlong, short_v, short_v); TYPE_TEST(ulonglong, short_v, ushort_v); // double_v + float_v + sfloat_v + short_v done TYPE_TEST( ushort_v, short, ushort_v); TYPE_TEST( ushort_v, ushort_v, ushort_v); TYPE_TEST( ushort_v, ushort, ushort_v); TYPE_TEST( ushort_v, int, ushort_v); TYPE_TEST( ushort_v, uint, ushort_v); TYPE_TEST( ushort_v, long, ushort_v); TYPE_TEST( ushort_v, ulong, ushort_v); TYPE_TEST( ushort_v, longlong, ushort_v); TYPE_TEST( ushort_v, ulonglong, ushort_v); TYPE_TEST( short, ushort_v, ushort_v); TYPE_TEST( ushort, ushort_v, ushort_v); TYPE_TEST( int, ushort_v, ushort_v); TYPE_TEST( uint, ushort_v, ushort_v); TYPE_TEST( long, ushort_v, ushort_v); TYPE_TEST( ulong, ushort_v, ushort_v); TYPE_TEST( longlong, ushort_v, ushort_v); TYPE_TEST(ulonglong, ushort_v, ushort_v); // double_v + float_v + sfloat_v + short_v + ushort_v done TYPE_TEST( int_v, ushort, uint_v); TYPE_TEST( int_v, short, int_v); TYPE_TEST( int_v, int_v, int_v); TYPE_TEST( int_v, int, int_v); TYPE_TEST( int_v, uint_v, uint_v); TYPE_TEST( int_v, uint, uint_v); TYPE_TEST( int_v, long, int_v); TYPE_TEST( int_v, ulong, uint_v); TYPE_TEST( int_v, longlong, int_v); TYPE_TEST( int_v, ulonglong, uint_v); TYPE_TEST( ushort, int_v, uint_v); TYPE_TEST( short, int_v, int_v); TYPE_TEST( int, int_v, int_v); TYPE_TEST( uint_v, int_v, uint_v); TYPE_TEST( uint, int_v, uint_v); TYPE_TEST( long, int_v, int_v); TYPE_TEST( ulong, int_v, uint_v); TYPE_TEST( longlong, int_v, int_v); TYPE_TEST(ulonglong, int_v, uint_v); TYPE_TEST( uint_v, short, uint_v); TYPE_TEST( uint_v, ushort, uint_v); TYPE_TEST( uint_v, int_v, uint_v); TYPE_TEST( uint_v, int, uint_v); TYPE_TEST( uint_v, uint_v, uint_v); TYPE_TEST( uint_v, uint, uint_v); TYPE_TEST( uint_v, long, uint_v); TYPE_TEST( uint_v, ulong, uint_v); TYPE_TEST( uint_v, longlong, uint_v); TYPE_TEST( uint_v, ulonglong, uint_v); TYPE_TEST( short, uint_v, uint_v); TYPE_TEST( ushort, uint_v, uint_v); TYPE_TEST( int_v, uint_v, uint_v); TYPE_TEST( int, uint_v, uint_v); TYPE_TEST( uint, uint_v, uint_v); TYPE_TEST( long, uint_v, uint_v); TYPE_TEST( ulong, uint_v, uint_v); TYPE_TEST( longlong, uint_v, uint_v); TYPE_TEST(ulonglong, uint_v, uint_v); } int main(int argc, char **argv) { initTest(argc, argv); runTest(testImplicitTypeConversions); return 0; } Vc-0.7.4/tests/implicit_type_conversion_failures.cpp000066400000000000000000000003431233512346000227510ustar00rootroot00000000000000#include #if !defined(TYPE_A) || !defined(TEST_OP) || !defined(TYPE_B) #error "Need to define TYPE_A, TEST_OP, and TYPE_B" #endif using namespace Vc; int main() { TYPE_A() TEST_OP TYPE_B(); return 0; } Vc-0.7.4/tests/linkTest0.cpp000066400000000000000000000004661233512346000156220ustar00rootroot00000000000000#include #if !(defined VC_GCC && VC_GCC < 0x40400) && !defined VC_MSVC #include #endif #include #include #include using namespace Vc; float_v foo0(float_v::AsArg a) { const float_v b = sin(a + float_v::One()); std::cerr << b; return b; } Vc-0.7.4/tests/linkTest1.cpp000066400000000000000000000011441233512346000156150ustar00rootroot00000000000000#include #include #include using namespace Vc; float_v fooLib0A(float_v::AsArg a); float_v fooLib1A(float_v::AsArg a); float_v fooLib0B(float_v::AsArg a); float_v fooLib1B(float_v::AsArg a); float_v fooLib2(float_v::AsArg a); float_v fooLib3(float_v::AsArg a); float_v foo0(float_v::AsArg a); float_v foo1(float_v::AsArg a) { const float_v b = sin(a + float_v::One()); std::cerr << b; return b; } int main() { float_v x = float_v::Random(); x = fooLib0A(fooLib0B(fooLib1A(fooLib1B(fooLib2(fooLib3(foo0(foo1(x)))))))); return static_cast(x.sum()); } Vc-0.7.4/tests/linkTestLib0.cpp000066400000000000000000000004361233512346000162460ustar00rootroot00000000000000#include #include #define CAT(a, b) a##b #define name(a, b) CAT(a, b) using namespace Vc; float_v #ifdef VC_MSVC __declspec(dllexport) #endif name(fooLib0, POSTFIX)(float_v::AsArg a) { const float_v b = sin(a + float_v::One()); std::cerr << b; return b; } Vc-0.7.4/tests/linkTestLib1.cpp000066400000000000000000000004361233512346000162470ustar00rootroot00000000000000#include #include #define CAT(a, b) a##b #define name(a, b) CAT(a, b) using namespace Vc; float_v #ifdef VC_MSVC __declspec(dllexport) #endif name(fooLib1, POSTFIX)(float_v::AsArg a) { const float_v b = sin(a + float_v::One()); std::cerr << b; return b; } Vc-0.7.4/tests/linkTestLib2.cpp000066400000000000000000000002561233512346000162500ustar00rootroot00000000000000#include #include using namespace Vc; float_v fooLib2(float_v::AsArg a) { const float_v b = sin(a + float_v::One()); std::cerr << b; return b; } Vc-0.7.4/tests/linkTestLib3.cpp000066400000000000000000000002561233512346000162510ustar00rootroot00000000000000#include #include using namespace Vc; float_v fooLib3(float_v::AsArg a) { const float_v b = sin(a + float_v::One()); std::cerr << b; return b; } Vc-0.7.4/tests/load.cpp000066400000000000000000000252131233512346000146610ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include "unittest.h" #include using namespace Vc; template unsigned long alignmentMask() { if (Vec::Size == 1) { // on 32bit the maximal alignment is 4 Bytes, even for 8-Byte doubles. return std::min(sizeof(void*), sizeof(typename Vec::EntryType)) - 1; } // sizeof(SSE::sfloat_v) is too large // AVX::VectorAlignment is too large return std::min(sizeof(Vec), VectorAlignment) - 1; } template void checkAlignment() { unsigned char i = 1; Vec a[10]; unsigned long mask = alignmentMask(); for (i = 0; i < 10; ++i) { VERIFY((reinterpret_cast(&a[i]) & mask) == 0) << "a = " << a << ", mask = " << mask; } const char *data = reinterpret_cast(&a[0]); for (i = 0; i < 10; ++i) { VERIFY(&data[i * Vec::Size * sizeof(typename Vec::EntryType)] == reinterpret_cast(&a[i])); } } void *hack_to_put_b_on_the_stack = 0; template void checkMemoryAlignment() { typedef typename Vec::EntryType T; const T *b = 0; Vc::Memory a; b = a; hack_to_put_b_on_the_stack = &b; unsigned long mask = alignmentMask(); for (int i = 0; i < 10; ++i) { VERIFY((reinterpret_cast(&b[i * Vec::Size]) & mask) == 0) << "b = " << b << ", mask = " << mask; } } template void loadArray() { typedef typename Vec::EntryType T; typedef typename Vec::IndexType I; enum loadArrayEnum { count = 256 * 1024 / sizeof(T) }; Vc::Memory array; for (int i = 0; i < count; ++i) { array[i] = i; } const I indexesFromZero(IndexesFromZero); const Vec offsets(indexesFromZero); for (int i = 0; i < count; i += Vec::Size) { const T *const addr = &array[i]; Vec ii(i); ii += offsets; Vec a(addr); COMPARE(a, ii); Vec b = Vec::Zero(); b.load(addr); COMPARE(b, ii); } } enum Enum { loadArrayShortCount = 32 * 1024, streamingLoadCount = 1024 }; template void loadArrayShort() { typedef typename Vec::EntryType T; Vc::Memory array; for (int i = 0; i < loadArrayShortCount; ++i) { array[i] = i; } const Vec &offsets = static_cast(ushort_v::IndexesFromZero()); for (int i = 0; i < loadArrayShortCount; i += Vec::Size) { const T *const addr = &array[i]; Vec ii(i); ii += offsets; Vec a(addr); COMPARE(a, ii); Vec b = Vec::Zero(); b.load(addr); COMPARE(b, ii); } } template void streamingLoad() { typedef typename Vec::EntryType T; Vc::Memory data; data[0] = static_cast(-streamingLoadCount/2); for (int i = 1; i < streamingLoadCount; ++i) { data[i] = data[i - 1]; ++data[i]; } Vec ref = data.firstVector(); for (int i = 0; i < streamingLoadCount - Vec::Size; ++i, ++ref) { Vec v1, v2; if (0 == i % Vec::Size) { v1 = Vec(&data[i], Vc::Streaming | Vc::Aligned); v2.load (&data[i], Vc::Streaming | Vc::Aligned); } else { v1 = Vec(&data[i], Vc::Streaming | Vc::Unaligned); v2.load (&data[i], Vc::Streaming | Vc::Unaligned); } COMPARE(v1, ref); COMPARE(v2, ref); } } template struct TypeInfo; template<> struct TypeInfo { static const char *string() { return "double"; } }; template<> struct TypeInfo { static const char *string() { return "float"; } }; template<> struct TypeInfo { static const char *string() { return "int"; } }; template<> struct TypeInfo { static const char *string() { return "uint"; } }; template<> struct TypeInfo { static const char *string() { return "short"; } }; template<> struct TypeInfo { static const char *string() { return "ushort"; } }; template<> struct TypeInfo { static const char *string() { return "schar"; } }; template<> struct TypeInfo { static const char *string() { return "uchar"; } }; template<> struct TypeInfo { static const char *string() { return "double_v"; } }; template<> struct TypeInfo { static const char *string() { return "float_v"; } }; template<> struct TypeInfo { static const char *string() { return "sfloat_v"; } }; template<> struct TypeInfo { static const char *string() { return "int_v"; } }; template<> struct TypeInfo { static const char *string() { return "uint_v"; } }; template<> struct TypeInfo { static const char *string() { return "short_v"; } }; template<> struct TypeInfo { static const char *string() { return "ushort_v"; } }; template struct SupportedConversions { typedef void Next; }; template<> struct SupportedConversions { typedef double Next; }; template<> struct SupportedConversions { typedef int Next; }; template<> struct SupportedConversions { typedef unsigned int Next; }; template<> struct SupportedConversions { typedef short Next; }; template<> struct SupportedConversions { typedef unsigned short Next; }; template<> struct SupportedConversions { typedef signed char Next; }; template<> struct SupportedConversions { typedef unsigned char Next; }; template<> struct SupportedConversions { typedef void Next; }; template<> struct SupportedConversions { typedef unsigned int Next; }; template<> struct SupportedConversions { typedef short Next; }; template<> struct SupportedConversions { typedef unsigned short Next; }; template<> struct SupportedConversions { typedef signed char Next; }; template<> struct SupportedConversions { typedef unsigned char Next; }; template<> struct SupportedConversions { typedef void Next; }; template<> struct SupportedConversions { typedef unsigned short Next; }; template<> struct SupportedConversions { typedef unsigned char Next; }; template<> struct SupportedConversions { typedef void Next; }; template<> struct SupportedConversions { typedef unsigned char Next; }; template<> struct SupportedConversions { typedef void Next; }; template<> struct SupportedConversions< short, void > { typedef unsigned char Next; }; template<> struct SupportedConversions< short, unsigned char > { typedef signed char Next; }; template<> struct SupportedConversions< short, signed char > { typedef void Next; }; template struct LoadCvt { static void test() { typedef typename Vec::EntryType VecT; MemT *data = Vc::malloc(128); for (size_t i = 0; i < 128; ++i) { data[i] = static_cast(i - 64); } for (size_t i = 0; i < 128 - Vec::Size + 1; ++i) { Vec v; if (i % (2 * Vec::Size) == 0) { v = Vec(&data[i]); } else if (i % Vec::Size == 0) { v = Vec(&data[i], Vc::Aligned); } else { v = Vec(&data[i], Vc::Unaligned); } for (size_t j = 0; j < Vec::Size; ++j) { COMPARE(v[j], static_cast(data[i + j])) << " " << TypeInfo::string(); } } for (size_t i = 0; i < 128 - Vec::Size + 1; ++i) { Vec v; if (i % (2 * Vec::Size) == 0) { v.load(&data[i]); } else if (i % Vec::Size == 0) { v.load(&data[i], Vc::Aligned); } else { v.load(&data[i], Vc::Unaligned); } for (size_t j = 0; j < Vec::Size; ++j) { COMPARE(v[j], static_cast(data[i + j])) << " " << TypeInfo::string(); } } for (size_t i = 0; i < 128 - Vec::Size + 1; ++i) { Vec v; if (i % (2 * Vec::Size) == 0) { v = Vec(&data[i], Vc::Streaming); } else if (i % Vec::Size == 0) { v = Vec(&data[i], Vc::Streaming | Vc::Aligned); } else { v = Vec(&data[i], Vc::Streaming | Vc::Unaligned); } for (size_t j = 0; j < Vec::Size; ++j) { COMPARE(v[j], static_cast(data[i + j])) << " " << TypeInfo::string(); } } ADD_PASS() << "loadCvt: load " << TypeInfo::string() << "* as " << TypeInfo::string(); LoadCvt::Next>::test(); } }; template struct LoadCvt { static void test() {} }; template void loadCvt() { typedef typename Vec::EntryType T; LoadCvt::Next>::test(); } int main() { runTest(checkAlignment); runTest(checkAlignment); runTest(checkAlignment); runTest(checkAlignment); runTest(checkAlignment); runTest(checkAlignment); runTest(checkAlignment); testAllTypes(checkMemoryAlignment); runTest(loadArray); runTest(loadArray); runTest(loadArray); runTest(loadArray); runTest(loadArray); runTest(loadArrayShort); runTest(loadArrayShort); testAllTypes(streamingLoad); testAllTypes(loadCvt); return 0; } Vc-0.7.4/tests/mask.cpp000066400000000000000000000223171233512346000146770ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include "unittest.h" #include #include "vectormemoryhelper.h" #include using namespace Vc; template void testInc() { VectorMemoryHelper mem(2); typedef typename Vec::EntryType T; typedef typename Vec::Mask Mask; T *data = mem; for (int borderI = 0; borderI < Vec::Size; ++borderI) { const T border = static_cast(borderI); for (int i = 0; i < Vec::Size; ++i) { data[i] = static_cast(i); data[i + Vec::Size] = data[i] + static_cast(data[i] < border ? 1 : 0); } Vec a(&data[0]); Vec b(&data[Vec::Size]); Mask m = a < border; Vec aa(a); COMPARE(aa(m)++, a) << ", border: " << border << ", m: " << m; COMPARE(aa, b) << ", border: " << border << ", m: " << m; COMPARE(++a(m), b) << ", border: " << border << ", m: " << m; COMPARE(a, b) << ", border: " << border << ", m: " << m; } } template void testDec() { VectorMemoryHelper mem(2); typedef typename Vec::EntryType T; typedef typename Vec::Mask Mask; T *data = mem; for (int borderI = 0; borderI < Vec::Size; ++borderI) { const T border = static_cast(borderI); for (int i = 0; i < Vec::Size; ++i) { data[i] = static_cast(i + 1); data[i + Vec::Size] = data[i] - static_cast(data[i] < border ? 1 : 0); } Vec a(&data[0]); Vec b(&data[Vec::Size]); Mask m = a < border; Vec aa(a); COMPARE(aa(m)--, a); COMPARE(--a(m), b); COMPARE(a, b); COMPARE(aa, b); } } template void testPlusEq() { VectorMemoryHelper mem(2); typedef typename Vec::EntryType T; typedef typename Vec::Mask Mask; T *data = mem; for (int borderI = 0; borderI < Vec::Size; ++borderI) { const T border = static_cast(borderI); for (int i = 0; i < Vec::Size; ++i) { data[i] = static_cast(i + 1); data[i + Vec::Size] = data[i] + static_cast(data[i] < border ? 2 : 0); } Vec a(&data[0]); Vec b(&data[Vec::Size]); Mask m = a < border; COMPARE(a(m) += static_cast(2), b); COMPARE(a, b); } } template void testMinusEq() { VectorMemoryHelper mem(2); typedef typename Vec::EntryType T; typedef typename Vec::Mask Mask; T *data = mem; for (int borderI = 0; borderI < Vec::Size; ++borderI) { const T border = static_cast(borderI); for (int i = 0; i < Vec::Size; ++i) { data[i] = static_cast(i + 2); data[i + Vec::Size] = data[i] - static_cast(data[i] < border ? 2 : 0); } Vec a(&data[0]); Vec b(&data[Vec::Size]); Mask m = a < border; COMPARE(a(m) -= static_cast(2), b); COMPARE(a, b); } } template void testTimesEq() { VectorMemoryHelper mem(2); typedef typename Vec::EntryType T; typedef typename Vec::Mask Mask; T *data = mem; for (int borderI = 0; borderI < Vec::Size; ++borderI) { const T border = static_cast(borderI); for (int i = 0; i < Vec::Size; ++i) { data[i] = static_cast(i); data[i + Vec::Size] = data[i] * static_cast(data[i] < border ? 2 : 1); } Vec a(&data[0]); Vec b(&data[Vec::Size]); Mask m = a < border; COMPARE(a(m) *= static_cast(2), b); COMPARE(a, b); } } template void testDivEq() { VectorMemoryHelper mem(2); typedef typename Vec::EntryType T; typedef typename Vec::Mask Mask; T *data = mem; for (int borderI = 0; borderI < Vec::Size; ++borderI) { const T border = static_cast(borderI); for (int i = 0; i < Vec::Size; ++i) { data[i] = static_cast(5 * i); data[i + Vec::Size] = data[i] / static_cast(data[i] < border ? 3 : 1); } Vec a(&data[0]); Vec b(&data[Vec::Size]); Mask m = a < border; COMPARE(a(m) /= static_cast(3), b); COMPARE(a, b); } } template void testAssign() { VectorMemoryHelper mem(2); typedef typename Vec::EntryType T; typedef typename Vec::Mask Mask; T *data = mem; for (int borderI = 0; borderI < Vec::Size; ++borderI) { const T border = static_cast(borderI); for (int i = 0; i < Vec::Size; ++i) { data[i] = static_cast(i); data[i + Vec::Size] = data[i] + static_cast(data[i] < border ? 2 : 0); } Vec a(&data[0]); Vec b(&data[Vec::Size]); Mask m = a < border; COMPARE(a(m) = b, b); COMPARE(a, b); } } template void testZero() { typedef typename Vec::EntryType T; typedef typename Vec::Mask Mask; typedef typename Vec::IndexType I; for (int cut = 0; cut < Vec::Size; ++cut) { const Mask mask(I(Vc::IndexesFromZero) < cut); //std::cout << mask << std::endl; const T aa = 4; Vec a(aa); Vec b(Vc::Zero); b(!mask) = a; a.setZero(mask); COMPARE(a, b); } } template void testCount() { for_all_masks(Vec, m) { int count = 0; for (int i = 0; i < Vec::Size; ++i) { if (m[i]) { ++count; } } COMPARE(m.count(), count) << ", m = " << m; } } template void testFirstOne() { typedef typename Vec::IndexType I; typedef typename Vec::Mask M; for (int i = 0; i < Vec::Size; ++i) { const M mask(I(Vc::IndexesFromZero) == i); COMPARE(mask.firstOne(), i); } } template void testLogicalOperatorsImpl() { VERIFY((M1(true) && M2(true)).isFull()); VERIFY((M1(true) && M2(false)).isEmpty()); VERIFY((M1(true) || M2(true)).isFull()); VERIFY((M1(true) || M2(false)).isFull()); VERIFY((M1(false) || M2(false)).isEmpty()); } template void testBinaryOperatorsImpl() { testLogicalOperatorsImpl(); VERIFY((M1(true) & M2(true)).isFull()); VERIFY((M1(true) & M2(false)).isEmpty()); VERIFY((M1(true) | M2(true)).isFull()); VERIFY((M1(true) | M2(false)).isFull()); VERIFY((M1(false) | M2(false)).isEmpty()); VERIFY((M1(true) ^ M2(true)).isEmpty()); VERIFY((M1(true) ^ M2(false)).isFull()); } void testBinaryOperators() { testLogicalOperatorsImpl< short_m, sfloat_m>(); testLogicalOperatorsImpl(); testLogicalOperatorsImpl(); testLogicalOperatorsImpl(); testBinaryOperatorsImpl< short_m, short_m>(); testBinaryOperatorsImpl< short_m, ushort_m>(); testBinaryOperatorsImpl(); testBinaryOperatorsImpl(); testBinaryOperatorsImpl(); testBinaryOperatorsImpl< int_m, int_m>(); testBinaryOperatorsImpl< int_m, uint_m>(); testBinaryOperatorsImpl< int_m, float_m>(); testBinaryOperatorsImpl< uint_m, int_m>(); testBinaryOperatorsImpl< uint_m, uint_m>(); testBinaryOperatorsImpl< uint_m, float_m>(); testBinaryOperatorsImpl< float_m, int_m>(); testBinaryOperatorsImpl< float_m, uint_m>(); testBinaryOperatorsImpl< float_m, float_m>(); testBinaryOperatorsImpl(); } #ifdef VC_IMPL_SSE void testFloat8GatherMask() { Memory data; short_v::Memory andMemory; for (int i = 0; i < short_v::Size; ++i) { andMemory[i] = 1 << i; } const short_v andMask(andMemory); for (unsigned int i = 0; i < data.vectorsCount(); ++i) { data.vector(i) = andMask & i; } for (unsigned int i = 0; i < data.vectorsCount(); ++i) { const short_m mask = data.vector(i) == short_v::Zero(); SSE::Float8GatherMask gatherMaskA(mask), gatherMaskB(static_cast(mask)); COMPARE(gatherMaskA.toInt(), gatherMaskB.toInt()); } } #endif int main(int argc, char **argv) { initTest(argc, argv); testAllTypes(testInc); testAllTypes(testDec); testAllTypes(testPlusEq); testAllTypes(testMinusEq); testAllTypes(testTimesEq); testAllTypes(testDivEq); testAllTypes(testAssign); testAllTypes(testZero); testAllTypes(testCount); testAllTypes(testFirstOne); runTest(testBinaryOperators); #ifdef VC_IMPL_SSE runTest(testFloat8GatherMask); #endif return 0; } Vc-0.7.4/tests/math.cpp000066400000000000000000001043421233512346000146740ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . }}}*/ /*includes {{{*/ #include "unittest.h" #include #include "vectormemoryhelper.h" #include "const.h" #include #include #include /*}}}*/ using namespace Vc; /*fix isfinite and isnan{{{*/ #ifdef isfinite #undef isfinite #endif #ifdef isnan #undef isnan #endif /*}}}*/ template struct SincosReference/*{{{*/ { T x, s, c; }; template struct Reference { T x, ref; }; template struct Array { size_t size; T *data; Array() : size(0), data(0) {} }; template struct StaticDeleter { T *ptr; StaticDeleter(T *p) : ptr(p) {} ~StaticDeleter() { delete[] ptr; } }; enum Function { Sincos, Atan, Asin, Acos, Log, Log2, Log10 }; template static inline const char *filename(); template<> inline const char *filename() { return "reference-sincos-sp.dat"; } template<> inline const char *filename() { return "reference-sincos-dp.dat"; } template<> inline const char *filename() { return "reference-atan-sp.dat"; } template<> inline const char *filename() { return "reference-atan-dp.dat"; } template<> inline const char *filename() { return "reference-asin-sp.dat"; } template<> inline const char *filename() { return "reference-asin-dp.dat"; } // template<> inline const char *filename() { return "reference-acos-sp.dat"; } // template<> inline const char *filename() { return "reference-acos-dp.dat"; } template<> inline const char *filename() { return "reference-ln-sp.dat"; } template<> inline const char *filename() { return "reference-ln-dp.dat"; } template<> inline const char *filename() { return "reference-log2-sp.dat"; } template<> inline const char *filename() { return "reference-log2-dp.dat"; } template<> inline const char *filename() { return "reference-log10-sp.dat"; } template<> inline const char *filename() { return "reference-log10-dp.dat"; } template static Array > sincosReference() { static Array > data; if (data.data == 0) { FILE *file = fopen(filename(), "rb"); if (file) { fseek(file, 0, SEEK_END); const size_t size = ftell(file) / sizeof(SincosReference); rewind(file); data.data = new SincosReference[size]; static StaticDeleter > _cleanup(data.data); data.size = fread(data.data, sizeof(SincosReference), size, file); fclose(file); } else { FAIL() << "the reference data " << filename() << " does not exist in the current working directory."; } } return data; } template static Array > referenceData() { static Array > data; if (data.data == 0) { FILE *file = fopen(filename(), "rb"); if (file) { fseek(file, 0, SEEK_END); const size_t size = ftell(file) / sizeof(Reference); rewind(file); data.data = new Reference[size]; static StaticDeleter > _cleanup(data.data); data.size = fread(data.data, sizeof(Reference), size, file); fclose(file); } else { FAIL() << "the reference data " << filename() << " does not exist in the current working directory."; } } return data; }/*}}}*/ template struct Denormals { static T *data; };/*{{{*/ template<> float *Denormals::data = 0; template<> double *Denormals::data = 0; enum { NDenormals = 64 }; /*}}}*/ template V apply_v(VC_ALIGNED_PARAMETER(V) x, typename V::EntryType (func)(typename V::EntryType))/*{{{*/ { V r; for (size_t i = 0; i < V::Size; ++i) { r[i] = func(x[i]); } return r; } /*}}}*/ template void testAbs()/*{{{*/ { for (int i = 0; i < 0x7fff; ++i) { Vec a(i); Vec b(-i); COMPARE(a, Vc::abs(a)); COMPARE(a, Vc::abs(b)); } } /*}}}*/ static inline float my_trunc(float x)/*{{{*/ { #if __cplusplus >= 201103 /*C++11*/ return std::trunc(x); #elif defined(_ISOC99_SOURCE) return truncf(x); #else return x > 0 ? std::floor(x) : std::ceil(x); #endif } static inline double my_trunc(double x) { #if __cplusplus >= 201103 /*C++11*/ return std::trunc(x); #elif defined(_ISOC99_SOURCE) return trunc(x); #else return x > 0 ? std::floor(x) : std::ceil(x); #endif } /*}}}*/ template void testTrunc()/*{{{*/ { typedef typename V::EntryType T; typedef typename V::IndexType I; for (size_t i = 0; i < 100000 / V::Size; ++i) { V x = (V::Random() - T(0.5)) * T(100); V reference = apply_v(x, my_trunc); COMPARE(Vc::trunc(x), reference) << ", x = " << x << ", i = " << i; } V x = static_cast(I::IndexesFromZero()); V reference = apply_v(x, my_trunc); COMPARE(Vc::trunc(x), reference) << ", x = " << x; } /*}}}*/ template void testFloor()/*{{{*/ { typedef typename V::EntryType T; typedef typename V::IndexType I; for (size_t i = 0; i < 100000 / V::Size; ++i) { V x = (V::Random() - T(0.5)) * T(100); V reference = apply_v(x, std::floor); COMPARE(Vc::floor(x), reference) << ", x = " << x << ", i = " << i; } V x = static_cast(I::IndexesFromZero()); V reference = apply_v(x, std::floor); COMPARE(Vc::floor(x), reference) << ", x = " << x; } /*}}}*/ template void testCeil()/*{{{*/ { typedef typename V::EntryType T; typedef typename V::IndexType I; for (size_t i = 0; i < 100000 / V::Size; ++i) { V x = (V::Random() - T(0.5)) * T(100); V reference = apply_v(x, std::ceil); COMPARE(Vc::ceil(x), reference) << ", x = " << x << ", i = " << i; } V x = static_cast(I::IndexesFromZero()); V reference = apply_v(x, std::ceil); COMPARE(Vc::ceil(x), reference) << ", x = " << x; } /*}}}*/ template void testExp()/*{{{*/ { setFuzzyness(1); setFuzzyness(2); typedef typename V::EntryType T; for (size_t i = 0; i < 100000 / V::Size; ++i) { V x = (V::Random() - T(0.5)) * T(20); V reference = apply_v(x, std::exp); FUZZY_COMPARE(Vc::exp(x), reference) << ", x = " << x << ", i = " << i; } COMPARE(Vc::exp(V::Zero()), V::One()); } /*}}}*/ template void testLog()/*{{{*/ { setFuzzyness(1); typedef typename V::EntryType T; Array > reference = referenceData(); for (size_t i = 0; i + V::Size - 1 < reference.size; i += V::Size) { V x, ref; for (int j = 0; j < V::Size; ++j) { x[j] = reference.data[i + j].x; ref[j] = reference.data[i + j].ref; } FUZZY_COMPARE(Vc::log(x), ref) << " x = " << x << ", i = " << i; } COMPARE(Vc::log(V::Zero()), V(std::log(T(0)))); for (int i = 0; i < NDenormals; i += V::Size) { V x(&Denormals::data[i]); V ref = apply_v(x, std::log); FUZZY_COMPARE(Vc::log(x), ref) << ", x = " << x << ", i = " << i; } } /*}}}*/ #if (defined(_XOPEN_SOURCE) && _XOPEN_SOURCE >= 600) || defined(_ISOC99_SOURCE) || (defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L) static inline float my_log2(float x) { return ::log2f(x); } /* I need to make sure whether the log2 that I compare against is really precise to <0.5ulp. At * least I get different results when I use "double log2(double)", which is somewhat unexpected. * Well, conversion from double to float goes via truncation, so if the most significant truncated * mantissa bit is set the resulting float is incorrect by 1 ulp static inline float my_log2(float x) { return ::log2(static_cast(x)); } static inline float my_log2(float x) { double tmp = ::log2(static_cast(x)); int e; frexp(tmp, &e); // frexp(0.5) -> e = 0 return tmp + ldexp(tmp < 0 ? -0.5 : 0.5, e - 24); } */ static inline double my_log2(double x) { return ::log2(x); } #else static inline float my_log2(float x) { return ::logf(x) / Vc::Math::ln2(); } static inline double my_log2(double x) { return ::log(x) / Vc::Math::ln2(); } #endif /*}}}*/ template void testLog2()/*{{{*/ { #if defined(VC_LOG_ILP) || defined(VC_LOG_ILP2) setFuzzyness(3); #else setFuzzyness(1); #endif #if (defined(VC_MSVC) || defined(__APPLE__)) && defined(VC_IMPL_Scalar) setFuzzyness(2); #else setFuzzyness(1); #endif typedef typename V::EntryType T; Array > reference = referenceData(); for (size_t i = 0; i + V::Size - 1 < reference.size; i += V::Size) { V x, ref; for (int j = 0; j < V::Size; ++j) { x[j] = reference.data[i + j].x; ref[j] = reference.data[i + j].ref; } FUZZY_COMPARE(Vc::log2(x), ref) << " x = " << x << ", i = " << i; } COMPARE(Vc::log2(V::Zero()), V(my_log2(T(0)))); for (int i = 0; i < NDenormals; i += V::Size) { V x(&Denormals::data[i]); V ref = apply_v(x, my_log2); FUZZY_COMPARE(Vc::log2(x), ref) << ", x = " << x << ", i = " << i; } } /*}}}*/ template void testLog10()/*{{{*/ { setFuzzyness(2); setFuzzyness(2); typedef typename V::EntryType T; Array > reference = referenceData(); for (size_t i = 0; i + V::Size - 1 < reference.size; i += V::Size) { V x, ref; for (int j = 0; j < V::Size; ++j) { x[j] = reference.data[i + j].x; ref[j] = reference.data[i + j].ref; } FUZZY_COMPARE(Vc::log10(x), ref) << " x = " << x << ", i = " << i; } COMPARE(Vc::log10(V::Zero()), V(std::log10(T(0)))); for (int i = 0; i < NDenormals; i += V::Size) { V x(&Denormals::data[i]); V ref = apply_v(x, std::log10); FUZZY_COMPARE(Vc::log10(x), ref) << ", x = " << x << ", i = " << i; } } /*}}}*/ template void testMax()/*{{{*/ { typedef typename Vec::EntryType T; VectorMemoryHelper mem(3); T *data = mem; for (int i = 0; i < Vec::Size; ++i) { data[i] = i; data[i + Vec::Size] = Vec::Size + 1 - i; data[i + 2 * Vec::Size] = std::max(data[i], data[i + Vec::Size]); } Vec a(&data[0]); Vec b(&data[Vec::Size]); Vec c(&data[2 * Vec::Size]); COMPARE(Vc::max(a, b), c); } /*}}}*/ /*{{{*/ #define FillHelperMemory(code) \ typename V::Memory data; \ typename V::Memory reference; \ for (int ii = 0; ii < V::Size; ++ii) { \ const T i = static_cast(ii); \ data[ii] = i; \ reference[ii] = code; \ } do {} while (false) /*}}}*/ template void testSqrt()/*{{{*/ { typedef typename V::EntryType T; FillHelperMemory(std::sqrt(i)); V a(data); V b(reference); FUZZY_COMPARE(Vc::sqrt(a), b); } /*}}}*/ template void testRSqrt()/*{{{*/ { typedef typename V::EntryType T; for (size_t i = 0; i < 1024 / V::Size; ++i) { const V x = V::Random() * T(1000); // RSQRTPS is documented as having a relative error <= 1.5 * 2^-12 VERIFY(Vc::abs(Vc::rsqrt(x) * Vc::sqrt(x) - V::One()) < static_cast(std::ldexp(1.5, -12))); } } /*}}}*/ template void testSincos()/*{{{*/ { typedef typename V::EntryType T; setFuzzyness(2); setFuzzyness(1e7); Array > reference = sincosReference(); for (size_t i = 0; i + V::Size - 1 < reference.size; i += V::Size) { V x, sref, cref; for (int j = 0; j < V::Size; ++j) { x[j] = reference.data[i + j].x; sref[j] = reference.data[i + j].s; cref[j] = reference.data[i + j].c; } V sin, cos; Vc::sincos(x, &sin, &cos); FUZZY_COMPARE(sin, sref) << " x = " << x << ", i = " << i; FUZZY_COMPARE(cos, cref) << " x = " << x << ", i = " << i; Vc::sincos(-x, &sin, &cos); FUZZY_COMPARE(sin, -sref) << " x = " << -x << ", i = " << i; FUZZY_COMPARE(cos, cref) << " x = " << -x << ", i = " << i; } } /*}}}*/ template void testSin()/*{{{*/ { typedef typename V::EntryType T; setFuzzyness(2); setFuzzyness(1e7); Array > reference = sincosReference(); for (size_t i = 0; i + V::Size - 1 < reference.size; i += V::Size) { V x, sref; for (int j = 0; j < V::Size; ++j) { x[j] = reference.data[i + j].x; sref[j] = reference.data[i + j].s; } FUZZY_COMPARE(Vc::sin(x), sref) << " x = " << x << ", i = " << i; FUZZY_COMPARE(Vc::sin(-x), -sref) << " x = " << x << ", i = " << i; } } /*}}}*/ template void testCos()/*{{{*/ { typedef typename V::EntryType T; setFuzzyness(2); setFuzzyness(1e7); Array > reference = sincosReference(); for (size_t i = 0; i + V::Size - 1 < reference.size; i += V::Size) { V x, cref; for (int j = 0; j < V::Size; ++j) { x[j] = reference.data[i + j].x; cref[j] = reference.data[i + j].c; } FUZZY_COMPARE(Vc::cos(x), cref) << " x = " << x << ", i = " << i; FUZZY_COMPARE(Vc::cos(-x), cref) << " x = " << x << ", i = " << i; } } /*}}}*/ template void testAsin()/*{{{*/ { typedef typename V::EntryType T; setFuzzyness(2); setFuzzyness(36); Array > reference = referenceData(); for (size_t i = 0; i + V::Size - 1 < reference.size; i += V::Size) { V x, ref; for (int j = 0; j < V::Size; ++j) { x[j] = reference.data[i + j].x; ref[j] = reference.data[i + j].ref; } FUZZY_COMPARE(Vc::asin(x), ref) << " x = " << x << ", i = " << i; FUZZY_COMPARE(Vc::asin(-x), -ref) << " -x = " << -x << ", i = " << i; } } /*}}}*/ const union { unsigned int hex; float value; } INF = { 0x7f800000 }; #if defined(__APPLE__) && defined(VC_IMPL_Scalar) #define ATAN_COMPARE FUZZY_COMPARE #else #define ATAN_COMPARE COMPARE #endif template void testAtan()/*{{{*/ { typedef typename V::EntryType T; setFuzzyness(3); setFuzzyness(2); { const V Pi_2 = T(Vc_buildDouble(1, 0x921fb54442d18ull, 0)); V nan; nan.setQnan(); const V inf = T(INF.value); VERIFY(Vc::isnan(Vc::atan(nan))); ATAN_COMPARE(Vc::atan(+inf), +Pi_2); #ifdef VC_MSVC #pragma warning(suppress: 4756) // overflow in constant arithmetic #endif ATAN_COMPARE(Vc::atan(-inf), -Pi_2); } Array > reference = referenceData(); for (size_t i = 0; i + V::Size - 1 < reference.size; i += V::Size) { V x, ref; for (int j = 0; j < V::Size; ++j) { x[j] = reference.data[i + j].x; ref[j] = reference.data[i + j].ref; } FUZZY_COMPARE(Vc::atan(x), ref) << " x = " << x << ", i = " << i; FUZZY_COMPARE(Vc::atan(-x), -ref) << " -x = " << -x << ", i = " << i; } } /*}}}*/ template void testAtan2()/*{{{*/ { typedef typename V::EntryType T; setFuzzyness(3); setFuzzyness(2); { const V Pi = T(Vc_buildDouble(1, 0x921fb54442d18ull, 1)); const V Pi_2 = T(Vc_buildDouble(1, 0x921fb54442d18ull, 0)); V nan; nan.setQnan(); const V inf = T(INF.value); // If y is +0 (-0) and x is less than 0, +pi (-pi) is returned. ATAN_COMPARE(Vc::atan2(V(T(+0.)), V(T(-3.))), +Pi); ATAN_COMPARE(Vc::atan2(V(T(-0.)), V(T(-3.))), -Pi); // If y is +0 (-0) and x is greater than 0, +0 (-0) is returned. COMPARE(Vc::atan2(V(T(+0.)), V(T(+3.))), V(T(+0.))); VERIFY(!Vc::atan2(V(T(+0.)), V(T(+3.))).isNegative()); COMPARE(Vc::atan2(V(T(-0.)), V(T(+3.))), V(T(-0.))); VERIFY (Vc::atan2(V(T(-0.)), V(T(+3.))).isNegative()); // If y is less than 0 and x is +0 or -0, -pi/2 is returned. COMPARE(Vc::atan2(V(T(-3.)), V(T(+0.))), -Pi_2); COMPARE(Vc::atan2(V(T(-3.)), V(T(-0.))), -Pi_2); // If y is greater than 0 and x is +0 or -0, pi/2 is returned. COMPARE(Vc::atan2(V(T(+3.)), V(T(+0.))), +Pi_2); COMPARE(Vc::atan2(V(T(+3.)), V(T(-0.))), +Pi_2); // If either x or y is NaN, a NaN is returned. VERIFY(Vc::isnan(Vc::atan2(nan, V(T(3.))))); VERIFY(Vc::isnan(Vc::atan2(V(T(3.)), nan))); VERIFY(Vc::isnan(Vc::atan2(nan, nan))); // If y is +0 (-0) and x is -0, +pi (-pi) is returned. ATAN_COMPARE(Vc::atan2(V(T(+0.)), V(T(-0.))), +Pi); ATAN_COMPARE(Vc::atan2(V(T(-0.)), V(T(-0.))), -Pi); // If y is +0 (-0) and x is +0, +0 (-0) is returned. COMPARE(Vc::atan2(V(T(+0.)), V(T(+0.))), V(T(+0.))); COMPARE(Vc::atan2(V(T(-0.)), V(T(+0.))), V(T(-0.))); VERIFY(!Vc::atan2(V(T(+0.)), V(T(+0.))).isNegative()); VERIFY( Vc::atan2(V(T(-0.)), V(T(+0.))).isNegative()); // If y is a finite value greater (less) than 0, and x is negative infinity, +pi (-pi) is returned. ATAN_COMPARE(Vc::atan2(V(T(+1.)), -inf), +Pi); ATAN_COMPARE(Vc::atan2(V(T(-1.)), -inf), -Pi); // If y is a finite value greater (less) than 0, and x is positive infinity, +0 (-0) is returned. COMPARE(Vc::atan2(V(T(+3.)), +inf), V(T(+0.))); VERIFY(!Vc::atan2(V(T(+3.)), +inf).isNegative()); COMPARE(Vc::atan2(V(T(-3.)), +inf), V(T(-0.))); VERIFY (Vc::atan2(V(T(-3.)), +inf).isNegative()); // If y is positive infinity (negative infinity), and x is finite, pi/2 (-pi/2) is returned. COMPARE(Vc::atan2(+inf, V(T(+3.))), +Pi_2); COMPARE(Vc::atan2(-inf, V(T(+3.))), -Pi_2); COMPARE(Vc::atan2(+inf, V(T(-3.))), +Pi_2); COMPARE(Vc::atan2(-inf, V(T(-3.))), -Pi_2); #ifndef _WIN32 // the Microsoft implementation of atan2 fails this test const V Pi_4 = T(Vc_buildDouble(1, 0x921fb54442d18ull, -1)); // If y is positive infinity (negative infinity) and x is negative infinity, +3*pi/4 (-3*pi/4) is returned. COMPARE(Vc::atan2(+inf, -inf), T(+3.) * Pi_4); COMPARE(Vc::atan2(-inf, -inf), T(-3.) * Pi_4); // If y is positive infinity (negative infinity) and x is positive infinity, +pi/4 (-pi/4) is returned. COMPARE(Vc::atan2(+inf, +inf), +Pi_4); COMPARE(Vc::atan2(-inf, +inf), -Pi_4); #endif } for (int xoffset = -100; xoffset < 54613; xoffset += 47 * V::Size) { for (int yoffset = -100; yoffset < 54613; yoffset += 47 * V::Size) { FillHelperMemory(std::atan2((i + xoffset) * T(0.15), (i + yoffset) * T(0.15))); const V a(data); const V b(reference); const V x = (a + xoffset) * T(0.15); const V y = (a + yoffset) * T(0.15); FUZZY_COMPARE(Vc::atan2(x, y), b) << ", x = " << x << ", y = " << y; } } } /*}}}*/ template void testReciprocal()/*{{{*/ { typedef typename Vec::EntryType T; setFuzzyness(1.258295e+07); setFuzzyness(0); const T one = 1; for (int offset = -1000; offset < 1000; offset += 10) { const T scale = T(0.1); typename Vec::Memory data; typename Vec::Memory reference; for (int ii = 0; ii < Vec::Size; ++ii) { const T i = static_cast(ii); data[ii] = i; T tmp = (i + offset) * scale; reference[ii] = one / tmp; } Vec a(data); Vec b(reference); FUZZY_COMPARE(Vc::reciprocal((a + offset) * scale), b); } } /*}}}*/ template void isNegative()/*{{{*/ { typedef typename V::EntryType T; VERIFY(V::One().isNegative().isEmpty()); VERIFY(V::Zero().isNegative().isEmpty()); VERIFY((-V::One()).isNegative().isFull()); VERIFY(V(T(-0.)).isNegative().isFull()); } /*}}}*/ template void testInf()/*{{{*/ { typedef typename Vec::EntryType T; const T one = 1; const Vec zero(Zero); VERIFY(Vc::isfinite(zero)); VERIFY(Vc::isfinite(Vec(one))); VERIFY(!Vc::isfinite(one / zero)); } /*}}}*/ template void testNaN()/*{{{*/ { typedef typename Vec::EntryType T; typedef typename Vec::IndexType I; typedef typename Vec::Mask M; const T one = 1; const Vec zero(Zero); VERIFY(!Vc::isnan(zero)); VERIFY(!Vc::isnan(Vec(one))); const Vec inf = one / zero; VERIFY(Vc::isnan(Vec(inf * zero))); Vec nan = Vec::Zero(); const M mask(I::IndexesFromZero() == I::Zero()); nan.setQnan(mask); COMPARE(Vc::isnan(nan), mask); nan.setQnan(); VERIFY(Vc::isnan(nan)); } /*}}}*/ template void testRound()/*{{{*/ { typedef typename Vec::EntryType T; enum { Count = (16 + Vec::Size) / Vec::Size }; VectorMemoryHelper mem1(Count); VectorMemoryHelper mem2(Count); T *data = mem1; T *reference = mem2; for (int i = 0; i < Count * Vec::Size; ++i) { data[i] = i * 0.25 - 2.0; reference[i] = std::floor(i * 0.25 - 2.0 + 0.5); if (i % 8 == 2) { reference[i] -= 1.; } //std::cout << reference[i] << " "; } //std::cout << std::endl; for (int i = 0; i < Count; ++i) { const Vec a(&data[i * Vec::Size]); const Vec ref(&reference[i * Vec::Size]); //std::cout << a << ref << std::endl; COMPARE(Vc::round(a), ref); } } /*}}}*/ template void testReduceMin()/*{{{*/ { typedef typename Vec::EntryType T; const T one = 1; VectorMemoryHelper mem(Vec::Size); T *data = mem; for (int i = 0; i < Vec::Size * Vec::Size; ++i) { data[i] = i % (Vec::Size + 1) + one; } for (int i = 0; i < Vec::Size; ++i, data += Vec::Size) { const Vec a(&data[0]); //std::cout << a << std::endl; COMPARE(a.min(), one); } } /*}}}*/ template void testReduceMax()/*{{{*/ { typedef typename Vec::EntryType T; const T max = Vec::Size + 1; VectorMemoryHelper mem(Vec::Size); T *data = mem; for (int i = 0; i < Vec::Size * Vec::Size; ++i) { data[i] = (i + Vec::Size) % (Vec::Size + 1) + 1; } for (int i = 0; i < Vec::Size; ++i, data += Vec::Size) { const Vec a(&data[0]); //std::cout << a << std::endl; COMPARE(a.max(), max); } } /*}}}*/ template void testReduceProduct()/*{{{*/ { enum { Max = Vec::Size > 8 ? Vec::Size / 2 : Vec::Size }; typedef typename Vec::EntryType T; int _product = 1; for (int i = 1; i < Vec::Size; ++i) { _product *= (i % Max) + 1; } const T product = _product; VectorMemoryHelper mem(Vec::Size); T *data = mem; for (int i = 0; i < Vec::Size * Vec::Size; ++i) { data[i] = ((i + (i / Vec::Size)) % Max) + 1; } for (int i = 0; i < Vec::Size; ++i, data += Vec::Size) { const Vec a(&data[0]); //std::cout << a << std::endl; COMPARE(a.product(), product); } } /*}}}*/ template void testReduceSum()/*{{{*/ { typedef typename Vec::EntryType T; int _sum = 1; for (int i = 2; i <= Vec::Size; ++i) { _sum += i; } const T sum = _sum; VectorMemoryHelper mem(Vec::Size); T *data = mem; for (int i = 0; i < Vec::Size * Vec::Size; ++i) { data[i] = (i + i / Vec::Size) % Vec::Size + 1; } for (int i = 0; i < Vec::Size; ++i, data += Vec::Size) { const Vec a(&data[0]); //std::cout << a << std::endl; COMPARE(a.sum(), sum); } } /*}}}*/ template void testExponent()/*{{{*/ { typedef typename V::EntryType T; Vc::Memory input; Vc::Memory expected; input[ 0] = T(0.25); expected[ 0] = T(-2); input[ 1] = T( 1); expected[ 1] = T( 0); input[ 2] = T( 2); expected[ 2] = T( 1); input[ 3] = T( 3); expected[ 3] = T( 1); input[ 4] = T( 4); expected[ 4] = T( 2); input[ 5] = T( 0.5); expected[ 5] = T(-1); input[ 6] = T( 6); expected[ 6] = T( 2); input[ 7] = T( 7); expected[ 7] = T( 2); input[ 8] = T( 8); expected[ 8] = T( 3); input[ 9] = T( 9); expected[ 9] = T( 3); input[10] = T( 10); expected[10] = T( 3); input[11] = T( 11); expected[11] = T( 3); input[12] = T( 12); expected[12] = T( 3); input[13] = T( 13); expected[13] = T( 3); input[14] = T( 14); expected[14] = T( 3); input[15] = T( 15); expected[15] = T( 3); input[16] = T( 16); expected[16] = T( 4); input[17] = T( 17); expected[17] = T( 4); input[18] = T( 18); expected[18] = T( 4); input[19] = T( 19); expected[19] = T( 4); input[20] = T( 20); expected[20] = T( 4); input[21] = T( 21); expected[21] = T( 4); input[22] = T( 22); expected[22] = T( 4); input[23] = T( 23); expected[23] = T( 4); input[24] = T( 24); expected[24] = T( 4); input[25] = T( 25); expected[25] = T( 4); input[26] = T( 26); expected[26] = T( 4); input[27] = T( 27); expected[27] = T( 4); input[28] = T( 28); expected[28] = T( 4); input[29] = T( 29); expected[29] = T( 4); input[30] = T( 32); expected[30] = T( 5); input[31] = T( 31); expected[31] = T( 4); for (size_t i = 0; i < input.vectorsCount(); ++i) { COMPARE(V(input.vector(i)).exponent(), V(expected.vector(i))); } } /*}}}*/ template struct _ExponentVector { typedef int_v Type; }; template<> struct _ExponentVector { typedef short_v Type; }; template void testFrexp()/*{{{*/ { typedef typename V::EntryType T; typedef typename _ExponentVector::Type ExpV; Vc::Memory input; Vc::Memory expectedFraction; Vc::Memory expectedExponent; input[ 0] = T(0.25); expectedFraction[ 0] = T(.5 ); expectedExponent[ 0] = -1; input[ 1] = T( 1); expectedFraction[ 1] = T(.5 ); expectedExponent[ 1] = 1; input[ 2] = T( 0); expectedFraction[ 2] = T(0. ); expectedExponent[ 2] = 0; input[ 3] = T( 3); expectedFraction[ 3] = T(.75 ); expectedExponent[ 3] = 2; input[ 4] = T( 4); expectedFraction[ 4] = T(.5 ); expectedExponent[ 4] = 3; input[ 5] = T( 0.5); expectedFraction[ 5] = T(.5 ); expectedExponent[ 5] = 0; input[ 6] = T( 6); expectedFraction[ 6] = T( 6./8. ); expectedExponent[ 6] = 3; input[ 7] = T( 7); expectedFraction[ 7] = T( 7./8. ); expectedExponent[ 7] = 3; input[ 8] = T( 8); expectedFraction[ 8] = T( 8./16.); expectedExponent[ 8] = 4; input[ 9] = T( 9); expectedFraction[ 9] = T( 9./16.); expectedExponent[ 9] = 4; input[10] = T( 10); expectedFraction[10] = T(10./16.); expectedExponent[10] = 4; input[11] = T( 11); expectedFraction[11] = T(11./16.); expectedExponent[11] = 4; input[12] = T( 12); expectedFraction[12] = T(12./16.); expectedExponent[12] = 4; input[13] = T( 13); expectedFraction[13] = T(13./16.); expectedExponent[13] = 4; input[14] = T( 14); expectedFraction[14] = T(14./16.); expectedExponent[14] = 4; input[15] = T( 15); expectedFraction[15] = T(15./16.); expectedExponent[15] = 4; input[16] = T( 16); expectedFraction[16] = T(16./32.); expectedExponent[16] = 5; input[17] = T( 17); expectedFraction[17] = T(17./32.); expectedExponent[17] = 5; input[18] = T( 18); expectedFraction[18] = T(18./32.); expectedExponent[18] = 5; input[19] = T( 19); expectedFraction[19] = T(19./32.); expectedExponent[19] = 5; input[20] = T( 20); expectedFraction[20] = T(20./32.); expectedExponent[20] = 5; input[21] = T( 21); expectedFraction[21] = T(21./32.); expectedExponent[21] = 5; input[22] = T( 22); expectedFraction[22] = T(22./32.); expectedExponent[22] = 5; input[23] = T( 23); expectedFraction[23] = T(23./32.); expectedExponent[23] = 5; input[24] = T( 24); expectedFraction[24] = T(24./32.); expectedExponent[24] = 5; input[25] = T( 25); expectedFraction[25] = T(25./32.); expectedExponent[25] = 5; input[26] = T( 26); expectedFraction[26] = T(26./32.); expectedExponent[26] = 5; input[27] = T( 27); expectedFraction[27] = T(27./32.); expectedExponent[27] = 5; input[28] = T( 28); expectedFraction[28] = T(28./32.); expectedExponent[28] = 5; input[29] = T( 29); expectedFraction[29] = T(29./32.); expectedExponent[29] = 5; input[30] = T( 32); expectedFraction[30] = T(32./64.); expectedExponent[30] = 6; input[31] = T( 31); expectedFraction[31] = T(31./32.); expectedExponent[31] = 5; for (size_t i = 0; i < input.vectorsCount(); ++i) { const V v = input.vector(i); ExpV exp; COMPARE(frexp(v, &exp), V(expectedFraction.vector(i))); if (V::Size * 2 == ExpV::Size) { for (size_t j = 0; j < V::Size; ++j) { COMPARE(exp[j * 2], expectedExponent[i * V::Size + j]); } } else { COMPARE(exp, ExpV(expectedExponent.vector(i))); } } } /*}}}*/ template void testLdexp()/*{{{*/ { typedef typename V::EntryType T; typedef typename _ExponentVector::Type ExpV; for (size_t i = 0; i < 1024 / V::Size; ++i) { const V v = (V::Random() - T(0.5)) * T(1000); ExpV e; const V m = frexp(v, &e); COMPARE(ldexp(m, e), v) << ", m = " << m << ", e = " << e; } } /*}}}*/ #include "ulp.h" template void testUlpDiff()/*{{{*/ { typedef typename V::EntryType T; COMPARE(ulpDiffToReference(V::Zero(), V::Zero()), V::Zero()); COMPARE(ulpDiffToReference(std::numeric_limits::min(), V::Zero()), V::One()); COMPARE(ulpDiffToReference(V::Zero(), std::numeric_limits::min()), V::One()); for (size_t count = 0; count < 1024 / V::Size; ++count) { const V base = (V::Random() - T(0.5)) * T(1000); typename _Ulp_ExponentVector::Type exp; Vc::frexp(base, &exp); const V eps = ldexp(V(std::numeric_limits::epsilon()), exp - 1); //std::cout << base << ", " << exp << ", " << eps << std::endl; for (int i = -10000; i <= 10000; ++i) { const V i_v = V(T(i)); const V diff = base + i_v * eps; // if diff and base have a different exponent then ulpDiffToReference has an uncertainty // of +/-1 const V ulpDifference = ulpDiffToReference(diff, base); const V expectedDifference = Vc::abs(i_v); const V maxUncertainty = Vc::abs(abs(diff).exponent() - abs(base).exponent()); VERIFY(Vc::abs(ulpDifference - expectedDifference) <= maxUncertainty) << ", base = " << base << ", epsilon = " << eps << ", diff = " << diff; for (int k = 0; k < V::Size; ++k) { VERIFY(std::abs(ulpDifference[k] - expectedDifference[k]) <= maxUncertainty[k]); } } } }/*}}}*/ int main(int argc, char **argv)/*{{{*/ { initTest(argc, argv); Denormals::data = Vc::malloc(NDenormals);/*{{{*/ Denormals::data[0] = std::numeric_limits::denorm_min(); for (int i = 1; i < NDenormals; ++i) { Denormals::data[i] = Denormals::data[i - 1] * 2.173f; } Denormals::data = Vc::malloc(NDenormals); Denormals::data[0] = std::numeric_limits::denorm_min(); for (int i = 1; i < NDenormals; ++i) { Denormals::data[i] = Denormals::data[i - 1] * 2.173; }/*}}}*/ testRealTypes(isNegative); testRealTypes(testFrexp); testRealTypes(testLdexp); runTest(testAbs); runTest(testAbs); runTest(testAbs); runTest(testAbs); runTest(testAbs); testRealTypes(testUlpDiff); testRealTypes(testTrunc); testRealTypes(testFloor); testRealTypes(testCeil); testRealTypes(testExp); testRealTypes(testLog); testRealTypes(testLog2); testRealTypes(testLog10); runTest(testMax); runTest(testMax); runTest(testMax); runTest(testMax); runTest(testMax); runTest(testMax); runTest(testMax); testRealTypes(testSqrt); testRealTypes(testRSqrt); testRealTypes(testSin); testRealTypes(testCos); testRealTypes(testAsin); testRealTypes(testAtan); testRealTypes(testAtan2); testRealTypes(testReciprocal); testRealTypes(testInf); testRealTypes(testNaN); testRealTypes(testRound); runTest(testReduceMin); runTest(testReduceMin); runTest(testReduceMin); runTest(testReduceMin); runTest(testReduceMin); runTest(testReduceMin); runTest(testReduceMin); runTest(testReduceMax); runTest(testReduceMax); runTest(testReduceMax); runTest(testReduceMax); runTest(testReduceMax); runTest(testReduceMax); runTest(testReduceMax); runTest(testReduceProduct); runTest(testReduceProduct); runTest(testReduceProduct); runTest(testReduceProduct); runTest(testReduceProduct); runTest(testReduceProduct); runTest(testReduceProduct); runTest(testReduceSum); runTest(testReduceSum); runTest(testReduceSum); runTest(testReduceSum); runTest(testReduceSum); runTest(testReduceSum); runTest(testReduceSum); testRealTypes(testSincos); testRealTypes(testExponent); return 0; }/*}}}*/ // vim: foldmethod=marker Vc-0.7.4/tests/memory.cpp000066400000000000000000000207511233512346000152540ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include "unittest.h" using namespace Vc; template class TestClass> struct TestWrapper { static inline void run() { TestWrapper::run(); TestClass::test(); TestClass::test(); } }; template class TestClass> struct TestWrapper { static inline void run() {} }; template struct TestEntries { static void test() { typedef typename V::EntryType T; const T x = Size; Memory m; const Memory &m2 = m; Memory m3(Size); for (unsigned int i = 0; i < Size; ++i) { m[i] = x; m3[i] = x; } for (unsigned int i = 0; i < Size; ++i) { COMPARE(m[i], x); COMPARE(m2[i], x); COMPARE(m3[i], x); } for (unsigned int i = 0; i < Size; ++i) { COMPARE(m.entries()[i], x); COMPARE(m2.entries()[i], x); COMPARE(m3.entries()[i], x); } const T *ptr = m2; for (unsigned int i = 0; i < Size; ++i) { COMPARE(ptr[i], x); } ptr = m3; for (unsigned int i = 0; i < Size; ++i) { COMPARE(ptr[i], x); } }}; template struct TestEntries2D { static void test() { typedef typename V::EntryType T; const T x = Size; Memory m; const Memory &m2 = m; for (size_t i = 0; i < Size; ++i) { for (size_t j = 0; j < Size; ++j) { m[i][j] = x + i + j; } } for (size_t i = 0; i < Size; ++i) { for (size_t j = 0; j < Size; ++j) { COMPARE(m[i][j], T(x + i + j)); COMPARE(m2[i][j], T(x + i + j)); } } for (size_t i = 0; i < Size; ++i) { for (size_t j = 0; j < Size; ++j) { COMPARE(m[i].entries()[j], T(x + i + j)); COMPARE(m2[i].entries()[j], T(x + i + j)); } } for (size_t i = 0; i < Size; ++i) { const T *ptr = m2[i]; for (size_t j = 0; j < Size; ++j) { COMPARE(ptr[j], T(x + i + j)); } } }}; template struct TestVectors { static void test() { const V startX(V::IndexType::IndexesFromZero() + Size); Memory m; const Memory &m2 = m; Memory m3(Size); V x = startX; for (unsigned int i = 0; i < m.vectorsCount(); ++i, x += V::Size) { m.vector(i) = x; m3.vector(i) = x; } x = startX; unsigned int i; for (i = 0; i + 1 < m.vectorsCount(); ++i) { COMPARE(V(m.vector(i)), x); COMPARE(V(m2.vector(i)), x); COMPARE(V(m3.vector(i)), x); for (int shift = 0; shift < V::Size; ++shift, ++x) { COMPARE(V(m.vector(i, shift)), x); COMPARE(V(m2.vector(i, shift)), x); COMPARE(V(m3.vector(i, shift)), x); } } COMPARE(V(m.vector(i)), x); COMPARE(V(m2.vector(i)), x); COMPARE(V(m3.vector(i)), x); }}; template struct TestVectors2D { static void test() { const V startX(V::IndexType::IndexesFromZero() + Size); Memory m; const Memory &m2 = m; V x = startX; for (size_t i = 0; i < m.rowsCount(); ++i, x += V::Size) { Memory &mrow = m[i]; for (size_t j = 0; j < mrow.vectorsCount(); ++j, x += V::Size) { mrow.vector(j) = x; } } x = startX; for (size_t i = 0; i < m.rowsCount(); ++i, x += V::Size) { Memory &mrow = m[i]; const Memory &m2row = m2[i]; size_t j; for (j = 0; j < mrow.vectorsCount() - 1; ++j) { COMPARE(V(mrow.vector(j)), x); COMPARE(V(m2row.vector(j)), x); for (int shift = 0; shift < V::Size; ++shift, ++x) { COMPARE(V(mrow.vector(j, shift)), x); COMPARE(V(m2row.vector(j, shift)), x); } } COMPARE(V(mrow.vector(j)), x) << i << " " << j; COMPARE(V(m2row.vector(j)), x); x += V::Size; } }}; template struct TestVectorReorganization { static void test() { typename V::Memory init; for (unsigned int i = 0; i < V::Size; ++i) { init[i] = i; } V x(init); Memory m; Memory m3(Size); for (unsigned int i = 0; i < m.vectorsCount(); ++i) { m.vector(i) = x; m3.vector(i) = x; x += V::Size; } /////////////////////////////////////////////////////////////////////////// x = V(init); for (unsigned int i = 0; i < m.vectorsCount(); ++i) { COMPARE(V(m.vector(i)), x); COMPARE(V(m3.vector(i)), x); x += V::Size; } /////////////////////////////////////////////////////////////////////////// x = V(init); unsigned int indexes[Size]; for (unsigned int i = 0; i < Size; ++i) { indexes[i] = i; } for (unsigned int i = 0; i + V::Size < Size; ++i) { COMPARE(m.gather(&indexes[i]), x); COMPARE(m3.gather(&indexes[i]), x); x += 1; } /////////////////////////////////////////////////////////////////////////// for (unsigned int i = 0; i < V::Size; ++i) { init[i] = i * 2; } x = V(init); for (unsigned int i = 0; i < Size; ++i) { indexes[i] = (i * 2) % Size; } for (unsigned int i = 0; i + V::Size < Size; ++i) { COMPARE(m.gather(&indexes[i]), x); COMPARE(m3.gather(&indexes[i]), x); x += 2; x(x >= Size) -= Size; } }}; template void testEntries() { TestWrapper::run(); } template void testEntries2D() { TestWrapper::run(); } template void testVectors() { TestWrapper::run(); } template void testVectors2D() { TestWrapper::run(); } template void testVectorReorganization() { TestWrapper::run(); } template void memoryOperators() { Memory m1, m2; m1.setZero(); m2.setZero(); VERIFY(m1 == m2); VERIFY(!(m1 != m2)); VERIFY(!(m1 < m2)); VERIFY(!(m1 > m2)); m1 += m2; VERIFY(m1 == m2); VERIFY(m1 <= m2); VERIFY(m1 >= m2); m1 += 1; VERIFY(m1 != m2); VERIFY(m1 > m2); VERIFY(m1 >= m2); VERIFY(m2 < m1); VERIFY(m2 <= m1); VERIFY(!(m1 == m2)); VERIFY(!(m1 <= m2)); VERIFY(!(m2 >= m1)); m2 += m1; VERIFY(m1 == m2); m2 *= 2; m1 += 1; VERIFY(m1 == m2); m2 /= 2; m1 -= 1; VERIFY(m1 == m2); m1 *= m2; VERIFY(m1 == m2); m1 /= m2; VERIFY(m1 == m2); m1 -= m2; m2 -= m2; VERIFY(m1 == m2); } template void testCCtor() { Memory m1(5); for (size_t i = 0; i < m1.entriesCount(); ++i) { m1[i] = i; } Memory m2(m1); for (size_t i = 0; i < m1.entriesCount(); ++i) { m1[i] += 1; } for (size_t i = 0; i < m1.entriesCount(); ++i) { COMPARE(m1[i], m2[i] + 1); } } template void testCopyAssignment() { typedef typename V::EntryType T; Memory m1; m1.setZero(); Memory m2(m1); for (size_t i = 0; i < m2.entriesCount(); ++i) { COMPARE(m2[i], T(0)); m2[i] += 1; } m1 = m2; for (size_t i = 0; i < m2.entriesCount(); ++i) { COMPARE(m1[i], T(1)); } } int main() { testAllTypes(testEntries); testAllTypes(testEntries2D); testAllTypes(testVectors); testAllTypes(testVectors2D); testAllTypes(testVectorReorganization); testAllTypes(memoryOperators); testAllTypes(testCCtor); testAllTypes(testCopyAssignment); return 0; } Vc-0.7.4/tests/scalaraccess.cpp000066400000000000000000000072041233512346000163710ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2010-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include "unittest.h" using namespace Vc; template void reads() { typedef typename V::EntryType T; typedef typename V::IndexType I; V a = V::Zero(); const T zero = 0; for (int i = 0; i < V::Size; ++i) { const T x = a[i]; COMPARE(x, zero); } a = static_cast(I::IndexesFromZero()); for (int i = 0; i < V::Size; ++i) { const T x = a[i]; const T y = i; COMPARE(x, y); } } template inline void readsConstantIndexTest(VC_ALIGNED_PARAMETER(V) a, VC_ALIGNED_PARAMETER(V) b) { typedef typename V::EntryType T; { const T x = a[Index]; const T zero = 0; COMPARE(x, zero) << Index; }{ const T x = b[Index]; const T y = Index; COMPARE(x, y) << Index; } } template struct ReadsConstantIndex { ReadsConstantIndex(VC_ALIGNED_PARAMETER(V) a, VC_ALIGNED_PARAMETER(V) b) { readsConstantIndexTest(a, b); ReadsConstantIndex(a, b); } }; template struct ReadsConstantIndex { ReadsConstantIndex(VC_ALIGNED_PARAMETER(V) a, VC_ALIGNED_PARAMETER(V) b) { readsConstantIndexTest(a, b); } }; template void readsConstantIndex() { typedef typename V::IndexType I; V a = V::Zero(); V b = static_cast(I::IndexesFromZero()); ReadsConstantIndex(a, b); } template void writes() { typedef typename V::EntryType T; typedef typename V::IndexType I; V a; for (int i = 0; i < V::Size; ++i) { a[i] = static_cast(i); } V b = static_cast(I::IndexesFromZero()); COMPARE(a, b); const T one = 1; const T two = 2; if (V::Size == 1) { a(a == 0) += one; a[0] += one; a(a == 0) += one; COMPARE(a, V(2)); } else if (V::Size == 4) { a(a == 1) += two; a[2] += one; a(a == 3) += one; b(b == 1) += one; b(b == 2) += one; b(b == 3) += one; COMPARE(a, b); } else if (V::Size == 8 || V::Size == 16) { a(a == 2) += two; a[3] += one; a(a == 4) += one; b(b == 2) += one; b(b == 3) += one; b(b == 4) += one; COMPARE(a, b); } else if (V::Size == 2) { // a = [0, 1]; b = [0, 1] a(a == 0) += two; // a = [2, 1] a[1] += one; // a = [2, 2] a(a == 2) += one; // a = [3, 3] b(b == 0) += one; // b = [1, 1] b(b == 1) += one; // b = [2, 2] b(b == 2) += one; // b = [3, 3] COMPARE(a, b); } else { FAIL() << "unsupported Vector::Size"; } } int main(int argc, char **argv) { initTest(argc, argv); testAllTypes(reads); testAllTypes(writes); testAllTypes(readsConstantIndex); //testAllTypes(writesConstantIndex); return 0; } Vc-0.7.4/tests/scatter.cpp000066400000000000000000000124101233512346000154020ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ // includes {{{1 #include "unittest.h" #include #include using namespace Vc; template void maskedScatterArray() //{{{1 { typedef typename Vec::IndexType It; typedef typename Vec::EntryType T; T mem[Vec::Size]; const Vec v(It::IndexesFromZero() + 1); for_all_masks(Vec, m) { Vec::Zero().store(mem, Vc::Unaligned); v.scatter(&mem[0], It::IndexesFromZero(), m); for (int i = 0; i < Vec::Size; ++i) { COMPARE(mem[i], m[i] ? v[i] : T(0)) << " i = " << i << ", m = " << m; } } } template void scatterArray() //{{{1 { typedef typename Vec::IndexType It; const int count = 31999; typename Vec::EntryType array[count], out[count]; for (int i = 0; i < count; ++i) { array[i] = i - 100; } typename It::Mask mask; for (It i(IndexesFromZero); !(mask = (i < count)).isEmpty(); i += Vec::Size) { typename Vec::Mask castedMask(mask); if (castedMask.isFull()) { Vec a(array, i); a += Vec(One); a.scatter(out, i); } else { Vec a(array, i, castedMask); a += Vec(One); a.scatter(out, i, castedMask); } } for (int i = 0; i < count; ++i) { array[i] += 1; COMPARE(array[i], out[i]); } COMPARE(0, std::memcmp(array, out, count * sizeof(typename Vec::EntryType))); } template struct Struct //{{{1 { T a; char x; T b; short y; T c; char z; }; template void scatterStruct() //{{{1 { typedef typename Vec::IndexType It; typedef Struct S; const int count = 3999; S array[count], out[count]; memset(array, 0, count * sizeof(S)); memset(out, 0, count * sizeof(S)); for (int i = 0; i < count; ++i) { array[i].a = i; array[i].b = i + 1; array[i].c = i + 2; } typename It::Mask mask; for (It i(IndexesFromZero); !(mask = (i < count)).isEmpty(); i += Vec::Size) { typename Vec::Mask castedMask(mask); Vec a(array, &S::a, i, castedMask); Vec b(array, &S::b, i, castedMask); Vec c(array, &S::c, i, castedMask); a.scatter(out, &S::a, i, castedMask); b.scatter(out, &S::b, i, castedMask); c.scatter(out, &S::c, i, castedMask); } VERIFY(0 == memcmp(array, out, count * sizeof(S))); } template struct Struct2 //{{{1 { char x; Struct b; short y; }; template void scatterStruct2() //{{{1 { typedef typename Vec::IndexType It; typedef Struct2 S1; typedef Struct S2; const int count = 97; S1 array[count], out[count]; memset(array, 0, count * sizeof(S1)); memset(out, 0, count * sizeof(S1)); for (int i = 0; i < count; ++i) { array[i].b.a = i + 0; array[i].b.b = i + 1; array[i].b.c = i + 2; } typename It::Mask mask; for (It i(IndexesFromZero); !(mask = (i < count)).isEmpty(); i += Vec::Size) { typename Vec::Mask castedMask(mask); Vec a(array, &S1::b, &S2::a, i, castedMask); Vec b(array, &S1::b, &S2::b, i, castedMask); Vec c(array, &S1::b, &S2::c, i, castedMask); a.scatter(out, &S1::b, &S2::a, i, castedMask); b.scatter(out, &S1::b, &S2::b, i, castedMask); c.scatter(out, &S1::b, &S2::c, i, castedMask); } VERIFY(0 == memcmp(array, out, count * sizeof(S1))); } int main(int argc, char **argv) //{{{1 { initTest(argc, argv); runTest(scatterArray); runTest(scatterArray); runTest(scatterArray); runTest(scatterArray); runTest(scatterArray); runTest(scatterArray); runTest(scatterArray); testAllTypes(maskedScatterArray); #if defined(VC_CLANG) && VC_CLANG <= 0x030000 // clang fails with: // candidate template ignored: failed template argument deduction // template inline Vector(const S1 *array, const T S1::* // member1, IT indexes, Mask mask = true) #warning "Skipping compilation of tests scatterStruct and scatterStruct2 because of clang bug" #else runTest(scatterStruct); runTest(scatterStruct); runTest(scatterStruct); runTest(scatterStruct); runTest(scatterStruct); runTest(scatterStruct); runTest(scatterStruct); testAllTypes(scatterStruct2); #endif return 0; } // vim: foldmethod=marker Vc-0.7.4/tests/sse_blend.cpp000066400000000000000000000111321233512346000156730ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include "unittest.h" #include namespace std { ostream &operator<<(ostream &out, const __m128i &v) { union { __m128i v; short m[8]; } x = { v }; out << "[" << x.m[0]; for (int i = 1; i < 8; ++i) { out << ", " << x.m[i]; } return out << "]"; } } // namespace std template<> inline bool unittest_compareHelper<__m128i, __m128i>(const __m128i &a, const __m128i &b) { return _mm_movemask_epi8(_mm_cmpeq_epi16(a, b)) == 0xffff; } void blendpd() { #ifdef VC_IMPL_SSE4_1 #define blend _mm_blend_pd #else #define blend Vc::SSE::mm_blend_pd #endif __m128d a = _mm_set_pd(11, 10); __m128d b = _mm_set_pd(21, 20); COMPARE(_mm_movemask_pd(_mm_cmpeq_pd(blend(a, b, 0x0), a)), 0x3); COMPARE(_mm_movemask_pd(_mm_cmpeq_pd(blend(a, b, 0x1), _mm_set_pd(11, 20))), 0x3); COMPARE(_mm_movemask_pd(_mm_cmpeq_pd(blend(a, b, 0x2), _mm_set_pd(21, 10))), 0x3); COMPARE(_mm_movemask_pd(_mm_cmpeq_pd(blend(a, b, 0x3), b)), 0x3); #undef blend } void blendps() { #ifdef VC_IMPL_SSE4_1 #define blend _mm_blend_ps #else #define blend Vc::SSE::mm_blend_ps #endif __m128 a = _mm_set_ps(13, 12, 11, 10); __m128 b = _mm_set_ps(23, 22, 21, 20); COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0x0), a)), 0xf); COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0x1), _mm_set_ps(13, 12, 11, 20))), 0xf); COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0x2), _mm_set_ps(13, 12, 21, 10))), 0xf); COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0x3), _mm_set_ps(13, 12, 21, 20))), 0xf); COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0x4), _mm_set_ps(13, 22, 11, 10))), 0xf); COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0x5), _mm_set_ps(13, 22, 11, 20))), 0xf); COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0x6), _mm_set_ps(13, 22, 21, 10))), 0xf); COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0x7), _mm_set_ps(13, 22, 21, 20))), 0xf); COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0x8), _mm_set_ps(23, 12, 11, 10))), 0xf); COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0x9), _mm_set_ps(23, 12, 11, 20))), 0xf); COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0xa), _mm_set_ps(23, 12, 21, 10))), 0xf); COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0xb), _mm_set_ps(23, 12, 21, 20))), 0xf); COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0xc), _mm_set_ps(23, 22, 11, 10))), 0xf); COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0xd), _mm_set_ps(23, 22, 11, 20))), 0xf); COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0xe), _mm_set_ps(23, 22, 21, 10))), 0xf); COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0xf), b)), 0xf); #undef blend } void blendepi16() { #ifdef VC_IMPL_SSE4_1 #define blend _mm_blend_epi16 #else #define blend Vc::SSE::mm_blend_epi16 #endif __m128i a = _mm_set_epi16(17, 16, 15, 14, 13, 12, 11, 10); __m128i b = _mm_set_epi16(27, 26, 25, 24, 23, 22, 21, 20); #define CALL_2(_i, code) { enum { i = _i }; code } { enum { i = _i + 1 }; code } #define CALL_4(_i, code) CALL_2(_i, code) CALL_2(_i + 2, code) #define CALL_8(_i, code) CALL_4(_i, code) CALL_4(_i + 4, code) #define CALL_16(_i, code) CALL_8(_i, code) CALL_8(_i + 8, code) #define CALL_32(_i, code) CALL_16(_i, code) CALL_16(_i + 16, code) #define CALL_64(_i, code) CALL_32(_i, code) CALL_32(_i + 32, code) #define CALL_128(_i, code) CALL_64(_i, code) CALL_64(_i + 64, code) #define CALL_256(code) CALL_128(0, code) CALL_128(128, code) #define CALL_100(code) CALL_64(0, code) CALL_32(64, code) CALL_4(96, code) CALL_256( short r[8]; for (int j = 0; j < 8; ++j) { r[j] = j + ((((i >> j) & 1) == 0) ? 10 : 20); } __m128i reference = _mm_set_epi16(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]); COMPARE_NOEQ(blend(a, b, i), reference); ) #undef blend } int main() { runTest(blendpd); runTest(blendps); runTest(blendepi16); } Vc-0.7.4/tests/stlcontainer.cpp000066400000000000000000000051711233512346000164500ustar00rootroot00000000000000/*{{{ Copyright (C) 2012 Matthias Kretz Permission to use, copy, modify, and distribute this software and its documentation for any purpose and without fee is hereby granted, provided that the above copyright notice appear in all copies and that both that the copyright notice and this permission notice and warranty disclaimer appear in supporting documentation, and that the name of the author not be used in advertising or publicity pertaining to distribution of the software without specific, written prior permission. The author disclaim all warranties with regard to this software, including all implied warranties of merchantability and fitness. In no event shall the author be liable for any special, indirect or consequential damages or any damages whatsoever resulting from loss of use, data or profits, whether in an action of contract, negligence or other tortious action, arising out of or in connection with the use or performance of this software. }}}*/ #include #include #include "unittest.h" #include "common/macros.h" template size_t alignmentMask() { if (Vec::Size == 1) { // on 32bit the maximal alignment is 4 Bytes, even for 8-Byte doubles. return std::min(sizeof(void*), sizeof(typename Vec::EntryType)) - 1; } // sizeof(SSE::sfloat_v) is too large // AVX::VectorAlignment is too large return std::min(sizeof(Vec), Vc::VectorAlignment) - 1; } template struct SomeStruct { char a; T x; }; template void stdVectorAlignment() { const size_t mask = alignmentMask(); const char *const null = 0; std::vector v(11); for (int i = 0; i < 11; ++i) { COMPARE((reinterpret_cast(&v[i]) - null) & mask, 0u) << "&v[i] = " << &v[i] << ", mask = " << mask << ", i = " << i; } std::vector, Vc::Allocator > > v2(11); for (int i = 0; i < 11; ++i) { COMPARE((reinterpret_cast(&v2[i]) - null) & mask, 0u) << "&v2[i] = " << &v2[i] << ", mask = " << mask << ", i = " << i; } std::vector v3(v); std::vector, Vc::Allocator > > v4(v2); typedef typename V::EntryType T; for (int i = 1; i < 100; ++i) { std::vector > v5(i); const size_t expectedAlignment = Vc_ALIGNOF(V); COMPARE((&v5[0] - static_cast(0)) * sizeof(T) & (expectedAlignment - 1), 0u); } } int main(int argc, char **argv) { initTest(argc, argv); using namespace Vc; testAllTypes(stdVectorAlignment); } Vc-0.7.4/tests/store.cpp000066400000000000000000000111171233512346000150740ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2011 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include "unittest.h" #include #include using namespace Vc; template void alignedStore() { typedef typename Vec::EntryType T; enum { Count = 256 * 1024 / sizeof(T) }; Memory array; // do the memset to make sure the array doesn't have the old data from a previous call which // would mask a real problem std::memset(array, 0xff, Count * sizeof(T)); T xValue = 1; const Vec x(xValue); for (int i = 0; i < Count; i += Vec::Size) { x.store(&array[i]); } for (int i = 0; i < Count; ++i) { COMPARE(array[i], xValue); } } template void unalignedStore() { typedef typename Vec::EntryType T; enum { Count = 256 * 1024 / sizeof(T) }; Memory array; // do the memset to make sure the array doesn't have the old data from a previous call which // would mask a real problem std::memset(array, 0xff, Count * sizeof(T)); T xValue = 1; const Vec x(xValue); for (int i = 1; i < Count - Vec::Size + 1; i += Vec::Size) { x.store(&array[i], Unaligned); } for (int i = 1; i < Count - Vec::Size + 1; ++i) { COMPARE(array[i], xValue); } } template void streamingAndAlignedStore() { typedef typename Vec::EntryType T; enum { Count = 256 * 1024 / sizeof(T) }; Memory array; // do the memset to make sure the array doesn't have the old data from a previous call which // would mask a real problem std::memset(array, 0xff, Count * sizeof(T)); T xValue = 1; const Vec x(xValue); for (int i = 0; i < Count; i += Vec::Size) { x.store(&array[i], Streaming | Aligned); } for (int i = 0; i < Count; ++i) { COMPARE(array[i], xValue); } } template void streamingAndUnalignedStore() { typedef typename Vec::EntryType T; enum { Count = 256 * 1024 / sizeof(T) }; Memory array; // do the memset to make sure the array doesn't have the old data from a previous call which // would mask a real problem std::memset(array, 0xff, Count * sizeof(T)); T xValue = 1; const Vec x(xValue); for (int i = 1; i < Count - Vec::Size + 1; i += Vec::Size) { x.store(&array[i], Streaming | Unaligned); } for (int i = 1; i < Count - Vec::Size + 1; ++i) { COMPARE(array[i], xValue); } } template void maskedStore() { typedef typename Vec::EntryType T; typedef typename Vec::Mask M; M mask; { typedef typename Vec::IndexType I; const I tmp(IndexesFromZero); const typename I::Mask k = (tmp & I(One)) > 0; mask = M(k); } const int count = 256 * 1024 / sizeof(T); const int outerCount = count / Vec::Size; Vc::Memory array(count); array.setZero(); const T nullValue = 0; const T setValue = 170; const Vec x(setValue); for (int i = 0; i < count; i += Vec::Size) { x.store(&array[i], mask); } for (int i = 1; i < count; i += 2) { COMPARE(array[i], setValue) << ", i: " << i << ", count: " << count << ", outer: " << outerCount; } for (int i = 0; i < count; i += 2) { COMPARE(array[i], nullValue) << ", i: " << i << ", count: " << count << ", outer: " << outerCount; } } int main(int argc, char **argv) { initTest(argc, argv); testAllTypes(alignedStore); testAllTypes(unalignedStore); testAllTypes(streamingAndAlignedStore); testAllTypes(streamingAndUnalignedStore); if (float_v::Size > 1) { runTest(maskedStore); runTest(maskedStore); runTest(maskedStore); runTest(maskedStore); runTest(maskedStore); runTest(maskedStore); runTest(maskedStore); } return 0; } Vc-0.7.4/tests/supportfunctions.cpp000066400000000000000000000046661233512346000174200ustar00rootroot00000000000000/*{{{ Copyright (C) 2013 Matthias Kretz This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . }}}*/ #include "unittest.h" void testCompiledImplementation() { VERIFY(Vc::currentImplementationSupported()); } void testIsSupported() { using Vc::CpuId; VERIFY(Vc::isImplementationSupported(Vc::ScalarImpl)); COMPARE(Vc::isImplementationSupported(Vc::SSE2Impl ), CpuId::hasSse2()); COMPARE(Vc::isImplementationSupported(Vc::SSE3Impl ), CpuId::hasSse3()); COMPARE(Vc::isImplementationSupported(Vc::SSSE3Impl), CpuId::hasSsse3()); COMPARE(Vc::isImplementationSupported(Vc::SSE41Impl), CpuId::hasSse41()); COMPARE(Vc::isImplementationSupported(Vc::SSE42Impl), CpuId::hasSse42()); COMPARE(Vc::isImplementationSupported(Vc::AVXImpl ), CpuId::hasOsxsave() && CpuId::hasAvx()); COMPARE(Vc::isImplementationSupported(Vc::AVX2Impl ), false); } void testBestImplementation() { // when building with a recent and fully featured compiler the following should pass // but - old GCC versions have to fall back to Scalar, even though SSE is supported by the CPU // - ICC/MSVC can't use XOP/FMA4 //COMPARE(Vc::bestImplementationSupported(), VC_IMPL); } void testExtraInstructions() { using Vc::CpuId; unsigned int extra = Vc::extraInstructionsSupported(); COMPARE(!(extra & Vc::Float16cInstructions), !CpuId::hasF16c()); COMPARE(!(extra & Vc::XopInstructions), !CpuId::hasXop()); COMPARE(!(extra & Vc::Fma4Instructions), !CpuId::hasFma4()); COMPARE(!(extra & Vc::PopcntInstructions), !CpuId::hasPopcnt()); COMPARE(!(extra & Vc::Sse4aInstructions), !CpuId::hasSse4a()); } int main(int argc, char **argv) { initTest(argc, argv); runTest(testCompiledImplementation); runTest(testIsSupported); runTest(testBestImplementation); runTest(testExtraInstructions); return 0; } Vc-0.7.4/tests/swizzles.cpp000066400000000000000000000103721233512346000156340ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright (C) 2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . }}}*/ #include "unittest.h" using namespace Vc; enum Swizzle { BADC, CDAB, AAAA, BBBB, CCCC, DDDD, BCAD, BCDA, DABC, ACBD, DBCA, DCBA }; template V scalarSwizzle(VC_ALIGNED_PARAMETER(V) v, Swizzle s) { V r = v; for (int i = 0; i + 4 <= V::Size; i += 4) { switch (s) { case BADC: r[i + 0] = v[i + 1]; r[i + 1] = v[i + 0]; r[i + 2] = v[i + 3]; r[i + 3] = v[i + 2]; break; case CDAB: r[i + 0] = v[i + 2]; r[i + 1] = v[i + 3]; r[i + 2] = v[i + 0]; r[i + 3] = v[i + 1]; break; case AAAA: r[i + 0] = v[i + 0]; r[i + 1] = v[i + 0]; r[i + 2] = v[i + 0]; r[i + 3] = v[i + 0]; break; case BBBB: r[i + 0] = v[i + 1]; r[i + 1] = v[i + 1]; r[i + 2] = v[i + 1]; r[i + 3] = v[i + 1]; break; case CCCC: r[i + 0] = v[i + 2]; r[i + 1] = v[i + 2]; r[i + 2] = v[i + 2]; r[i + 3] = v[i + 2]; break; case DDDD: r[i + 0] = v[i + 3]; r[i + 1] = v[i + 3]; r[i + 2] = v[i + 3]; r[i + 3] = v[i + 3]; break; case BCAD: r[i + 0] = v[i + 1]; r[i + 1] = v[i + 2]; r[i + 2] = v[i + 0]; r[i + 3] = v[i + 3]; break; case BCDA: r[i + 0] = v[i + 1]; r[i + 1] = v[i + 2]; r[i + 2] = v[i + 3]; r[i + 3] = v[i + 0]; break; case DABC: r[i + 0] = v[i + 3]; r[i + 1] = v[i + 0]; r[i + 2] = v[i + 1]; r[i + 3] = v[i + 2]; break; case ACBD: r[i + 0] = v[i + 0]; r[i + 1] = v[i + 2]; r[i + 2] = v[i + 1]; r[i + 3] = v[i + 3]; break; case DBCA: r[i + 0] = v[i + 3]; r[i + 1] = v[i + 1]; r[i + 2] = v[i + 2]; r[i + 3] = v[i + 0]; break; case DCBA: r[i + 0] = v[i + 3]; r[i + 1] = v[i + 2]; r[i + 2] = v[i + 1]; r[i + 3] = v[i + 0]; break; } } return r; } template void testSwizzle() { for (int i = 0; i < 100; ++i) { const V test = V::Random(); COMPARE(test.abcd(), test); COMPARE(test.badc(), scalarSwizzle(test, BADC)); COMPARE(test.cdab(), scalarSwizzle(test, CDAB)); COMPARE(test.aaaa(), scalarSwizzle(test, AAAA)); COMPARE(test.bbbb(), scalarSwizzle(test, BBBB)); COMPARE(test.cccc(), scalarSwizzle(test, CCCC)); COMPARE(test.dddd(), scalarSwizzle(test, DDDD)); COMPARE(test.bcad(), scalarSwizzle(test, BCAD)); COMPARE(test.bcda(), scalarSwizzle(test, BCDA)); COMPARE(test.dabc(), scalarSwizzle(test, DABC)); COMPARE(test.acbd(), scalarSwizzle(test, ACBD)); COMPARE(test.dbca(), scalarSwizzle(test, DBCA)); COMPARE(test.dcba(), scalarSwizzle(test, DCBA)); } } int main(int argc, char **argv) { initTest(argc, argv); #if VC_DOUBLE_V_SIZE >= 4 || VC_DOUBLE_V_SIZE == 1 runTest(testSwizzle); #endif runTest(testSwizzle); runTest(testSwizzle); runTest(testSwizzle); runTest(testSwizzle); runTest(testSwizzle); runTest(testSwizzle); return 0; } Vc-0.7.4/tests/ulp.h000066400000000000000000000060431233512346000142070ustar00rootroot00000000000000/* This file is part of the Vc library. {{{ Copyright (C) 2011-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . }}}*/ #ifndef TESTS_ULP_H #define TESTS_ULP_H #include #include #ifdef VC_MSVC namespace std { static inline bool isnan(float x) { return _isnan(x); } static inline bool isnan(double x) { return _isnan(x); } } // namespace std #endif template static T ulpDiffToReference(T val, T ref) { if (val == ref || (std::isnan(val) && std::isnan(ref))) { return 0; } if (ref == T(0)) { return 1 + ulpDiffToReference(std::abs(val), std::numeric_limits::min()); } if (val == T(0)) { return 1 + ulpDiffToReference(std::numeric_limits::min(), std::abs(ref)); } int exp; /*tmp = */ frexp(ref, &exp); // ref == tmp * 2 ^ exp => tmp == ref * 2 ^ -exp // tmp is now in the range [0.5, 1.0[ // now we want to know how many times we can fit 2^-numeric_limits::digits between tmp and // val * 2 ^ -exp return ldexp(std::abs(ref - val), std::numeric_limits::digits - exp); } template static T ulpDiffToReferenceSigned(T val, T ref) { return ulpDiffToReference(val, ref) * (val - ref < 0 ? -1 : 1); } template struct _Ulp_ExponentVector { typedef Vc::int_v Type; }; template<> struct _Ulp_ExponentVector { typedef Vc::short_v Type; }; template static Vc::Vector<_T> ulpDiffToReference(const Vc::Vector<_T> &_val, const Vc::Vector<_T> &_ref) { using namespace Vc; typedef Vector<_T> V; typedef typename V::EntryType T; typedef typename V::Mask M; V val = _val; V ref = _ref; V diff = V::Zero(); M zeroMask = ref == V::Zero(); val (zeroMask)= abs(val); ref (zeroMask)= std::numeric_limits::min(); diff (zeroMask)= V::One(); zeroMask = val == V::Zero(); ref (zeroMask)= abs(ref); val (zeroMask)= std::numeric_limits::min(); diff (zeroMask)+= V::One(); typename _Ulp_ExponentVector::Type exp; frexp(ref, &exp); diff += ldexp(abs(ref - val), std::numeric_limits::digits - exp); diff.setZero(_val == _ref || (isnan(_val) && isnan(_ref))); return diff; } template static Vc::Vector<_T> ulpDiffToReferenceSigned(const Vc::Vector<_T> &_val, const Vc::Vector<_T> &_ref) { return ulpDiffToReference(_val, _ref).copySign(_val - _ref); } #endif // TESTS_ULP_H // vim: foldmethod=marker Vc-0.7.4/tests/unittest.h000066400000000000000000000656401233512346000152760ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef UNITTEST_H #define UNITTEST_H #ifdef VC_ASSERT #error "include unittest.h before any Vc header" #endif inline void unittest_assert(bool cond, const char *code, const char *file, int line); #define VC_ASSERT(cond) unittest_assert(cond, #cond, __FILE__, __LINE__); #include #include #include #include #include #include #include #include #include "ulp.h" #include #include #define _expand(name) #name #define runTest(name) _unit_test_global.runTestInt(&name, _expand(name)) #define testAllTypes(name) \ _unit_test_global.runTestInt(&name, #name ""); \ _unit_test_global.runTestInt(&name, #name ""); \ _unit_test_global.runTestInt(&name, #name ""); \ _unit_test_global.runTestInt(&name, #name ""); \ _unit_test_global.runTestInt(&name, #name ""); \ _unit_test_global.runTestInt(&name, #name ""); \ _unit_test_global.runTestInt(&name, #name "") #define testRealTypes(name) \ _unit_test_global.runTestInt(&name, #name ""); \ _unit_test_global.runTestInt(&name, #name ""); \ _unit_test_global.runTestInt(&name, #name ""); template struct isEqualType { operator bool() const { return false; } }; template struct isEqualType { operator bool() const { return true; } }; inline void printPass() { std::cout << AnsiColor::green << " PASS: " << AnsiColor::normal; } bool _UnitTest_verify_vector_unit_supported() { bool s = Vc::currentImplementationSupported(); if (!s) { std::cerr << "CPU or OS requirements not met for the compiled in vector unit!\n"; exit(-1); } return s; } static bool _UnitTest_verify_vector_unit_supported_result = _UnitTest_verify_vector_unit_supported(); class _UnitTest_Failure { }; typedef void (*testFunction)(); class _UnitTest_Global_Object { public: _UnitTest_Global_Object() : status(true), expect_failure(false), assert_failure(0), expect_assert_failure(false), float_fuzzyness( 1.f ), double_fuzzyness( 1. ), only_name(0), m_finalized(false), failedTests(0), passedTests(0), findMaximumDistance(false), maximumDistance(0), meanDistance(0), meanCount(0) { } ~_UnitTest_Global_Object() { if (m_finalized) { // on windows std::exit will call the dtor again, leading to infinite recursion return; } if (plotFile.is_open()) { plotFile.flush(); plotFile.close(); } std::cout << "\n Testing done. " << passedTests << " tests passed. " << failedTests << " tests failed." << std::endl; m_finalized = true; std::exit(failedTests); } void runTestInt(testFunction fun, const char *name); bool status; bool expect_failure; int assert_failure; bool expect_assert_failure; float float_fuzzyness; double double_fuzzyness; const char *only_name; std::fstream plotFile; private: bool m_finalized; int failedTests; public: int passedTests; bool findMaximumDistance; double maximumDistance; double meanDistance; int meanCount; }; static _UnitTest_Global_Object _unit_test_global; void EXPECT_FAILURE() { _unit_test_global.expect_failure = true; } inline const char *_unittest_fail() { if (_unit_test_global.expect_failure) { return "XFAIL: "; } static const char *str = 0; if (str == 0) { if (mayUseColor(std::cout)) { static const char *fail = " \033[1;40;31mFAIL:\033[0m "; str = fail; } else { static const char *fail = " FAIL: "; str = fail; } } return str; } void initTest(int argc, char **argv) { for (int i = 1; i < argc; ++i) { if (0 == std::strcmp(argv[i], "--help") || 0 == std::strcmp(argv[i], "-h")) { std::cout << "Usage: " << argv[0] << " [-h|--help] [--only ] [--maxdist] [--plotdist ]\n"; exit(0); } if (0 == std::strcmp(argv[i], "--only") && i + 1 < argc) { _unit_test_global.only_name = argv[i + 1]; } else if (0 == std::strcmp(argv[i], "--maxdist")) { _unit_test_global.findMaximumDistance = true; } else if (0 == std::strcmp(argv[i], "--plotdist") && i + 1 < argc) { _unit_test_global.plotFile.open(argv[i + 1], std::ios_base::out); _unit_test_global.plotFile << "# reference\tdistance\n"; } } } template inline void setFuzzyness( T ); template<> inline void setFuzzyness( float fuzz ) { _unit_test_global.float_fuzzyness = fuzz; } template<> inline void setFuzzyness( double fuzz ) { _unit_test_global.double_fuzzyness = fuzz; } void _UnitTest_Global_Object::runTestInt(testFunction fun, const char *name) { if (_unit_test_global.only_name && 0 != std::strcmp(name, _unit_test_global.only_name)) { return; } _unit_test_global.status = true; _unit_test_global.expect_failure = false; try { setFuzzyness(1); setFuzzyness(1); maximumDistance = 0.; meanDistance = 0.; meanCount = 0; fun(); } catch(_UnitTest_Failure) { } if (_unit_test_global.expect_failure) { if (!_unit_test_global.status) { std::cout << "XFAIL: " << name << std::endl; } else { std::cout << "unexpected PASS: " << name << "\n This test should have failed but didn't. Check the code!" << std::endl; ++failedTests; } } else { if (!_unit_test_global.status) { if (findMaximumDistance) { std::cout << _unittest_fail() << "│ with a maximal distance of " << maximumDistance << " to the reference (mean: " << meanDistance / meanCount << ").\n"; } std::cout << _unittest_fail() << "┕ " << name << std::endl; ++failedTests; } else { printPass(); std::cout << name; if (findMaximumDistance) { if (maximumDistance > 0.) { std::cout << " with a maximal distance of " << maximumDistance << " to the reference (mean: " << meanDistance / meanCount << ")."; } else { std::cout << " all values matched the reference precisely."; } } std::cout << std::endl; ++passedTests; } } } template inline bool unittest_compareHelper( const T1 &a, const T2 &b ) { return a == b; } template<> inline bool unittest_compareHelper( const Vc::int_v &a, const Vc::int_v &b ) { return (a == b).isFull(); } template<> inline bool unittest_compareHelper( const Vc::uint_v &a, const Vc::uint_v &b ) { return (a == b).isFull(); } template<> inline bool unittest_compareHelper( const Vc::float_v &a, const Vc::float_v &b ) { return (a == b).isFull(); } template<> inline bool unittest_compareHelper( const Vc::sfloat_v &a, const Vc::sfloat_v &b ) { return (a == b).isFull(); } template<> inline bool unittest_compareHelper( const Vc::double_v &a, const Vc::double_v &b ) { return (a == b).isFull(); } template<> inline bool unittest_compareHelper( const Vc::ushort_v &a, const Vc::ushort_v &b ) { return (a == b).isFull(); } template<> inline bool unittest_compareHelper( const Vc::short_v &a, const Vc::short_v &b ) { return (a == b).isFull(); } template<> inline bool unittest_compareHelper(const std::type_info &a, const std::type_info &b ) { return &a == &b; } template T ulpDiffToReferenceWrapper(T a, T b) { const T diff = ulpDiffToReference(a, b); if (VC_IS_UNLIKELY(_unit_test_global.findMaximumDistance)) { _unit_test_global.maximumDistance = std::max(std::abs(diff), _unit_test_global.maximumDistance); _unit_test_global.meanDistance += std::abs(diff); ++_unit_test_global.meanCount; } return diff; } template Vc::Vector ulpDiffToReferenceWrapper(VC_ALIGNED_PARAMETER(Vc::Vector) a, VC_ALIGNED_PARAMETER(Vc::Vector) b) { const Vc::Vector diff = ulpDiffToReference(a, b); if (VC_IS_UNLIKELY(_unit_test_global.findMaximumDistance)) { _unit_test_global.maximumDistance = std::max(Vc::abs(diff).max(), _unit_test_global.maximumDistance); _unit_test_global.meanDistance += Vc::abs(diff).sum(); _unit_test_global.meanCount += Vc::Vector::Size; } return diff; } template inline bool unittest_fuzzyCompareHelper( const T &a, const T &b ) { return a == b; } template<> inline bool unittest_fuzzyCompareHelper( const float &a, const float &b ) { return ulpDiffToReferenceWrapper(a, b) <= _unit_test_global.float_fuzzyness; } template<> inline bool unittest_fuzzyCompareHelper( const Vc::float_v &a, const Vc::float_v &b ) { return (ulpDiffToReferenceWrapper(a, b) <= _unit_test_global.float_fuzzyness).isFull(); } template<> inline bool unittest_fuzzyCompareHelper( const Vc::sfloat_v &a, const Vc::sfloat_v &b ) { return (ulpDiffToReferenceWrapper(a, b) <= _unit_test_global.float_fuzzyness).isFull(); } template<> inline bool unittest_fuzzyCompareHelper( const double &a, const double &b ) { return ulpDiffToReferenceWrapper(a, b) <= _unit_test_global.double_fuzzyness; } template<> inline bool unittest_fuzzyCompareHelper( const Vc::double_v &a, const Vc::double_v &b ) { return (ulpDiffToReferenceWrapper(a, b) <= _unit_test_global.double_fuzzyness).isFull(); } template inline void unitttest_comparePrintHelper(const T1 &a, const T2 &b, const M &m, const char *aa, const char *bb, const char *file, int line, double fuzzyness = 0.) { std::cout << " " << aa << " (" << std::setprecision(10) << a << std::setprecision(6) << ") == " << bb << " (" << std::setprecision(10) << b << std::setprecision(6) << ") -> " << m; if (fuzzyness > 0.) { std::cout << " with fuzzyness " << fuzzyness; } std::cout << " at " << file << ":" << line << " failed.\n"; } template inline double unittest_fuzzynessHelper(const T &) { return 0.; } template<> inline double unittest_fuzzynessHelper(const float &) { return _unit_test_global.float_fuzzyness; } template<> inline double unittest_fuzzynessHelper(const Vc::float_v &) { return _unit_test_global.float_fuzzyness; } template<> inline double unittest_fuzzynessHelper(const double &) { return _unit_test_global.double_fuzzyness; } template<> inline double unittest_fuzzynessHelper(const Vc::double_v &) { return _unit_test_global.double_fuzzyness; } class _UnitTest_Compare { public: enum OptionFuzzy { Fuzzy }; enum OptionNoEq { NoEq }; template Vc_ALWAYS_INLINE _UnitTest_Compare(const T1 &a, const T2 &b, const char *_a, const char *_b, const char *_file, int _line) : m_ip(getIp()), m_failed(!unittest_compareHelper(a, b)) { if (VC_IS_UNLIKELY(m_failed)) { printFirst(); printPosition(_file, _line); print(":\n"); print(_a); print(" ("); print(std::setprecision(10)); print(a); print(") == "); print(_b); print(" ("); print(std::setprecision(10)); print(b); print(std::setprecision(6)); print(") -> "); print(a == b); } } template Vc_ALWAYS_INLINE _UnitTest_Compare(const T1 &a, const T2 &b, const char *_a, const char *_b, const char *_file, int _line, OptionNoEq) : m_ip(getIp()), m_failed(!unittest_compareHelper(a, b)) { if (VC_IS_UNLIKELY(m_failed)) { printFirst(); printPosition(_file, _line); print(":\n"); print(_a); print(" ("); print(std::setprecision(10)); print(a); print(") == "); print(_b); print(" ("); print(std::setprecision(10)); print(b); print(std::setprecision(6)); print(')'); } } template Vc_ALWAYS_INLINE _UnitTest_Compare(const T &a, const T &b, const char *_a, const char *_b, const char *_file, int _line, OptionFuzzy) : m_ip(getIp()), m_failed(!unittest_fuzzyCompareHelper(a, b)) { if (VC_IS_UNLIKELY(m_failed)) { printFirst(); printPosition(_file, _line); print(":\n"); print(_a); print(" ("); print(std::setprecision(10)); print(a); print(") ≈ "); print(_b); print(" ("); print(std::setprecision(10)); print(b); print(std::setprecision(6)); print(") -> "); print(a == b); printFuzzyInfo(a, b); } if (_unit_test_global.plotFile.is_open()) { writePlotData(_unit_test_global.plotFile, a, b); } } Vc_ALWAYS_INLINE _UnitTest_Compare(bool good, const char *cond, const char *_file, int _line) : m_ip(getIp()), m_failed(!good) { if (VC_IS_UNLIKELY(m_failed)) { printFirst(); printPosition(_file, _line); print(": "); print(cond); } } Vc_ALWAYS_INLINE _UnitTest_Compare(const char *_file, int _line) : m_ip(getIp()), m_failed(true) { printFirst(); printPosition(_file, _line); print(":\n"); } template Vc_ALWAYS_INLINE const _UnitTest_Compare &operator<<(const T &x) const { if (VC_IS_UNLIKELY(m_failed)) { print(x); } return *this; } Vc_ALWAYS_INLINE const _UnitTest_Compare &operator<<(const char *str) const { if (VC_IS_UNLIKELY(m_failed)) { print(str); } return *this; } Vc_ALWAYS_INLINE const _UnitTest_Compare &operator<<(const char ch) const { if (VC_IS_UNLIKELY(m_failed)) { print(ch); } return *this; } Vc_ALWAYS_INLINE const _UnitTest_Compare &operator<<(bool b) const { if (VC_IS_UNLIKELY(m_failed)) { print(b); } return *this; } Vc_ALWAYS_INLINE ~_UnitTest_Compare() throw(_UnitTest_Failure) { if (VC_IS_UNLIKELY(m_failed)) { printLast(); } } private: static Vc_ALWAYS_INLINE size_t getIp() { size_t _ip; #if defined(__x86_64__) && defined(VC_GNU_ASM) asm("lea 0(%%rip),%0" : "=r"(_ip)); #else _ip = 0; #endif return _ip; } static void printFirst() { std::cout << _unittest_fail() << "┍ "; } template static void print(const T &x) { std::cout << x; } static void print(const std::type_info &x) { std::cout << x.name(); } static void print(const char *str) { const char *pos = 0; if (0 != (pos = std::strchr(str, '\n'))) { if (pos == str) { std::cout << '\n' << _unittest_fail() << "│ " << &str[1]; } else { char *left = strdup(str); left[pos - str] = '\0'; std::cout << left << '\n' << _unittest_fail() << "│ " << &pos[1]; free(left); } } else { std::cout << str; } } static void print(const char ch) { if (ch == '\n') { std::cout << '\n' << _unittest_fail() << "│ "; } else { std::cout << ch; } } static void print(bool b) { std::cout << (b ? "true" : "false"); } static void printLast() { std::cout << std::endl; _unit_test_global.status = false; //if (!_unit_test_global.plotFile.is_open()) { throw _UnitTest_Failure(); //} } void printPosition(const char *_file, int _line) { std::cout << "at " << _file << ':' << _line << " (0x" << std::hex << m_ip << std::dec << ')'; } template static inline void writePlotData(std::fstream &file, VC_ALIGNED_PARAMETER(T) a, VC_ALIGNED_PARAMETER(T) b); template static inline void printFuzzyInfo(VC_ALIGNED_PARAMETER(T) a, VC_ALIGNED_PARAMETER(T) b); template static inline void printFuzzyInfoImpl(VC_ALIGNED_PARAMETER(T) a, VC_ALIGNED_PARAMETER(T) b, double fuzzyness) { print("\ndistance: "); print(ulpDiffToReferenceSigned(a, b)); print(", allowed distance: "); print(fuzzyness); } const size_t m_ip; const bool m_failed; }; template inline void _UnitTest_Compare::printFuzzyInfo(VC_ALIGNED_PARAMETER(T), VC_ALIGNED_PARAMETER(T)) {} template<> inline void _UnitTest_Compare::printFuzzyInfo(VC_ALIGNED_PARAMETER(float) a, VC_ALIGNED_PARAMETER(float) b) { printFuzzyInfoImpl(a, b, _unit_test_global.float_fuzzyness); } template<> inline void _UnitTest_Compare::printFuzzyInfo(VC_ALIGNED_PARAMETER(double) a, VC_ALIGNED_PARAMETER(double) b) { printFuzzyInfoImpl(a, b, _unit_test_global.double_fuzzyness); } template<> inline void _UnitTest_Compare::printFuzzyInfo(VC_ALIGNED_PARAMETER(Vc::float_v) a, VC_ALIGNED_PARAMETER(Vc::float_v) b) { printFuzzyInfoImpl(a, b, _unit_test_global.float_fuzzyness); } template<> inline void _UnitTest_Compare::printFuzzyInfo(VC_ALIGNED_PARAMETER(Vc::double_v) a, VC_ALIGNED_PARAMETER(Vc::double_v) b) { printFuzzyInfoImpl(a, b, _unit_test_global.double_fuzzyness); } template<> inline void _UnitTest_Compare::printFuzzyInfo(VC_ALIGNED_PARAMETER(Vc::sfloat_v) a, VC_ALIGNED_PARAMETER(Vc::sfloat_v) b) { printFuzzyInfoImpl(a, b, _unit_test_global.float_fuzzyness); } template inline void _UnitTest_Compare::writePlotData(std::fstream &, VC_ALIGNED_PARAMETER(T), VC_ALIGNED_PARAMETER(T)) {} template<> inline void _UnitTest_Compare::writePlotData(std::fstream &file, VC_ALIGNED_PARAMETER(float) a, VC_ALIGNED_PARAMETER(float) b) { file << std::setprecision(12) << b << "\t" << ulpDiffToReferenceSigned(a, b) << "\n"; } template<> inline void _UnitTest_Compare::writePlotData(std::fstream &file, VC_ALIGNED_PARAMETER(double) a, VC_ALIGNED_PARAMETER(double) b) { file << std::setprecision(12) << b << "\t" << ulpDiffToReferenceSigned(a, b) << "\n"; } template<> inline void _UnitTest_Compare::writePlotData(std::fstream &file, VC_ALIGNED_PARAMETER(Vc::float_v) a, VC_ALIGNED_PARAMETER(Vc::float_v) b) { const Vc::float_v ref = b; const Vc::float_v dist = ulpDiffToReferenceSigned(a, b); for (int i = 0; i < Vc::float_v::Size; ++i) { file << std::setprecision(12) << ref[i] << "\t" << dist[i] << "\n"; } } template<> inline void _UnitTest_Compare::writePlotData(std::fstream &file, VC_ALIGNED_PARAMETER(Vc::double_v) a, VC_ALIGNED_PARAMETER(Vc::double_v) b) { const Vc::double_v ref = b; const Vc::double_v dist = ulpDiffToReferenceSigned(a, b); for (int i = 0; i < Vc::double_v::Size; ++i) { file << std::setprecision(12) << ref[i] << "\t" << dist[i] << "\n"; } } template<> inline void _UnitTest_Compare::writePlotData(std::fstream &file, VC_ALIGNED_PARAMETER(Vc::sfloat_v) a, VC_ALIGNED_PARAMETER(Vc::sfloat_v) b) { const Vc::sfloat_v ref = b; const Vc::sfloat_v dist = ulpDiffToReferenceSigned(a, b); for (int i = 0; i < Vc::sfloat_v::Size; ++i) { file << std::setprecision(12) << ref[i] << "\t" << dist[i] << "\n"; } } // Workaround for clang: The "<< ' '" is only added to silence the warnings about unused return // values. #define FUZZY_COMPARE( a, b ) \ _UnitTest_Compare(a, b, #a, #b, __FILE__, __LINE__, _UnitTest_Compare::Fuzzy) << ' ' #define COMPARE( a, b ) \ _UnitTest_Compare(a, b, #a, #b, __FILE__, __LINE__) << ' ' #define COMPARE_NOEQ( a, b ) \ _UnitTest_Compare(a, b, #a, #b, __FILE__, __LINE__, _UnitTest_Compare::NoEq) << ' ' #define VERIFY(cond) \ _UnitTest_Compare(cond, #cond, __FILE__, __LINE__) << ' ' #define FAIL() \ _UnitTest_Compare(__FILE__, __LINE__) << ' ' class ADD_PASS { public: ADD_PASS() { ++_unit_test_global.passedTests; printPass(); } ~ADD_PASS() { std::cout << std::endl; } template ADD_PASS &operator<<(const T &x) { std::cout << x; return *this; } }; inline void unittest_assert(bool cond, const char *code, const char *file, int line) { if (!cond) { if (_unit_test_global.expect_assert_failure) { ++_unit_test_global.assert_failure; } else { _UnitTest_Compare(file, line) << "assert(" << code << ") failed."; } } } #ifdef assert #undef assert #endif #define assert(cond) unittest_assert(cond, #cond, __FILE__, __LINE__) #define EXPECT_ASSERT_FAILURE(code) \ _unit_test_global.expect_assert_failure = true; \ _unit_test_global.assert_failure = 0; \ code; \ if (_unit_test_global.assert_failure == 0) { \ /* failure expected but it didn't fail */ \ std::cout << " " << #code << " at " << __FILE__ << ":" << __LINE__ << \ " did not fail as was expected.\n"; \ _unit_test_global.status = false; \ throw _UnitTest_Failure(); \ return; \ } \ _unit_test_global.expect_assert_failure = false template inline typename Vec::Mask allMasks(int i) { typedef typename Vec::IndexType I; typedef typename Vec::Mask M; if (i == 0) { return M(true); } --i; if (i < Vec::Size) { return M (I(Vc::IndexesFromZero) == i); } i -= Vec::Size; if (Vec::Size < 3) { return M(false); } for (int a = 0; a < Vec::Size - 1; ++a) { for (int b = a + 1; b < Vec::Size; ++b) { if (i == 0) { I indexes(Vc::IndexesFromZero); return M(indexes == a || indexes == b); } --i; } } if (Vec::Size < 4) { return M(false); } for (int a = 0; a < Vec::Size - 1; ++a) { for (int b = a + 1; b < Vec::Size; ++b) { for (int c = b + 1; c < Vec::Size; ++c) { if (i == 0) { I indexes(Vc::IndexesFromZero); return M(indexes == a || indexes == b || indexes == c); } --i; } } } if (Vec::Size < 5) { return M(false); } for (int a = 0; a < Vec::Size - 1; ++a) { for (int b = a + 1; b < Vec::Size; ++b) { for (int c = b + 1; c < Vec::Size; ++c) { for (int d = c + 1; d < Vec::Size; ++d) { if (i == 0) { I indexes(Vc::IndexesFromZero); return M(indexes == a || indexes == b || indexes == c || indexes == d); } --i; } } } } if (Vec::Size < 6) { return M(false); } for (int a = 0; a < Vec::Size - 1; ++a) { for (int b = a + 1; b < Vec::Size; ++b) { for (int c = b + 1; c < Vec::Size; ++c) { for (int d = c + 1; d < Vec::Size; ++d) { for (int e = d + 1; e < Vec::Size; ++e) { if (i == 0) { I indexes(Vc::IndexesFromZero); return M(indexes == a || indexes == b || indexes == c || indexes == d || indexes == e); } --i; } } } } } if (Vec::Size < 7) { return M(false); } for (int a = 0; a < Vec::Size - 1; ++a) { for (int b = a + 1; b < Vec::Size; ++b) { for (int c = b + 1; c < Vec::Size; ++c) { for (int d = c + 1; d < Vec::Size; ++d) { for (int e = d + 1; e < Vec::Size; ++e) { for (int f = e + 1; f < Vec::Size; ++f) { if (i == 0) { I indexes(Vc::IndexesFromZero); return M(indexes == a || indexes == b || indexes == c || indexes == d || indexes == e || indexes == f); } --i; } } } } } } if (Vec::Size < 8) { return M(false); } for (int a = 0; a < Vec::Size - 1; ++a) { for (int b = a + 1; b < Vec::Size; ++b) { for (int c = b + 1; c < Vec::Size; ++c) { for (int d = c + 1; d < Vec::Size; ++d) { for (int e = d + 1; e < Vec::Size; ++e) { for (int f = e + 1; f < Vec::Size; ++f) { for (int g = f + 1; g < Vec::Size; ++g) { if (i == 0) { I indexes(Vc::IndexesFromZero); return M(indexes == a || indexes == b || indexes == c || indexes == d || indexes == e || indexes == f || indexes == g); } --i; } } } } } } } return M(false); } #define for_all_masks(VecType, _mask_) \ for (int _Vc_for_all_masks_i = 0; _Vc_for_all_masks_i == 0; ++_Vc_for_all_masks_i) \ for (typename VecType::Mask _mask_ = allMasks(_Vc_for_all_masks_i++); !_mask_.isEmpty(); _mask_ = allMasks(_Vc_for_all_masks_i++)) #endif // UNITTEST_H Vc-0.7.4/tests/utils.cpp000066400000000000000000000274501233512346000151070ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include "unittest.h" #include #include "vectormemoryhelper.h" #include using namespace Vc; template void testSort() { typedef typename Vec::IndexType IndexType; const IndexType _ref(IndexesFromZero); Vec ref(_ref); Vec a; int maxPerm = 1; for (int x = Vec::Size; x > 0; --x) { maxPerm *= x; } for (int perm = 0; perm < maxPerm; ++perm) { int rest = perm; for (int i = 0; i < Vec::Size; ++i) { a[i] = 0; for (int j = 0; j < i; ++j) { if (a[i] == a[j]) { ++(a[i]); j = -1; } } a[i] += rest % (Vec::Size - i); rest /= (Vec::Size - i); for (int j = 0; j < i; ++j) { if (a[i] == a[j]) { ++(a[i]); j = -1; } } } //std::cout << a << a.sorted() << std::endl; COMPARE(ref, a.sorted()) << ", a: " << a; } for (int repetition = 0; repetition < 1000; ++repetition) { Vec test = Vec::Random(); Vc::Memory reference; reference.vector(0) = test; std::sort(&reference[0], &reference[Vec::Size]); ref = reference.vector(0); COMPARE(ref, test.sorted()); } } template struct Foo { Foo() : i(0) {} void reset() { i = 0; } void operator()(T v) { d[i++] = v; } Mem d; int i; }; template void testCall() { typedef typename V::EntryType T; typedef typename V::IndexType I; typedef typename V::Mask M; typedef typename I::Mask MI; const I _indexes(IndexesFromZero); const MI _odd = (_indexes & I(One)) > 0; const M odd(_odd); V a(_indexes); Foo f; a.callWithValuesSorted(f); V b(f.d); COMPARE(b, a); f.reset(); a(odd) -= 1; a.callWithValuesSorted(f); V c(f.d); for (int i = 0; i < V::Size / 2; ++i) { COMPARE(a[i * 2], c[i]); } for (int i = V::Size / 2; i < V::Size; ++i) { COMPARE(b[i], c[i]); } } template void testForeachBit() { typedef typename V::EntryType T; typedef typename V::IndexType I; const I indexes(IndexesFromZero); for_all_masks(V, mask) { V tmp = V::Zero(); foreach_bit(int j, mask) { tmp[j] = T(1); } COMPARE(tmp == V::One(), mask); int count = 0; foreach_bit(int j, mask) { ++count; if (j >= 0) { continue; } } COMPARE(count, mask.count()); count = 0; foreach_bit(int j, mask) { if (j >= 0) { break; } ++count; } COMPARE(count, 0); } } template void copySign() { V v(One); V positive(One); V negative = -positive; COMPARE(v, v.copySign(positive)); COMPARE(-v, v.copySign(negative)); } #ifdef _WIN32 void bzero(void *p, size_t n) { memset(p, 0, n); } #else #include #endif template void Random() { typedef typename V::EntryType T; enum { NBits = 3, NBins = 1 << NBits, // short int TotalBits = sizeof(T) * 8, // 16 32 RightShift = TotalBits - NBits, // 13 29 NHistograms = TotalBits - NBits + 1, // 14 30 LeftShift = (RightShift + 1) / NHistograms,// 1 1 Mean = 135791, MinGood = Mean - Mean/10, MaxGood = Mean + Mean/10 }; const V mask((1 << NBits) - 1); int histogram[NHistograms][NBins]; bzero(&histogram[0][0], sizeof(histogram)); for (size_t i = 0; i < NBins * Mean / V::Size; ++i) { const V rand = V::Random(); for (size_t hist = 0; hist < NHistograms; ++hist) { const V bin = ((rand << (hist * LeftShift)) >> RightShift) & mask; for (size_t k = 0; k < V::Size; ++k) { ++histogram[hist][bin[k]]; } } } //#define PRINT_RANDOM_HISTOGRAM #ifdef PRINT_RANDOM_HISTOGRAM for (size_t hist = 0; hist < NHistograms; ++hist) { std::cout << "histogram[" << std::setw(2) << hist << "]: "; for (size_t bin = 0; bin < NBins; ++bin) { std::cout << std::setw(3) << (histogram[hist][bin] - Mean) * 1000 / Mean << "|"; } std::cout << std::endl; } #endif for (size_t hist = 0; hist < NHistograms; ++hist) { for (size_t bin = 0; bin < NBins; ++bin) { VERIFY(histogram[hist][bin] > MinGood) << " bin = " << bin << " is " << histogram[0][bin]; VERIFY(histogram[hist][bin] < MaxGood) << " bin = " << bin << " is " << histogram[0][bin]; } } } template void FloatRandom() { typedef typename V::EntryType T; enum { NBins = 64, NHistograms = 1, Mean = 135791, MinGood = Mean - Mean/10, MaxGood = Mean + Mean/10 }; int histogram[NHistograms][NBins]; bzero(&histogram[0][0], sizeof(histogram)); for (size_t i = 0; i < NBins * Mean / V::Size; ++i) { const V rand = V::Random(); const I bin = static_cast(rand * T(NBins)); for (size_t k = 0; k < V::Size; ++k) { ++histogram[0][bin[k]]; } } #ifdef PRINT_RANDOM_HISTOGRAM for (size_t hist = 0; hist < NHistograms; ++hist) { std::cout << "histogram[" << std::setw(2) << hist << "]: "; for (size_t bin = 0; bin < NBins; ++bin) { std::cout << std::setw(3) << (histogram[hist][bin] - Mean) * 1000 / Mean << "|"; } std::cout << std::endl; } #endif for (size_t hist = 0; hist < NHistograms; ++hist) { for (size_t bin = 0; bin < NBins; ++bin) { VERIFY(histogram[hist][bin] > MinGood) << " bin = " << bin << " is " << histogram[0][bin]; VERIFY(histogram[hist][bin] < MaxGood) << " bin = " << bin << " is " << histogram[0][bin]; } } } template<> void Random() { FloatRandom(); } template<> void Random() { FloatRandom(); } template<> void Random() { FloatRandom(); } template T add2(T x) { return x + T(2); } template class CallTester { public: CallTester() : v(Vc::Zero), i(0) {} void operator()(T x) { v[i] = x; ++i; } void reset() { v.setZero(); i = 0; } int callCount() const { return i; } V callValues() const { return v; } private: V v; int i; }; #if __cplusplus >= 201103 && (!defined(VC_CLANG) || VC_CLANG > 0x30000) #define DO_LAMBDA_TESTS 1 #endif template void applyAndCall() { typedef typename V::EntryType T; const V two(T(2)); for (int i = 0; i < 1000; ++i) { const V rand = V::Random(); COMPARE(rand.apply(add2), rand + two); #ifdef DO_LAMBDA_TESTS COMPARE(rand.apply([](T x) { return x + T(2); }), rand + two); #endif CallTester callTester; rand.call(callTester); COMPARE(callTester.callCount(), int(V::Size)); COMPARE(callTester.callValues(), rand); for_all_masks(V, mask) { V copy1 = rand; V copy2 = rand; copy1(mask) += two; COMPARE(copy2(mask).apply(add2), copy1) << mask; COMPARE(rand.apply(add2, mask), copy1) << mask; #ifdef DO_LAMBDA_TESTS COMPARE(copy2(mask).apply([](T x) { return x + T(2); }), copy1) << mask; COMPARE(rand.apply([](T x) { return x + T(2); }, mask), copy1) << mask; #endif callTester.reset(); copy2(mask).call(callTester); COMPARE(callTester.callCount(), mask.count()); callTester.reset(); rand.call(callTester, mask); COMPARE(callTester.callCount(), mask.count()); } } } template T returnConstant() { return T(value); } template T returnConstantOffset(int i) { return T(value) + T(i); } template T returnConstantOffset2(unsigned short i) { return T(value) + T(i); } template void fill() { typedef typename V::EntryType T; typedef typename V::IndexType I; V test = V::Random(); test.fill(returnConstant); COMPARE(test, V(T(2))); test = V::Random(); test.fill(returnConstantOffset); COMPARE(test, static_cast(I::IndexesFromZero())); test = V::Random(); test.fill(returnConstantOffset2); COMPARE(test, static_cast(I::IndexesFromZero())); } template void shifted() { typedef typename V::EntryType T; for (int shift = -2 * V::Size; shift <= 2 * V::Size; ++shift) { const V reference = V::Random(); const V test = reference.shifted(shift); for (int i = 0; i < V::Size; ++i) { if (i + shift >= 0 && i + shift < V::Size) { COMPARE(test[i], reference[i + shift]) << "shift: " << shift << ", i: " << i << ", test: " << test << ", reference: " << reference; } else { COMPARE(test[i], T(0)) << "shift: " << shift << ", i: " << i << ", test: " << test << ", reference: " << reference; } } } } template void rotated() { for (int shift = -2 * V::Size; shift <= 2 * V::Size; ++shift) { //std::cout << "amount = " << shift % V::Size << std::endl; const V reference = V::Random(); const V test = reference.rotated(shift); for (int i = 0; i < V::Size; ++i) { unsigned int refShift = i + shift; COMPARE(test[i], reference[refShift % V::Size]) << "shift: " << shift << ", i: " << i << ", test: " << test << ", reference: " << reference; } } } void testMallocAlignment() { int_v *a = Vc::malloc(10); unsigned long mask = VectorAlignment - 1; for (int i = 0; i < 10; ++i) { VERIFY((reinterpret_cast(&a[i]) & mask) == 0); } const char *data = reinterpret_cast(&a[0]); for (int i = 0; i < 10; ++i) { VERIFY(&data[i * int_v::Size * sizeof(int_v::EntryType)] == reinterpret_cast(&a[i])); } a = Vc::malloc(10); mask = CpuId::cacheLineSize() - 1; COMPARE((reinterpret_cast(&a[0]) & mask), 0ul); // I don't know how to properly check page alignment. So we check for 4 KiB alignment as this is // the minimum page size on x86 a = Vc::malloc(10); mask = 4096 - 1; COMPARE((reinterpret_cast(&a[0]) & mask), 0ul); } int main() { testAllTypes(testCall); testAllTypes(testForeachBit); testAllTypes(testSort); testRealTypes(copySign); testAllTypes(shifted); testAllTypes(rotated); testAllTypes(Random); testAllTypes(applyAndCall); testAllTypes(fill); runTest(testMallocAlignment); return 0; } Vc-0.7.4/tests/vectormemoryhelper.h000066400000000000000000000025461233512346000173460ustar00rootroot00000000000000/* This file is part of the Vc library. Copyright (C) 2009 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #ifndef VECTORMEMORYHELPER_H #define VECTORMEMORYHELPER_H #include template class VectorMemoryHelper { char *const mem; char *const aligned; public: VectorMemoryHelper(int count) : mem(new char[count * sizeof(Vec) + Vc::VectorAlignment]), aligned(mem + (Vc::VectorAlignment - (reinterpret_cast( mem ) & ( Vc::VectorAlignment - 1 )))) { } ~VectorMemoryHelper() { delete[] mem; } operator typename Vec::EntryType *() { return reinterpret_cast(aligned); } }; #endif // VECTORMEMORYHELPER_H